1 /* 2 * Copyright (c) 2011-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/buf.h> 61 62 #include <crypto/sha2/sha2.h> 63 64 #include "hammer2.h" 65 66 static hammer2_chain_t *hammer2_chain_create_indirect( 67 hammer2_chain_t *parent, 68 hammer2_key_t key, int keybits, 69 hammer2_tid_t mtid, int for_type, int *errorp); 70 static int hammer2_chain_delete_obref(hammer2_chain_t *parent, 71 hammer2_chain_t *chain, 72 hammer2_tid_t mtid, int flags, 73 hammer2_blockref_t *obref); 74 static hammer2_chain_t *hammer2_combined_find( 75 hammer2_chain_t *parent, 76 hammer2_blockref_t *base, int count, 77 hammer2_key_t *key_nextp, 78 hammer2_key_t key_beg, hammer2_key_t key_end, 79 hammer2_blockref_t **brefp); 80 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 81 int depth); 82 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp); 83 84 /* 85 * There are many degenerate situations where an extreme rate of console 86 * output can occur from warnings and errors. Make sure this output does 87 * not impede operations. 88 */ 89 static struct krate krate_h2chk = { .freq = 5 }; 90 static struct krate krate_h2me = { .freq = 1 }; 91 static struct krate krate_h2em = { .freq = 1 }; 92 93 /* 94 * Basic RBTree for chains (core.rbtree). 95 */ 96 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 97 98 int 99 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 100 { 101 hammer2_key_t c1_beg; 102 hammer2_key_t c1_end; 103 hammer2_key_t c2_beg; 104 hammer2_key_t c2_end; 105 106 /* 107 * Compare chains. Overlaps are not supposed to happen and catch 108 * any software issues early we count overlaps as a match. 109 */ 110 c1_beg = chain1->bref.key; 111 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 112 c2_beg = chain2->bref.key; 113 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 114 115 if (c1_end < c2_beg) /* fully to the left */ 116 return(-1); 117 if (c1_beg > c2_end) /* fully to the right */ 118 return(1); 119 return(0); /* overlap (must not cross edge boundary) */ 120 } 121 122 /* 123 * Assert that a chain has no media data associated with it. 124 */ 125 static __inline void 126 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 127 { 128 KKASSERT(chain->dio == NULL); 129 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 130 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 131 chain->data) { 132 panic("hammer2_chain_assert_no_data: chain %p still has data", 133 chain); 134 } 135 } 136 137 /* 138 * Make a chain visible to the flusher. The flusher operates using a top-down 139 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains, 140 * flushes them, and updates blocks back to the volume root. 141 * 142 * This routine sets the ONFLUSH flag upward from the triggering chain until 143 * it hits an inode root or the volume root. Inode chains serve as inflection 144 * points, requiring the flusher to bridge across trees. Inodes include 145 * regular inodes, PFS roots (pmp->iroot), and the media super root 146 * (spmp->iroot). 147 */ 148 void 149 hammer2_chain_setflush(hammer2_chain_t *chain) 150 { 151 hammer2_chain_t *parent; 152 153 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 154 hammer2_spin_sh(&chain->core.spin); 155 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 156 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 157 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 158 break; 159 if ((parent = chain->parent) == NULL) 160 break; 161 hammer2_spin_sh(&parent->core.spin); 162 hammer2_spin_unsh(&chain->core.spin); 163 chain = parent; 164 } 165 hammer2_spin_unsh(&chain->core.spin); 166 } 167 } 168 169 /* 170 * Allocate a new disconnected chain element representing the specified 171 * bref. chain->refs is set to 1 and the passed bref is copied to 172 * chain->bref. chain->bytes is derived from the bref. 173 * 174 * chain->pmp inherits pmp unless the chain is an inode (other than the 175 * super-root inode). 176 * 177 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 178 */ 179 hammer2_chain_t * 180 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 181 hammer2_blockref_t *bref) 182 { 183 hammer2_chain_t *chain; 184 u_int bytes; 185 186 /* 187 * Special case - radix of 0 indicates a chain that does not 188 * need a data reference (context is completely embedded in the 189 * bref). 190 */ 191 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 192 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 193 else 194 bytes = 0; 195 196 switch(bref->type) { 197 case HAMMER2_BREF_TYPE_INODE: 198 case HAMMER2_BREF_TYPE_INDIRECT: 199 case HAMMER2_BREF_TYPE_DATA: 200 case HAMMER2_BREF_TYPE_DIRENT: 201 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 202 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 203 case HAMMER2_BREF_TYPE_FREEMAP: 204 case HAMMER2_BREF_TYPE_VOLUME: 205 chain = kmalloc_obj(sizeof(*chain), hmp->mchain, 206 M_WAITOK | M_ZERO); 207 atomic_add_long(&hammer2_chain_allocs, 1); 208 break; 209 case HAMMER2_BREF_TYPE_EMPTY: 210 default: 211 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 212 bref->type); 213 break; 214 } 215 216 /* 217 * Initialize the new chain structure. pmp must be set to NULL for 218 * chains belonging to the super-root topology of a device mount. 219 */ 220 if (pmp == hmp->spmp) 221 chain->pmp = NULL; 222 else 223 chain->pmp = pmp; 224 225 chain->hmp = hmp; 226 chain->bref = *bref; 227 chain->bytes = bytes; 228 chain->refs = 1; 229 chain->flags = HAMMER2_CHAIN_ALLOCATED; 230 lockinit(&chain->diolk, "chdio", 0, 0); 231 232 /* 233 * Set the PFS boundary flag if this chain represents a PFS root. 234 */ 235 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 236 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 237 hammer2_chain_core_init(chain); 238 239 return (chain); 240 } 241 242 /* 243 * Initialize a chain's core structure. This structure used to be allocated 244 * but is now embedded. 245 * 246 * The core is not locked. No additional refs on the chain are made. 247 */ 248 void 249 hammer2_chain_core_init(hammer2_chain_t *chain) 250 { 251 RB_INIT(&chain->core.rbtree); /* live chains */ 252 hammer2_mtx_init(&chain->lock, "h2chain"); 253 hammer2_spin_init(&chain->core.spin, "h2chain"); 254 } 255 256 /* 257 * Add a reference to a chain element, preventing its destruction. 258 * 259 * (can be called with spinlock held) 260 */ 261 void 262 hammer2_chain_ref(hammer2_chain_t *chain) 263 { 264 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 265 /* 266 * Just flag that the chain was used and should be recycled 267 * on the LRU if it encounters it later. 268 */ 269 if (chain->flags & HAMMER2_CHAIN_ONLRU) 270 atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 271 272 #if 0 273 /* 274 * REMOVED - reduces contention, lru_list is more heuristical 275 * now. 276 * 277 * 0->non-zero transition must ensure that chain is removed 278 * from the LRU list. 279 * 280 * NOTE: Already holding lru_spin here so we cannot call 281 * hammer2_chain_ref() to get it off lru_list, do 282 * it manually. 283 */ 284 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 285 hammer2_pfs_t *pmp = chain->pmp; 286 hammer2_spin_ex(&pmp->lru_spin); 287 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 288 atomic_add_int(&pmp->lru_count, -1); 289 atomic_clear_int(&chain->flags, 290 HAMMER2_CHAIN_ONLRU); 291 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 292 } 293 hammer2_spin_unex(&pmp->lru_spin); 294 } 295 #endif 296 } 297 } 298 299 /* 300 * Ref a locked chain and force the data to be held across an unlock. 301 * Chain must be currently locked. The user of the chain who desires 302 * to release the hold must call hammer2_chain_lock_unhold() to relock 303 * and unhold the chain, then unlock normally, or may simply call 304 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 305 */ 306 void 307 hammer2_chain_ref_hold(hammer2_chain_t *chain) 308 { 309 atomic_add_int(&chain->lockcnt, 1); 310 hammer2_chain_ref(chain); 311 } 312 313 /* 314 * Insert the chain in the core rbtree. 315 * 316 * Normal insertions are placed in the live rbtree. Insertion of a deleted 317 * chain is a special case used by the flush code that is placed on the 318 * unstaged deleted list to avoid confusing the live view. 319 */ 320 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 321 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 322 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 323 324 static 325 int 326 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 327 int flags, int generation) 328 { 329 hammer2_chain_t *xchain; 330 int error = 0; 331 332 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 333 hammer2_spin_ex(&parent->core.spin); 334 335 /* 336 * Interlocked by spinlock, check for race 337 */ 338 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 339 parent->core.generation != generation) { 340 error = HAMMER2_ERROR_EAGAIN; 341 goto failed; 342 } 343 344 /* 345 * Insert chain 346 */ 347 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 348 KASSERT(xchain == NULL, 349 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 350 chain, xchain, chain->bref.key)); 351 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 352 chain->parent = parent; 353 ++parent->core.chain_count; 354 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 355 356 /* 357 * We have to keep track of the effective live-view blockref count 358 * so the create code knows when to push an indirect block. 359 */ 360 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 361 atomic_add_int(&parent->core.live_count, 1); 362 failed: 363 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 364 hammer2_spin_unex(&parent->core.spin); 365 return error; 366 } 367 368 /* 369 * Drop the caller's reference to the chain. When the ref count drops to 370 * zero this function will try to disassociate the chain from its parent and 371 * deallocate it, then recursely drop the parent using the implied ref 372 * from the chain's chain->parent. 373 * 374 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 375 * races an acquisition by another cpu. Therefore we can loop if we are 376 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 377 * race against another drop. 378 */ 379 void 380 hammer2_chain_drop(hammer2_chain_t *chain) 381 { 382 u_int refs; 383 384 KKASSERT(chain->refs > 0); 385 386 while (chain) { 387 refs = chain->refs; 388 cpu_ccfence(); 389 KKASSERT(refs > 0); 390 391 if (refs == 1) { 392 if (hammer2_mtx_ex_try(&chain->lock) == 0) 393 chain = hammer2_chain_lastdrop(chain, 0); 394 /* retry the same chain, or chain from lastdrop */ 395 } else { 396 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 397 break; 398 /* retry the same chain */ 399 } 400 cpu_pause(); 401 } 402 } 403 404 /* 405 * Unhold a held and probably not-locked chain, ensure that the data is 406 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 407 * lock and then simply unlocking the chain. 408 */ 409 void 410 hammer2_chain_unhold(hammer2_chain_t *chain) 411 { 412 u_int lockcnt; 413 int iter = 0; 414 415 for (;;) { 416 lockcnt = chain->lockcnt; 417 cpu_ccfence(); 418 if (lockcnt > 1) { 419 if (atomic_cmpset_int(&chain->lockcnt, 420 lockcnt, lockcnt - 1)) { 421 break; 422 } 423 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) { 424 hammer2_chain_unlock(chain); 425 break; 426 } else { 427 /* 428 * This situation can easily occur on SMP due to 429 * the gap inbetween the 1->0 transition and the 430 * final unlock. We cannot safely block on the 431 * mutex because lockcnt might go above 1. 432 * 433 * XXX Sleep for one tick if it takes too long. 434 */ 435 if (++iter > 1000) { 436 if (iter > 1000 + hz) { 437 kprintf("hammer2: h2race1 %p\n", chain); 438 iter = 1000; 439 } 440 tsleep(&iter, 0, "h2race1", 1); 441 } 442 cpu_pause(); 443 } 444 } 445 } 446 447 void 448 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 449 { 450 hammer2_chain_unhold(chain); 451 hammer2_chain_drop(chain); 452 } 453 454 void 455 hammer2_chain_rehold(hammer2_chain_t *chain) 456 { 457 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED); 458 atomic_add_int(&chain->lockcnt, 1); 459 hammer2_chain_unlock(chain); 460 } 461 462 /* 463 * Handles the (potential) last drop of chain->refs from 1->0. Called with 464 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 465 * possible against refs and lockcnt. We must dispose of the mutex on chain. 466 * 467 * This function returns an unlocked chain for recursive drop or NULL. It 468 * can return the same chain if it determines it has raced another ref. 469 * 470 * -- 471 * 472 * When two chains need to be recursively dropped we use the chain we 473 * would otherwise free to placehold the additional chain. It's a bit 474 * convoluted but we can't just recurse without potentially blowing out 475 * the kernel stack. 476 * 477 * The chain cannot be freed if it has any children. 478 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 479 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 480 * Any dedup registration can remain intact. 481 * 482 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 483 */ 484 static 485 hammer2_chain_t * 486 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth) 487 { 488 hammer2_pfs_t *pmp; 489 hammer2_dev_t *hmp; 490 hammer2_chain_t *parent; 491 hammer2_chain_t *rdrop; 492 493 /* 494 * We need chain's spinlock to interlock the sub-tree test. 495 * We already have chain's mutex, protecting chain->parent. 496 * 497 * Remember that chain->refs can be in flux. 498 */ 499 hammer2_spin_ex(&chain->core.spin); 500 501 if (chain->parent != NULL) { 502 /* 503 * If the chain has a parent the UPDATE bit prevents scrapping 504 * as the chain is needed to properly flush the parent. Try 505 * to complete the 1->0 transition and return NULL. Retry 506 * (return chain) if we are unable to complete the 1->0 507 * transition, else return NULL (nothing more to do). 508 * 509 * If the chain has a parent the MODIFIED bit prevents 510 * scrapping. 511 * 512 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 513 */ 514 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 515 HAMMER2_CHAIN_MODIFIED)) { 516 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 517 hammer2_spin_unex(&chain->core.spin); 518 hammer2_chain_assert_no_data(chain); 519 hammer2_mtx_unlock(&chain->lock); 520 chain = NULL; 521 } else { 522 hammer2_spin_unex(&chain->core.spin); 523 hammer2_mtx_unlock(&chain->lock); 524 } 525 return (chain); 526 } 527 /* spinlock still held */ 528 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME || 529 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) { 530 /* 531 * Retain the static vchain and fchain. Clear bits that 532 * are not relevant. Do not clear the MODIFIED bit, 533 * and certainly do not put it on the delayed-flush queue. 534 */ 535 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 536 } else { 537 /* 538 * The chain has no parent and can be flagged for destruction. 539 * Since it has no parent, UPDATE can also be cleared. 540 */ 541 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 542 if (chain->flags & HAMMER2_CHAIN_UPDATE) 543 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 544 545 /* 546 * If the chain has children we must propagate the DESTROY 547 * flag downward and rip the disconnected topology apart. 548 * This is accomplished by calling hammer2_flush() on the 549 * chain. 550 * 551 * Any dedup is already handled by the underlying DIO, so 552 * we do not have to specifically flush it here. 553 */ 554 if (chain->core.chain_count) { 555 hammer2_spin_unex(&chain->core.spin); 556 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 557 HAMMER2_FLUSH_ALL); 558 hammer2_mtx_unlock(&chain->lock); 559 560 return(chain); /* retry drop */ 561 } 562 563 /* 564 * Otherwise we can scrap the MODIFIED bit if it is set, 565 * and continue along the freeing path. 566 * 567 * Be sure to clean-out any dedup bits. Without a parent 568 * this chain will no longer be visible to the flush code. 569 * Easy check data_off to avoid the volume root. 570 */ 571 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 572 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 573 atomic_add_long(&hammer2_count_modified_chains, -1); 574 if (chain->pmp) 575 hammer2_pfs_memory_wakeup(chain->pmp, -1); 576 } 577 /* spinlock still held */ 578 } 579 580 /* spinlock still held */ 581 582 /* 583 * If any children exist we must leave the chain intact with refs == 0. 584 * They exist because chains are retained below us which have refs or 585 * may require flushing. 586 * 587 * Retry (return chain) if we fail to transition the refs to 0, else 588 * return NULL indication nothing more to do. 589 * 590 * Chains with children are NOT put on the LRU list. 591 */ 592 if (chain->core.chain_count) { 593 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 594 hammer2_spin_unex(&chain->core.spin); 595 hammer2_chain_assert_no_data(chain); 596 hammer2_mtx_unlock(&chain->lock); 597 chain = NULL; 598 } else { 599 hammer2_spin_unex(&chain->core.spin); 600 hammer2_mtx_unlock(&chain->lock); 601 } 602 return (chain); 603 } 604 /* spinlock still held */ 605 /* no chains left under us */ 606 607 /* 608 * chain->core has no children left so no accessors can get to our 609 * chain from there. Now we have to lock the parent core to interlock 610 * remaining possible accessors that might bump chain's refs before 611 * we can safely drop chain's refs with intent to free the chain. 612 */ 613 hmp = chain->hmp; 614 pmp = chain->pmp; /* can be NULL */ 615 rdrop = NULL; 616 617 parent = chain->parent; 618 619 /* 620 * WARNING! chain's spin lock is still held here, and other spinlocks 621 * will be acquired and released in the code below. We 622 * cannot be making fancy procedure calls! 623 */ 624 625 /* 626 * We can cache the chain if it is associated with a pmp 627 * and not flagged as being destroyed or requesting a full 628 * release. In this situation the chain is not removed 629 * from its parent, i.e. it can still be looked up. 630 * 631 * We intentionally do not cache DATA chains because these 632 * were likely used to load data into the logical buffer cache 633 * and will not be accessed again for some time. 634 */ 635 if ((chain->flags & 636 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 637 chain->pmp && 638 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 639 if (parent) 640 hammer2_spin_ex(&parent->core.spin); 641 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 642 /* 643 * 1->0 transition failed, retry. Do not drop 644 * the chain's data yet! 645 */ 646 if (parent) 647 hammer2_spin_unex(&parent->core.spin); 648 hammer2_spin_unex(&chain->core.spin); 649 hammer2_mtx_unlock(&chain->lock); 650 651 return(chain); 652 } 653 654 /* 655 * Success 656 */ 657 hammer2_chain_assert_no_data(chain); 658 659 /* 660 * Make sure we are on the LRU list, clean up excessive 661 * LRU entries. We can only really drop one but there might 662 * be other entries that we can remove from the lru_list 663 * without dropping. 664 * 665 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when 666 * chain->core.spin AND pmp->lru_spin are held, but 667 * can be safely cleared only holding pmp->lru_spin. 668 */ 669 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 670 hammer2_spin_ex(&pmp->lru_spin); 671 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 672 atomic_set_int(&chain->flags, 673 HAMMER2_CHAIN_ONLRU); 674 TAILQ_INSERT_TAIL(&pmp->lru_list, 675 chain, lru_node); 676 atomic_add_int(&pmp->lru_count, 1); 677 } 678 if (pmp->lru_count < HAMMER2_LRU_LIMIT) 679 depth = 1; /* disable lru_list flush */ 680 hammer2_spin_unex(&pmp->lru_spin); 681 } else { 682 /* disable lru flush */ 683 depth = 1; 684 } 685 686 if (parent) { 687 hammer2_spin_unex(&parent->core.spin); 688 parent = NULL; /* safety */ 689 } 690 hammer2_spin_unex(&chain->core.spin); 691 hammer2_mtx_unlock(&chain->lock); 692 693 /* 694 * lru_list hysteresis (see above for depth overrides). 695 * Note that depth also prevents excessive lastdrop recursion. 696 */ 697 if (depth == 0) 698 hammer2_chain_lru_flush(pmp); 699 700 return NULL; 701 /* NOT REACHED */ 702 } 703 704 /* 705 * Make sure we are not on the LRU list. 706 */ 707 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 708 hammer2_spin_ex(&pmp->lru_spin); 709 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 710 atomic_add_int(&pmp->lru_count, -1); 711 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 712 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 713 } 714 hammer2_spin_unex(&pmp->lru_spin); 715 } 716 717 /* 718 * Spinlock the parent and try to drop the last ref on chain. 719 * On success determine if we should dispose of the chain 720 * (remove the chain from its parent, etc). 721 * 722 * (normal core locks are top-down recursive but we define 723 * core spinlocks as bottom-up recursive, so this is safe). 724 */ 725 if (parent) { 726 hammer2_spin_ex(&parent->core.spin); 727 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 728 /* 729 * 1->0 transition failed, retry. 730 */ 731 hammer2_spin_unex(&parent->core.spin); 732 hammer2_spin_unex(&chain->core.spin); 733 hammer2_mtx_unlock(&chain->lock); 734 735 return(chain); 736 } 737 738 /* 739 * 1->0 transition successful, parent spin held to prevent 740 * new lookups, chain spinlock held to protect parent field. 741 * Remove chain from the parent. 742 * 743 * If the chain is being removed from the parent's btree but 744 * is not blkmapped, we have to adjust live_count downward. If 745 * it is blkmapped then the blockref is retained in the parent 746 * as is its associated live_count. This case can occur when 747 * a chain added to the topology is unable to flush and is 748 * then later deleted. 749 */ 750 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 751 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 752 (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) { 753 atomic_add_int(&parent->core.live_count, -1); 754 } 755 RB_REMOVE(hammer2_chain_tree, 756 &parent->core.rbtree, chain); 757 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 758 --parent->core.chain_count; 759 chain->parent = NULL; 760 } 761 762 /* 763 * If our chain was the last chain in the parent's core the 764 * core is now empty and its parent might have to be 765 * re-dropped if it has 0 refs. 766 */ 767 if (parent->core.chain_count == 0) { 768 rdrop = parent; 769 atomic_add_int(&rdrop->refs, 1); 770 /* 771 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 772 rdrop = NULL; 773 */ 774 } 775 hammer2_spin_unex(&parent->core.spin); 776 parent = NULL; /* safety */ 777 /* FALL THROUGH */ 778 } else { 779 /* 780 * No-parent case. 781 */ 782 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 783 /* 784 * 1->0 transition failed, retry. 785 */ 786 hammer2_spin_unex(&parent->core.spin); 787 hammer2_spin_unex(&chain->core.spin); 788 hammer2_mtx_unlock(&chain->lock); 789 790 return(chain); 791 } 792 } 793 794 /* 795 * Successful 1->0 transition, no parent, no children... no way for 796 * anyone to ref this chain any more. We can clean-up and free it. 797 * 798 * We still have the core spinlock, and core's chain_count is 0. 799 * Any parent spinlock is gone. 800 */ 801 hammer2_spin_unex(&chain->core.spin); 802 hammer2_chain_assert_no_data(chain); 803 hammer2_mtx_unlock(&chain->lock); 804 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 805 chain->core.chain_count == 0); 806 807 /* 808 * All locks are gone, no pointers remain to the chain, finish 809 * freeing it. 810 */ 811 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 812 HAMMER2_CHAIN_MODIFIED)) == 0); 813 814 /* 815 * Once chain resources are gone we can use the now dead chain 816 * structure to placehold what might otherwise require a recursive 817 * drop, because we have potentially two things to drop and can only 818 * return one directly. 819 */ 820 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 821 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 822 chain->hmp = NULL; 823 kfree_obj(chain, hmp->mchain); 824 atomic_add_long(&hammer2_chain_allocs, -1); 825 } 826 827 /* 828 * Possible chaining loop when parent re-drop needed. 829 */ 830 return(rdrop); 831 } 832 833 /* 834 * Heuristical flush of the LRU, try to reduce the number of entries 835 * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3). This procedure is called 836 * only when lru_count exceeds HAMMER2_LRU_LIMIT. 837 */ 838 static 839 void 840 hammer2_chain_lru_flush(hammer2_pfs_t *pmp) 841 { 842 hammer2_chain_t *chain; 843 844 again: 845 chain = NULL; 846 hammer2_spin_ex(&pmp->lru_spin); 847 while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) { 848 /* 849 * Pick a chain off the lru_list, just recycle it quickly 850 * if LRUHINT is set (the chain was ref'd but left on 851 * the lru_list, so cycle to the end). 852 */ 853 chain = TAILQ_FIRST(&pmp->lru_list); 854 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 855 856 if (chain->flags & HAMMER2_CHAIN_LRUHINT) { 857 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 858 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 859 chain = NULL; 860 continue; 861 } 862 863 /* 864 * Ok, we are off the LRU. We must adjust refs before we 865 * can safely clear the ONLRU flag. 866 */ 867 atomic_add_int(&pmp->lru_count, -1); 868 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 869 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 870 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 871 break; 872 } 873 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 874 chain = NULL; 875 } 876 hammer2_spin_unex(&pmp->lru_spin); 877 if (chain == NULL) 878 return; 879 880 /* 881 * If we picked a chain off the lru list we may be able to lastdrop 882 * it. Use a depth of 1 to prevent excessive lastdrop recursion. 883 */ 884 while (chain) { 885 u_int refs; 886 887 refs = chain->refs; 888 cpu_ccfence(); 889 KKASSERT(refs > 0); 890 891 if (refs == 1) { 892 if (hammer2_mtx_ex_try(&chain->lock) == 0) 893 chain = hammer2_chain_lastdrop(chain, 1); 894 /* retry the same chain, or chain from lastdrop */ 895 } else { 896 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 897 break; 898 /* retry the same chain */ 899 } 900 cpu_pause(); 901 } 902 goto again; 903 } 904 905 /* 906 * On last lock release. 907 */ 908 static hammer2_io_t * 909 hammer2_chain_drop_data(hammer2_chain_t *chain) 910 { 911 hammer2_io_t *dio; 912 913 if ((dio = chain->dio) != NULL) { 914 chain->dio = NULL; 915 chain->data = NULL; 916 } else { 917 switch(chain->bref.type) { 918 case HAMMER2_BREF_TYPE_VOLUME: 919 case HAMMER2_BREF_TYPE_FREEMAP: 920 break; 921 default: 922 if (chain->data != NULL) { 923 hammer2_spin_unex(&chain->core.spin); 924 panic("chain data not null: " 925 "chain %p bref %016jx.%02x " 926 "refs %d parent %p dio %p data %p", 927 chain, chain->bref.data_off, 928 chain->bref.type, chain->refs, 929 chain->parent, 930 chain->dio, chain->data); 931 } 932 KKASSERT(chain->data == NULL); 933 break; 934 } 935 } 936 return dio; 937 } 938 939 /* 940 * Lock a referenced chain element, acquiring its data with I/O if necessary, 941 * and specify how you would like the data to be resolved. 942 * 943 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 944 * 945 * The lock is allowed to recurse, multiple locking ops will aggregate 946 * the requested resolve types. Once data is assigned it will not be 947 * removed until the last unlock. 948 * 949 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 950 * (typically used to avoid device/logical buffer 951 * aliasing for data) 952 * 953 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 954 * the INITIAL-create state (indirect blocks only). 955 * 956 * Do not resolve data elements for DATA chains. 957 * (typically used to avoid device/logical buffer 958 * aliasing for data) 959 * 960 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 961 * 962 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 963 * it will be locked exclusive. 964 * 965 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If 966 * the lock fails, EAGAIN is returned. 967 * 968 * NOTE: Embedded elements (volume header, inodes) are always resolved 969 * regardless. 970 * 971 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 972 * element will instantiate and zero its buffer, and flush it on 973 * release. 974 * 975 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 976 * so as not to instantiate a device buffer, which could alias against 977 * a logical file buffer. However, if ALWAYS is specified the 978 * device buffer will be instantiated anyway. 979 * 980 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which 981 * case it can be either 0 or EAGAIN. 982 * 983 * WARNING! This function blocks on I/O if data needs to be fetched. This 984 * blocking can run concurrent with other compatible lock holders 985 * who do not need data returning. The lock is not upgraded to 986 * exclusive during a data fetch, a separate bit is used to 987 * interlock I/O. However, an exclusive lock holder can still count 988 * on being interlocked against an I/O fetch managed by a shared 989 * lock holder. 990 */ 991 int 992 hammer2_chain_lock(hammer2_chain_t *chain, int how) 993 { 994 KKASSERT(chain->refs > 0); 995 996 if (how & HAMMER2_RESOLVE_NONBLOCK) { 997 /* 998 * We still have to bump lockcnt before acquiring the lock, 999 * even for non-blocking operation, because the unlock code 1000 * live-loops on lockcnt == 1 when dropping the last lock. 1001 * 1002 * If the non-blocking operation fails we have to use an 1003 * unhold sequence to undo the mess. 1004 * 1005 * NOTE: LOCKAGAIN must always succeed without blocking, 1006 * even if NONBLOCK is specified. 1007 */ 1008 atomic_add_int(&chain->lockcnt, 1); 1009 if (how & HAMMER2_RESOLVE_SHARED) { 1010 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1011 hammer2_mtx_sh_again(&chain->lock); 1012 } else { 1013 if (hammer2_mtx_sh_try(&chain->lock) != 0) { 1014 hammer2_chain_unhold(chain); 1015 return EAGAIN; 1016 } 1017 } 1018 } else { 1019 if (hammer2_mtx_ex_try(&chain->lock) != 0) { 1020 hammer2_chain_unhold(chain); 1021 return EAGAIN; 1022 } 1023 } 1024 } else { 1025 /* 1026 * Get the appropriate lock. If LOCKAGAIN is flagged with 1027 * SHARED the caller expects a shared lock to already be 1028 * present and we are giving it another ref. This case must 1029 * importantly not block if there is a pending exclusive lock 1030 * request. 1031 */ 1032 atomic_add_int(&chain->lockcnt, 1); 1033 if (how & HAMMER2_RESOLVE_SHARED) { 1034 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1035 hammer2_mtx_sh_again(&chain->lock); 1036 } else { 1037 hammer2_mtx_sh(&chain->lock); 1038 } 1039 } else { 1040 hammer2_mtx_ex(&chain->lock); 1041 } 1042 } 1043 1044 /* 1045 * If we already have a valid data pointer make sure the data is 1046 * synchronized to the current cpu, and then no further action is 1047 * necessary. 1048 */ 1049 if (chain->data) { 1050 if (chain->dio) 1051 hammer2_io_bkvasync(chain->dio); 1052 return 0; 1053 } 1054 1055 /* 1056 * Do we have to resolve the data? This is generally only 1057 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 1058 * Other BREF types expects the data to be there. 1059 */ 1060 switch(how & HAMMER2_RESOLVE_MASK) { 1061 case HAMMER2_RESOLVE_NEVER: 1062 return 0; 1063 case HAMMER2_RESOLVE_MAYBE: 1064 if (chain->flags & HAMMER2_CHAIN_INITIAL) 1065 return 0; 1066 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 1067 return 0; 1068 #if 0 1069 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 1070 return 0; 1071 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 1072 return 0; 1073 #endif 1074 /* fall through */ 1075 case HAMMER2_RESOLVE_ALWAYS: 1076 default: 1077 break; 1078 } 1079 1080 /* 1081 * Caller requires data 1082 */ 1083 hammer2_chain_load_data(chain); 1084 1085 return 0; 1086 } 1087 1088 /* 1089 * Lock the chain, retain the hold, and drop the data persistence count. 1090 * The data should remain valid because we never transitioned lockcnt 1091 * through 0. 1092 */ 1093 void 1094 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 1095 { 1096 hammer2_chain_lock(chain, how); 1097 atomic_add_int(&chain->lockcnt, -1); 1098 } 1099 1100 #if 0 1101 /* 1102 * Downgrade an exclusive chain lock to a shared chain lock. 1103 * 1104 * NOTE: There is no upgrade equivalent due to the ease of 1105 * deadlocks in that direction. 1106 */ 1107 void 1108 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1109 { 1110 hammer2_mtx_downgrade(&chain->lock); 1111 } 1112 #endif 1113 1114 /* 1115 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1116 * may be of any type. 1117 * 1118 * Once chain->data is set it cannot be disposed of until all locks are 1119 * released. 1120 * 1121 * Make sure the data is synchronized to the current cpu. 1122 */ 1123 void 1124 hammer2_chain_load_data(hammer2_chain_t *chain) 1125 { 1126 hammer2_blockref_t *bref; 1127 hammer2_dev_t *hmp; 1128 hammer2_io_t *dio; 1129 char *bdata; 1130 int error; 1131 1132 /* 1133 * Degenerate case, data already present, or chain has no media 1134 * reference to load. 1135 */ 1136 KKASSERT(chain->lock.mtx_lock & MTX_MASK); 1137 if (chain->data) { 1138 if (chain->dio) 1139 hammer2_io_bkvasync(chain->dio); 1140 return; 1141 } 1142 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1143 return; 1144 1145 hmp = chain->hmp; 1146 KKASSERT(hmp != NULL); 1147 1148 /* 1149 * Gain the IOINPROG bit, interlocked block. 1150 */ 1151 for (;;) { 1152 u_int oflags; 1153 u_int nflags; 1154 1155 oflags = chain->flags; 1156 cpu_ccfence(); 1157 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1158 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1159 tsleep_interlock(&chain->flags, 0); 1160 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1161 tsleep(&chain->flags, PINTERLOCKED, 1162 "h2iocw", 0); 1163 } 1164 /* retry */ 1165 } else { 1166 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1167 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1168 break; 1169 } 1170 /* retry */ 1171 } 1172 } 1173 1174 /* 1175 * We own CHAIN_IOINPROG 1176 * 1177 * Degenerate case if we raced another load. 1178 */ 1179 if (chain->data) { 1180 if (chain->dio) 1181 hammer2_io_bkvasync(chain->dio); 1182 goto done; 1183 } 1184 1185 /* 1186 * We must resolve to a device buffer, either by issuing I/O or 1187 * by creating a zero-fill element. We do not mark the buffer 1188 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1189 * API must still be used to do that). 1190 * 1191 * The device buffer is variable-sized in powers of 2 down 1192 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1193 * chunk always contains buffers of the same size. (XXX) 1194 * 1195 * The minimum physical IO size may be larger than the variable 1196 * block size. 1197 */ 1198 bref = &chain->bref; 1199 1200 /* 1201 * The getblk() optimization can only be used on newly created 1202 * elements if the physical block size matches the request. 1203 */ 1204 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1205 error = hammer2_io_new(hmp, bref->type, 1206 bref->data_off, chain->bytes, 1207 &chain->dio); 1208 } else { 1209 error = hammer2_io_bread(hmp, bref->type, 1210 bref->data_off, chain->bytes, 1211 &chain->dio); 1212 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 1213 } 1214 if (error) { 1215 chain->error = HAMMER2_ERROR_EIO; 1216 kprintf("hammer2_chain_load_data: I/O error %016jx: %d\n", 1217 (intmax_t)bref->data_off, error); 1218 hammer2_io_bqrelse(&chain->dio); 1219 goto done; 1220 } 1221 chain->error = 0; 1222 1223 /* 1224 * This isn't perfect and can be ignored on OSs which do not have 1225 * an indication as to whether a buffer is coming from cache or 1226 * if I/O was actually issued for the read. TESTEDGOOD will work 1227 * pretty well without the B_IOISSUED logic because chains are 1228 * cached, but in that situation (without B_IOISSUED) it will not 1229 * detect whether a re-read via I/O is corrupted verses the original 1230 * read. 1231 * 1232 * We can't re-run the CRC on every fresh lock. That would be 1233 * insanely expensive. 1234 * 1235 * If the underlying kernel buffer covers the entire chain we can 1236 * use the B_IOISSUED indication to determine if we have to re-run 1237 * the CRC on chain data for chains that managed to stay cached 1238 * across the kernel disposal of the original buffer. 1239 */ 1240 if ((dio = chain->dio) != NULL && dio->bp) { 1241 struct buf *bp = dio->bp; 1242 1243 if (dio->psize == chain->bytes && 1244 (bp->b_flags & B_IOISSUED)) { 1245 atomic_clear_int(&chain->flags, 1246 HAMMER2_CHAIN_TESTEDGOOD); 1247 bp->b_flags &= ~B_IOISSUED; 1248 } 1249 } 1250 1251 /* 1252 * NOTE: A locked chain's data cannot be modified without first 1253 * calling hammer2_chain_modify(). 1254 */ 1255 1256 /* 1257 * NOTE: hammer2_io_data() call issues bkvasync() 1258 */ 1259 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1260 1261 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1262 /* 1263 * Clear INITIAL. In this case we used io_new() and the 1264 * buffer has been zero'd and marked dirty. 1265 * 1266 * CHAIN_MODIFIED has not been set yet, and we leave it 1267 * that way for now. Set a temporary CHAIN_NOTTESTED flag 1268 * to prevent hammer2_chain_testcheck() from trying to match 1269 * a check code that has not yet been generated. This bit 1270 * should NOT end up on the actual media. 1271 */ 1272 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1273 atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 1274 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1275 /* 1276 * check data not currently synchronized due to 1277 * modification. XXX assumes data stays in the buffer 1278 * cache, which might not be true (need biodep on flush 1279 * to calculate crc? or simple crc?). 1280 */ 1281 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1282 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1283 chain->error = HAMMER2_ERROR_CHECK; 1284 } else { 1285 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1286 } 1287 } 1288 1289 /* 1290 * Setup the data pointer, either pointing it to an embedded data 1291 * structure and copying the data from the buffer, or pointing it 1292 * into the buffer. 1293 * 1294 * The buffer is not retained when copying to an embedded data 1295 * structure in order to avoid potential deadlocks or recursions 1296 * on the same physical buffer. 1297 * 1298 * WARNING! Other threads can start using the data the instant we 1299 * set chain->data non-NULL. 1300 */ 1301 switch (bref->type) { 1302 case HAMMER2_BREF_TYPE_VOLUME: 1303 case HAMMER2_BREF_TYPE_FREEMAP: 1304 /* 1305 * Copy data from bp to embedded buffer 1306 */ 1307 panic("hammer2_chain_load_data: unresolved volume header"); 1308 break; 1309 case HAMMER2_BREF_TYPE_DIRENT: 1310 KKASSERT(chain->bytes != 0); 1311 /* fall through */ 1312 case HAMMER2_BREF_TYPE_INODE: 1313 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1314 case HAMMER2_BREF_TYPE_INDIRECT: 1315 case HAMMER2_BREF_TYPE_DATA: 1316 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1317 default: 1318 /* 1319 * Point data at the device buffer and leave dio intact. 1320 */ 1321 chain->data = (void *)bdata; 1322 break; 1323 } 1324 1325 /* 1326 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1327 */ 1328 done: 1329 for (;;) { 1330 u_int oflags; 1331 u_int nflags; 1332 1333 oflags = chain->flags; 1334 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1335 HAMMER2_CHAIN_IOSIGNAL); 1336 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1337 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1338 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1339 wakeup(&chain->flags); 1340 break; 1341 } 1342 } 1343 } 1344 1345 /* 1346 * Unlock and deref a chain element. 1347 * 1348 * Remember that the presence of children under chain prevent the chain's 1349 * destruction but do not add additional references, so the dio will still 1350 * be dropped. 1351 */ 1352 void 1353 hammer2_chain_unlock(hammer2_chain_t *chain) 1354 { 1355 hammer2_io_t *dio; 1356 u_int lockcnt; 1357 int iter = 0; 1358 1359 /* 1360 * If multiple locks are present (or being attempted) on this 1361 * particular chain we can just unlock, drop refs, and return. 1362 * 1363 * Otherwise fall-through on the 1->0 transition. 1364 */ 1365 for (;;) { 1366 lockcnt = chain->lockcnt; 1367 KKASSERT(lockcnt > 0); 1368 cpu_ccfence(); 1369 if (lockcnt > 1) { 1370 if (atomic_cmpset_int(&chain->lockcnt, 1371 lockcnt, lockcnt - 1)) { 1372 hammer2_mtx_unlock(&chain->lock); 1373 return; 1374 } 1375 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1376 /* while holding the mutex exclusively */ 1377 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1378 break; 1379 } else { 1380 /* 1381 * This situation can easily occur on SMP due to 1382 * the gap inbetween the 1->0 transition and the 1383 * final unlock. We cannot safely block on the 1384 * mutex because lockcnt might go above 1. 1385 * 1386 * XXX Sleep for one tick if it takes too long. 1387 */ 1388 if (++iter > 1000) { 1389 if (iter > 1000 + hz) { 1390 kprintf("hammer2: h2race2 %p\n", chain); 1391 iter = 1000; 1392 } 1393 tsleep(&iter, 0, "h2race2", 1); 1394 } 1395 cpu_pause(); 1396 } 1397 /* retry */ 1398 } 1399 1400 /* 1401 * Last unlock / mutex upgraded to exclusive. Drop the data 1402 * reference. 1403 */ 1404 dio = hammer2_chain_drop_data(chain); 1405 if (dio) 1406 hammer2_io_bqrelse(&dio); 1407 hammer2_mtx_unlock(&chain->lock); 1408 } 1409 1410 /* 1411 * Unlock and hold chain data intact 1412 */ 1413 void 1414 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1415 { 1416 atomic_add_int(&chain->lockcnt, 1); 1417 hammer2_chain_unlock(chain); 1418 } 1419 1420 /* 1421 * Helper to obtain the blockref[] array base and count for a chain. 1422 * 1423 * XXX Not widely used yet, various use cases need to be validated and 1424 * converted to use this function. 1425 */ 1426 static 1427 hammer2_blockref_t * 1428 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1429 { 1430 hammer2_blockref_t *base; 1431 int count; 1432 1433 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1434 base = NULL; 1435 1436 switch(parent->bref.type) { 1437 case HAMMER2_BREF_TYPE_INODE: 1438 count = HAMMER2_SET_COUNT; 1439 break; 1440 case HAMMER2_BREF_TYPE_INDIRECT: 1441 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1442 count = parent->bytes / sizeof(hammer2_blockref_t); 1443 break; 1444 case HAMMER2_BREF_TYPE_VOLUME: 1445 count = HAMMER2_SET_COUNT; 1446 break; 1447 case HAMMER2_BREF_TYPE_FREEMAP: 1448 count = HAMMER2_SET_COUNT; 1449 break; 1450 default: 1451 panic("hammer2_chain_base_and_count: " 1452 "unrecognized blockref type: %d", 1453 parent->bref.type); 1454 count = 0; 1455 break; 1456 } 1457 } else { 1458 switch(parent->bref.type) { 1459 case HAMMER2_BREF_TYPE_INODE: 1460 base = &parent->data->ipdata.u.blockset.blockref[0]; 1461 count = HAMMER2_SET_COUNT; 1462 break; 1463 case HAMMER2_BREF_TYPE_INDIRECT: 1464 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1465 base = &parent->data->npdata[0]; 1466 count = parent->bytes / sizeof(hammer2_blockref_t); 1467 break; 1468 case HAMMER2_BREF_TYPE_VOLUME: 1469 base = &parent->data->voldata. 1470 sroot_blockset.blockref[0]; 1471 count = HAMMER2_SET_COUNT; 1472 break; 1473 case HAMMER2_BREF_TYPE_FREEMAP: 1474 base = &parent->data->blkset.blockref[0]; 1475 count = HAMMER2_SET_COUNT; 1476 break; 1477 default: 1478 panic("hammer2_chain_base_and_count: " 1479 "unrecognized blockref type: %d", 1480 parent->bref.type); 1481 base = NULL; 1482 count = 0; 1483 break; 1484 } 1485 } 1486 *countp = count; 1487 1488 return base; 1489 } 1490 1491 /* 1492 * This counts the number of live blockrefs in a block array and 1493 * also calculates the point at which all remaining blockrefs are empty. 1494 * This routine can only be called on a live chain. 1495 * 1496 * Caller holds the chain locked, but possibly with a shared lock. We 1497 * must use an exclusive spinlock to prevent corruption. 1498 * 1499 * NOTE: Flag is not set until after the count is complete, allowing 1500 * callers to test the flag without holding the spinlock. 1501 * 1502 * NOTE: If base is NULL the related chain is still in the INITIAL 1503 * state and there are no blockrefs to count. 1504 * 1505 * NOTE: live_count may already have some counts accumulated due to 1506 * creation and deletion and could even be initially negative. 1507 */ 1508 void 1509 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1510 hammer2_blockref_t *base, int count) 1511 { 1512 hammer2_spin_ex(&chain->core.spin); 1513 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1514 if (base) { 1515 while (--count >= 0) { 1516 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1517 break; 1518 } 1519 chain->core.live_zero = count + 1; 1520 while (count >= 0) { 1521 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1522 atomic_add_int(&chain->core.live_count, 1523 1); 1524 --count; 1525 } 1526 } else { 1527 chain->core.live_zero = 0; 1528 } 1529 /* else do not modify live_count */ 1530 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1531 } 1532 hammer2_spin_unex(&chain->core.spin); 1533 } 1534 1535 /* 1536 * Resize the chain's physical storage allocation in-place. This function does 1537 * not usually adjust the data pointer and must be followed by (typically) a 1538 * hammer2_chain_modify() call to copy any old data over and adjust the 1539 * data pointer. 1540 * 1541 * Chains can be resized smaller without reallocating the storage. Resizing 1542 * larger will reallocate the storage. Excess or prior storage is reclaimed 1543 * asynchronously at a later time. 1544 * 1545 * An nradix value of 0 is special-cased to mean that the storage should 1546 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1547 * byte). 1548 * 1549 * Must be passed an exclusively locked parent and chain. 1550 * 1551 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1552 * to avoid instantiating a device buffer that conflicts with the vnode data 1553 * buffer. However, because H2 can compress or encrypt data, the chain may 1554 * have a dio assigned to it in those situations, and they do not conflict. 1555 * 1556 * XXX return error if cannot resize. 1557 */ 1558 int 1559 hammer2_chain_resize(hammer2_chain_t *chain, 1560 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1561 int nradix, int flags) 1562 { 1563 hammer2_dev_t *hmp; 1564 size_t obytes; 1565 size_t nbytes; 1566 int error; 1567 1568 hmp = chain->hmp; 1569 1570 /* 1571 * Only data and indirect blocks can be resized for now. 1572 * (The volu root, inodes, and freemap elements use a fixed size). 1573 */ 1574 KKASSERT(chain != &hmp->vchain); 1575 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1576 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1577 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1578 1579 /* 1580 * Nothing to do if the element is already the proper size 1581 */ 1582 obytes = chain->bytes; 1583 nbytes = (nradix) ? (1U << nradix) : 0; 1584 if (obytes == nbytes) 1585 return (chain->error); 1586 1587 /* 1588 * Make sure the old data is instantiated so we can copy it. If this 1589 * is a data block, the device data may be superfluous since the data 1590 * might be in a logical block, but compressed or encrypted data is 1591 * another matter. 1592 * 1593 * NOTE: The modify will set BLKMAPUPD for us if BLKMAPPED is set. 1594 */ 1595 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1596 if (error) 1597 return error; 1598 1599 /* 1600 * Reallocate the block, even if making it smaller (because different 1601 * block sizes may be in different regions). 1602 * 1603 * NOTE: Operation does not copy the data and may only be used 1604 * to resize data blocks in-place, or directory entry blocks 1605 * which are about to be modified in some manner. 1606 */ 1607 error = hammer2_freemap_alloc(chain, nbytes); 1608 if (error) 1609 return error; 1610 1611 chain->bytes = nbytes; 1612 1613 /* 1614 * We don't want the followup chain_modify() to try to copy data 1615 * from the old (wrong-sized) buffer. It won't know how much to 1616 * copy. This case should only occur during writes when the 1617 * originator already has the data to write in-hand. 1618 */ 1619 if (chain->dio) { 1620 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1621 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1622 hammer2_io_brelse(&chain->dio); 1623 chain->data = NULL; 1624 } 1625 return (chain->error); 1626 } 1627 1628 /* 1629 * Set the chain modified so its data can be changed by the caller, or 1630 * install deduplicated data. The caller must call this routine for each 1631 * set of modifications it makes, even if the chain is already flagged 1632 * MODIFIED. 1633 * 1634 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1635 * is a CLC (cluster level change) field and is not updated by parent 1636 * propagation during a flush. 1637 * 1638 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1639 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1640 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1641 * remains unmodified with its old data ref intact and chain->error 1642 * unchanged. 1643 * 1644 * Dedup Handling 1645 * 1646 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1647 * even if the chain is still flagged MODIFIED. In this case the chain's 1648 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1649 * 1650 * If the caller passes a non-zero dedup_off we will use it to assign the 1651 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1652 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1653 * must not modify the data content upon return. 1654 */ 1655 int 1656 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1657 hammer2_off_t dedup_off, int flags) 1658 { 1659 hammer2_dev_t *hmp; 1660 hammer2_io_t *dio; 1661 int error; 1662 int wasinitial; 1663 int setmodified; 1664 int setupdate; 1665 int newmod; 1666 char *bdata; 1667 1668 hmp = chain->hmp; 1669 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE); 1670 1671 /* 1672 * Data is not optional for freemap chains (we must always be sure 1673 * to copy the data on COW storage allocations). 1674 */ 1675 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1676 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1677 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1678 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1679 } 1680 1681 /* 1682 * Data must be resolved if already assigned, unless explicitly 1683 * flagged otherwise. If we cannot safety load the data the 1684 * modification fails and we return early. 1685 */ 1686 if (chain->data == NULL && chain->bytes != 0 && 1687 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1688 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1689 hammer2_chain_load_data(chain); 1690 if (chain->error) 1691 return (chain->error); 1692 } 1693 error = 0; 1694 1695 /* 1696 * Set MODIFIED to indicate that the chain has been modified. A new 1697 * allocation is required when modifying a chain. 1698 * 1699 * Set UPDATE to ensure that the blockref is updated in the parent. 1700 * 1701 * If MODIFIED is already set determine if we can reuse the assigned 1702 * data block or if we need a new data block. 1703 */ 1704 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1705 /* 1706 * Must set modified bit. 1707 */ 1708 atomic_add_long(&hammer2_count_modified_chains, 1); 1709 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1710 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1711 setmodified = 1; 1712 1713 /* 1714 * We may be able to avoid a copy-on-write if the chain's 1715 * check mode is set to NONE and the chain's current 1716 * modify_tid is beyond the last explicit snapshot tid. 1717 * 1718 * This implements HAMMER2's overwrite-in-place feature. 1719 * 1720 * NOTE! This data-block cannot be used as a de-duplication 1721 * source when the check mode is set to NONE. 1722 */ 1723 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1724 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1725 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1726 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1727 HAMMER2_DEC_CHECK(chain->bref.methods) == 1728 HAMMER2_CHECK_NONE && 1729 chain->pmp && 1730 chain->bref.modify_tid > 1731 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1732 /* 1733 * Sector overwrite allowed. 1734 */ 1735 newmod = 0; 1736 } else if ((hmp->hflags & HMNT2_EMERG) && 1737 chain->pmp && 1738 chain->bref.modify_tid > 1739 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1740 /* 1741 * If in emergency delete mode then do a modify-in- 1742 * place on any chain type belonging to the PFS as 1743 * long as it doesn't mess up a snapshot. We might 1744 * be forced to do this anyway a little further down 1745 * in the code if the allocation fails. 1746 * 1747 * Also note that in emergency mode, these modify-in- 1748 * place operations are NOT SAFE. A storage failure, 1749 * power failure, or panic can corrupt the filesystem. 1750 */ 1751 newmod = 0; 1752 } else { 1753 /* 1754 * Sector overwrite not allowed, must copy-on-write. 1755 */ 1756 newmod = 1; 1757 } 1758 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1759 /* 1760 * If the modified chain was registered for dedup we need 1761 * a new allocation. This only happens for delayed-flush 1762 * chains (i.e. which run through the front-end buffer 1763 * cache). 1764 */ 1765 newmod = 1; 1766 setmodified = 0; 1767 } else { 1768 /* 1769 * Already flagged modified, no new allocation is needed. 1770 */ 1771 newmod = 0; 1772 setmodified = 0; 1773 } 1774 1775 /* 1776 * Flag parent update required. 1777 */ 1778 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1779 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1780 setupdate = 1; 1781 } else { 1782 setupdate = 0; 1783 } 1784 1785 /* 1786 * The XOP code returns held but unlocked focus chains. This 1787 * prevents the chain from being destroyed but does not prevent 1788 * it from being modified. diolk is used to interlock modifications 1789 * against XOP frontend accesses to the focus. 1790 * 1791 * This allows us to theoretically avoid deadlocking the frontend 1792 * if one of the backends lock up by not formally locking the 1793 * focused chain in the frontend. In addition, the synchronization 1794 * code relies on this mechanism to avoid deadlocking concurrent 1795 * synchronization threads. 1796 */ 1797 lockmgr(&chain->diolk, LK_EXCLUSIVE); 1798 1799 /* 1800 * The modification or re-modification requires an allocation and 1801 * possible COW. If an error occurs, the previous content and data 1802 * reference is retained and the modification fails. 1803 * 1804 * If dedup_off is non-zero, the caller is requesting a deduplication 1805 * rather than a modification. The MODIFIED bit is not set and the 1806 * data offset is set to the deduplication offset. The data cannot 1807 * be modified. 1808 * 1809 * NOTE: The dedup offset is allowed to be in a partially free state 1810 * and we must be sure to reset it to a fully allocated state 1811 * to force two bulkfree passes to free it again. 1812 * 1813 * NOTE: Only applicable when chain->bytes != 0. 1814 * 1815 * XXX can a chain already be marked MODIFIED without a data 1816 * assignment? If not, assert here instead of testing the case. 1817 */ 1818 if (chain != &hmp->vchain && chain != &hmp->fchain && 1819 chain->bytes) { 1820 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1821 newmod 1822 ) { 1823 /* 1824 * NOTE: We do not have to remove the dedup 1825 * registration because the area is still 1826 * allocated and the underlying DIO will 1827 * still be flushed. 1828 */ 1829 if (dedup_off) { 1830 chain->bref.data_off = dedup_off; 1831 if ((int)(dedup_off & HAMMER2_OFF_MASK_RADIX)) 1832 chain->bytes = 1 << 1833 (int)(dedup_off & 1834 HAMMER2_OFF_MASK_RADIX); 1835 else 1836 chain->bytes = 0; 1837 chain->error = 0; 1838 atomic_clear_int(&chain->flags, 1839 HAMMER2_CHAIN_MODIFIED); 1840 atomic_add_long(&hammer2_count_modified_chains, 1841 -1); 1842 if (chain->pmp) { 1843 hammer2_pfs_memory_wakeup( 1844 chain->pmp, -1); 1845 } 1846 hammer2_freemap_adjust(hmp, &chain->bref, 1847 HAMMER2_FREEMAP_DORECOVER); 1848 atomic_set_int(&chain->flags, 1849 HAMMER2_CHAIN_DEDUPABLE); 1850 } else { 1851 error = hammer2_freemap_alloc(chain, 1852 chain->bytes); 1853 atomic_clear_int(&chain->flags, 1854 HAMMER2_CHAIN_DEDUPABLE); 1855 1856 /* 1857 * If we are unable to allocate a new block 1858 * but we are in emergency mode, issue a 1859 * warning to the console and reuse the same 1860 * block. 1861 * 1862 * We behave as if the allocation were 1863 * successful. 1864 * 1865 * THIS IS IMPORTANT: These modifications 1866 * are virtually guaranteed to corrupt any 1867 * snapshots related to this filesystem. 1868 */ 1869 if (error && (hmp->hflags & HMNT2_EMERG)) { 1870 error = 0; 1871 chain->bref.flags |= 1872 HAMMER2_BREF_FLAG_EMERG_MIP; 1873 1874 krateprintf(&krate_h2em, 1875 "hammer2: Emergency Mode WARNING: " 1876 "Operation will likely corrupt " 1877 "related snapshot: " 1878 "%016jx.%02x key=%016jx\n", 1879 chain->bref.data_off, 1880 chain->bref.type, 1881 chain->bref.key); 1882 } else if (error == 0) { 1883 chain->bref.flags &= 1884 ~HAMMER2_BREF_FLAG_EMERG_MIP; 1885 } 1886 } 1887 } 1888 } 1889 1890 /* 1891 * Stop here if error. We have to undo any flag bits we might 1892 * have set above. 1893 */ 1894 if (error) { 1895 if (setmodified) { 1896 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1897 atomic_add_long(&hammer2_count_modified_chains, -1); 1898 if (chain->pmp) 1899 hammer2_pfs_memory_wakeup(chain->pmp, -1); 1900 } 1901 if (setupdate) { 1902 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1903 } 1904 lockmgr(&chain->diolk, LK_RELEASE); 1905 1906 return error; 1907 } 1908 1909 /* 1910 * Update mirror_tid and modify_tid. modify_tid is only updated 1911 * if not passed as zero (during flushes, parent propagation passes 1912 * the value 0). 1913 * 1914 * NOTE: chain->pmp could be the device spmp. 1915 */ 1916 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1917 if (mtid) 1918 chain->bref.modify_tid = mtid; 1919 1920 /* 1921 * Set BLKMAPUPD to tell the flush code that an existing blockmap entry 1922 * requires updating as well as to tell the delete code that the 1923 * chain's blockref might not exactly match (in terms of physical size 1924 * or block offset) the one in the parent's blocktable. The base key 1925 * of course will still match. 1926 */ 1927 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) 1928 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD); 1929 1930 /* 1931 * Short-cut data block handling when the caller does not need an 1932 * actual data reference to (aka OPTDATA), as long as the chain does 1933 * not already have a data pointer to the data and no de-duplication 1934 * occurred. 1935 * 1936 * This generally means that the modifications are being done via the 1937 * logical buffer cache. 1938 * 1939 * NOTE: If deduplication occurred we have to run through the data 1940 * stuff to clear INITIAL, and the caller will likely want to 1941 * assign the check code anyway. Leaving INITIAL set on a 1942 * dedup can be deadly (it can cause the block to be zero'd!). 1943 * 1944 * This code also handles bytes == 0 (most dirents). 1945 */ 1946 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1947 (flags & HAMMER2_MODIFY_OPTDATA) && 1948 chain->data == NULL) { 1949 if (dedup_off == 0) { 1950 KKASSERT(chain->dio == NULL); 1951 goto skip2; 1952 } 1953 } 1954 1955 /* 1956 * Clearing the INITIAL flag (for indirect blocks) indicates that 1957 * we've processed the uninitialized storage allocation. 1958 * 1959 * If this flag is already clear we are likely in a copy-on-write 1960 * situation but we have to be sure NOT to bzero the storage if 1961 * no data is present. 1962 * 1963 * Clearing of NOTTESTED is allowed if the MODIFIED bit is set, 1964 */ 1965 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1966 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1967 wasinitial = 1; 1968 } else { 1969 wasinitial = 0; 1970 } 1971 1972 /* 1973 * Instantiate data buffer and possibly execute COW operation 1974 */ 1975 switch(chain->bref.type) { 1976 case HAMMER2_BREF_TYPE_VOLUME: 1977 case HAMMER2_BREF_TYPE_FREEMAP: 1978 /* 1979 * The data is embedded, no copy-on-write operation is 1980 * needed. 1981 */ 1982 KKASSERT(chain->dio == NULL); 1983 break; 1984 case HAMMER2_BREF_TYPE_DIRENT: 1985 /* 1986 * The data might be fully embedded. 1987 */ 1988 if (chain->bytes == 0) { 1989 KKASSERT(chain->dio == NULL); 1990 break; 1991 } 1992 /* fall through */ 1993 case HAMMER2_BREF_TYPE_INODE: 1994 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1995 case HAMMER2_BREF_TYPE_DATA: 1996 case HAMMER2_BREF_TYPE_INDIRECT: 1997 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1998 /* 1999 * Perform the copy-on-write operation 2000 * 2001 * zero-fill or copy-on-write depending on whether 2002 * chain->data exists or not and set the dirty state for 2003 * the new buffer. hammer2_io_new() will handle the 2004 * zero-fill. 2005 * 2006 * If a dedup_off was supplied this is an existing block 2007 * and no COW, copy, or further modification is required. 2008 */ 2009 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 2010 2011 if (wasinitial && dedup_off == 0) { 2012 error = hammer2_io_new(hmp, chain->bref.type, 2013 chain->bref.data_off, 2014 chain->bytes, &dio); 2015 } else { 2016 error = hammer2_io_bread(hmp, chain->bref.type, 2017 chain->bref.data_off, 2018 chain->bytes, &dio); 2019 } 2020 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 2021 2022 /* 2023 * If an I/O error occurs make sure callers cannot accidently 2024 * modify the old buffer's contents and corrupt the filesystem. 2025 * 2026 * NOTE: hammer2_io_data() call issues bkvasync() 2027 */ 2028 if (error) { 2029 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 2030 hmp); 2031 chain->error = HAMMER2_ERROR_EIO; 2032 hammer2_io_brelse(&dio); 2033 hammer2_io_brelse(&chain->dio); 2034 chain->data = NULL; 2035 break; 2036 } 2037 chain->error = 0; 2038 bdata = hammer2_io_data(dio, chain->bref.data_off); 2039 2040 if (chain->data) { 2041 /* 2042 * COW (unless a dedup). 2043 */ 2044 KKASSERT(chain->dio != NULL); 2045 if (chain->data != (void *)bdata && dedup_off == 0) { 2046 bcopy(chain->data, bdata, chain->bytes); 2047 } 2048 } else if (wasinitial == 0 && dedup_off == 0) { 2049 /* 2050 * We have a problem. We were asked to COW but 2051 * we don't have any data to COW with! 2052 */ 2053 panic("hammer2_chain_modify: having a COW %p\n", 2054 chain); 2055 } 2056 2057 /* 2058 * Retire the old buffer, replace with the new. Dirty or 2059 * redirty the new buffer. 2060 * 2061 * WARNING! The system buffer cache may have already flushed 2062 * the buffer, so we must be sure to [re]dirty it 2063 * for further modification. 2064 * 2065 * If dedup_off was supplied, the caller is not 2066 * expected to make any further modification to the 2067 * buffer. 2068 * 2069 * WARNING! hammer2_get_gdata() assumes dio never transitions 2070 * through NULL in order to optimize away unnecessary 2071 * diolk operations. 2072 */ 2073 { 2074 hammer2_io_t *tio; 2075 2076 if ((tio = chain->dio) != NULL) 2077 hammer2_io_bqrelse(&tio); 2078 chain->data = (void *)bdata; 2079 chain->dio = dio; 2080 if (dedup_off == 0) 2081 hammer2_io_setdirty(dio); 2082 } 2083 break; 2084 default: 2085 panic("hammer2_chain_modify: illegal non-embedded type %d", 2086 chain->bref.type); 2087 break; 2088 2089 } 2090 skip2: 2091 /* 2092 * setflush on parent indicating that the parent must recurse down 2093 * to us. Do not call on chain itself which might already have it 2094 * set. 2095 */ 2096 if (chain->parent) 2097 hammer2_chain_setflush(chain->parent); 2098 lockmgr(&chain->diolk, LK_RELEASE); 2099 2100 return (chain->error); 2101 } 2102 2103 /* 2104 * Modify the chain associated with an inode. 2105 */ 2106 int 2107 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 2108 hammer2_tid_t mtid, int flags) 2109 { 2110 int error; 2111 2112 hammer2_inode_modify(ip); 2113 error = hammer2_chain_modify(chain, mtid, 0, flags); 2114 2115 return error; 2116 } 2117 2118 /* 2119 * This function returns the chain at the nearest key within the specified 2120 * range. The returned chain will be referenced but not locked. 2121 * 2122 * This function will recurse through chain->rbtree as necessary and will 2123 * return a *key_nextp suitable for iteration. *key_nextp is only set if 2124 * the iteration value is less than the current value of *key_nextp. 2125 * 2126 * The caller should use (*key_nextp) to calculate the actual range of 2127 * the returned element, which will be (key_beg to *key_nextp - 1), because 2128 * there might be another element which is superior to the returned element 2129 * and overlaps it. 2130 * 2131 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 2132 * chains continue to be returned. On EOF (*key_nextp) may overflow since 2133 * it will wind up being (key_end + 1). 2134 * 2135 * WARNING! Must be called with child's spinlock held. Spinlock remains 2136 * held through the operation. 2137 */ 2138 struct hammer2_chain_find_info { 2139 hammer2_chain_t *best; 2140 hammer2_key_t key_beg; 2141 hammer2_key_t key_end; 2142 hammer2_key_t key_next; 2143 }; 2144 2145 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 2146 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 2147 2148 static 2149 hammer2_chain_t * 2150 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 2151 hammer2_key_t key_beg, hammer2_key_t key_end) 2152 { 2153 struct hammer2_chain_find_info info; 2154 2155 info.best = NULL; 2156 info.key_beg = key_beg; 2157 info.key_end = key_end; 2158 info.key_next = *key_nextp; 2159 2160 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2161 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2162 &info); 2163 *key_nextp = info.key_next; 2164 #if 0 2165 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2166 parent, key_beg, key_end, *key_nextp); 2167 #endif 2168 2169 return (info.best); 2170 } 2171 2172 static 2173 int 2174 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2175 { 2176 struct hammer2_chain_find_info *info = data; 2177 hammer2_key_t child_beg; 2178 hammer2_key_t child_end; 2179 2180 child_beg = child->bref.key; 2181 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2182 2183 if (child_end < info->key_beg) 2184 return(-1); 2185 if (child_beg > info->key_end) 2186 return(1); 2187 return(0); 2188 } 2189 2190 static 2191 int 2192 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2193 { 2194 struct hammer2_chain_find_info *info = data; 2195 hammer2_chain_t *best; 2196 hammer2_key_t child_end; 2197 2198 /* 2199 * WARNING! Layerq is scanned forwards, exact matches should keep 2200 * the existing info->best. 2201 */ 2202 if ((best = info->best) == NULL) { 2203 /* 2204 * No previous best. Assign best 2205 */ 2206 info->best = child; 2207 } else if (best->bref.key <= info->key_beg && 2208 child->bref.key <= info->key_beg) { 2209 /* 2210 * Illegal overlap. 2211 */ 2212 KKASSERT(0); 2213 /*info->best = child;*/ 2214 } else if (child->bref.key < best->bref.key) { 2215 /* 2216 * Child has a nearer key and best is not flush with key_beg. 2217 * Set best to child. Truncate key_next to the old best key. 2218 */ 2219 info->best = child; 2220 if (info->key_next > best->bref.key || info->key_next == 0) 2221 info->key_next = best->bref.key; 2222 } else if (child->bref.key == best->bref.key) { 2223 /* 2224 * If our current best is flush with the child then this 2225 * is an illegal overlap. 2226 * 2227 * key_next will automatically be limited to the smaller of 2228 * the two end-points. 2229 */ 2230 KKASSERT(0); 2231 info->best = child; 2232 } else { 2233 /* 2234 * Keep the current best but truncate key_next to the child's 2235 * base. 2236 * 2237 * key_next will also automatically be limited to the smaller 2238 * of the two end-points (probably not necessary for this case 2239 * but we do it anyway). 2240 */ 2241 if (info->key_next > child->bref.key || info->key_next == 0) 2242 info->key_next = child->bref.key; 2243 } 2244 2245 /* 2246 * Always truncate key_next based on child's end-of-range. 2247 */ 2248 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2249 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2250 info->key_next = child_end; 2251 2252 return(0); 2253 } 2254 2255 /* 2256 * Retrieve the specified chain from a media blockref, creating the 2257 * in-memory chain structure which reflects it. The returned chain is 2258 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must 2259 * handle crc-checks and so forth, and should check chain->error before 2260 * assuming that the data is good. 2261 * 2262 * To handle insertion races pass the INSERT_RACE flag along with the 2263 * generation number of the core. NULL will be returned if the generation 2264 * number changes before we have a chance to insert the chain. Insert 2265 * races can occur because the parent might be held shared. 2266 * 2267 * Caller must hold the parent locked shared or exclusive since we may 2268 * need the parent's bref array to find our block. 2269 * 2270 * WARNING! chain->pmp is always set to NULL for any chain representing 2271 * part of the super-root topology. 2272 */ 2273 hammer2_chain_t * 2274 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2275 hammer2_blockref_t *bref, int how) 2276 { 2277 hammer2_dev_t *hmp = parent->hmp; 2278 hammer2_chain_t *chain; 2279 int error; 2280 2281 /* 2282 * Allocate a chain structure representing the existing media 2283 * entry. Resulting chain has one ref and is not locked. 2284 */ 2285 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2286 chain = hammer2_chain_alloc(hmp, NULL, bref); 2287 else 2288 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2289 /* ref'd chain returned */ 2290 2291 /* 2292 * Flag that the chain is in the parent's blockmap so delete/flush 2293 * knows what to do with it. 2294 */ 2295 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 2296 2297 /* 2298 * chain must be locked to avoid unexpected ripouts 2299 */ 2300 hammer2_chain_lock(chain, how); 2301 2302 /* 2303 * Link the chain into its parent. A spinlock is required to safely 2304 * access the RBTREE, and it is possible to collide with another 2305 * hammer2_chain_get() operation because the caller might only hold 2306 * a shared lock on the parent. 2307 * 2308 * NOTE: Get races can occur quite often when we distribute 2309 * asynchronous read-aheads across multiple threads. 2310 */ 2311 KKASSERT(parent->refs > 0); 2312 error = hammer2_chain_insert(parent, chain, 2313 HAMMER2_CHAIN_INSERT_SPIN | 2314 HAMMER2_CHAIN_INSERT_RACE, 2315 generation); 2316 if (error) { 2317 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2318 /*kprintf("chain %p get race\n", chain);*/ 2319 hammer2_chain_unlock(chain); 2320 hammer2_chain_drop(chain); 2321 chain = NULL; 2322 } else { 2323 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2324 } 2325 2326 /* 2327 * Return our new chain referenced but not locked, or NULL if 2328 * a race occurred. 2329 */ 2330 return (chain); 2331 } 2332 2333 /* 2334 * Lookup initialization/completion API 2335 */ 2336 hammer2_chain_t * 2337 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2338 { 2339 hammer2_chain_ref(parent); 2340 if (flags & HAMMER2_LOOKUP_SHARED) { 2341 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2342 HAMMER2_RESOLVE_SHARED); 2343 } else { 2344 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2345 } 2346 return (parent); 2347 } 2348 2349 void 2350 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2351 { 2352 if (parent) { 2353 hammer2_chain_unlock(parent); 2354 hammer2_chain_drop(parent); 2355 } 2356 } 2357 2358 /* 2359 * Take the locked chain and return a locked parent. The chain remains 2360 * locked on return, but may have to be temporarily unlocked to acquire 2361 * the parent. Because of this, (chain) must be stable and cannot be 2362 * deleted while it was temporarily unlocked (typically means that (chain) 2363 * is an inode). 2364 * 2365 * Pass HAMMER2_RESOLVE_* flags in flags. 2366 * 2367 * This will work even if the chain is errored, and the caller can check 2368 * parent->error on return if desired since the parent will be locked. 2369 * 2370 * This function handles the lock order reversal. 2371 */ 2372 hammer2_chain_t * 2373 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2374 { 2375 hammer2_chain_t *parent; 2376 2377 /* 2378 * Be careful of order, chain must be unlocked before parent 2379 * is locked below to avoid a deadlock. Try it trivially first. 2380 */ 2381 parent = chain->parent; 2382 if (parent == NULL) 2383 panic("hammer2_chain_getparent: no parent"); 2384 hammer2_chain_ref(parent); 2385 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) 2386 return parent; 2387 2388 for (;;) { 2389 hammer2_chain_unlock(chain); 2390 hammer2_chain_lock(parent, flags); 2391 hammer2_chain_lock(chain, flags); 2392 2393 /* 2394 * Parent relinking races are quite common. We have to get 2395 * it right or we will blow up the block table. 2396 */ 2397 if (chain->parent == parent) 2398 break; 2399 hammer2_chain_unlock(parent); 2400 hammer2_chain_drop(parent); 2401 cpu_ccfence(); 2402 parent = chain->parent; 2403 if (parent == NULL) 2404 panic("hammer2_chain_getparent: no parent"); 2405 hammer2_chain_ref(parent); 2406 } 2407 return parent; 2408 } 2409 2410 /* 2411 * Take the locked chain and return a locked parent. The chain is unlocked 2412 * and dropped. *chainp is set to the returned parent as a convenience. 2413 * Pass HAMMER2_RESOLVE_* flags in flags. 2414 * 2415 * This will work even if the chain is errored, and the caller can check 2416 * parent->error on return if desired since the parent will be locked. 2417 * 2418 * The chain does NOT need to be stable. We use a tracking structure 2419 * to track the expected parent if the chain is deleted out from under us. 2420 * 2421 * This function handles the lock order reversal. 2422 */ 2423 hammer2_chain_t * 2424 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2425 { 2426 hammer2_chain_t *chain; 2427 hammer2_chain_t *parent; 2428 struct hammer2_reptrack reptrack; 2429 struct hammer2_reptrack **repp; 2430 2431 /* 2432 * Be careful of order, chain must be unlocked before parent 2433 * is locked below to avoid a deadlock. Try it trivially first. 2434 */ 2435 chain = *chainp; 2436 parent = chain->parent; 2437 if (parent == NULL) { 2438 hammer2_spin_unex(&chain->core.spin); 2439 panic("hammer2_chain_repparent: no parent"); 2440 } 2441 hammer2_chain_ref(parent); 2442 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) { 2443 hammer2_chain_unlock(chain); 2444 hammer2_chain_drop(chain); 2445 *chainp = parent; 2446 2447 return parent; 2448 } 2449 2450 /* 2451 * Ok, now it gets a bit nasty. There are multiple situations where 2452 * the parent might be in the middle of a deletion, or where the child 2453 * (chain) might be deleted the instant we let go of its lock. 2454 * We can potentially end up in a no-win situation! 2455 * 2456 * In particular, the indirect_maintenance() case can cause these 2457 * situations. 2458 * 2459 * To deal with this we install a reptrack structure in the parent 2460 * This reptrack structure 'owns' the parent ref and will automatically 2461 * migrate to the parent's parent if the parent is deleted permanently. 2462 */ 2463 hammer2_spin_init(&reptrack.spin, "h2reptrk"); 2464 reptrack.chain = parent; 2465 hammer2_chain_ref(parent); /* for the reptrack */ 2466 2467 hammer2_spin_ex(&parent->core.spin); 2468 reptrack.next = parent->core.reptrack; 2469 parent->core.reptrack = &reptrack; 2470 hammer2_spin_unex(&parent->core.spin); 2471 2472 hammer2_chain_unlock(chain); 2473 hammer2_chain_drop(chain); 2474 chain = NULL; /* gone */ 2475 2476 /* 2477 * At the top of this loop, chain is gone and parent is refd both 2478 * by us explicitly AND via our reptrack. We are attempting to 2479 * lock parent. 2480 */ 2481 for (;;) { 2482 hammer2_chain_lock(parent, flags); 2483 2484 if (reptrack.chain == parent) 2485 break; 2486 hammer2_chain_unlock(parent); 2487 hammer2_chain_drop(parent); 2488 2489 kprintf("hammer2: debug REPTRACK %p->%p\n", 2490 parent, reptrack.chain); 2491 hammer2_spin_ex(&reptrack.spin); 2492 parent = reptrack.chain; 2493 hammer2_chain_ref(parent); 2494 hammer2_spin_unex(&reptrack.spin); 2495 } 2496 2497 /* 2498 * Once parent is locked and matches our reptrack, our reptrack 2499 * will be stable and we have our parent. We can unlink our 2500 * reptrack. 2501 * 2502 * WARNING! Remember that the chain lock might be shared. Chains 2503 * locked shared have stable parent linkages. 2504 */ 2505 hammer2_spin_ex(&parent->core.spin); 2506 repp = &parent->core.reptrack; 2507 while (*repp != &reptrack) 2508 repp = &(*repp)->next; 2509 *repp = reptrack.next; 2510 hammer2_spin_unex(&parent->core.spin); 2511 2512 hammer2_chain_drop(parent); /* reptrack ref */ 2513 *chainp = parent; /* return parent lock+ref */ 2514 2515 return parent; 2516 } 2517 2518 /* 2519 * Dispose of any linked reptrack structures in (chain) by shifting them to 2520 * (parent). Both (chain) and (parent) must be exclusively locked. 2521 * 2522 * This is interlocked against any children of (chain) on the other side. 2523 * No children so remain as-of when this is called so we can test 2524 * core.reptrack without holding the spin-lock. 2525 * 2526 * Used whenever the caller intends to permanently delete chains related 2527 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE), 2528 * where the chains underneath the node being deleted are given a new parent 2529 * above the node being deleted. 2530 */ 2531 static 2532 void 2533 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain) 2534 { 2535 struct hammer2_reptrack *reptrack; 2536 2537 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)); 2538 while (chain->core.reptrack) { 2539 hammer2_spin_ex(&parent->core.spin); 2540 hammer2_spin_ex(&chain->core.spin); 2541 reptrack = chain->core.reptrack; 2542 if (reptrack == NULL) { 2543 hammer2_spin_unex(&chain->core.spin); 2544 hammer2_spin_unex(&parent->core.spin); 2545 break; 2546 } 2547 hammer2_spin_ex(&reptrack->spin); 2548 chain->core.reptrack = reptrack->next; 2549 reptrack->chain = parent; 2550 reptrack->next = parent->core.reptrack; 2551 parent->core.reptrack = reptrack; 2552 hammer2_chain_ref(parent); /* reptrack */ 2553 2554 hammer2_spin_unex(&chain->core.spin); 2555 hammer2_spin_unex(&parent->core.spin); 2556 kprintf("hammer2: debug repchange %p %p->%p\n", 2557 reptrack, chain, parent); 2558 hammer2_chain_drop(chain); /* reptrack */ 2559 } 2560 } 2561 2562 /* 2563 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2564 * (*parentp) typically points to an inode but can also point to a related 2565 * indirect block and this function will recurse upwards and find the inode 2566 * or the nearest undeleted indirect block covering the key range. 2567 * 2568 * This function unconditionally sets *errorp, replacing any previous value. 2569 * 2570 * (*parentp) must be exclusive or shared locked (depending on flags) and 2571 * referenced and can be an inode or an existing indirect block within the 2572 * inode. 2573 * 2574 * If (*parent) is errored out, this function will not attempt to recurse 2575 * the radix tree and will return NULL along with an appropriate *errorp. 2576 * If NULL is returned and *errorp is 0, the requested lookup could not be 2577 * located. 2578 * 2579 * On return (*parentp) will be modified to point at the deepest parent chain 2580 * element encountered during the search, as a helper for an insertion or 2581 * deletion. 2582 * 2583 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2584 * and referenced, and the old will be unlocked and dereferenced (no change 2585 * if they are both the same). This is particularly important if the caller 2586 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2587 * is returned, as long as no error occurred. 2588 * 2589 * The matching chain will be returned locked according to flags. 2590 * 2591 * -- 2592 * 2593 * NULL is returned if no match was found, but (*parentp) will still 2594 * potentially be adjusted. 2595 * 2596 * On return (*key_nextp) will point to an iterative value for key_beg. 2597 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2598 * 2599 * This function will also recurse up the chain if the key is not within the 2600 * current parent's range. (*parentp) can never be set to NULL. An iteration 2601 * can simply allow (*parentp) to float inside the loop. 2602 * 2603 * NOTE! chain->data is not always resolved. By default it will not be 2604 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2605 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2606 * BREF_TYPE_DATA as the device buffer can alias the logical file 2607 * buffer). 2608 */ 2609 hammer2_chain_t * 2610 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2611 hammer2_key_t key_beg, hammer2_key_t key_end, 2612 int *errorp, int flags) 2613 { 2614 hammer2_chain_t *parent; 2615 hammer2_chain_t *chain; 2616 hammer2_blockref_t *base; 2617 hammer2_blockref_t *bref; 2618 hammer2_blockref_t bsave; 2619 hammer2_key_t scan_beg; 2620 hammer2_key_t scan_end; 2621 int count = 0; 2622 int how_always = HAMMER2_RESOLVE_ALWAYS; 2623 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2624 int how; 2625 int generation; 2626 int maxloops = 300000; 2627 2628 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2629 how_maybe = how_always; 2630 how = HAMMER2_RESOLVE_ALWAYS; 2631 } else if (flags & HAMMER2_LOOKUP_NODATA) { 2632 how = HAMMER2_RESOLVE_NEVER; 2633 } else { 2634 how = HAMMER2_RESOLVE_MAYBE; 2635 } 2636 if (flags & HAMMER2_LOOKUP_SHARED) { 2637 how_maybe |= HAMMER2_RESOLVE_SHARED; 2638 how_always |= HAMMER2_RESOLVE_SHARED; 2639 how |= HAMMER2_RESOLVE_SHARED; 2640 } 2641 2642 /* 2643 * Recurse (*parentp) upward if necessary until the parent completely 2644 * encloses the key range or we hit the inode. 2645 * 2646 * Handle races against the flusher deleting indirect nodes on its 2647 * way back up by continuing to recurse upward past the deletion. 2648 */ 2649 parent = *parentp; 2650 *errorp = 0; 2651 2652 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2653 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2654 scan_beg = parent->bref.key; 2655 scan_end = scan_beg + 2656 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2657 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2658 if (key_beg >= scan_beg && key_end <= scan_end) 2659 break; 2660 } 2661 parent = hammer2_chain_repparent(parentp, how_maybe); 2662 } 2663 again: 2664 if (--maxloops == 0) 2665 panic("hammer2_chain_lookup: maxloops"); 2666 2667 /* 2668 * MATCHIND case that does not require parent->data (do prior to 2669 * parent->error check). 2670 */ 2671 switch(parent->bref.type) { 2672 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2673 case HAMMER2_BREF_TYPE_INDIRECT: 2674 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2675 scan_beg = parent->bref.key; 2676 scan_end = scan_beg + 2677 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2678 if (key_beg == scan_beg && key_end == scan_end) { 2679 chain = parent; 2680 hammer2_chain_ref(chain); 2681 hammer2_chain_lock(chain, how_maybe); 2682 *key_nextp = scan_end + 1; 2683 goto done; 2684 } 2685 } 2686 break; 2687 default: 2688 break; 2689 } 2690 2691 /* 2692 * No lookup is possible if the parent is errored. We delayed 2693 * this check as long as we could to ensure that the parent backup, 2694 * embedded data, and MATCHIND code could still execute. 2695 */ 2696 if (parent->error) { 2697 *errorp = parent->error; 2698 return NULL; 2699 } 2700 2701 /* 2702 * Locate the blockref array. Currently we do a fully associative 2703 * search through the array. 2704 */ 2705 switch(parent->bref.type) { 2706 case HAMMER2_BREF_TYPE_INODE: 2707 /* 2708 * Special shortcut for embedded data returns the inode 2709 * itself. Callers must detect this condition and access 2710 * the embedded data (the strategy code does this for us). 2711 * 2712 * This is only applicable to regular files and softlinks. 2713 * 2714 * We need a second lock on parent. Since we already have 2715 * a lock we must pass LOCKAGAIN to prevent unexpected 2716 * blocking (we don't want to block on a second shared 2717 * ref if an exclusive lock is pending) 2718 */ 2719 if (parent->data->ipdata.meta.op_flags & 2720 HAMMER2_OPFLAG_DIRECTDATA) { 2721 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2722 chain = NULL; 2723 *key_nextp = key_end + 1; 2724 goto done; 2725 } 2726 hammer2_chain_ref(parent); 2727 hammer2_chain_lock(parent, how_always | 2728 HAMMER2_RESOLVE_LOCKAGAIN); 2729 *key_nextp = key_end + 1; 2730 return (parent); 2731 } 2732 base = &parent->data->ipdata.u.blockset.blockref[0]; 2733 count = HAMMER2_SET_COUNT; 2734 break; 2735 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2736 case HAMMER2_BREF_TYPE_INDIRECT: 2737 /* 2738 * Optimize indirect blocks in the INITIAL state to avoid 2739 * I/O. 2740 * 2741 * Debugging: Enter permanent wait state instead of 2742 * panicing on unexpectedly NULL data for the moment. 2743 */ 2744 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2745 base = NULL; 2746 } else { 2747 if (parent->data == NULL) { 2748 kprintf("hammer2: unexpected NULL data " 2749 "on %p\n", parent); 2750 while (1) 2751 tsleep(parent, 0, "xxx", 0); 2752 } 2753 base = &parent->data->npdata[0]; 2754 } 2755 count = parent->bytes / sizeof(hammer2_blockref_t); 2756 break; 2757 case HAMMER2_BREF_TYPE_VOLUME: 2758 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2759 count = HAMMER2_SET_COUNT; 2760 break; 2761 case HAMMER2_BREF_TYPE_FREEMAP: 2762 base = &parent->data->blkset.blockref[0]; 2763 count = HAMMER2_SET_COUNT; 2764 break; 2765 default: 2766 panic("hammer2_chain_lookup: unrecognized " 2767 "blockref(B) type: %d", 2768 parent->bref.type); 2769 base = NULL; /* safety */ 2770 count = 0; /* safety */ 2771 break; 2772 } 2773 2774 /* 2775 * Merged scan to find next candidate. 2776 * 2777 * hammer2_base_*() functions require the parent->core.live_* fields 2778 * to be synchronized. 2779 * 2780 * We need to hold the spinlock to access the block array and RB tree 2781 * and to interlock chain creation. 2782 */ 2783 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2784 hammer2_chain_countbrefs(parent, base, count); 2785 2786 /* 2787 * Combined search 2788 */ 2789 hammer2_spin_ex(&parent->core.spin); 2790 chain = hammer2_combined_find(parent, base, count, 2791 key_nextp, 2792 key_beg, key_end, 2793 &bref); 2794 generation = parent->core.generation; 2795 2796 /* 2797 * Exhausted parent chain, iterate. 2798 */ 2799 if (bref == NULL) { 2800 KKASSERT(chain == NULL); 2801 hammer2_spin_unex(&parent->core.spin); 2802 if (key_beg == key_end) /* short cut single-key case */ 2803 return (NULL); 2804 2805 /* 2806 * Stop if we reached the end of the iteration. 2807 */ 2808 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2809 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2810 return (NULL); 2811 } 2812 2813 /* 2814 * Calculate next key, stop if we reached the end of the 2815 * iteration, otherwise go up one level and loop. 2816 */ 2817 key_beg = parent->bref.key + 2818 ((hammer2_key_t)1 << parent->bref.keybits); 2819 if (key_beg == 0 || key_beg > key_end) 2820 return (NULL); 2821 parent = hammer2_chain_repparent(parentp, how_maybe); 2822 goto again; 2823 } 2824 2825 /* 2826 * Selected from blockref or in-memory chain. 2827 */ 2828 bsave = *bref; 2829 if (chain == NULL) { 2830 hammer2_spin_unex(&parent->core.spin); 2831 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2832 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2833 chain = hammer2_chain_get(parent, generation, 2834 &bsave, how_maybe); 2835 } else { 2836 chain = hammer2_chain_get(parent, generation, 2837 &bsave, how); 2838 } 2839 if (chain == NULL) 2840 goto again; 2841 } else { 2842 hammer2_chain_ref(chain); 2843 hammer2_spin_unex(&parent->core.spin); 2844 2845 /* 2846 * chain is referenced but not locked. We must lock the 2847 * chain to obtain definitive state. 2848 */ 2849 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2850 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2851 hammer2_chain_lock(chain, how_maybe); 2852 } else { 2853 hammer2_chain_lock(chain, how); 2854 } 2855 KKASSERT(chain->parent == parent); 2856 } 2857 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 2858 chain->parent != parent) { 2859 hammer2_chain_unlock(chain); 2860 hammer2_chain_drop(chain); 2861 chain = NULL; /* SAFETY */ 2862 goto again; 2863 } 2864 2865 2866 /* 2867 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2868 * 2869 * NOTE: Chain's key range is not relevant as there might be 2870 * one-offs within the range that are not deleted. 2871 * 2872 * NOTE: Lookups can race delete-duplicate because 2873 * delete-duplicate does not lock the parent's core 2874 * (they just use the spinlock on the core). 2875 */ 2876 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2877 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2878 chain->bref.data_off, chain->bref.type, 2879 chain->bref.key); 2880 hammer2_chain_unlock(chain); 2881 hammer2_chain_drop(chain); 2882 chain = NULL; /* SAFETY */ 2883 key_beg = *key_nextp; 2884 if (key_beg == 0 || key_beg > key_end) 2885 return(NULL); 2886 goto again; 2887 } 2888 2889 /* 2890 * If the chain element is an indirect block it becomes the new 2891 * parent and we loop on it. We must maintain our top-down locks 2892 * to prevent the flusher from interfering (i.e. doing a 2893 * delete-duplicate and leaving us recursing down a deleted chain). 2894 * 2895 * The parent always has to be locked with at least RESOLVE_MAYBE 2896 * so we can access its data. It might need a fixup if the caller 2897 * passed incompatible flags. Be careful not to cause a deadlock 2898 * as a data-load requires an exclusive lock. 2899 * 2900 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2901 * range is within the requested key range we return the indirect 2902 * block and do NOT loop. This is usually only used to acquire 2903 * freemap nodes. 2904 */ 2905 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2906 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2907 hammer2_chain_unlock(parent); 2908 hammer2_chain_drop(parent); 2909 *parentp = parent = chain; 2910 chain = NULL; /* SAFETY */ 2911 goto again; 2912 } 2913 done: 2914 /* 2915 * All done, return the locked chain. 2916 * 2917 * If the caller does not want a locked chain, replace the lock with 2918 * a ref. Perhaps this can eventually be optimized to not obtain the 2919 * lock in the first place for situations where the data does not 2920 * need to be resolved. 2921 * 2922 * NOTE! A chain->error must be tested by the caller upon return. 2923 * *errorp is only set based on issues which occur while 2924 * trying to reach the chain. 2925 */ 2926 return (chain); 2927 } 2928 2929 /* 2930 * After having issued a lookup we can iterate all matching keys. 2931 * 2932 * If chain is non-NULL we continue the iteration from just after it's index. 2933 * 2934 * If chain is NULL we assume the parent was exhausted and continue the 2935 * iteration at the next parent. 2936 * 2937 * If a fatal error occurs (typically an I/O error), a dummy chain is 2938 * returned with chain->error and error-identifying information set. This 2939 * chain will assert if you try to do anything fancy with it. 2940 * 2941 * XXX Depending on where the error occurs we should allow continued iteration. 2942 * 2943 * parent must be locked on entry and remains locked throughout. chain's 2944 * lock status must match flags. Chain is always at least referenced. 2945 * 2946 * WARNING! The MATCHIND flag does not apply to this function. 2947 */ 2948 hammer2_chain_t * 2949 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2950 hammer2_key_t *key_nextp, 2951 hammer2_key_t key_beg, hammer2_key_t key_end, 2952 int *errorp, int flags) 2953 { 2954 hammer2_chain_t *parent; 2955 int how_maybe; 2956 2957 /* 2958 * Calculate locking flags for upward recursion. 2959 */ 2960 how_maybe = HAMMER2_RESOLVE_MAYBE; 2961 if (flags & HAMMER2_LOOKUP_SHARED) 2962 how_maybe |= HAMMER2_RESOLVE_SHARED; 2963 2964 parent = *parentp; 2965 *errorp = 0; 2966 2967 /* 2968 * Calculate the next index and recalculate the parent if necessary. 2969 */ 2970 if (chain) { 2971 key_beg = chain->bref.key + 2972 ((hammer2_key_t)1 << chain->bref.keybits); 2973 hammer2_chain_unlock(chain); 2974 hammer2_chain_drop(chain); 2975 2976 /* 2977 * chain invalid past this point, but we can still do a 2978 * pointer comparison w/parent. 2979 * 2980 * Any scan where the lookup returned degenerate data embedded 2981 * in the inode has an invalid index and must terminate. 2982 */ 2983 if (chain == parent) 2984 return(NULL); 2985 if (key_beg == 0 || key_beg > key_end) 2986 return(NULL); 2987 chain = NULL; 2988 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2989 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2990 /* 2991 * We reached the end of the iteration. 2992 */ 2993 return (NULL); 2994 } else { 2995 /* 2996 * Continue iteration with next parent unless the current 2997 * parent covers the range. 2998 * 2999 * (This also handles the case of a deleted, empty indirect 3000 * node). 3001 */ 3002 key_beg = parent->bref.key + 3003 ((hammer2_key_t)1 << parent->bref.keybits); 3004 if (key_beg == 0 || key_beg > key_end) 3005 return (NULL); 3006 parent = hammer2_chain_repparent(parentp, how_maybe); 3007 } 3008 3009 /* 3010 * And execute 3011 */ 3012 return (hammer2_chain_lookup(parentp, key_nextp, 3013 key_beg, key_end, 3014 errorp, flags)); 3015 } 3016 3017 /* 3018 * Caller wishes to iterate chains under parent, loading new chains into 3019 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 3020 * then call hammer2_chain_scan() repeatedly until a non-zero return. 3021 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 3022 * with the returned chain for the scan. The returned *chainp will be 3023 * locked and referenced. Any prior contents will be unlocked and dropped. 3024 * 3025 * Caller should check the return value. A normal scan EOF will return 3026 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 3027 * error trying to access parent data. Any error in the returned chain 3028 * must be tested separately by the caller. 3029 * 3030 * (*chainp) is dropped on each scan, but will only be set if the returned 3031 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 3032 * returned via *chainp. The caller will get their bref only. 3033 * 3034 * The raw scan function is similar to lookup/next but does not seek to a key. 3035 * Blockrefs are iterated via first_bref = (parent, NULL) and 3036 * next_chain = (parent, bref). 3037 * 3038 * The passed-in parent must be locked and its data resolved. The function 3039 * nominally returns a locked and referenced *chainp != NULL for chains 3040 * the caller might need to recurse on (and will dipose of any *chainp passed 3041 * in). The caller must check the chain->bref.type either way. 3042 */ 3043 int 3044 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 3045 hammer2_blockref_t *bref, int *firstp, 3046 int flags) 3047 { 3048 hammer2_blockref_t *base; 3049 hammer2_blockref_t *bref_ptr; 3050 hammer2_key_t key; 3051 hammer2_key_t next_key; 3052 hammer2_chain_t *chain = NULL; 3053 int count = 0; 3054 int how; 3055 int generation; 3056 int maxloops = 300000; 3057 int error; 3058 3059 error = 0; 3060 3061 /* 3062 * Scan flags borrowed from lookup. 3063 */ 3064 if (flags & HAMMER2_LOOKUP_ALWAYS) { 3065 how = HAMMER2_RESOLVE_ALWAYS; 3066 } else if (flags & HAMMER2_LOOKUP_NODATA) { 3067 how = HAMMER2_RESOLVE_NEVER; 3068 } else { 3069 how = HAMMER2_RESOLVE_MAYBE; 3070 } 3071 if (flags & HAMMER2_LOOKUP_SHARED) { 3072 how |= HAMMER2_RESOLVE_SHARED; 3073 } 3074 3075 /* 3076 * Calculate key to locate first/next element, unlocking the previous 3077 * element as we go. Be careful, the key calculation can overflow. 3078 * 3079 * (also reset bref to NULL) 3080 */ 3081 if (*firstp) { 3082 key = 0; 3083 *firstp = 0; 3084 } else { 3085 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 3086 if ((chain = *chainp) != NULL) { 3087 *chainp = NULL; 3088 hammer2_chain_unlock(chain); 3089 hammer2_chain_drop(chain); 3090 chain = NULL; 3091 } 3092 if (key == 0) { 3093 error |= HAMMER2_ERROR_EOF; 3094 goto done; 3095 } 3096 } 3097 3098 again: 3099 if (parent->error) { 3100 error = parent->error; 3101 goto done; 3102 } 3103 if (--maxloops == 0) 3104 panic("hammer2_chain_scan: maxloops"); 3105 3106 /* 3107 * Locate the blockref array. Currently we do a fully associative 3108 * search through the array. 3109 */ 3110 switch(parent->bref.type) { 3111 case HAMMER2_BREF_TYPE_INODE: 3112 /* 3113 * An inode with embedded data has no sub-chains. 3114 * 3115 * WARNING! Bulk scan code may pass a static chain marked 3116 * as BREF_TYPE_INODE with a copy of the volume 3117 * root blockset to snapshot the volume. 3118 */ 3119 if (parent->data->ipdata.meta.op_flags & 3120 HAMMER2_OPFLAG_DIRECTDATA) { 3121 error |= HAMMER2_ERROR_EOF; 3122 goto done; 3123 } 3124 base = &parent->data->ipdata.u.blockset.blockref[0]; 3125 count = HAMMER2_SET_COUNT; 3126 break; 3127 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3128 case HAMMER2_BREF_TYPE_INDIRECT: 3129 /* 3130 * Optimize indirect blocks in the INITIAL state to avoid 3131 * I/O. 3132 */ 3133 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3134 base = NULL; 3135 } else { 3136 if (parent->data == NULL) 3137 panic("parent->data is NULL"); 3138 base = &parent->data->npdata[0]; 3139 } 3140 count = parent->bytes / sizeof(hammer2_blockref_t); 3141 break; 3142 case HAMMER2_BREF_TYPE_VOLUME: 3143 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3144 count = HAMMER2_SET_COUNT; 3145 break; 3146 case HAMMER2_BREF_TYPE_FREEMAP: 3147 base = &parent->data->blkset.blockref[0]; 3148 count = HAMMER2_SET_COUNT; 3149 break; 3150 default: 3151 panic("hammer2_chain_scan: unrecognized blockref type: %d", 3152 parent->bref.type); 3153 base = NULL; /* safety */ 3154 count = 0; /* safety */ 3155 break; 3156 } 3157 3158 /* 3159 * Merged scan to find next candidate. 3160 * 3161 * hammer2_base_*() functions require the parent->core.live_* fields 3162 * to be synchronized. 3163 * 3164 * We need to hold the spinlock to access the block array and RB tree 3165 * and to interlock chain creation. 3166 */ 3167 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3168 hammer2_chain_countbrefs(parent, base, count); 3169 3170 next_key = 0; 3171 bref_ptr = NULL; 3172 hammer2_spin_ex(&parent->core.spin); 3173 chain = hammer2_combined_find(parent, base, count, 3174 &next_key, 3175 key, HAMMER2_KEY_MAX, 3176 &bref_ptr); 3177 generation = parent->core.generation; 3178 3179 /* 3180 * Exhausted parent chain, we're done. 3181 */ 3182 if (bref_ptr == NULL) { 3183 hammer2_spin_unex(&parent->core.spin); 3184 KKASSERT(chain == NULL); 3185 error |= HAMMER2_ERROR_EOF; 3186 goto done; 3187 } 3188 3189 /* 3190 * Copy into the supplied stack-based blockref. 3191 */ 3192 *bref = *bref_ptr; 3193 3194 /* 3195 * Selected from blockref or in-memory chain. 3196 */ 3197 if (chain == NULL) { 3198 switch(bref->type) { 3199 case HAMMER2_BREF_TYPE_INODE: 3200 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3201 case HAMMER2_BREF_TYPE_INDIRECT: 3202 case HAMMER2_BREF_TYPE_VOLUME: 3203 case HAMMER2_BREF_TYPE_FREEMAP: 3204 /* 3205 * Recursion, always get the chain 3206 */ 3207 hammer2_spin_unex(&parent->core.spin); 3208 chain = hammer2_chain_get(parent, generation, 3209 bref, how); 3210 if (chain == NULL) 3211 goto again; 3212 break; 3213 default: 3214 /* 3215 * No recursion, do not waste time instantiating 3216 * a chain, just iterate using the bref. 3217 */ 3218 hammer2_spin_unex(&parent->core.spin); 3219 break; 3220 } 3221 } else { 3222 /* 3223 * Recursion or not we need the chain in order to supply 3224 * the bref. 3225 */ 3226 hammer2_chain_ref(chain); 3227 hammer2_spin_unex(&parent->core.spin); 3228 hammer2_chain_lock(chain, how); 3229 } 3230 if (chain && 3231 (bcmp(bref, &chain->bref, sizeof(*bref)) || 3232 chain->parent != parent)) { 3233 hammer2_chain_unlock(chain); 3234 hammer2_chain_drop(chain); 3235 chain = NULL; 3236 goto again; 3237 } 3238 3239 /* 3240 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 3241 * 3242 * NOTE: chain's key range is not relevant as there might be 3243 * one-offs within the range that are not deleted. 3244 * 3245 * NOTE: XXX this could create problems with scans used in 3246 * situations other than mount-time recovery. 3247 * 3248 * NOTE: Lookups can race delete-duplicate because 3249 * delete-duplicate does not lock the parent's core 3250 * (they just use the spinlock on the core). 3251 */ 3252 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3253 hammer2_chain_unlock(chain); 3254 hammer2_chain_drop(chain); 3255 chain = NULL; 3256 3257 key = next_key; 3258 if (key == 0) { 3259 error |= HAMMER2_ERROR_EOF; 3260 goto done; 3261 } 3262 goto again; 3263 } 3264 3265 done: 3266 /* 3267 * All done, return the bref or NULL, supply chain if necessary. 3268 */ 3269 if (chain) 3270 *chainp = chain; 3271 return (error); 3272 } 3273 3274 /* 3275 * Create and return a new hammer2 system memory structure of the specified 3276 * key, type and size and insert it under (*parentp). This is a full 3277 * insertion, based on the supplied key/keybits, and may involve creating 3278 * indirect blocks and moving other chains around via delete/duplicate. 3279 * 3280 * This call can be made with parent == NULL as long as a non -1 methods 3281 * is supplied. hmp must also be supplied in this situation (otherwise 3282 * hmp is extracted from the supplied parent). The chain will be detached 3283 * from the topology. A later call with both parent and chain can be made 3284 * to attach it. 3285 * 3286 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3287 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3288 * FULL. This typically means that the caller is creating the chain after 3289 * doing a hammer2_chain_lookup(). 3290 * 3291 * (*parentp) must be exclusive locked and may be replaced on return 3292 * depending on how much work the function had to do. 3293 * 3294 * (*parentp) must not be errored or this function will assert. 3295 * 3296 * (*chainp) usually starts out NULL and returns the newly created chain, 3297 * but if the caller desires the caller may allocate a disconnected chain 3298 * and pass it in instead. 3299 * 3300 * This function should NOT be used to insert INDIRECT blocks. It is 3301 * typically used to create/insert inodes and data blocks. 3302 * 3303 * Caller must pass-in an exclusively locked parent the new chain is to 3304 * be inserted under, and optionally pass-in a disconnected, exclusively 3305 * locked chain to insert (else we create a new chain). The function will 3306 * adjust (*parentp) as necessary, create or connect the chain, and 3307 * return an exclusively locked chain in *chainp. 3308 * 3309 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3310 * and will be reassigned. 3311 * 3312 * NOTE: returns HAMMER_ERROR_* flags 3313 */ 3314 int 3315 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3316 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods, 3317 hammer2_key_t key, int keybits, int type, size_t bytes, 3318 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3319 { 3320 hammer2_chain_t *chain; 3321 hammer2_chain_t *parent; 3322 hammer2_blockref_t *base; 3323 hammer2_blockref_t dummy; 3324 int allocated = 0; 3325 int error = 0; 3326 int count; 3327 int maxloops = 300000; 3328 3329 /* 3330 * Topology may be crossing a PFS boundary. 3331 */ 3332 parent = *parentp; 3333 if (parent) { 3334 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3335 KKASSERT(parent->error == 0); 3336 hmp = parent->hmp; 3337 } 3338 chain = *chainp; 3339 3340 if (chain == NULL) { 3341 /* 3342 * First allocate media space and construct the dummy bref, 3343 * then allocate the in-memory chain structure. Set the 3344 * INITIAL flag for fresh chains which do not have embedded 3345 * data. 3346 */ 3347 bzero(&dummy, sizeof(dummy)); 3348 dummy.type = type; 3349 dummy.key = key; 3350 dummy.keybits = keybits; 3351 dummy.data_off = hammer2_getradix(bytes); 3352 3353 /* 3354 * Inherit methods from parent by default. Primarily used 3355 * for BREF_TYPE_DATA. Non-data types *must* be set to 3356 * a non-NONE check algorithm. 3357 */ 3358 if (methods == HAMMER2_METH_DEFAULT) 3359 dummy.methods = parent->bref.methods; 3360 else 3361 dummy.methods = (uint8_t)methods; 3362 3363 if (type != HAMMER2_BREF_TYPE_DATA && 3364 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3365 dummy.methods |= 3366 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3367 } 3368 3369 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3370 3371 /* 3372 * Lock the chain manually, chain_lock will load the chain 3373 * which we do NOT want to do. (note: chain->refs is set 3374 * to 1 by chain_alloc() for us, but lockcnt is not). 3375 */ 3376 chain->lockcnt = 1; 3377 hammer2_mtx_ex(&chain->lock); 3378 allocated = 1; 3379 3380 /* 3381 * Set INITIAL to optimize I/O. The flag will generally be 3382 * processed when we call hammer2_chain_modify(). 3383 */ 3384 switch(type) { 3385 case HAMMER2_BREF_TYPE_VOLUME: 3386 case HAMMER2_BREF_TYPE_FREEMAP: 3387 panic("hammer2_chain_create: called with volume type"); 3388 break; 3389 case HAMMER2_BREF_TYPE_INDIRECT: 3390 panic("hammer2_chain_create: cannot be used to" 3391 "create indirect block"); 3392 break; 3393 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3394 panic("hammer2_chain_create: cannot be used to" 3395 "create freemap root or node"); 3396 break; 3397 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3398 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3399 /* fall through */ 3400 case HAMMER2_BREF_TYPE_DIRENT: 3401 case HAMMER2_BREF_TYPE_INODE: 3402 case HAMMER2_BREF_TYPE_DATA: 3403 default: 3404 /* 3405 * leave chain->data NULL, set INITIAL 3406 */ 3407 KKASSERT(chain->data == NULL); 3408 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3409 break; 3410 } 3411 } else { 3412 /* 3413 * We are reattaching a previously deleted chain, possibly 3414 * under a new parent and possibly with a new key/keybits. 3415 * The chain does not have to be in a modified state. The 3416 * UPDATE flag will be set later on in this routine. 3417 * 3418 * Do NOT mess with the current state of the INITIAL flag. 3419 */ 3420 chain->bref.key = key; 3421 chain->bref.keybits = keybits; 3422 if (chain->flags & HAMMER2_CHAIN_DELETED) 3423 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3424 KKASSERT(chain->parent == NULL); 3425 } 3426 3427 /* 3428 * Set the appropriate bref flag if requested. 3429 * 3430 * NOTE! Callers can call this function to move chains without 3431 * knowing about special flags, so don't clear bref flags 3432 * here! 3433 */ 3434 if (flags & HAMMER2_INSERT_PFSROOT) 3435 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3436 3437 if (parent == NULL) 3438 goto skip; 3439 3440 /* 3441 * Calculate how many entries we have in the blockref array and 3442 * determine if an indirect block is required when inserting into 3443 * the parent. 3444 */ 3445 again: 3446 if (--maxloops == 0) 3447 panic("hammer2_chain_create: maxloops"); 3448 3449 switch(parent->bref.type) { 3450 case HAMMER2_BREF_TYPE_INODE: 3451 if ((parent->data->ipdata.meta.op_flags & 3452 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3453 kprintf("hammer2: parent set for direct-data! " 3454 "pkey=%016jx ckey=%016jx\n", 3455 parent->bref.key, 3456 chain->bref.key); 3457 } 3458 KKASSERT((parent->data->ipdata.meta.op_flags & 3459 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3460 KKASSERT(parent->data != NULL); 3461 base = &parent->data->ipdata.u.blockset.blockref[0]; 3462 count = HAMMER2_SET_COUNT; 3463 break; 3464 case HAMMER2_BREF_TYPE_INDIRECT: 3465 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3466 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3467 base = NULL; 3468 else 3469 base = &parent->data->npdata[0]; 3470 count = parent->bytes / sizeof(hammer2_blockref_t); 3471 break; 3472 case HAMMER2_BREF_TYPE_VOLUME: 3473 KKASSERT(parent->data != NULL); 3474 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3475 count = HAMMER2_SET_COUNT; 3476 break; 3477 case HAMMER2_BREF_TYPE_FREEMAP: 3478 KKASSERT(parent->data != NULL); 3479 base = &parent->data->blkset.blockref[0]; 3480 count = HAMMER2_SET_COUNT; 3481 break; 3482 default: 3483 panic("hammer2_chain_create: unrecognized blockref type: %d", 3484 parent->bref.type); 3485 base = NULL; 3486 count = 0; 3487 break; 3488 } 3489 3490 /* 3491 * Make sure we've counted the brefs 3492 */ 3493 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3494 hammer2_chain_countbrefs(parent, base, count); 3495 3496 KASSERT(parent->core.live_count >= 0 && 3497 parent->core.live_count <= count, 3498 ("bad live_count %d/%d (%02x, %d)", 3499 parent->core.live_count, count, 3500 parent->bref.type, parent->bytes)); 3501 3502 /* 3503 * If no free blockref could be found we must create an indirect 3504 * block and move a number of blockrefs into it. With the parent 3505 * locked we can safely lock each child in order to delete+duplicate 3506 * it without causing a deadlock. 3507 * 3508 * This may return the new indirect block or the old parent depending 3509 * on where the key falls. NULL is returned on error. 3510 */ 3511 if (parent->core.live_count == count) { 3512 hammer2_chain_t *nparent; 3513 3514 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3515 3516 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3517 mtid, type, &error); 3518 if (nparent == NULL) { 3519 if (allocated) 3520 hammer2_chain_drop(chain); 3521 chain = NULL; 3522 goto done; 3523 } 3524 if (parent != nparent) { 3525 hammer2_chain_unlock(parent); 3526 hammer2_chain_drop(parent); 3527 parent = *parentp = nparent; 3528 } 3529 goto again; 3530 } 3531 3532 /* 3533 * fall through if parent, or skip to here if no parent. 3534 */ 3535 skip: 3536 if (chain->flags & HAMMER2_CHAIN_DELETED) 3537 kprintf("Inserting deleted chain @%016jx\n", 3538 chain->bref.key); 3539 3540 /* 3541 * Link the chain into its parent. 3542 */ 3543 if (chain->parent != NULL) 3544 panic("hammer2: hammer2_chain_create: chain already connected"); 3545 KKASSERT(chain->parent == NULL); 3546 if (parent) { 3547 KKASSERT(parent->core.live_count < count); 3548 hammer2_chain_insert(parent, chain, 3549 HAMMER2_CHAIN_INSERT_SPIN | 3550 HAMMER2_CHAIN_INSERT_LIVE, 3551 0); 3552 } 3553 3554 if (allocated) { 3555 /* 3556 * Mark the newly created chain modified. This will cause 3557 * UPDATE to be set and process the INITIAL flag. 3558 * 3559 * Device buffers are not instantiated for DATA elements 3560 * as these are handled by logical buffers. 3561 * 3562 * Indirect and freemap node indirect blocks are handled 3563 * by hammer2_chain_create_indirect() and not by this 3564 * function. 3565 * 3566 * Data for all other bref types is expected to be 3567 * instantiated (INODE, LEAF). 3568 */ 3569 switch(chain->bref.type) { 3570 case HAMMER2_BREF_TYPE_DATA: 3571 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3572 case HAMMER2_BREF_TYPE_DIRENT: 3573 case HAMMER2_BREF_TYPE_INODE: 3574 error = hammer2_chain_modify(chain, mtid, dedup_off, 3575 HAMMER2_MODIFY_OPTDATA); 3576 break; 3577 default: 3578 /* 3579 * Remaining types are not supported by this function. 3580 * In particular, INDIRECT and LEAF_NODE types are 3581 * handled by create_indirect(). 3582 */ 3583 panic("hammer2_chain_create: bad type: %d", 3584 chain->bref.type); 3585 /* NOT REACHED */ 3586 break; 3587 } 3588 } else { 3589 /* 3590 * When reconnecting a chain we must set UPDATE and 3591 * setflush so the flush recognizes that it must update 3592 * the bref in the parent. 3593 */ 3594 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3595 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3596 } 3597 3598 /* 3599 * We must setflush(parent) to ensure that it recurses through to 3600 * chain. setflush(chain) might not work because ONFLUSH is possibly 3601 * already set in the chain (so it won't recurse up to set it in the 3602 * parent). 3603 */ 3604 if (parent) 3605 hammer2_chain_setflush(parent); 3606 3607 done: 3608 *chainp = chain; 3609 3610 return (error); 3611 } 3612 3613 /* 3614 * Move the chain from its old parent to a new parent. The chain must have 3615 * already been deleted or already disconnected (or never associated) with 3616 * a parent. The chain is reassociated with the new parent and the deleted 3617 * flag will be cleared (no longer deleted). The chain's modification state 3618 * is not altered. 3619 * 3620 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3621 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3622 * FULL. This typically means that the caller is creating the chain after 3623 * doing a hammer2_chain_lookup(). 3624 * 3625 * Neither (parent) or (chain) can be errored. 3626 * 3627 * If (parent) is non-NULL then the chain is inserted under the parent. 3628 * 3629 * If (parent) is NULL then the newly duplicated chain is not inserted 3630 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3631 * passing into hammer2_chain_create() after this function returns). 3632 * 3633 * WARNING! This function calls create which means it can insert indirect 3634 * blocks. This can cause other unrelated chains in the parent to 3635 * be moved to a newly inserted indirect block in addition to the 3636 * specific chain. 3637 */ 3638 void 3639 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3640 hammer2_tid_t mtid, int flags) 3641 { 3642 hammer2_blockref_t *bref; 3643 hammer2_chain_t *parent; 3644 3645 /* 3646 * WARNING! We should never resolve DATA to device buffers 3647 * (XXX allow it if the caller did?), and since 3648 * we currently do not have the logical buffer cache 3649 * buffer in-hand to fix its cached physical offset 3650 * we also force the modify code to not COW it. XXX 3651 * 3652 * NOTE! We allow error'd chains to be renamed. The bref itself 3653 * is good and can be renamed. The content, however, may 3654 * be inaccessible. 3655 */ 3656 KKASSERT(chain->parent == NULL); 3657 /*KKASSERT(chain->error == 0); allow */ 3658 bref = &chain->bref; 3659 3660 /* 3661 * If parent is not NULL the duplicated chain will be entered under 3662 * the parent and the UPDATE bit set to tell flush to update 3663 * the blockref. 3664 * 3665 * We must setflush(parent) to ensure that it recurses through to 3666 * chain. setflush(chain) might not work because ONFLUSH is possibly 3667 * already set in the chain (so it won't recurse up to set it in the 3668 * parent). 3669 * 3670 * Having both chains locked is extremely important for atomicy. 3671 */ 3672 if (parentp && (parent = *parentp) != NULL) { 3673 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3674 KKASSERT(parent->refs > 0); 3675 KKASSERT(parent->error == 0); 3676 3677 hammer2_chain_create(parentp, &chain, NULL, chain->pmp, 3678 HAMMER2_METH_DEFAULT, 3679 bref->key, bref->keybits, bref->type, 3680 chain->bytes, mtid, 0, flags); 3681 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3682 hammer2_chain_setflush(*parentp); 3683 } 3684 } 3685 3686 /* 3687 * This works in tandem with delete_obref() to install a blockref in 3688 * (typically) an indirect block that is associated with the chain being 3689 * moved to *parentp. 3690 * 3691 * The reason we need this function is that the caller needs to maintain 3692 * the blockref as it was, and not generate a new blockref for what might 3693 * be a modified chain. Otherwise stuff will leak into the flush that 3694 * the flush code's FLUSH_INODE_STOP flag is unable to catch. 3695 * 3696 * It is EXTREMELY important that we properly set CHAIN_BLKMAPUPD and 3697 * CHAIN_UPDATE. We must set BLKMAPUPD if the bref does not match, and 3698 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if 3699 * it does. Otherwise we can end up in a situation where H2 is unable to 3700 * clean up the in-memory chain topology. 3701 * 3702 * The reason for this is that flushes do not generally flush through 3703 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq 3704 * or sideq to properly flush and dispose of the related inode chain's flags. 3705 * Situations where the inode is not actually modified by the frontend, 3706 * but where we have to move the related chains around as we insert or cleanup 3707 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory 3708 * inode chain that does not have a hammer2_inode_t associated with it. 3709 */ 3710 static void 3711 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3712 hammer2_tid_t mtid, int flags, 3713 hammer2_blockref_t *obref) 3714 { 3715 hammer2_chain_rename(parentp, chain, mtid, flags); 3716 3717 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) { 3718 hammer2_blockref_t *tbase; 3719 int tcount; 3720 3721 KKASSERT((chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0); 3722 hammer2_chain_modify(*parentp, mtid, 0, 0); 3723 tbase = hammer2_chain_base_and_count(*parentp, &tcount); 3724 hammer2_base_insert(*parentp, tbase, tcount, chain, obref); 3725 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) { 3726 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD | 3727 HAMMER2_CHAIN_UPDATE); 3728 } else { 3729 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3730 } 3731 } 3732 } 3733 3734 /* 3735 * Helper function for deleting chains. 3736 * 3737 * The chain is removed from the live view (the RBTREE) as well as the parent's 3738 * blockmap. Both chain and its parent must be locked. 3739 * 3740 * parent may not be errored. chain can be errored. 3741 */ 3742 static int 3743 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3744 hammer2_tid_t mtid, int flags, 3745 hammer2_blockref_t *obref) 3746 { 3747 int error = 0; 3748 3749 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 3750 KKASSERT(chain->parent == parent); 3751 3752 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) { 3753 /* 3754 * Chain is blockmapped, so there must be a parent. 3755 * Atomically remove the chain from the parent and remove 3756 * the blockmap entry. The parent must be set modified 3757 * to remove the blockmap entry. 3758 */ 3759 hammer2_blockref_t *base; 3760 int count; 3761 3762 KKASSERT(parent != NULL); 3763 KKASSERT(parent->error == 0); 3764 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3765 error = hammer2_chain_modify(parent, mtid, 0, 0); 3766 if (error) 3767 goto done; 3768 3769 /* 3770 * Calculate blockmap pointer 3771 */ 3772 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3773 hammer2_spin_ex(&chain->core.spin); 3774 hammer2_spin_ex(&parent->core.spin); 3775 3776 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3777 atomic_add_int(&parent->core.live_count, -1); 3778 ++parent->core.generation; 3779 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3780 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3781 --parent->core.chain_count; 3782 chain->parent = NULL; 3783 3784 switch(parent->bref.type) { 3785 case HAMMER2_BREF_TYPE_INODE: 3786 /* 3787 * Access the inode's block array. However, there 3788 * is no block array if the inode is flagged 3789 * DIRECTDATA. 3790 */ 3791 if (parent->data && 3792 (parent->data->ipdata.meta.op_flags & 3793 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3794 base = 3795 &parent->data->ipdata.u.blockset.blockref[0]; 3796 } else { 3797 base = NULL; 3798 } 3799 count = HAMMER2_SET_COUNT; 3800 break; 3801 case HAMMER2_BREF_TYPE_INDIRECT: 3802 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3803 if (parent->data) 3804 base = &parent->data->npdata[0]; 3805 else 3806 base = NULL; 3807 count = parent->bytes / sizeof(hammer2_blockref_t); 3808 break; 3809 case HAMMER2_BREF_TYPE_VOLUME: 3810 base = &parent->data->voldata. 3811 sroot_blockset.blockref[0]; 3812 count = HAMMER2_SET_COUNT; 3813 break; 3814 case HAMMER2_BREF_TYPE_FREEMAP: 3815 base = &parent->data->blkset.blockref[0]; 3816 count = HAMMER2_SET_COUNT; 3817 break; 3818 default: 3819 base = NULL; 3820 count = 0; 3821 panic("_hammer2_chain_delete_helper: " 3822 "unrecognized blockref type: %d", 3823 parent->bref.type); 3824 break; 3825 } 3826 3827 /* 3828 * delete blockmapped chain from its parent. 3829 * 3830 * The parent is not affected by any statistics in chain 3831 * which are pending synchronization. That is, there is 3832 * nothing to undo in the parent since they have not yet 3833 * been incorporated into the parent. 3834 * 3835 * The parent is affected by statistics stored in inodes. 3836 * Those have already been synchronized, so they must be 3837 * undone. XXX split update possible w/delete in middle? 3838 */ 3839 if (base) { 3840 hammer2_base_delete(parent, base, count, chain, obref); 3841 } 3842 hammer2_spin_unex(&parent->core.spin); 3843 hammer2_spin_unex(&chain->core.spin); 3844 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3845 /* 3846 * Chain is not blockmapped but a parent is present. 3847 * Atomically remove the chain from the parent. There is 3848 * no blockmap entry to remove. 3849 * 3850 * Because chain was associated with a parent but not 3851 * synchronized, the chain's *_count_up fields contain 3852 * inode adjustment statistics which must be undone. 3853 */ 3854 hammer2_spin_ex(&chain->core.spin); 3855 hammer2_spin_ex(&parent->core.spin); 3856 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3857 atomic_add_int(&parent->core.live_count, -1); 3858 ++parent->core.generation; 3859 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3860 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3861 --parent->core.chain_count; 3862 chain->parent = NULL; 3863 hammer2_spin_unex(&parent->core.spin); 3864 hammer2_spin_unex(&chain->core.spin); 3865 } else { 3866 /* 3867 * Chain is not blockmapped and has no parent. This 3868 * is a degenerate case. 3869 */ 3870 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3871 } 3872 done: 3873 return error; 3874 } 3875 3876 /* 3877 * Create an indirect block that covers one or more of the elements in the 3878 * current parent. Either returns the existing parent with no locking or 3879 * ref changes or returns the new indirect block locked and referenced 3880 * and leaving the original parent lock/ref intact as well. 3881 * 3882 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3883 * 3884 * The returned chain depends on where the specified key falls. 3885 * 3886 * The key/keybits for the indirect mode only needs to follow three rules: 3887 * 3888 * (1) That all elements underneath it fit within its key space and 3889 * 3890 * (2) That all elements outside it are outside its key space. 3891 * 3892 * (3) When creating the new indirect block any elements in the current 3893 * parent that fit within the new indirect block's keyspace must be 3894 * moved into the new indirect block. 3895 * 3896 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3897 * keyspace the the current parent, but lookup/iteration rules will 3898 * ensure (and must ensure) that rule (2) for all parents leading up 3899 * to the nearest inode or the root volume header is adhered to. This 3900 * is accomplished by always recursing through matching keyspaces in 3901 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3902 * 3903 * The current implementation calculates the current worst-case keyspace by 3904 * iterating the current parent and then divides it into two halves, choosing 3905 * whichever half has the most elements (not necessarily the half containing 3906 * the requested key). 3907 * 3908 * We can also opt to use the half with the least number of elements. This 3909 * causes lower-numbered keys (aka logical file offsets) to recurse through 3910 * fewer indirect blocks and higher-numbered keys to recurse through more. 3911 * This also has the risk of not moving enough elements to the new indirect 3912 * block and being forced to create several indirect blocks before the element 3913 * can be inserted. 3914 * 3915 * Must be called with an exclusively locked parent. 3916 * 3917 * NOTE: *errorp set to HAMMER_ERROR_* flags 3918 */ 3919 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3920 hammer2_key_t *keyp, int keybits, 3921 hammer2_blockref_t *base, int count); 3922 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3923 hammer2_key_t *keyp, int keybits, 3924 hammer2_blockref_t *base, int count, 3925 int ncount); 3926 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3927 hammer2_key_t *keyp, int keybits, 3928 hammer2_blockref_t *base, int count, 3929 int ncount); 3930 static 3931 hammer2_chain_t * 3932 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3933 hammer2_key_t create_key, int create_bits, 3934 hammer2_tid_t mtid, int for_type, int *errorp) 3935 { 3936 hammer2_dev_t *hmp; 3937 hammer2_blockref_t *base; 3938 hammer2_blockref_t *bref; 3939 hammer2_blockref_t bsave; 3940 hammer2_blockref_t dummy; 3941 hammer2_chain_t *chain; 3942 hammer2_chain_t *ichain; 3943 hammer2_key_t key = create_key; 3944 hammer2_key_t key_beg; 3945 hammer2_key_t key_end; 3946 hammer2_key_t key_next; 3947 int keybits = create_bits; 3948 int count; 3949 int ncount; 3950 int nbytes; 3951 int loops; 3952 int error; 3953 int reason; 3954 int generation; 3955 int maxloops = 300000; 3956 3957 /* 3958 * Calculate the base blockref pointer or NULL if the chain 3959 * is known to be empty. We need to calculate the array count 3960 * for RB lookups either way. 3961 */ 3962 hmp = parent->hmp; 3963 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3964 3965 /* 3966 * Pre-modify the parent now to avoid having to deal with error 3967 * processing if we tried to later (in the middle of our loop). 3968 * 3969 * We are going to be moving bref's around, the indirect blocks 3970 * cannot be in an initial state. Do not pass MODIFY_OPTDATA. 3971 */ 3972 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 3973 if (*errorp) { 3974 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 3975 *errorp, hammer2_error_str(*errorp)); 3976 return NULL; 3977 } 3978 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3979 3980 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 3981 base = hammer2_chain_base_and_count(parent, &count); 3982 3983 /* 3984 * How big should our new indirect block be? It has to be at least 3985 * as large as its parent for splits to work properly. 3986 * 3987 * The freemap uses a specific indirect block size. The number of 3988 * levels are built dynamically and ultimately depend on the size 3989 * volume. Because freemap blocks are taken from the reserved areas 3990 * of the volume our goal is efficiency (fewer levels) and not so 3991 * much to save disk space. 3992 * 3993 * The first indirect block level for a directory usually uses 3994 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 3995 * the hash mechanism, this typically gives us a nominal 3996 * 32 * 4 entries with one level of indirection. 3997 * 3998 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 3999 * indirect blocks. The initial 4 entries in the inode gives us 4000 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 4001 * of indirection gives us 137GB, and so forth. H2 can support 4002 * huge file sizes but they are not typical, so we try to stick 4003 * with compactness and do not use a larger indirect block size. 4004 * 4005 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 4006 * due to the way indirect blocks are created this usually winds 4007 * up being extremely inefficient for small files. Even though 4008 * 16KB requires more levels of indirection for very large files, 4009 * the 16KB records can be ganged together into 64KB DIOs. 4010 */ 4011 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4012 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4013 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 4014 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4015 if (parent->data->ipdata.meta.type == 4016 HAMMER2_OBJTYPE_DIRECTORY) 4017 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 4018 else 4019 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 4020 4021 } else { 4022 nbytes = HAMMER2_IND_BYTES_NOM; 4023 } 4024 if (nbytes < count * sizeof(hammer2_blockref_t)) { 4025 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 4026 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 4027 nbytes = count * sizeof(hammer2_blockref_t); 4028 } 4029 ncount = nbytes / sizeof(hammer2_blockref_t); 4030 4031 /* 4032 * When creating an indirect block for a freemap node or leaf 4033 * the key/keybits must be fitted to static radix levels because 4034 * particular radix levels use particular reserved blocks in the 4035 * related zone. 4036 * 4037 * This routine calculates the key/radix of the indirect block 4038 * we need to create, and whether it is on the high-side or the 4039 * low-side. 4040 */ 4041 switch(for_type) { 4042 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4043 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4044 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 4045 base, count); 4046 break; 4047 case HAMMER2_BREF_TYPE_DATA: 4048 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 4049 base, count, ncount); 4050 break; 4051 case HAMMER2_BREF_TYPE_DIRENT: 4052 case HAMMER2_BREF_TYPE_INODE: 4053 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 4054 base, count, ncount); 4055 break; 4056 default: 4057 panic("illegal indirect block for bref type %d", for_type); 4058 break; 4059 } 4060 4061 /* 4062 * Normalize the key for the radix being represented, keeping the 4063 * high bits and throwing away the low bits. 4064 */ 4065 key &= ~(((hammer2_key_t)1 << keybits) - 1); 4066 4067 /* 4068 * Ok, create our new indirect block 4069 */ 4070 bzero(&dummy, sizeof(dummy)); 4071 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4072 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4073 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 4074 } else { 4075 dummy.type = HAMMER2_BREF_TYPE_INDIRECT; 4076 } 4077 dummy.key = key; 4078 dummy.keybits = keybits; 4079 dummy.data_off = hammer2_getradix(nbytes); 4080 dummy.methods = 4081 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 4082 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 4083 4084 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy); 4085 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 4086 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 4087 /* ichain has one ref at this point */ 4088 4089 /* 4090 * We have to mark it modified to allocate its block, but use 4091 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 4092 * it won't be acted upon by the flush code. 4093 * 4094 * XXX remove OPTDATA, we need a fully initialized indirect block to 4095 * be able to move the original blockref. 4096 */ 4097 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0); 4098 if (*errorp) { 4099 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 4100 *errorp, hammer2_error_str(*errorp)); 4101 hammer2_chain_unlock(ichain); 4102 hammer2_chain_drop(ichain); 4103 return NULL; 4104 } 4105 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4106 4107 /* 4108 * Iterate the original parent and move the matching brefs into 4109 * the new indirect block. 4110 * 4111 * XXX handle flushes. 4112 */ 4113 key_beg = 0; 4114 key_end = HAMMER2_KEY_MAX; 4115 key_next = 0; /* avoid gcc warnings */ 4116 hammer2_spin_ex(&parent->core.spin); 4117 loops = 0; 4118 reason = 0; 4119 4120 for (;;) { 4121 /* 4122 * Parent may have been modified, relocating its block array. 4123 * Reload the base pointer. 4124 */ 4125 base = hammer2_chain_base_and_count(parent, &count); 4126 4127 if (++loops > 100000) { 4128 hammer2_spin_unex(&parent->core.spin); 4129 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 4130 reason, parent, base, count, key_next); 4131 } 4132 4133 /* 4134 * NOTE: spinlock stays intact, returned chain (if not NULL) 4135 * is not referenced or locked which means that we 4136 * cannot safely check its flagged / deletion status 4137 * until we lock it. 4138 */ 4139 chain = hammer2_combined_find(parent, base, count, 4140 &key_next, 4141 key_beg, key_end, 4142 &bref); 4143 generation = parent->core.generation; 4144 if (bref == NULL) 4145 break; 4146 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4147 4148 /* 4149 * Skip keys that are not within the key/radix of the new 4150 * indirect block. They stay in the parent. 4151 */ 4152 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) { 4153 goto next_key_spinlocked; 4154 } 4155 4156 /* 4157 * Load the new indirect block by acquiring the related 4158 * chains (potentially from media as it might not be 4159 * in-memory). Then move it to the new parent (ichain). 4160 * 4161 * chain is referenced but not locked. We must lock the 4162 * chain to obtain definitive state. 4163 */ 4164 bsave = *bref; 4165 if (chain) { 4166 /* 4167 * Use chain already present in the RBTREE 4168 */ 4169 hammer2_chain_ref(chain); 4170 hammer2_spin_unex(&parent->core.spin); 4171 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 4172 } else { 4173 /* 4174 * Get chain for blockref element. _get returns NULL 4175 * on insertion race. 4176 */ 4177 hammer2_spin_unex(&parent->core.spin); 4178 chain = hammer2_chain_get(parent, generation, &bsave, 4179 HAMMER2_RESOLVE_NEVER); 4180 if (chain == NULL) { 4181 reason = 1; 4182 hammer2_spin_ex(&parent->core.spin); 4183 continue; 4184 } 4185 } 4186 4187 /* 4188 * This is always live so if the chain has been deleted 4189 * we raced someone and we have to retry. 4190 * 4191 * NOTE: Lookups can race delete-duplicate because 4192 * delete-duplicate does not lock the parent's core 4193 * (they just use the spinlock on the core). 4194 * 4195 * (note reversed logic for this one) 4196 */ 4197 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 4198 chain->parent != parent || 4199 (chain->flags & HAMMER2_CHAIN_DELETED)) { 4200 hammer2_chain_unlock(chain); 4201 hammer2_chain_drop(chain); 4202 if (hammer2_debug & 0x0040) { 4203 kprintf("LOST PARENT RETRY " 4204 "RETRY (%p,%p)->%p %08x\n", 4205 parent, chain->parent, chain, chain->flags); 4206 } 4207 hammer2_spin_ex(&parent->core.spin); 4208 continue; 4209 } 4210 4211 /* 4212 * Shift the chain to the indirect block. 4213 * 4214 * WARNING! No reason for us to load chain data, pass NOSTATS 4215 * to prevent delete/insert from trying to access 4216 * inode stats (and thus asserting if there is no 4217 * chain->data loaded). 4218 * 4219 * WARNING! The (parent, chain) deletion may modify the parent 4220 * and invalidate the base pointer. 4221 * 4222 * WARNING! Parent must already be marked modified, so we 4223 * can assume that chain_delete always suceeds. 4224 * 4225 * WARNING! hammer2_chain_repchange() does not have to be 4226 * called (and doesn't work anyway because we are 4227 * only doing a partial shift). A recursion that is 4228 * in-progress can continue at the current parent 4229 * and will be able to properly find its next key. 4230 */ 4231 error = hammer2_chain_delete_obref(parent, chain, mtid, 0, 4232 &bsave); 4233 KKASSERT(error == 0); 4234 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave); 4235 hammer2_chain_unlock(chain); 4236 hammer2_chain_drop(chain); 4237 KKASSERT(parent->refs > 0); 4238 chain = NULL; 4239 base = NULL; /* safety */ 4240 hammer2_spin_ex(&parent->core.spin); 4241 next_key_spinlocked: 4242 if (--maxloops == 0) 4243 panic("hammer2_chain_create_indirect: maxloops"); 4244 reason = 4; 4245 if (key_next == 0 || key_next > key_end) 4246 break; 4247 key_beg = key_next; 4248 /* loop */ 4249 } 4250 hammer2_spin_unex(&parent->core.spin); 4251 4252 /* 4253 * Insert the new indirect block into the parent now that we've 4254 * cleared out some entries in the parent. We calculated a good 4255 * insertion index in the loop above (ichain->index). 4256 * 4257 * We don't have to set UPDATE here because we mark ichain 4258 * modified down below (so the normal modified -> flush -> set-moved 4259 * sequence applies). 4260 * 4261 * The insertion shouldn't race as this is a completely new block 4262 * and the parent is locked. 4263 */ 4264 base = NULL; /* safety, parent modify may change address */ 4265 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 4266 KKASSERT(parent->core.live_count < count); 4267 hammer2_chain_insert(parent, ichain, 4268 HAMMER2_CHAIN_INSERT_SPIN | 4269 HAMMER2_CHAIN_INSERT_LIVE, 4270 0); 4271 4272 /* 4273 * Make sure flushes propogate after our manual insertion. 4274 */ 4275 hammer2_chain_setflush(ichain); 4276 hammer2_chain_setflush(parent); 4277 4278 /* 4279 * Figure out what to return. 4280 */ 4281 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits)) { 4282 /* 4283 * Key being created is outside the key range, 4284 * return the original parent. 4285 */ 4286 hammer2_chain_unlock(ichain); 4287 hammer2_chain_drop(ichain); 4288 } else { 4289 /* 4290 * Otherwise its in the range, return the new parent. 4291 * (leave both the new and old parent locked). 4292 */ 4293 parent = ichain; 4294 } 4295 4296 return(parent); 4297 } 4298 4299 /* 4300 * Do maintenance on an indirect chain. Both parent and chain are locked. 4301 * 4302 * Returns non-zero if (chain) is deleted, either due to being empty or 4303 * because its children were safely moved into the parent. 4304 */ 4305 int 4306 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4307 hammer2_chain_t *chain) 4308 { 4309 hammer2_blockref_t *chain_base; 4310 hammer2_blockref_t *base; 4311 hammer2_blockref_t *bref; 4312 hammer2_blockref_t bsave; 4313 hammer2_key_t key_next; 4314 hammer2_key_t key_beg; 4315 hammer2_key_t key_end; 4316 hammer2_chain_t *sub; 4317 int chain_count; 4318 int count; 4319 int error; 4320 int generation; 4321 4322 /* 4323 * Make sure we have an accurate live_count 4324 */ 4325 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4326 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4327 base = &chain->data->npdata[0]; 4328 count = chain->bytes / sizeof(hammer2_blockref_t); 4329 hammer2_chain_countbrefs(chain, base, count); 4330 } 4331 4332 /* 4333 * If the indirect block is empty we can delete it. 4334 * (ignore deletion error) 4335 */ 4336 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4337 hammer2_chain_delete(parent, chain, 4338 chain->bref.modify_tid, 4339 HAMMER2_DELETE_PERMANENT); 4340 hammer2_chain_repchange(parent, chain); 4341 return 1; 4342 } 4343 4344 base = hammer2_chain_base_and_count(parent, &count); 4345 4346 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4347 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4348 hammer2_chain_countbrefs(parent, base, count); 4349 } 4350 4351 /* 4352 * Determine if we can collapse chain into parent, calculate 4353 * hysteresis for chain emptiness. 4354 */ 4355 if (parent->core.live_count + chain->core.live_count - 1 > count) 4356 return 0; 4357 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4358 if (chain->core.live_count > chain_count * 3 / 4) 4359 return 0; 4360 4361 /* 4362 * Ok, theoretically we can collapse chain's contents into 4363 * parent. chain is locked, but any in-memory children of chain 4364 * are not. For this to work, we must be able to dispose of any 4365 * in-memory children of chain. 4366 * 4367 * For now require that there are no in-memory children of chain. 4368 * 4369 * WARNING! Both chain and parent must remain locked across this 4370 * entire operation. 4371 */ 4372 4373 /* 4374 * Parent must be marked modified. Don't try to collapse it if we 4375 * can't mark it modified. Once modified, destroy chain to make room 4376 * and to get rid of what will be a conflicting key (this is included 4377 * in the calculation above). Finally, move the children of chain 4378 * into chain's parent. 4379 * 4380 * This order creates an accounting problem for bref.embed.stats 4381 * because we destroy chain before we remove its children. Any 4382 * elements whos blockref is already synchronized will be counted 4383 * twice. To deal with the problem we clean out chain's stats prior 4384 * to deleting it. 4385 */ 4386 error = hammer2_chain_modify(parent, 0, 0, 0); 4387 if (error) { 4388 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4389 hammer2_error_str(error)); 4390 return 0; 4391 } 4392 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4393 if (error) { 4394 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4395 hammer2_error_str(error)); 4396 return 0; 4397 } 4398 4399 chain->bref.embed.stats.inode_count = 0; 4400 chain->bref.embed.stats.data_count = 0; 4401 error = hammer2_chain_delete(parent, chain, 4402 chain->bref.modify_tid, 4403 HAMMER2_DELETE_PERMANENT); 4404 KKASSERT(error == 0); 4405 4406 /* 4407 * The combined_find call requires core.spin to be held. One would 4408 * think there wouldn't be any conflicts since we hold chain 4409 * exclusively locked, but the caching mechanism for 0-ref children 4410 * does not require a chain lock. 4411 */ 4412 hammer2_spin_ex(&chain->core.spin); 4413 4414 key_next = 0; 4415 key_beg = 0; 4416 key_end = HAMMER2_KEY_MAX; 4417 for (;;) { 4418 chain_base = &chain->data->npdata[0]; 4419 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4420 sub = hammer2_combined_find(chain, chain_base, chain_count, 4421 &key_next, 4422 key_beg, key_end, 4423 &bref); 4424 generation = chain->core.generation; 4425 if (bref == NULL) 4426 break; 4427 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4428 4429 bsave = *bref; 4430 if (sub) { 4431 hammer2_chain_ref(sub); 4432 hammer2_spin_unex(&chain->core.spin); 4433 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4434 } else { 4435 hammer2_spin_unex(&chain->core.spin); 4436 sub = hammer2_chain_get(chain, generation, &bsave, 4437 HAMMER2_RESOLVE_NEVER); 4438 if (sub == NULL) { 4439 hammer2_spin_ex(&chain->core.spin); 4440 continue; 4441 } 4442 } 4443 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) || 4444 sub->parent != chain || 4445 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4446 hammer2_chain_unlock(sub); 4447 hammer2_chain_drop(sub); 4448 hammer2_spin_ex(&chain->core.spin); 4449 sub = NULL; /* safety */ 4450 continue; 4451 } 4452 error = hammer2_chain_delete_obref(chain, sub, 4453 sub->bref.modify_tid, 0, 4454 &bsave); 4455 KKASSERT(error == 0); 4456 hammer2_chain_rename_obref(&parent, sub, 4457 sub->bref.modify_tid, 4458 HAMMER2_INSERT_SAMEPARENT, &bsave); 4459 hammer2_chain_unlock(sub); 4460 hammer2_chain_drop(sub); 4461 hammer2_spin_ex(&chain->core.spin); 4462 4463 if (key_next == 0) 4464 break; 4465 key_beg = key_next; 4466 } 4467 hammer2_spin_unex(&chain->core.spin); 4468 4469 hammer2_chain_repchange(parent, chain); 4470 4471 return 1; 4472 } 4473 4474 /* 4475 * Freemap indirect blocks 4476 * 4477 * Calculate the keybits and highside/lowside of the freemap node the 4478 * caller is creating. 4479 * 4480 * This routine will specify the next higher-level freemap key/radix 4481 * representing the lowest-ordered set. By doing so, eventually all 4482 * low-ordered sets will be moved one level down. 4483 * 4484 * We have to be careful here because the freemap reserves a limited 4485 * number of blocks for a limited number of levels. So we can't just 4486 * push indiscriminately. 4487 */ 4488 int 4489 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4490 int keybits, hammer2_blockref_t *base, int count) 4491 { 4492 hammer2_chain_t *chain; 4493 hammer2_blockref_t *bref; 4494 hammer2_key_t key; 4495 hammer2_key_t key_beg; 4496 hammer2_key_t key_end; 4497 hammer2_key_t key_next; 4498 int maxloops = 300000; 4499 4500 key = *keyp; 4501 keybits = 64; 4502 4503 /* 4504 * Calculate the range of keys in the array being careful to skip 4505 * slots which are overridden with a deletion. 4506 */ 4507 key_beg = 0; 4508 key_end = HAMMER2_KEY_MAX; 4509 hammer2_spin_ex(&parent->core.spin); 4510 4511 for (;;) { 4512 if (--maxloops == 0) { 4513 panic("indkey_freemap shit %p %p:%d\n", 4514 parent, base, count); 4515 } 4516 chain = hammer2_combined_find(parent, base, count, 4517 &key_next, 4518 key_beg, key_end, 4519 &bref); 4520 4521 /* 4522 * Exhausted search 4523 */ 4524 if (bref == NULL) 4525 break; 4526 4527 /* 4528 * Skip deleted chains. 4529 */ 4530 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4531 if (key_next == 0 || key_next > key_end) 4532 break; 4533 key_beg = key_next; 4534 continue; 4535 } 4536 4537 /* 4538 * Use the full live (not deleted) element for the scan 4539 * iteration. HAMMER2 does not allow partial replacements. 4540 * 4541 * XXX should be built into hammer2_combined_find(). 4542 */ 4543 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4544 4545 if (keybits > bref->keybits) { 4546 key = bref->key; 4547 keybits = bref->keybits; 4548 } else if (keybits == bref->keybits && bref->key < key) { 4549 key = bref->key; 4550 } 4551 if (key_next == 0) 4552 break; 4553 key_beg = key_next; 4554 } 4555 hammer2_spin_unex(&parent->core.spin); 4556 4557 /* 4558 * Return the keybits for a higher-level FREEMAP_NODE covering 4559 * this node. 4560 */ 4561 switch(keybits) { 4562 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4563 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4564 break; 4565 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4566 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4567 break; 4568 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4569 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4570 break; 4571 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4572 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4573 break; 4574 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4575 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4576 break; 4577 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4578 panic("hammer2_chain_indkey_freemap: level too high"); 4579 break; 4580 default: 4581 panic("hammer2_chain_indkey_freemap: bad radix"); 4582 break; 4583 } 4584 *keyp = key; 4585 4586 return (keybits); 4587 } 4588 4589 /* 4590 * File indirect blocks 4591 * 4592 * Calculate the key/keybits for the indirect block to create by scanning 4593 * existing keys. The key being created is also passed in *keyp and can be 4594 * inside or outside the indirect block. Regardless, the indirect block 4595 * must hold at least two keys in order to guarantee sufficient space. 4596 * 4597 * We use a modified version of the freemap's fixed radix tree, but taylored 4598 * for file data. Basically we configure an indirect block encompassing the 4599 * smallest key. 4600 */ 4601 static int 4602 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4603 int keybits, hammer2_blockref_t *base, int count, 4604 int ncount) 4605 { 4606 hammer2_chain_t *chain; 4607 hammer2_blockref_t *bref; 4608 hammer2_key_t key; 4609 hammer2_key_t key_beg; 4610 hammer2_key_t key_end; 4611 hammer2_key_t key_next; 4612 int nradix; 4613 int maxloops = 300000; 4614 4615 key = *keyp; 4616 keybits = 64; 4617 4618 /* 4619 * Calculate the range of keys in the array being careful to skip 4620 * slots which are overridden with a deletion. 4621 * 4622 * Locate the smallest key. 4623 */ 4624 key_beg = 0; 4625 key_end = HAMMER2_KEY_MAX; 4626 hammer2_spin_ex(&parent->core.spin); 4627 4628 for (;;) { 4629 if (--maxloops == 0) { 4630 panic("indkey_freemap shit %p %p:%d\n", 4631 parent, base, count); 4632 } 4633 chain = hammer2_combined_find(parent, base, count, 4634 &key_next, 4635 key_beg, key_end, 4636 &bref); 4637 4638 /* 4639 * Exhausted search 4640 */ 4641 if (bref == NULL) 4642 break; 4643 4644 /* 4645 * Skip deleted chains. 4646 */ 4647 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4648 if (key_next == 0 || key_next > key_end) 4649 break; 4650 key_beg = key_next; 4651 continue; 4652 } 4653 4654 /* 4655 * Use the full live (not deleted) element for the scan 4656 * iteration. HAMMER2 does not allow partial replacements. 4657 * 4658 * XXX should be built into hammer2_combined_find(). 4659 */ 4660 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4661 4662 if (keybits > bref->keybits) { 4663 key = bref->key; 4664 keybits = bref->keybits; 4665 } else if (keybits == bref->keybits && bref->key < key) { 4666 key = bref->key; 4667 } 4668 if (key_next == 0) 4669 break; 4670 key_beg = key_next; 4671 } 4672 hammer2_spin_unex(&parent->core.spin); 4673 4674 /* 4675 * Calculate the static keybits for a higher-level indirect block 4676 * that contains the key. 4677 */ 4678 *keyp = key; 4679 4680 switch(ncount) { 4681 case HAMMER2_IND_COUNT_MIN: 4682 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4683 break; 4684 case HAMMER2_IND_COUNT_NOM: 4685 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4686 break; 4687 case HAMMER2_IND_COUNT_MAX: 4688 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4689 break; 4690 default: 4691 panic("bad ncount %d\n", ncount); 4692 nradix = 0; 4693 break; 4694 } 4695 4696 /* 4697 * The largest radix that can be returned for an indirect block is 4698 * 63 bits. (The largest practical indirect block radix is actually 4699 * 62 bits because the top-level inode or volume root contains four 4700 * entries, but allow 63 to be returned). 4701 */ 4702 if (nradix >= 64) 4703 nradix = 63; 4704 4705 return keybits + nradix; 4706 } 4707 4708 #if 1 4709 4710 /* 4711 * Directory indirect blocks. 4712 * 4713 * Covers both the inode index (directory of inodes), and directory contents 4714 * (filenames hardlinked to inodes). 4715 * 4716 * Because directory keys are hashed we generally try to cut the space in 4717 * half. We accomodate the inode index (which tends to have linearly 4718 * increasing inode numbers) by ensuring that the keyspace is at least large 4719 * enough to fill up the indirect block being created. 4720 */ 4721 static int 4722 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4723 int keybits, hammer2_blockref_t *base, int count, 4724 int ncount) 4725 { 4726 hammer2_blockref_t *bref; 4727 hammer2_chain_t *chain; 4728 hammer2_key_t key_beg; 4729 hammer2_key_t key_end; 4730 hammer2_key_t key_next; 4731 hammer2_key_t key; 4732 int nkeybits; 4733 int locount; 4734 int hicount; 4735 int maxloops = 300000; 4736 4737 /* 4738 * NOTE: We can't take a shortcut here anymore for inodes because 4739 * the root directory can contain a mix of inodes and directory 4740 * entries (we used to just return 63 if parent->bref.type was 4741 * HAMMER2_BREF_TYPE_INODE. 4742 */ 4743 key = *keyp; 4744 locount = 0; 4745 hicount = 0; 4746 4747 /* 4748 * Calculate the range of keys in the array being careful to skip 4749 * slots which are overridden with a deletion. 4750 */ 4751 key_beg = 0; 4752 key_end = HAMMER2_KEY_MAX; 4753 hammer2_spin_ex(&parent->core.spin); 4754 4755 for (;;) { 4756 if (--maxloops == 0) { 4757 panic("indkey_freemap shit %p %p:%d\n", 4758 parent, base, count); 4759 } 4760 chain = hammer2_combined_find(parent, base, count, 4761 &key_next, 4762 key_beg, key_end, 4763 &bref); 4764 4765 /* 4766 * Exhausted search 4767 */ 4768 if (bref == NULL) 4769 break; 4770 4771 /* 4772 * Deleted object 4773 */ 4774 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4775 if (key_next == 0 || key_next > key_end) 4776 break; 4777 key_beg = key_next; 4778 continue; 4779 } 4780 4781 /* 4782 * Use the full live (not deleted) element for the scan 4783 * iteration. HAMMER2 does not allow partial replacements. 4784 * 4785 * XXX should be built into hammer2_combined_find(). 4786 */ 4787 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4788 4789 /* 4790 * Expand our calculated key range (key, keybits) to fit 4791 * the scanned key. nkeybits represents the full range 4792 * that we will later cut in half (two halves @ nkeybits - 1). 4793 */ 4794 nkeybits = keybits; 4795 if (nkeybits < bref->keybits) { 4796 if (bref->keybits > 64) { 4797 kprintf("bad bref chain %p bref %p\n", 4798 chain, bref); 4799 Debugger("fubar"); 4800 } 4801 nkeybits = bref->keybits; 4802 } 4803 while (nkeybits < 64 && 4804 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) { 4805 ++nkeybits; 4806 } 4807 4808 /* 4809 * If the new key range is larger we have to determine 4810 * which side of the new key range the existing keys fall 4811 * under by checking the high bit, then collapsing the 4812 * locount into the hicount or vise-versa. 4813 */ 4814 if (keybits != nkeybits) { 4815 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4816 hicount += locount; 4817 locount = 0; 4818 } else { 4819 locount += hicount; 4820 hicount = 0; 4821 } 4822 keybits = nkeybits; 4823 } 4824 4825 /* 4826 * The newly scanned key will be in the lower half or the 4827 * upper half of the (new) key range. 4828 */ 4829 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4830 ++hicount; 4831 else 4832 ++locount; 4833 4834 if (key_next == 0) 4835 break; 4836 key_beg = key_next; 4837 } 4838 hammer2_spin_unex(&parent->core.spin); 4839 bref = NULL; /* now invalid (safety) */ 4840 4841 /* 4842 * Adjust keybits to represent half of the full range calculated 4843 * above (radix 63 max) for our new indirect block. 4844 */ 4845 --keybits; 4846 4847 /* 4848 * Expand keybits to hold at least ncount elements. ncount will be 4849 * a power of 2. This is to try to completely fill leaf nodes (at 4850 * least for keys which are not hashes). 4851 * 4852 * We aren't counting 'in' or 'out', we are counting 'high side' 4853 * and 'low side' based on the bit at (1LL << keybits). We want 4854 * everything to be inside in these cases so shift it all to 4855 * the low or high side depending on the new high bit. 4856 */ 4857 while (((hammer2_key_t)1 << keybits) < ncount) { 4858 ++keybits; 4859 if (key & ((hammer2_key_t)1 << keybits)) { 4860 hicount += locount; 4861 locount = 0; 4862 } else { 4863 locount += hicount; 4864 hicount = 0; 4865 } 4866 } 4867 4868 if (hicount > locount) 4869 key |= (hammer2_key_t)1 << keybits; 4870 else 4871 key &= ~(hammer2_key_t)1 << keybits; 4872 4873 *keyp = key; 4874 4875 return (keybits); 4876 } 4877 4878 #else 4879 4880 /* 4881 * Directory indirect blocks. 4882 * 4883 * Covers both the inode index (directory of inodes), and directory contents 4884 * (filenames hardlinked to inodes). 4885 * 4886 * Because directory keys are hashed we generally try to cut the space in 4887 * half. We accomodate the inode index (which tends to have linearly 4888 * increasing inode numbers) by ensuring that the keyspace is at least large 4889 * enough to fill up the indirect block being created. 4890 */ 4891 static int 4892 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4893 int keybits, hammer2_blockref_t *base, int count, 4894 int ncount) 4895 { 4896 hammer2_blockref_t *bref; 4897 hammer2_chain_t *chain; 4898 hammer2_key_t key_beg; 4899 hammer2_key_t key_end; 4900 hammer2_key_t key_next; 4901 hammer2_key_t key; 4902 int nkeybits; 4903 int locount; 4904 int hicount; 4905 int maxloops = 300000; 4906 4907 /* 4908 * Shortcut if the parent is the inode. In this situation the 4909 * parent has 4+1 directory entries and we are creating an indirect 4910 * block capable of holding many more. 4911 */ 4912 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4913 return 63; 4914 } 4915 4916 key = *keyp; 4917 locount = 0; 4918 hicount = 0; 4919 4920 /* 4921 * Calculate the range of keys in the array being careful to skip 4922 * slots which are overridden with a deletion. 4923 */ 4924 key_beg = 0; 4925 key_end = HAMMER2_KEY_MAX; 4926 hammer2_spin_ex(&parent->core.spin); 4927 4928 for (;;) { 4929 if (--maxloops == 0) { 4930 panic("indkey_freemap shit %p %p:%d\n", 4931 parent, base, count); 4932 } 4933 chain = hammer2_combined_find(parent, base, count, 4934 &key_next, 4935 key_beg, key_end, 4936 &bref); 4937 4938 /* 4939 * Exhausted search 4940 */ 4941 if (bref == NULL) 4942 break; 4943 4944 /* 4945 * Deleted object 4946 */ 4947 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4948 if (key_next == 0 || key_next > key_end) 4949 break; 4950 key_beg = key_next; 4951 continue; 4952 } 4953 4954 /* 4955 * Use the full live (not deleted) element for the scan 4956 * iteration. HAMMER2 does not allow partial replacements. 4957 * 4958 * XXX should be built into hammer2_combined_find(). 4959 */ 4960 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4961 4962 /* 4963 * Expand our calculated key range (key, keybits) to fit 4964 * the scanned key. nkeybits represents the full range 4965 * that we will later cut in half (two halves @ nkeybits - 1). 4966 */ 4967 nkeybits = keybits; 4968 if (nkeybits < bref->keybits) { 4969 if (bref->keybits > 64) { 4970 kprintf("bad bref chain %p bref %p\n", 4971 chain, bref); 4972 Debugger("fubar"); 4973 } 4974 nkeybits = bref->keybits; 4975 } 4976 while (nkeybits < 64 && 4977 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4978 (key ^ bref->key)) != 0) { 4979 ++nkeybits; 4980 } 4981 4982 /* 4983 * If the new key range is larger we have to determine 4984 * which side of the new key range the existing keys fall 4985 * under by checking the high bit, then collapsing the 4986 * locount into the hicount or vise-versa. 4987 */ 4988 if (keybits != nkeybits) { 4989 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4990 hicount += locount; 4991 locount = 0; 4992 } else { 4993 locount += hicount; 4994 hicount = 0; 4995 } 4996 keybits = nkeybits; 4997 } 4998 4999 /* 5000 * The newly scanned key will be in the lower half or the 5001 * upper half of the (new) key range. 5002 */ 5003 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 5004 ++hicount; 5005 else 5006 ++locount; 5007 5008 if (key_next == 0) 5009 break; 5010 key_beg = key_next; 5011 } 5012 hammer2_spin_unex(&parent->core.spin); 5013 bref = NULL; /* now invalid (safety) */ 5014 5015 /* 5016 * Adjust keybits to represent half of the full range calculated 5017 * above (radix 63 max) for our new indirect block. 5018 */ 5019 --keybits; 5020 5021 /* 5022 * Expand keybits to hold at least ncount elements. ncount will be 5023 * a power of 2. This is to try to completely fill leaf nodes (at 5024 * least for keys which are not hashes). 5025 * 5026 * We aren't counting 'in' or 'out', we are counting 'high side' 5027 * and 'low side' based on the bit at (1LL << keybits). We want 5028 * everything to be inside in these cases so shift it all to 5029 * the low or high side depending on the new high bit. 5030 */ 5031 while (((hammer2_key_t)1 << keybits) < ncount) { 5032 ++keybits; 5033 if (key & ((hammer2_key_t)1 << keybits)) { 5034 hicount += locount; 5035 locount = 0; 5036 } else { 5037 locount += hicount; 5038 hicount = 0; 5039 } 5040 } 5041 5042 if (hicount > locount) 5043 key |= (hammer2_key_t)1 << keybits; 5044 else 5045 key &= ~(hammer2_key_t)1 << keybits; 5046 5047 *keyp = key; 5048 5049 return (keybits); 5050 } 5051 5052 #endif 5053 5054 /* 5055 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 5056 * it exists. 5057 * 5058 * Both parent and chain must be locked exclusively. 5059 * 5060 * This function will modify the parent if the blockref requires removal 5061 * from the parent's block table. 5062 * 5063 * This function is NOT recursive. Any entity already pushed into the 5064 * chain (such as an inode) may still need visibility into its contents, 5065 * as well as the ability to read and modify the contents. For example, 5066 * for an unlinked file which is still open. 5067 * 5068 * Also note that the flusher is responsible for cleaning up empty 5069 * indirect blocks. 5070 */ 5071 int 5072 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 5073 hammer2_tid_t mtid, int flags) 5074 { 5075 int error = 0; 5076 5077 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5078 5079 /* 5080 * Nothing to do if already marked. 5081 * 5082 * We need the spinlock on the core whos RBTREE contains chain 5083 * to protect against races. 5084 */ 5085 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5086 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5087 chain->parent == parent); 5088 error = _hammer2_chain_delete_helper(parent, chain, 5089 mtid, flags, NULL); 5090 } 5091 5092 /* 5093 * Permanent deletions mark the chain as destroyed. 5094 * 5095 * NOTE: We do not setflush the chain unless the deletion is 5096 * permanent, since the deletion of a chain does not actually 5097 * require it to be flushed. 5098 */ 5099 if (error == 0) { 5100 if (flags & HAMMER2_DELETE_PERMANENT) { 5101 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5102 hammer2_chain_setflush(chain); 5103 } 5104 } 5105 5106 return error; 5107 } 5108 5109 static int 5110 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain, 5111 hammer2_tid_t mtid, int flags, 5112 hammer2_blockref_t *obref) 5113 { 5114 int error = 0; 5115 5116 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5117 5118 /* 5119 * Nothing to do if already marked. 5120 * 5121 * We need the spinlock on the core whos RBTREE contains chain 5122 * to protect against races. 5123 */ 5124 obref->type = HAMMER2_BREF_TYPE_EMPTY; 5125 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5126 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5127 chain->parent == parent); 5128 error = _hammer2_chain_delete_helper(parent, chain, 5129 mtid, flags, obref); 5130 } 5131 5132 /* 5133 * Permanent deletions mark the chain as destroyed. 5134 * 5135 * NOTE: We do not setflush the chain unless the deletion is 5136 * permanent, since the deletion of a chain does not actually 5137 * require it to be flushed. 5138 */ 5139 if (error == 0) { 5140 if (flags & HAMMER2_DELETE_PERMANENT) { 5141 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5142 hammer2_chain_setflush(chain); 5143 } 5144 } 5145 5146 return error; 5147 } 5148 5149 /* 5150 * Returns the index of the nearest element in the blockref array >= elm. 5151 * Returns (count) if no element could be found. 5152 * 5153 * Sets *key_nextp to the next key for loop purposes but does not modify 5154 * it if the next key would be higher than the current value of *key_nextp. 5155 * Note that *key_nexp can overflow to 0, which should be tested by the 5156 * caller. 5157 * 5158 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5159 * held through the operation. 5160 */ 5161 static int 5162 hammer2_base_find(hammer2_chain_t *parent, 5163 hammer2_blockref_t *base, int count, 5164 hammer2_key_t *key_nextp, 5165 hammer2_key_t key_beg, hammer2_key_t key_end) 5166 { 5167 hammer2_blockref_t *scan; 5168 hammer2_key_t scan_end; 5169 int i; 5170 int limit; 5171 5172 /* 5173 * Require the live chain's already have their core's counted 5174 * so we can optimize operations. 5175 */ 5176 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 5177 5178 /* 5179 * Degenerate case 5180 */ 5181 if (count == 0 || base == NULL) 5182 return(count); 5183 5184 /* 5185 * Sequential optimization using parent->cache_index. This is 5186 * the most likely scenario. 5187 * 5188 * We can avoid trailing empty entries on live chains, otherwise 5189 * we might have to check the whole block array. 5190 */ 5191 i = parent->cache_index; /* SMP RACE OK */ 5192 cpu_ccfence(); 5193 limit = parent->core.live_zero; 5194 if (i >= limit) 5195 i = limit - 1; 5196 if (i < 0) 5197 i = 0; 5198 KKASSERT(i < count); 5199 5200 /* 5201 * Search backwards 5202 */ 5203 scan = &base[i]; 5204 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY || 5205 scan->key > key_beg)) { 5206 --scan; 5207 --i; 5208 } 5209 parent->cache_index = i; 5210 5211 /* 5212 * Search forwards, stop when we find a scan element which 5213 * encloses the key or until we know that there are no further 5214 * elements. 5215 */ 5216 while (i < count) { 5217 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) { 5218 scan_end = scan->key + 5219 ((hammer2_key_t)1 << scan->keybits) - 1; 5220 if (scan->key > key_beg || scan_end >= key_beg) 5221 break; 5222 } 5223 if (i >= limit) 5224 return (count); 5225 ++scan; 5226 ++i; 5227 } 5228 if (i != count) { 5229 parent->cache_index = i; 5230 if (i >= limit) { 5231 i = count; 5232 } else { 5233 scan_end = scan->key + 5234 ((hammer2_key_t)1 << scan->keybits); 5235 if (scan_end && (*key_nextp > scan_end || 5236 *key_nextp == 0)) { 5237 *key_nextp = scan_end; 5238 } 5239 } 5240 } 5241 return (i); 5242 } 5243 5244 /* 5245 * Do a combined search and return the next match either from the blockref 5246 * array or from the in-memory chain. Sets *brefp to the returned bref in 5247 * both cases, or sets it to NULL if the search exhausted. Only returns 5248 * a non-NULL chain if the search matched from the in-memory chain. 5249 * 5250 * When no in-memory chain has been found and a non-NULL bref is returned 5251 * in *brefp. 5252 * 5253 * 5254 * The returned chain is not locked or referenced. Use the returned bref 5255 * to determine if the search exhausted or not. Iterate if the base find 5256 * is chosen but matches a deleted chain. 5257 * 5258 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5259 * held through the operation. 5260 */ 5261 static hammer2_chain_t * 5262 hammer2_combined_find(hammer2_chain_t *parent, 5263 hammer2_blockref_t *base, int count, 5264 hammer2_key_t *key_nextp, 5265 hammer2_key_t key_beg, hammer2_key_t key_end, 5266 hammer2_blockref_t **brefp) 5267 { 5268 hammer2_blockref_t *bref; 5269 hammer2_chain_t *chain; 5270 int i; 5271 5272 /* 5273 * Lookup in block array and in rbtree. 5274 */ 5275 *key_nextp = key_end + 1; 5276 i = hammer2_base_find(parent, base, count, key_nextp, 5277 key_beg, key_end); 5278 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 5279 5280 /* 5281 * Neither matched 5282 */ 5283 if (i == count && chain == NULL) { 5284 *brefp = NULL; 5285 return(NULL); 5286 } 5287 5288 /* 5289 * Only chain matched. 5290 */ 5291 if (i == count) { 5292 bref = &chain->bref; 5293 goto found; 5294 } 5295 5296 /* 5297 * Only blockref matched. 5298 */ 5299 if (chain == NULL) { 5300 bref = &base[i]; 5301 goto found; 5302 } 5303 5304 /* 5305 * Both in-memory and blockref matched, select the nearer element. 5306 * 5307 * If both are flush with the left-hand side or both are the 5308 * same distance away, select the chain. In this situation the 5309 * chain must have been loaded from the matching blockmap. 5310 */ 5311 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 5312 chain->bref.key == base[i].key) { 5313 KKASSERT(chain->bref.key == base[i].key); 5314 bref = &chain->bref; 5315 goto found; 5316 } 5317 5318 /* 5319 * Select the nearer key 5320 */ 5321 if (chain->bref.key < base[i].key) { 5322 bref = &chain->bref; 5323 } else { 5324 bref = &base[i]; 5325 chain = NULL; 5326 } 5327 5328 /* 5329 * If the bref is out of bounds we've exhausted our search. 5330 */ 5331 found: 5332 if (bref->key > key_end) { 5333 *brefp = NULL; 5334 chain = NULL; 5335 } else { 5336 *brefp = bref; 5337 } 5338 return(chain); 5339 } 5340 5341 /* 5342 * Locate the specified block array element and delete it. The element 5343 * must exist. 5344 * 5345 * The spin lock on the related chain must be held. 5346 * 5347 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5348 * need to be adjusted when we commit the media change. 5349 */ 5350 void 5351 hammer2_base_delete(hammer2_chain_t *parent, 5352 hammer2_blockref_t *base, int count, 5353 hammer2_chain_t *chain, 5354 hammer2_blockref_t *obref) 5355 { 5356 hammer2_blockref_t *elm = &chain->bref; 5357 hammer2_blockref_t *scan; 5358 hammer2_key_t key_next; 5359 int i; 5360 5361 /* 5362 * Delete element. Expect the element to exist. 5363 * 5364 * XXX see caller, flush code not yet sophisticated enough to prevent 5365 * re-flushed in some cases. 5366 */ 5367 key_next = 0; /* max range */ 5368 i = hammer2_base_find(parent, base, count, &key_next, 5369 elm->key, elm->key); 5370 scan = &base[i]; 5371 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY || 5372 scan->key != elm->key || 5373 ((chain->flags & HAMMER2_CHAIN_BLKMAPUPD) == 0 && 5374 scan->keybits != elm->keybits)) { 5375 hammer2_spin_unex(&parent->core.spin); 5376 panic("delete base %p element not found at %d/%d elm %p\n", 5377 base, i, count, elm); 5378 return; 5379 } 5380 5381 /* 5382 * Update stats and zero the entry. 5383 * 5384 * NOTE: Handle radix == 0 (0 bytes) case. 5385 */ 5386 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5387 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5388 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5389 } 5390 switch(scan->type) { 5391 case HAMMER2_BREF_TYPE_INODE: 5392 --parent->bref.embed.stats.inode_count; 5393 /* fall through */ 5394 case HAMMER2_BREF_TYPE_DATA: 5395 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5396 atomic_set_int(&chain->flags, 5397 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5398 } else { 5399 if (parent->bref.leaf_count) 5400 --parent->bref.leaf_count; 5401 } 5402 /* fall through */ 5403 case HAMMER2_BREF_TYPE_INDIRECT: 5404 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5405 parent->bref.embed.stats.data_count -= 5406 scan->embed.stats.data_count; 5407 parent->bref.embed.stats.inode_count -= 5408 scan->embed.stats.inode_count; 5409 } 5410 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5411 break; 5412 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5413 atomic_set_int(&chain->flags, 5414 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5415 } else { 5416 if (parent->bref.leaf_count <= scan->leaf_count) 5417 parent->bref.leaf_count = 0; 5418 else 5419 parent->bref.leaf_count -= scan->leaf_count; 5420 } 5421 break; 5422 case HAMMER2_BREF_TYPE_DIRENT: 5423 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5424 atomic_set_int(&chain->flags, 5425 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5426 } else { 5427 if (parent->bref.leaf_count) 5428 --parent->bref.leaf_count; 5429 } 5430 default: 5431 break; 5432 } 5433 5434 if (obref) 5435 *obref = *scan; 5436 bzero(scan, sizeof(*scan)); 5437 5438 /* 5439 * We can only optimize parent->core.live_zero for live chains. 5440 */ 5441 if (parent->core.live_zero == i + 1) { 5442 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY) 5443 ; 5444 parent->core.live_zero = i + 1; 5445 } 5446 5447 /* 5448 * Clear appropriate blockmap flags in chain. 5449 */ 5450 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED | 5451 HAMMER2_CHAIN_BLKMAPUPD); 5452 } 5453 5454 /* 5455 * Insert the specified element. The block array must not already have the 5456 * element and must have space available for the insertion. 5457 * 5458 * The spin lock on the related chain must be held. 5459 * 5460 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5461 * need to be adjusted when we commit the media change. 5462 */ 5463 void 5464 hammer2_base_insert(hammer2_chain_t *parent, 5465 hammer2_blockref_t *base, int count, 5466 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5467 { 5468 hammer2_key_t key_next; 5469 hammer2_key_t xkey; 5470 int i; 5471 int j; 5472 int k; 5473 int l; 5474 int u = 1; 5475 5476 /* 5477 * Insert new element. Expect the element to not already exist 5478 * unless we are replacing it. 5479 * 5480 * XXX see caller, flush code not yet sophisticated enough to prevent 5481 * re-flushed in some cases. 5482 */ 5483 key_next = 0; /* max range */ 5484 i = hammer2_base_find(parent, base, count, &key_next, 5485 elm->key, elm->key); 5486 5487 /* 5488 * Shortcut fill optimization, typical ordered insertion(s) may not 5489 * require a search. 5490 */ 5491 KKASSERT(i >= 0 && i <= count); 5492 5493 /* 5494 * Set appropriate blockmap flags in chain (if not NULL) 5495 */ 5496 if (chain) 5497 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 5498 5499 /* 5500 * Update stats and zero the entry 5501 */ 5502 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5503 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5504 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5505 } 5506 switch(elm->type) { 5507 case HAMMER2_BREF_TYPE_INODE: 5508 ++parent->bref.embed.stats.inode_count; 5509 /* fall through */ 5510 case HAMMER2_BREF_TYPE_DATA: 5511 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5512 ++parent->bref.leaf_count; 5513 /* fall through */ 5514 case HAMMER2_BREF_TYPE_INDIRECT: 5515 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5516 parent->bref.embed.stats.data_count += 5517 elm->embed.stats.data_count; 5518 parent->bref.embed.stats.inode_count += 5519 elm->embed.stats.inode_count; 5520 } 5521 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5522 break; 5523 if (parent->bref.leaf_count + elm->leaf_count < 5524 HAMMER2_BLOCKREF_LEAF_MAX) { 5525 parent->bref.leaf_count += elm->leaf_count; 5526 } else { 5527 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5528 } 5529 break; 5530 case HAMMER2_BREF_TYPE_DIRENT: 5531 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5532 ++parent->bref.leaf_count; 5533 break; 5534 default: 5535 break; 5536 } 5537 5538 5539 /* 5540 * We can only optimize parent->core.live_zero for live chains. 5541 */ 5542 if (i == count && parent->core.live_zero < count) { 5543 i = parent->core.live_zero++; 5544 base[i] = *elm; 5545 return; 5546 } 5547 5548 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5549 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5550 hammer2_spin_unex(&parent->core.spin); 5551 panic("insert base %p overlapping elements at %d elm %p\n", 5552 base, i, elm); 5553 } 5554 5555 /* 5556 * Try to find an empty slot before or after. 5557 */ 5558 j = i; 5559 k = i; 5560 while (j > 0 || k < count) { 5561 --j; 5562 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) { 5563 if (j == i - 1) { 5564 base[j] = *elm; 5565 } else { 5566 bcopy(&base[j+1], &base[j], 5567 (i - j - 1) * sizeof(*base)); 5568 base[i - 1] = *elm; 5569 } 5570 goto validate; 5571 } 5572 ++k; 5573 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) { 5574 bcopy(&base[i], &base[i+1], 5575 (k - i) * sizeof(hammer2_blockref_t)); 5576 base[i] = *elm; 5577 5578 /* 5579 * We can only update parent->core.live_zero for live 5580 * chains. 5581 */ 5582 if (parent->core.live_zero <= k) 5583 parent->core.live_zero = k + 1; 5584 u = 2; 5585 goto validate; 5586 } 5587 } 5588 panic("hammer2_base_insert: no room!"); 5589 5590 /* 5591 * Debugging 5592 */ 5593 validate: 5594 key_next = 0; 5595 for (l = 0; l < count; ++l) { 5596 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5597 key_next = base[l].key + 5598 ((hammer2_key_t)1 << base[l].keybits) - 1; 5599 break; 5600 } 5601 } 5602 while (++l < count) { 5603 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5604 if (base[l].key <= key_next) 5605 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5606 key_next = base[l].key + 5607 ((hammer2_key_t)1 << base[l].keybits) - 1; 5608 5609 } 5610 } 5611 5612 } 5613 5614 #if 0 5615 5616 /* 5617 * Sort the blockref array for the chain. Used by the flush code to 5618 * sort the blockref[] array. 5619 * 5620 * The chain must be exclusively locked AND spin-locked. 5621 */ 5622 typedef hammer2_blockref_t *hammer2_blockref_p; 5623 5624 static 5625 int 5626 hammer2_base_sort_callback(const void *v1, const void *v2) 5627 { 5628 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5629 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5630 5631 /* 5632 * Make sure empty elements are placed at the end of the array 5633 */ 5634 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) { 5635 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) 5636 return(0); 5637 return(1); 5638 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) { 5639 return(-1); 5640 } 5641 5642 /* 5643 * Sort by key 5644 */ 5645 if (bref1->key < bref2->key) 5646 return(-1); 5647 if (bref1->key > bref2->key) 5648 return(1); 5649 return(0); 5650 } 5651 5652 void 5653 hammer2_base_sort(hammer2_chain_t *chain) 5654 { 5655 hammer2_blockref_t *base; 5656 int count; 5657 5658 switch(chain->bref.type) { 5659 case HAMMER2_BREF_TYPE_INODE: 5660 /* 5661 * Special shortcut for embedded data returns the inode 5662 * itself. Callers must detect this condition and access 5663 * the embedded data (the strategy code does this for us). 5664 * 5665 * This is only applicable to regular files and softlinks. 5666 */ 5667 if (chain->data->ipdata.meta.op_flags & 5668 HAMMER2_OPFLAG_DIRECTDATA) { 5669 return; 5670 } 5671 base = &chain->data->ipdata.u.blockset.blockref[0]; 5672 count = HAMMER2_SET_COUNT; 5673 break; 5674 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5675 case HAMMER2_BREF_TYPE_INDIRECT: 5676 /* 5677 * Optimize indirect blocks in the INITIAL state to avoid 5678 * I/O. 5679 */ 5680 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5681 base = &chain->data->npdata[0]; 5682 count = chain->bytes / sizeof(hammer2_blockref_t); 5683 break; 5684 case HAMMER2_BREF_TYPE_VOLUME: 5685 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5686 count = HAMMER2_SET_COUNT; 5687 break; 5688 case HAMMER2_BREF_TYPE_FREEMAP: 5689 base = &chain->data->blkset.blockref[0]; 5690 count = HAMMER2_SET_COUNT; 5691 break; 5692 default: 5693 panic("hammer2_base_sort: unrecognized " 5694 "blockref(A) type: %d", 5695 chain->bref.type); 5696 base = NULL; /* safety */ 5697 count = 0; /* safety */ 5698 break; 5699 } 5700 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5701 } 5702 5703 #endif 5704 5705 /* 5706 * Set the check data for a chain. This can be a heavy-weight operation 5707 * and typically only runs on-flush. For file data check data is calculated 5708 * when the logical buffers are flushed. 5709 */ 5710 void 5711 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5712 { 5713 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 5714 5715 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5716 case HAMMER2_CHECK_NONE: 5717 break; 5718 case HAMMER2_CHECK_DISABLED: 5719 break; 5720 case HAMMER2_CHECK_ISCSI32: 5721 chain->bref.check.iscsi32.value = 5722 hammer2_icrc32(bdata, chain->bytes); 5723 break; 5724 case HAMMER2_CHECK_XXHASH64: 5725 chain->bref.check.xxhash64.value = 5726 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5727 break; 5728 case HAMMER2_CHECK_SHA192: 5729 { 5730 SHA256_CTX hash_ctx; 5731 union { 5732 uint8_t digest[SHA256_DIGEST_LENGTH]; 5733 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5734 } u; 5735 5736 SHA256_Init(&hash_ctx); 5737 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5738 SHA256_Final(u.digest, &hash_ctx); 5739 u.digest64[2] ^= u.digest64[3]; 5740 bcopy(u.digest, 5741 chain->bref.check.sha192.data, 5742 sizeof(chain->bref.check.sha192.data)); 5743 } 5744 break; 5745 case HAMMER2_CHECK_FREEMAP: 5746 chain->bref.check.freemap.icrc32 = 5747 hammer2_icrc32(bdata, chain->bytes); 5748 break; 5749 default: 5750 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5751 chain->bref.methods); 5752 break; 5753 } 5754 } 5755 5756 /* 5757 * Characterize a failed check code and try to trace back to the inode. 5758 */ 5759 static void 5760 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check, 5761 int bits) 5762 { 5763 hammer2_chain_t *lchain; 5764 hammer2_chain_t *ochain; 5765 int did; 5766 5767 did = krateprintf(&krate_h2chk, 5768 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL " 5769 "(flags=%08x, bref/data ", 5770 chain->bref.data_off, 5771 chain->bref.type, 5772 hammer2_bref_type_str(chain->bref.type), 5773 chain->bref.methods, 5774 chain->flags); 5775 if (did == 0) 5776 return; 5777 5778 if (bits == 32) { 5779 kprintf("%08x/%08x)\n", 5780 chain->bref.check.iscsi32.value, 5781 (uint32_t)check); 5782 } else { 5783 kprintf("%016jx/%016jx)\n", 5784 chain->bref.check.xxhash64.value, 5785 check); 5786 } 5787 5788 /* 5789 * Run up the chains to try to find the governing inode so we 5790 * can report it. 5791 * 5792 * XXX This error reporting is not really MPSAFE 5793 */ 5794 ochain = chain; 5795 lchain = chain; 5796 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) { 5797 lchain = chain; 5798 chain = chain->parent; 5799 } 5800 5801 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE && 5802 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 || 5803 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) { 5804 kprintf(" Resides at/in inode %ld\n", 5805 chain->bref.key); 5806 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5807 kprintf(" Resides in inode index - CRITICAL!!!\n"); 5808 } else { 5809 kprintf(" Resides in root index - CRITICAL!!!\n"); 5810 } 5811 if (ochain->hmp) { 5812 const char *pfsname = "UNKNOWN"; 5813 int i; 5814 5815 if (ochain->pmp) { 5816 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 5817 if (ochain->pmp->pfs_hmps[i] == ochain->hmp && 5818 ochain->pmp->pfs_names[i]) { 5819 pfsname = ochain->pmp->pfs_names[i]; 5820 break; 5821 } 5822 } 5823 } 5824 kprintf(" In pfs %s on device %s\n", 5825 pfsname, ochain->hmp->devrepname); 5826 } 5827 } 5828 5829 /* 5830 * Returns non-zero on success, 0 on failure. 5831 */ 5832 int 5833 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5834 { 5835 uint32_t check32; 5836 uint64_t check64; 5837 int r; 5838 5839 if (chain->flags & HAMMER2_CHAIN_NOTTESTED) 5840 return 1; 5841 5842 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5843 case HAMMER2_CHECK_NONE: 5844 r = 1; 5845 break; 5846 case HAMMER2_CHECK_DISABLED: 5847 r = 1; 5848 break; 5849 case HAMMER2_CHECK_ISCSI32: 5850 check32 = hammer2_icrc32(bdata, chain->bytes); 5851 r = (chain->bref.check.iscsi32.value == check32); 5852 if (r == 0) { 5853 hammer2_characterize_failed_chain(chain, check32, 32); 5854 } 5855 hammer2_process_icrc32 += chain->bytes; 5856 break; 5857 case HAMMER2_CHECK_XXHASH64: 5858 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5859 r = (chain->bref.check.xxhash64.value == check64); 5860 if (r == 0) { 5861 hammer2_characterize_failed_chain(chain, check64, 64); 5862 } 5863 hammer2_process_xxhash64 += chain->bytes; 5864 break; 5865 case HAMMER2_CHECK_SHA192: 5866 { 5867 SHA256_CTX hash_ctx; 5868 union { 5869 uint8_t digest[SHA256_DIGEST_LENGTH]; 5870 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5871 } u; 5872 5873 SHA256_Init(&hash_ctx); 5874 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5875 SHA256_Final(u.digest, &hash_ctx); 5876 u.digest64[2] ^= u.digest64[3]; 5877 if (bcmp(u.digest, 5878 chain->bref.check.sha192.data, 5879 sizeof(chain->bref.check.sha192.data)) == 0) { 5880 r = 1; 5881 } else { 5882 r = 0; 5883 krateprintf(&krate_h2chk, 5884 "chain %016jx.%02x meth=%02x " 5885 "CHECK FAIL\n", 5886 chain->bref.data_off, 5887 chain->bref.type, 5888 chain->bref.methods); 5889 } 5890 } 5891 break; 5892 case HAMMER2_CHECK_FREEMAP: 5893 r = (chain->bref.check.freemap.icrc32 == 5894 hammer2_icrc32(bdata, chain->bytes)); 5895 if (r == 0) { 5896 int did; 5897 5898 did = krateprintf(&krate_h2chk, 5899 "chain %016jx.%02x meth=%02x " 5900 "CHECK FAIL\n", 5901 chain->bref.data_off, 5902 chain->bref.type, 5903 chain->bref.methods); 5904 if (did) { 5905 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 5906 chain->bref.check.freemap.icrc32, 5907 hammer2_icrc32(bdata, chain->bytes), 5908 chain->bytes); 5909 if (chain->dio) { 5910 kprintf("dio %p buf %016jx,%d " 5911 "bdata %p/%p\n", 5912 chain->dio, 5913 chain->dio->bp->b_loffset, 5914 chain->dio->bp->b_bufsize, 5915 bdata, 5916 chain->dio->bp->b_data); 5917 } 5918 } 5919 } 5920 break; 5921 default: 5922 kprintf("hammer2_chain_testcheck: unknown check type %02x\n", 5923 chain->bref.methods); 5924 r = 1; 5925 break; 5926 } 5927 return r; 5928 } 5929 5930 /* 5931 * Acquire the chain and parent representing the specified inode for the 5932 * device at the specified cluster index. 5933 * 5934 * The flags passed in are LOOKUP flags, not RESOLVE flags. 5935 * 5936 * If we are unable to locate the inode, HAMMER2_ERROR_EIO or HAMMER2_ERROR_CHECK 5937 * is returned. In case of error, *chainp and/or *parentp may still be returned 5938 * non-NULL. 5939 * 5940 * The caller may pass-in a locked *parentp and/or *chainp, or neither. 5941 * They will be unlocked and released by this function. The *parentp and 5942 * *chainp representing the located inode are returned locked. 5943 * 5944 * The returned error includes any error on the returned chain in addition to 5945 * errors incurred while trying to lookup the inode. However, a chain->error 5946 * might not be recognized if HAMMER2_LOOKUP_NODATA is passed. This flag may 5947 * not be passed to this function. 5948 */ 5949 int 5950 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 5951 int clindex, int flags, 5952 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 5953 { 5954 hammer2_chain_t *parent; 5955 hammer2_chain_t *rchain; 5956 hammer2_key_t key_dummy; 5957 hammer2_inode_t *ip; 5958 int resolve_flags; 5959 int error; 5960 5961 KKASSERT((flags & HAMMER2_LOOKUP_NODATA) == 0); 5962 5963 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 5964 HAMMER2_RESOLVE_SHARED : 0; 5965 5966 /* 5967 * Caller expects us to replace these. 5968 */ 5969 if (*chainp) { 5970 hammer2_chain_unlock(*chainp); 5971 hammer2_chain_drop(*chainp); 5972 *chainp = NULL; 5973 } 5974 if (*parentp) { 5975 hammer2_chain_unlock(*parentp); 5976 hammer2_chain_drop(*parentp); 5977 *parentp = NULL; 5978 } 5979 5980 /* 5981 * Be very careful, this is a backend function and we CANNOT 5982 * lock any frontend inode structure we find. But we have to 5983 * look the inode up this way first in case it exists but is 5984 * detached from the radix tree. 5985 */ 5986 ip = hammer2_inode_lookup(pmp, inum); 5987 if (ip) { 5988 *chainp = hammer2_inode_chain_and_parent(ip, clindex, 5989 parentp, 5990 resolve_flags); 5991 hammer2_inode_drop(ip); 5992 if (*chainp) 5993 return (*chainp)->error; 5994 hammer2_chain_unlock(*chainp); 5995 hammer2_chain_drop(*chainp); 5996 *chainp = NULL; 5997 if (*parentp) { 5998 hammer2_chain_unlock(*parentp); 5999 hammer2_chain_drop(*parentp); 6000 *parentp = NULL; 6001 } 6002 } 6003 6004 /* 6005 * Inodes hang off of the iroot (bit 63 is clear, differentiating 6006 * inodes from root directory entries in the key lookup). 6007 */ 6008 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 6009 rchain = NULL; 6010 if (parent) { 6011 /* 6012 * NOTE: rchain can be returned as NULL even if error == 0 6013 * (i.e. not found) 6014 */ 6015 rchain = hammer2_chain_lookup(&parent, &key_dummy, 6016 inum, inum, 6017 &error, flags); 6018 /* 6019 * Propagate a chain-specific error to caller. 6020 * 6021 * If the chain is not errored, we must still validate that the inode 6022 * number is correct, because all hell will break loose if it isn't 6023 * correct. It should always be correct so print to the console and 6024 * simulate a CHECK error if it is not. 6025 */ 6026 if (error == 0 && rchain) { 6027 error = rchain->error; 6028 if (error == 0 && rchain->data) { 6029 if (inum != rchain->data->ipdata.meta.inum) { 6030 kprintf("hammer2_chain_inode_find: lookup inum %ld, " 6031 "got valid inode but with inum %ld\n", 6032 inum, rchain->data->ipdata.meta.inum); 6033 error = HAMMER2_ERROR_CHECK; 6034 rchain->error = error; 6035 } 6036 } 6037 } 6038 } else { 6039 error = HAMMER2_ERROR_EIO; 6040 } 6041 *parentp = parent; 6042 *chainp = rchain; 6043 6044 return error; 6045 } 6046 6047 /* 6048 * Used by the bulkscan code to snapshot the synchronized storage for 6049 * a volume, allowing it to be scanned concurrently against normal 6050 * operation. 6051 */ 6052 hammer2_chain_t * 6053 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 6054 { 6055 hammer2_chain_t *copy; 6056 6057 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 6058 copy->data = kmalloc(sizeof(copy->data->voldata), 6059 hmp->mmsg, M_WAITOK | M_ZERO); 6060 hammer2_voldata_lock(hmp); 6061 copy->data->voldata = hmp->volsync; 6062 hammer2_voldata_unlock(hmp); 6063 6064 return copy; 6065 } 6066 6067 void 6068 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 6069 { 6070 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 6071 KKASSERT(copy->data); 6072 kfree(copy->data, copy->hmp->mmsg); 6073 copy->data = NULL; 6074 hammer2_chain_drop(copy); 6075 } 6076 6077 /* 6078 * Returns non-zero if the chain (INODE or DIRENT) matches the 6079 * filename. 6080 */ 6081 int 6082 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 6083 size_t name_len) 6084 { 6085 const hammer2_inode_data_t *ripdata; 6086 6087 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 6088 ripdata = &chain->data->ipdata; 6089 if (ripdata->meta.name_len == name_len && 6090 bcmp(ripdata->filename, name, name_len) == 0) { 6091 return 1; 6092 } 6093 } 6094 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 6095 chain->bref.embed.dirent.namlen == name_len) { 6096 if (name_len > sizeof(chain->bref.check.buf) && 6097 bcmp(chain->data->buf, name, name_len) == 0) { 6098 return 1; 6099 } 6100 if (name_len <= sizeof(chain->bref.check.buf) && 6101 bcmp(chain->bref.check.buf, name, name_len) == 0) { 6102 return 1; 6103 } 6104 } 6105 return 0; 6106 } 6107