1 /* 2 * Copyright (c) 2011-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/buf.h> 61 62 #include <crypto/sha2/sha2.h> 63 64 #include "hammer2.h" 65 66 static hammer2_chain_t *hammer2_chain_create_indirect( 67 hammer2_chain_t *parent, 68 hammer2_key_t key, int keybits, 69 hammer2_tid_t mtid, int for_type, int *errorp); 70 static int hammer2_chain_delete_obref(hammer2_chain_t *parent, 71 hammer2_chain_t *chain, 72 hammer2_tid_t mtid, int flags, 73 hammer2_blockref_t *obref); 74 static hammer2_chain_t *hammer2_combined_find( 75 hammer2_chain_t *parent, 76 hammer2_blockref_t *base, int count, 77 hammer2_key_t *key_nextp, 78 hammer2_key_t key_beg, hammer2_key_t key_end, 79 hammer2_blockref_t **brefp); 80 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 81 int depth); 82 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp); 83 84 /* 85 * There are many degenerate situations where an extreme rate of console 86 * output can occur from warnings and errors. Make sure this output does 87 * not impede operations. 88 */ 89 static struct krate krate_h2chk = { .freq = 5 }; 90 static struct krate krate_h2me = { .freq = 1 }; 91 static struct krate krate_h2em = { .freq = 1 }; 92 93 /* 94 * Basic RBTree for chains (core.rbtree). 95 */ 96 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 97 98 int 99 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 100 { 101 hammer2_key_t c1_beg; 102 hammer2_key_t c1_end; 103 hammer2_key_t c2_beg; 104 hammer2_key_t c2_end; 105 106 /* 107 * Compare chains. Overlaps are not supposed to happen and catch 108 * any software issues early we count overlaps as a match. 109 */ 110 c1_beg = chain1->bref.key; 111 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 112 c2_beg = chain2->bref.key; 113 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 114 115 if (c1_end < c2_beg) /* fully to the left */ 116 return(-1); 117 if (c1_beg > c2_end) /* fully to the right */ 118 return(1); 119 return(0); /* overlap (must not cross edge boundary) */ 120 } 121 122 /* 123 * Assert that a chain has no media data associated with it. 124 */ 125 static __inline void 126 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 127 { 128 KKASSERT(chain->dio == NULL); 129 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 130 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 131 chain->data) { 132 panic("hammer2_chain_assert_no_data: chain %p still has data", 133 chain); 134 } 135 } 136 137 /* 138 * Make a chain visible to the flusher. The flusher operates using a top-down 139 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains, 140 * flushes them, and updates blocks back to the volume root. 141 * 142 * This routine sets the ONFLUSH flag upward from the triggering chain until 143 * it hits an inode root or the volume root. Inode chains serve as inflection 144 * points, requiring the flusher to bridge across trees. Inodes include 145 * regular inodes, PFS roots (pmp->iroot), and the media super root 146 * (spmp->iroot). 147 */ 148 void 149 hammer2_chain_setflush(hammer2_chain_t *chain) 150 { 151 hammer2_chain_t *parent; 152 153 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 154 hammer2_spin_sh(&chain->core.spin); 155 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 156 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 157 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 158 break; 159 if ((parent = chain->parent) == NULL) 160 break; 161 hammer2_spin_sh(&parent->core.spin); 162 hammer2_spin_unsh(&chain->core.spin); 163 chain = parent; 164 } 165 hammer2_spin_unsh(&chain->core.spin); 166 } 167 } 168 169 /* 170 * Allocate a new disconnected chain element representing the specified 171 * bref. chain->refs is set to 1 and the passed bref is copied to 172 * chain->bref. chain->bytes is derived from the bref. 173 * 174 * chain->pmp inherits pmp unless the chain is an inode (other than the 175 * super-root inode). 176 * 177 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 178 */ 179 hammer2_chain_t * 180 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 181 hammer2_blockref_t *bref) 182 { 183 hammer2_chain_t *chain; 184 u_int bytes; 185 186 /* 187 * Special case - radix of 0 indicates a chain that does not 188 * need a data reference (context is completely embedded in the 189 * bref). 190 */ 191 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 192 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 193 else 194 bytes = 0; 195 196 switch(bref->type) { 197 case HAMMER2_BREF_TYPE_INODE: 198 case HAMMER2_BREF_TYPE_INDIRECT: 199 case HAMMER2_BREF_TYPE_DATA: 200 case HAMMER2_BREF_TYPE_DIRENT: 201 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 202 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 203 case HAMMER2_BREF_TYPE_FREEMAP: 204 case HAMMER2_BREF_TYPE_VOLUME: 205 chain = kmalloc_obj(sizeof(*chain), hmp->mchain, 206 M_WAITOK | M_ZERO); 207 atomic_add_long(&hammer2_chain_allocs, 1); 208 break; 209 case HAMMER2_BREF_TYPE_EMPTY: 210 default: 211 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 212 bref->type); 213 break; 214 } 215 216 /* 217 * Initialize the new chain structure. pmp must be set to NULL for 218 * chains belonging to the super-root topology of a device mount. 219 */ 220 if (pmp == hmp->spmp) 221 chain->pmp = NULL; 222 else 223 chain->pmp = pmp; 224 225 chain->hmp = hmp; 226 chain->bref = *bref; 227 chain->bytes = bytes; 228 chain->refs = 1; 229 chain->flags = HAMMER2_CHAIN_ALLOCATED; 230 lockinit(&chain->diolk, "chdio", 0, 0); 231 232 /* 233 * Set the PFS boundary flag if this chain represents a PFS root. 234 */ 235 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 236 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 237 hammer2_chain_init(chain); 238 239 return (chain); 240 } 241 242 /* 243 * A common function to initialize chains including fchain and vchain. 244 */ 245 void 246 hammer2_chain_init(hammer2_chain_t *chain) 247 { 248 RB_INIT(&chain->core.rbtree); /* live chains */ 249 hammer2_mtx_init(&chain->lock, "h2chain"); 250 hammer2_spin_init(&chain->core.spin, "h2chain"); 251 } 252 253 /* 254 * Add a reference to a chain element, preventing its destruction. 255 * 256 * (can be called with spinlock held) 257 */ 258 void 259 hammer2_chain_ref(hammer2_chain_t *chain) 260 { 261 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 262 /* 263 * Just flag that the chain was used and should be recycled 264 * on the LRU if it encounters it later. 265 */ 266 if (chain->flags & HAMMER2_CHAIN_ONLRU) 267 atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 268 269 #if 0 270 /* 271 * REMOVED - reduces contention, lru_list is more heuristical 272 * now. 273 * 274 * 0->non-zero transition must ensure that chain is removed 275 * from the LRU list. 276 * 277 * NOTE: Already holding lru_spin here so we cannot call 278 * hammer2_chain_ref() to get it off lru_list, do 279 * it manually. 280 */ 281 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 282 hammer2_pfs_t *pmp = chain->pmp; 283 hammer2_spin_ex(&pmp->lru_spin); 284 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 285 atomic_add_int(&pmp->lru_count, -1); 286 atomic_clear_int(&chain->flags, 287 HAMMER2_CHAIN_ONLRU); 288 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 289 } 290 hammer2_spin_unex(&pmp->lru_spin); 291 } 292 #endif 293 } 294 } 295 296 /* 297 * Ref a locked chain and force the data to be held across an unlock. 298 * Chain must be currently locked. The user of the chain who desires 299 * to release the hold must call hammer2_chain_lock_unhold() to relock 300 * and unhold the chain, then unlock normally, or may simply call 301 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 302 */ 303 void 304 hammer2_chain_ref_hold(hammer2_chain_t *chain) 305 { 306 atomic_add_int(&chain->lockcnt, 1); 307 hammer2_chain_ref(chain); 308 } 309 310 /* 311 * Insert the chain in the core rbtree. 312 * 313 * Normal insertions are placed in the live rbtree. Insertion of a deleted 314 * chain is a special case used by the flush code that is placed on the 315 * unstaged deleted list to avoid confusing the live view. 316 */ 317 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 318 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 319 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 320 321 static 322 int 323 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 324 int flags, int generation) 325 { 326 hammer2_chain_t *xchain __debugvar; 327 int error = 0; 328 329 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 330 hammer2_spin_ex(&parent->core.spin); 331 332 /* 333 * Interlocked by spinlock, check for race 334 */ 335 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 336 parent->core.generation != generation) { 337 error = HAMMER2_ERROR_EAGAIN; 338 goto failed; 339 } 340 341 /* 342 * Insert chain 343 */ 344 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 345 KASSERT(xchain == NULL, 346 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 347 chain, xchain, chain->bref.key)); 348 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 349 chain->parent = parent; 350 ++parent->core.chain_count; 351 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 352 353 /* 354 * We have to keep track of the effective live-view blockref count 355 * so the create code knows when to push an indirect block. 356 */ 357 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 358 atomic_add_int(&parent->core.live_count, 1); 359 failed: 360 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 361 hammer2_spin_unex(&parent->core.spin); 362 return error; 363 } 364 365 /* 366 * Drop the caller's reference to the chain. When the ref count drops to 367 * zero this function will try to disassociate the chain from its parent and 368 * deallocate it, then recursely drop the parent using the implied ref 369 * from the chain's chain->parent. 370 * 371 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 372 * races an acquisition by another cpu. Therefore we can loop if we are 373 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 374 * race against another drop. 375 */ 376 void 377 hammer2_chain_drop(hammer2_chain_t *chain) 378 { 379 u_int refs; 380 381 KKASSERT(chain->refs > 0); 382 383 while (chain) { 384 refs = chain->refs; 385 cpu_ccfence(); 386 KKASSERT(refs > 0); 387 388 if (refs == 1) { 389 if (hammer2_mtx_ex_try(&chain->lock) == 0) 390 chain = hammer2_chain_lastdrop(chain, 0); 391 /* retry the same chain, or chain from lastdrop */ 392 } else { 393 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 394 break; 395 /* retry the same chain */ 396 } 397 cpu_pause(); 398 } 399 } 400 401 /* 402 * Unhold a held and probably not-locked chain, ensure that the data is 403 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 404 * lock and then simply unlocking the chain. 405 */ 406 void 407 hammer2_chain_unhold(hammer2_chain_t *chain) 408 { 409 u_int lockcnt; 410 int iter = 0; 411 412 for (;;) { 413 lockcnt = chain->lockcnt; 414 cpu_ccfence(); 415 if (lockcnt > 1) { 416 if (atomic_cmpset_int(&chain->lockcnt, 417 lockcnt, lockcnt - 1)) { 418 break; 419 } 420 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) { 421 hammer2_chain_unlock(chain); 422 break; 423 } else { 424 /* 425 * This situation can easily occur on SMP due to 426 * the gap inbetween the 1->0 transition and the 427 * final unlock. We cannot safely block on the 428 * mutex because lockcnt might go above 1. 429 * 430 * XXX Sleep for one tick if it takes too long. 431 */ 432 if (++iter > 1000) { 433 if (iter > 1000 + hz) { 434 kprintf("hammer2: h2race1 %p\n", chain); 435 iter = 1000; 436 } 437 tsleep(&iter, 0, "h2race1", 1); 438 } 439 cpu_pause(); 440 } 441 } 442 } 443 444 void 445 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 446 { 447 hammer2_chain_unhold(chain); 448 hammer2_chain_drop(chain); 449 } 450 451 void 452 hammer2_chain_rehold(hammer2_chain_t *chain) 453 { 454 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED); 455 atomic_add_int(&chain->lockcnt, 1); 456 hammer2_chain_unlock(chain); 457 } 458 459 /* 460 * Handles the (potential) last drop of chain->refs from 1->0. Called with 461 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 462 * possible against refs and lockcnt. We must dispose of the mutex on chain. 463 * 464 * This function returns an unlocked chain for recursive drop or NULL. It 465 * can return the same chain if it determines it has raced another ref. 466 * 467 * -- 468 * 469 * When two chains need to be recursively dropped we use the chain we 470 * would otherwise free to placehold the additional chain. It's a bit 471 * convoluted but we can't just recurse without potentially blowing out 472 * the kernel stack. 473 * 474 * The chain cannot be freed if it has any children. 475 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 476 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 477 * Any dedup registration can remain intact. 478 * 479 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 480 */ 481 static 482 hammer2_chain_t * 483 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth) 484 { 485 hammer2_pfs_t *pmp; 486 hammer2_dev_t *hmp; 487 hammer2_chain_t *parent; 488 hammer2_chain_t *rdrop; 489 490 /* 491 * We need chain's spinlock to interlock the sub-tree test. 492 * We already have chain's mutex, protecting chain->parent. 493 * 494 * Remember that chain->refs can be in flux. 495 */ 496 hammer2_spin_ex(&chain->core.spin); 497 498 if (chain->parent != NULL) { 499 /* 500 * If the chain has a parent the UPDATE bit prevents scrapping 501 * as the chain is needed to properly flush the parent. Try 502 * to complete the 1->0 transition and return NULL. Retry 503 * (return chain) if we are unable to complete the 1->0 504 * transition, else return NULL (nothing more to do). 505 * 506 * If the chain has a parent the MODIFIED bit prevents 507 * scrapping. 508 * 509 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 510 */ 511 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 512 HAMMER2_CHAIN_MODIFIED)) { 513 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 514 hammer2_spin_unex(&chain->core.spin); 515 hammer2_chain_assert_no_data(chain); 516 hammer2_mtx_unlock(&chain->lock); 517 chain = NULL; 518 } else { 519 hammer2_spin_unex(&chain->core.spin); 520 hammer2_mtx_unlock(&chain->lock); 521 } 522 return (chain); 523 } 524 /* spinlock still held */ 525 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME || 526 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) { 527 /* 528 * Retain the static vchain and fchain. Clear bits that 529 * are not relevant. Do not clear the MODIFIED bit, 530 * and certainly do not put it on the delayed-flush queue. 531 */ 532 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 533 } else { 534 /* 535 * The chain has no parent and can be flagged for destruction. 536 * Since it has no parent, UPDATE can also be cleared. 537 */ 538 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 539 if (chain->flags & HAMMER2_CHAIN_UPDATE) 540 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 541 542 /* 543 * If the chain has children we must propagate the DESTROY 544 * flag downward and rip the disconnected topology apart. 545 * This is accomplished by calling hammer2_flush() on the 546 * chain. 547 * 548 * Any dedup is already handled by the underlying DIO, so 549 * we do not have to specifically flush it here. 550 */ 551 if (chain->core.chain_count) { 552 hammer2_spin_unex(&chain->core.spin); 553 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 554 HAMMER2_FLUSH_ALL); 555 hammer2_mtx_unlock(&chain->lock); 556 557 return(chain); /* retry drop */ 558 } 559 560 /* 561 * Otherwise we can scrap the MODIFIED bit if it is set, 562 * and continue along the freeing path. 563 * 564 * Be sure to clean-out any dedup bits. Without a parent 565 * this chain will no longer be visible to the flush code. 566 * Easy check data_off to avoid the volume root. 567 */ 568 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 569 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 570 atomic_add_long(&hammer2_count_modified_chains, -1); 571 if (chain->pmp) 572 hammer2_pfs_memory_wakeup(chain->pmp, -1); 573 } 574 /* spinlock still held */ 575 } 576 577 /* spinlock still held */ 578 579 /* 580 * If any children exist we must leave the chain intact with refs == 0. 581 * They exist because chains are retained below us which have refs or 582 * may require flushing. 583 * 584 * Retry (return chain) if we fail to transition the refs to 0, else 585 * return NULL indication nothing more to do. 586 * 587 * Chains with children are NOT put on the LRU list. 588 */ 589 if (chain->core.chain_count) { 590 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 591 hammer2_spin_unex(&chain->core.spin); 592 hammer2_chain_assert_no_data(chain); 593 hammer2_mtx_unlock(&chain->lock); 594 chain = NULL; 595 } else { 596 hammer2_spin_unex(&chain->core.spin); 597 hammer2_mtx_unlock(&chain->lock); 598 } 599 return (chain); 600 } 601 /* spinlock still held */ 602 /* no chains left under us */ 603 604 /* 605 * chain->core has no children left so no accessors can get to our 606 * chain from there. Now we have to lock the parent core to interlock 607 * remaining possible accessors that might bump chain's refs before 608 * we can safely drop chain's refs with intent to free the chain. 609 */ 610 hmp = chain->hmp; 611 pmp = chain->pmp; /* can be NULL */ 612 rdrop = NULL; 613 614 parent = chain->parent; 615 616 /* 617 * WARNING! chain's spin lock is still held here, and other spinlocks 618 * will be acquired and released in the code below. We 619 * cannot be making fancy procedure calls! 620 */ 621 622 /* 623 * We can cache the chain if it is associated with a pmp 624 * and not flagged as being destroyed or requesting a full 625 * release. In this situation the chain is not removed 626 * from its parent, i.e. it can still be looked up. 627 * 628 * We intentionally do not cache DATA chains because these 629 * were likely used to load data into the logical buffer cache 630 * and will not be accessed again for some time. 631 */ 632 if ((chain->flags & 633 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 634 chain->pmp && 635 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 636 if (parent) 637 hammer2_spin_ex(&parent->core.spin); 638 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 639 /* 640 * 1->0 transition failed, retry. Do not drop 641 * the chain's data yet! 642 */ 643 if (parent) 644 hammer2_spin_unex(&parent->core.spin); 645 hammer2_spin_unex(&chain->core.spin); 646 hammer2_mtx_unlock(&chain->lock); 647 648 return(chain); 649 } 650 651 /* 652 * Success 653 */ 654 hammer2_chain_assert_no_data(chain); 655 656 /* 657 * Make sure we are on the LRU list, clean up excessive 658 * LRU entries. We can only really drop one but there might 659 * be other entries that we can remove from the lru_list 660 * without dropping. 661 * 662 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when 663 * chain->core.spin AND pmp->lru_spin are held, but 664 * can be safely cleared only holding pmp->lru_spin. 665 */ 666 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 667 hammer2_spin_ex(&pmp->lru_spin); 668 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 669 atomic_set_int(&chain->flags, 670 HAMMER2_CHAIN_ONLRU); 671 TAILQ_INSERT_TAIL(&pmp->lru_list, 672 chain, lru_node); 673 atomic_add_int(&pmp->lru_count, 1); 674 } 675 if (pmp->lru_count < HAMMER2_LRU_LIMIT) 676 depth = 1; /* disable lru_list flush */ 677 hammer2_spin_unex(&pmp->lru_spin); 678 } else { 679 /* disable lru flush */ 680 depth = 1; 681 } 682 683 if (parent) { 684 hammer2_spin_unex(&parent->core.spin); 685 parent = NULL; /* safety */ 686 } 687 hammer2_spin_unex(&chain->core.spin); 688 hammer2_mtx_unlock(&chain->lock); 689 690 /* 691 * lru_list hysteresis (see above for depth overrides). 692 * Note that depth also prevents excessive lastdrop recursion. 693 */ 694 if (depth == 0) 695 hammer2_chain_lru_flush(pmp); 696 697 return NULL; 698 /* NOT REACHED */ 699 } 700 701 /* 702 * Make sure we are not on the LRU list. 703 */ 704 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 705 hammer2_spin_ex(&pmp->lru_spin); 706 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 707 atomic_add_int(&pmp->lru_count, -1); 708 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 709 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 710 } 711 hammer2_spin_unex(&pmp->lru_spin); 712 } 713 714 /* 715 * Spinlock the parent and try to drop the last ref on chain. 716 * On success determine if we should dispose of the chain 717 * (remove the chain from its parent, etc). 718 * 719 * (normal core locks are top-down recursive but we define 720 * core spinlocks as bottom-up recursive, so this is safe). 721 */ 722 if (parent) { 723 hammer2_spin_ex(&parent->core.spin); 724 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 725 /* 726 * 1->0 transition failed, retry. 727 */ 728 hammer2_spin_unex(&parent->core.spin); 729 hammer2_spin_unex(&chain->core.spin); 730 hammer2_mtx_unlock(&chain->lock); 731 732 return(chain); 733 } 734 735 /* 736 * 1->0 transition successful, parent spin held to prevent 737 * new lookups, chain spinlock held to protect parent field. 738 * Remove chain from the parent. 739 * 740 * If the chain is being removed from the parent's btree but 741 * is not blkmapped, we have to adjust live_count downward. If 742 * it is blkmapped then the blockref is retained in the parent 743 * as is its associated live_count. This case can occur when 744 * a chain added to the topology is unable to flush and is 745 * then later deleted. 746 */ 747 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 748 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 749 (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) { 750 atomic_add_int(&parent->core.live_count, -1); 751 } 752 RB_REMOVE(hammer2_chain_tree, 753 &parent->core.rbtree, chain); 754 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 755 --parent->core.chain_count; 756 chain->parent = NULL; 757 } 758 759 /* 760 * If our chain was the last chain in the parent's core the 761 * core is now empty and its parent might have to be 762 * re-dropped if it has 0 refs. 763 */ 764 if (parent->core.chain_count == 0) { 765 rdrop = parent; 766 atomic_add_int(&rdrop->refs, 1); 767 /* 768 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 769 rdrop = NULL; 770 */ 771 } 772 hammer2_spin_unex(&parent->core.spin); 773 parent = NULL; /* safety */ 774 /* FALL THROUGH */ 775 } else { 776 /* 777 * No-parent case. 778 */ 779 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 780 /* 781 * 1->0 transition failed, retry. 782 */ 783 hammer2_spin_unex(&parent->core.spin); 784 hammer2_spin_unex(&chain->core.spin); 785 hammer2_mtx_unlock(&chain->lock); 786 787 return(chain); 788 } 789 } 790 791 /* 792 * Successful 1->0 transition, no parent, no children... no way for 793 * anyone to ref this chain any more. We can clean-up and free it. 794 * 795 * We still have the core spinlock, and core's chain_count is 0. 796 * Any parent spinlock is gone. 797 */ 798 hammer2_spin_unex(&chain->core.spin); 799 hammer2_chain_assert_no_data(chain); 800 hammer2_mtx_unlock(&chain->lock); 801 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 802 chain->core.chain_count == 0); 803 804 /* 805 * All locks are gone, no pointers remain to the chain, finish 806 * freeing it. 807 */ 808 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 809 HAMMER2_CHAIN_MODIFIED)) == 0); 810 811 /* 812 * Once chain resources are gone we can use the now dead chain 813 * structure to placehold what might otherwise require a recursive 814 * drop, because we have potentially two things to drop and can only 815 * return one directly. 816 */ 817 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 818 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 819 chain->hmp = NULL; 820 kfree_obj(chain, hmp->mchain); 821 atomic_add_long(&hammer2_chain_allocs, -1); 822 } 823 824 /* 825 * Possible chaining loop when parent re-drop needed. 826 */ 827 return(rdrop); 828 } 829 830 /* 831 * Heuristical flush of the LRU, try to reduce the number of entries 832 * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3). This procedure is called 833 * only when lru_count exceeds HAMMER2_LRU_LIMIT. 834 */ 835 static 836 void 837 hammer2_chain_lru_flush(hammer2_pfs_t *pmp) 838 { 839 hammer2_chain_t *chain; 840 841 again: 842 chain = NULL; 843 hammer2_spin_ex(&pmp->lru_spin); 844 while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) { 845 /* 846 * Pick a chain off the lru_list, just recycle it quickly 847 * if LRUHINT is set (the chain was ref'd but left on 848 * the lru_list, so cycle to the end). 849 */ 850 chain = TAILQ_FIRST(&pmp->lru_list); 851 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 852 853 if (chain->flags & HAMMER2_CHAIN_LRUHINT) { 854 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 855 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 856 chain = NULL; 857 continue; 858 } 859 860 /* 861 * Ok, we are off the LRU. We must adjust refs before we 862 * can safely clear the ONLRU flag. 863 */ 864 atomic_add_int(&pmp->lru_count, -1); 865 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 866 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 867 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 868 break; 869 } 870 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 871 chain = NULL; 872 } 873 hammer2_spin_unex(&pmp->lru_spin); 874 if (chain == NULL) 875 return; 876 877 /* 878 * If we picked a chain off the lru list we may be able to lastdrop 879 * it. Use a depth of 1 to prevent excessive lastdrop recursion. 880 */ 881 while (chain) { 882 u_int refs; 883 884 refs = chain->refs; 885 cpu_ccfence(); 886 KKASSERT(refs > 0); 887 888 if (refs == 1) { 889 if (hammer2_mtx_ex_try(&chain->lock) == 0) 890 chain = hammer2_chain_lastdrop(chain, 1); 891 /* retry the same chain, or chain from lastdrop */ 892 } else { 893 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 894 break; 895 /* retry the same chain */ 896 } 897 cpu_pause(); 898 } 899 goto again; 900 } 901 902 /* 903 * On last lock release. 904 */ 905 static hammer2_io_t * 906 hammer2_chain_drop_data(hammer2_chain_t *chain) 907 { 908 hammer2_io_t *dio; 909 910 if ((dio = chain->dio) != NULL) { 911 chain->dio = NULL; 912 chain->data = NULL; 913 } else { 914 switch(chain->bref.type) { 915 case HAMMER2_BREF_TYPE_VOLUME: 916 case HAMMER2_BREF_TYPE_FREEMAP: 917 break; 918 default: 919 if (chain->data != NULL) { 920 hammer2_spin_unex(&chain->core.spin); 921 panic("chain data not null: " 922 "chain %p bref %016jx.%02x " 923 "refs %d parent %p dio %p data %p", 924 chain, chain->bref.data_off, 925 chain->bref.type, chain->refs, 926 chain->parent, 927 chain->dio, chain->data); 928 } 929 KKASSERT(chain->data == NULL); 930 break; 931 } 932 } 933 return dio; 934 } 935 936 /* 937 * Lock a referenced chain element, acquiring its data with I/O if necessary, 938 * and specify how you would like the data to be resolved. 939 * 940 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 941 * 942 * The lock is allowed to recurse, multiple locking ops will aggregate 943 * the requested resolve types. Once data is assigned it will not be 944 * removed until the last unlock. 945 * 946 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 947 * (typically used to avoid device/logical buffer 948 * aliasing for data) 949 * 950 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 951 * the INITIAL-create state (indirect blocks only). 952 * 953 * Do not resolve data elements for DATA chains. 954 * (typically used to avoid device/logical buffer 955 * aliasing for data) 956 * 957 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 958 * 959 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 960 * it will be locked exclusive. 961 * 962 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If 963 * the lock fails, EAGAIN is returned. 964 * 965 * NOTE: Embedded elements (volume header, inodes) are always resolved 966 * regardless. 967 * 968 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 969 * element will instantiate and zero its buffer, and flush it on 970 * release. 971 * 972 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 973 * so as not to instantiate a device buffer, which could alias against 974 * a logical file buffer. However, if ALWAYS is specified the 975 * device buffer will be instantiated anyway. 976 * 977 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which 978 * case it can be either 0 or EAGAIN. 979 * 980 * WARNING! This function blocks on I/O if data needs to be fetched. This 981 * blocking can run concurrent with other compatible lock holders 982 * who do not need data returning. The lock is not upgraded to 983 * exclusive during a data fetch, a separate bit is used to 984 * interlock I/O. However, an exclusive lock holder can still count 985 * on being interlocked against an I/O fetch managed by a shared 986 * lock holder. 987 */ 988 int 989 hammer2_chain_lock(hammer2_chain_t *chain, int how) 990 { 991 KKASSERT(chain->refs > 0); 992 993 if (how & HAMMER2_RESOLVE_NONBLOCK) { 994 /* 995 * We still have to bump lockcnt before acquiring the lock, 996 * even for non-blocking operation, because the unlock code 997 * live-loops on lockcnt == 1 when dropping the last lock. 998 * 999 * If the non-blocking operation fails we have to use an 1000 * unhold sequence to undo the mess. 1001 * 1002 * NOTE: LOCKAGAIN must always succeed without blocking, 1003 * even if NONBLOCK is specified. 1004 */ 1005 atomic_add_int(&chain->lockcnt, 1); 1006 if (how & HAMMER2_RESOLVE_SHARED) { 1007 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1008 hammer2_mtx_sh_again(&chain->lock); 1009 } else { 1010 if (hammer2_mtx_sh_try(&chain->lock) != 0) { 1011 hammer2_chain_unhold(chain); 1012 return EAGAIN; 1013 } 1014 } 1015 } else { 1016 if (hammer2_mtx_ex_try(&chain->lock) != 0) { 1017 hammer2_chain_unhold(chain); 1018 return EAGAIN; 1019 } 1020 } 1021 } else { 1022 /* 1023 * Get the appropriate lock. If LOCKAGAIN is flagged with 1024 * SHARED the caller expects a shared lock to already be 1025 * present and we are giving it another ref. This case must 1026 * importantly not block if there is a pending exclusive lock 1027 * request. 1028 */ 1029 atomic_add_int(&chain->lockcnt, 1); 1030 if (how & HAMMER2_RESOLVE_SHARED) { 1031 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1032 hammer2_mtx_sh_again(&chain->lock); 1033 } else { 1034 hammer2_mtx_sh(&chain->lock); 1035 } 1036 } else { 1037 hammer2_mtx_ex(&chain->lock); 1038 } 1039 } 1040 1041 /* 1042 * If we already have a valid data pointer make sure the data is 1043 * synchronized to the current cpu, and then no further action is 1044 * necessary. 1045 */ 1046 if (chain->data) { 1047 if (chain->dio) 1048 hammer2_io_bkvasync(chain->dio); 1049 return 0; 1050 } 1051 1052 /* 1053 * Do we have to resolve the data? This is generally only 1054 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 1055 * Other BREF types expects the data to be there. 1056 */ 1057 switch(how & HAMMER2_RESOLVE_MASK) { 1058 case HAMMER2_RESOLVE_NEVER: 1059 return 0; 1060 case HAMMER2_RESOLVE_MAYBE: 1061 if (chain->flags & HAMMER2_CHAIN_INITIAL) 1062 return 0; 1063 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 1064 return 0; 1065 #if 0 1066 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 1067 return 0; 1068 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 1069 return 0; 1070 #endif 1071 /* fall through */ 1072 case HAMMER2_RESOLVE_ALWAYS: 1073 default: 1074 break; 1075 } 1076 1077 /* 1078 * Caller requires data 1079 */ 1080 hammer2_chain_load_data(chain); 1081 1082 return 0; 1083 } 1084 1085 #if 0 1086 /* 1087 * Lock the chain, retain the hold, and drop the data persistence count. 1088 * The data should remain valid because we never transitioned lockcnt 1089 * through 0. 1090 */ 1091 void 1092 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 1093 { 1094 hammer2_chain_lock(chain, how); 1095 atomic_add_int(&chain->lockcnt, -1); 1096 } 1097 1098 /* 1099 * Downgrade an exclusive chain lock to a shared chain lock. 1100 * 1101 * NOTE: There is no upgrade equivalent due to the ease of 1102 * deadlocks in that direction. 1103 */ 1104 void 1105 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1106 { 1107 hammer2_mtx_downgrade(&chain->lock); 1108 } 1109 #endif 1110 1111 /* 1112 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1113 * may be of any type. 1114 * 1115 * Once chain->data is set it cannot be disposed of until all locks are 1116 * released. 1117 * 1118 * Make sure the data is synchronized to the current cpu. 1119 */ 1120 void 1121 hammer2_chain_load_data(hammer2_chain_t *chain) 1122 { 1123 hammer2_blockref_t *bref; 1124 hammer2_dev_t *hmp; 1125 hammer2_io_t *dio; 1126 char *bdata; 1127 int error; 1128 1129 /* 1130 * Degenerate case, data already present, or chain has no media 1131 * reference to load. 1132 */ 1133 KKASSERT(chain->lock.mtx_lock & MTX_MASK); 1134 if (chain->data) { 1135 if (chain->dio) 1136 hammer2_io_bkvasync(chain->dio); 1137 return; 1138 } 1139 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1140 return; 1141 1142 hmp = chain->hmp; 1143 KKASSERT(hmp != NULL); 1144 1145 /* 1146 * Gain the IOINPROG bit, interlocked block. 1147 */ 1148 for (;;) { 1149 u_int oflags; 1150 u_int nflags; 1151 1152 oflags = chain->flags; 1153 cpu_ccfence(); 1154 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1155 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1156 tsleep_interlock(&chain->flags, 0); 1157 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1158 tsleep(&chain->flags, PINTERLOCKED, 1159 "h2iocw", 0); 1160 } 1161 /* retry */ 1162 } else { 1163 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1164 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1165 break; 1166 } 1167 /* retry */ 1168 } 1169 } 1170 1171 /* 1172 * We own CHAIN_IOINPROG 1173 * 1174 * Degenerate case if we raced another load. 1175 */ 1176 if (chain->data) { 1177 if (chain->dio) 1178 hammer2_io_bkvasync(chain->dio); 1179 goto done; 1180 } 1181 1182 /* 1183 * We must resolve to a device buffer, either by issuing I/O or 1184 * by creating a zero-fill element. We do not mark the buffer 1185 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1186 * API must still be used to do that). 1187 * 1188 * The device buffer is variable-sized in powers of 2 down 1189 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1190 * chunk always contains buffers of the same size. (XXX) 1191 * 1192 * The minimum physical IO size may be larger than the variable 1193 * block size. 1194 */ 1195 bref = &chain->bref; 1196 1197 /* 1198 * The getblk() optimization can only be used on newly created 1199 * elements if the physical block size matches the request. 1200 */ 1201 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1202 error = hammer2_io_new(hmp, bref->type, 1203 bref->data_off, chain->bytes, 1204 &chain->dio); 1205 } else { 1206 error = hammer2_io_bread(hmp, bref->type, 1207 bref->data_off, chain->bytes, 1208 &chain->dio); 1209 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 1210 } 1211 if (error) { 1212 chain->error = HAMMER2_ERROR_EIO; 1213 kprintf("hammer2_chain_load_data: I/O error %016jx: %d\n", 1214 (intmax_t)bref->data_off, error); 1215 hammer2_io_bqrelse(&chain->dio); 1216 goto done; 1217 } 1218 chain->error = 0; 1219 1220 /* 1221 * This isn't perfect and can be ignored on OSs which do not have 1222 * an indication as to whether a buffer is coming from cache or 1223 * if I/O was actually issued for the read. TESTEDGOOD will work 1224 * pretty well without the B_IOISSUED logic because chains are 1225 * cached, but in that situation (without B_IOISSUED) it will not 1226 * detect whether a re-read via I/O is corrupted verses the original 1227 * read. 1228 * 1229 * We can't re-run the CRC on every fresh lock. That would be 1230 * insanely expensive. 1231 * 1232 * If the underlying kernel buffer covers the entire chain we can 1233 * use the B_IOISSUED indication to determine if we have to re-run 1234 * the CRC on chain data for chains that managed to stay cached 1235 * across the kernel disposal of the original buffer. 1236 */ 1237 if ((dio = chain->dio) != NULL && dio->bp) { 1238 struct buf *bp = dio->bp; 1239 1240 if (dio->psize == chain->bytes && 1241 (bp->b_flags & B_IOISSUED)) { 1242 atomic_clear_int(&chain->flags, 1243 HAMMER2_CHAIN_TESTEDGOOD); 1244 bp->b_flags &= ~B_IOISSUED; 1245 } 1246 } 1247 1248 /* 1249 * NOTE: A locked chain's data cannot be modified without first 1250 * calling hammer2_chain_modify(). 1251 */ 1252 1253 /* 1254 * NOTE: hammer2_io_data() call issues bkvasync() 1255 */ 1256 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1257 1258 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1259 /* 1260 * Clear INITIAL. In this case we used io_new() and the 1261 * buffer has been zero'd and marked dirty. 1262 * 1263 * CHAIN_MODIFIED has not been set yet, and we leave it 1264 * that way for now. Set a temporary CHAIN_NOTTESTED flag 1265 * to prevent hammer2_chain_testcheck() from trying to match 1266 * a check code that has not yet been generated. This bit 1267 * should NOT end up on the actual media. 1268 */ 1269 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1270 atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 1271 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1272 /* 1273 * check data not currently synchronized due to 1274 * modification. XXX assumes data stays in the buffer 1275 * cache, which might not be true (need biodep on flush 1276 * to calculate crc? or simple crc?). 1277 */ 1278 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1279 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1280 chain->error = HAMMER2_ERROR_CHECK; 1281 } else { 1282 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1283 } 1284 } 1285 1286 /* 1287 * Setup the data pointer, either pointing it to an embedded data 1288 * structure and copying the data from the buffer, or pointing it 1289 * into the buffer. 1290 * 1291 * The buffer is not retained when copying to an embedded data 1292 * structure in order to avoid potential deadlocks or recursions 1293 * on the same physical buffer. 1294 * 1295 * WARNING! Other threads can start using the data the instant we 1296 * set chain->data non-NULL. 1297 */ 1298 switch (bref->type) { 1299 case HAMMER2_BREF_TYPE_VOLUME: 1300 case HAMMER2_BREF_TYPE_FREEMAP: 1301 /* 1302 * Copy data from bp to embedded buffer 1303 */ 1304 panic("hammer2_chain_load_data: unresolved volume header"); 1305 break; 1306 case HAMMER2_BREF_TYPE_DIRENT: 1307 KKASSERT(chain->bytes != 0); 1308 /* fall through */ 1309 case HAMMER2_BREF_TYPE_INODE: 1310 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1311 case HAMMER2_BREF_TYPE_INDIRECT: 1312 case HAMMER2_BREF_TYPE_DATA: 1313 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1314 default: 1315 /* 1316 * Point data at the device buffer and leave dio intact. 1317 */ 1318 chain->data = (void *)bdata; 1319 break; 1320 } 1321 1322 /* 1323 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1324 */ 1325 done: 1326 for (;;) { 1327 u_int oflags; 1328 u_int nflags; 1329 1330 oflags = chain->flags; 1331 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1332 HAMMER2_CHAIN_IOSIGNAL); 1333 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1334 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1335 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1336 wakeup(&chain->flags); 1337 break; 1338 } 1339 } 1340 } 1341 1342 /* 1343 * Unlock and deref a chain element. 1344 * 1345 * Remember that the presence of children under chain prevent the chain's 1346 * destruction but do not add additional references, so the dio will still 1347 * be dropped. 1348 */ 1349 void 1350 hammer2_chain_unlock(hammer2_chain_t *chain) 1351 { 1352 hammer2_io_t *dio; 1353 u_int lockcnt; 1354 int iter = 0; 1355 1356 /* 1357 * If multiple locks are present (or being attempted) on this 1358 * particular chain we can just unlock, drop refs, and return. 1359 * 1360 * Otherwise fall-through on the 1->0 transition. 1361 */ 1362 for (;;) { 1363 lockcnt = chain->lockcnt; 1364 KKASSERT(lockcnt > 0); 1365 cpu_ccfence(); 1366 if (lockcnt > 1) { 1367 if (atomic_cmpset_int(&chain->lockcnt, 1368 lockcnt, lockcnt - 1)) { 1369 hammer2_mtx_unlock(&chain->lock); 1370 return; 1371 } 1372 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1373 /* while holding the mutex exclusively */ 1374 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1375 break; 1376 } else { 1377 /* 1378 * This situation can easily occur on SMP due to 1379 * the gap inbetween the 1->0 transition and the 1380 * final unlock. We cannot safely block on the 1381 * mutex because lockcnt might go above 1. 1382 * 1383 * XXX Sleep for one tick if it takes too long. 1384 */ 1385 if (++iter > 1000) { 1386 if (iter > 1000 + hz) { 1387 kprintf("hammer2: h2race2 %p\n", chain); 1388 iter = 1000; 1389 } 1390 tsleep(&iter, 0, "h2race2", 1); 1391 } 1392 cpu_pause(); 1393 } 1394 /* retry */ 1395 } 1396 1397 /* 1398 * Last unlock / mutex upgraded to exclusive. Drop the data 1399 * reference. 1400 */ 1401 dio = hammer2_chain_drop_data(chain); 1402 if (dio) 1403 hammer2_io_bqrelse(&dio); 1404 hammer2_mtx_unlock(&chain->lock); 1405 } 1406 1407 #if 0 1408 /* 1409 * Unlock and hold chain data intact 1410 */ 1411 void 1412 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1413 { 1414 atomic_add_int(&chain->lockcnt, 1); 1415 hammer2_chain_unlock(chain); 1416 } 1417 #endif 1418 1419 /* 1420 * Helper to obtain the blockref[] array base and count for a chain. 1421 * 1422 * XXX Not widely used yet, various use cases need to be validated and 1423 * converted to use this function. 1424 */ 1425 static 1426 hammer2_blockref_t * 1427 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1428 { 1429 hammer2_blockref_t *base; 1430 int count; 1431 1432 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1433 base = NULL; 1434 1435 switch(parent->bref.type) { 1436 case HAMMER2_BREF_TYPE_INODE: 1437 count = HAMMER2_SET_COUNT; 1438 break; 1439 case HAMMER2_BREF_TYPE_INDIRECT: 1440 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1441 count = parent->bytes / sizeof(hammer2_blockref_t); 1442 break; 1443 case HAMMER2_BREF_TYPE_VOLUME: 1444 count = HAMMER2_SET_COUNT; 1445 break; 1446 case HAMMER2_BREF_TYPE_FREEMAP: 1447 count = HAMMER2_SET_COUNT; 1448 break; 1449 default: 1450 panic("hammer2_chain_base_and_count: " 1451 "unrecognized blockref type: %d", 1452 parent->bref.type); 1453 count = 0; 1454 break; 1455 } 1456 } else { 1457 switch(parent->bref.type) { 1458 case HAMMER2_BREF_TYPE_INODE: 1459 base = &parent->data->ipdata.u.blockset.blockref[0]; 1460 count = HAMMER2_SET_COUNT; 1461 break; 1462 case HAMMER2_BREF_TYPE_INDIRECT: 1463 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1464 base = &parent->data->npdata[0]; 1465 count = parent->bytes / sizeof(hammer2_blockref_t); 1466 break; 1467 case HAMMER2_BREF_TYPE_VOLUME: 1468 base = &parent->data->voldata. 1469 sroot_blockset.blockref[0]; 1470 count = HAMMER2_SET_COUNT; 1471 break; 1472 case HAMMER2_BREF_TYPE_FREEMAP: 1473 base = &parent->data->blkset.blockref[0]; 1474 count = HAMMER2_SET_COUNT; 1475 break; 1476 default: 1477 panic("hammer2_chain_base_and_count: " 1478 "unrecognized blockref type: %d", 1479 parent->bref.type); 1480 base = NULL; 1481 count = 0; 1482 break; 1483 } 1484 } 1485 *countp = count; 1486 1487 return base; 1488 } 1489 1490 /* 1491 * This counts the number of live blockrefs in a block array and 1492 * also calculates the point at which all remaining blockrefs are empty. 1493 * This routine can only be called on a live chain. 1494 * 1495 * Caller holds the chain locked, but possibly with a shared lock. We 1496 * must use an exclusive spinlock to prevent corruption. 1497 * 1498 * NOTE: Flag is not set until after the count is complete, allowing 1499 * callers to test the flag without holding the spinlock. 1500 * 1501 * NOTE: If base is NULL the related chain is still in the INITIAL 1502 * state and there are no blockrefs to count. 1503 * 1504 * NOTE: live_count may already have some counts accumulated due to 1505 * creation and deletion and could even be initially negative. 1506 */ 1507 void 1508 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1509 hammer2_blockref_t *base, int count) 1510 { 1511 hammer2_spin_ex(&chain->core.spin); 1512 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1513 if (base) { 1514 while (--count >= 0) { 1515 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1516 break; 1517 } 1518 chain->core.live_zero = count + 1; 1519 while (count >= 0) { 1520 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1521 atomic_add_int(&chain->core.live_count, 1522 1); 1523 --count; 1524 } 1525 } else { 1526 chain->core.live_zero = 0; 1527 } 1528 /* else do not modify live_count */ 1529 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1530 } 1531 hammer2_spin_unex(&chain->core.spin); 1532 } 1533 1534 /* 1535 * Resize the chain's physical storage allocation in-place. This function does 1536 * not usually adjust the data pointer and must be followed by (typically) a 1537 * hammer2_chain_modify() call to copy any old data over and adjust the 1538 * data pointer. 1539 * 1540 * Chains can be resized smaller without reallocating the storage. Resizing 1541 * larger will reallocate the storage. Excess or prior storage is reclaimed 1542 * asynchronously at a later time. 1543 * 1544 * An nradix value of 0 is special-cased to mean that the storage should 1545 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1546 * byte). 1547 * 1548 * Must be passed an exclusively locked parent and chain. 1549 * 1550 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1551 * to avoid instantiating a device buffer that conflicts with the vnode data 1552 * buffer. However, because H2 can compress or encrypt data, the chain may 1553 * have a dio assigned to it in those situations, and they do not conflict. 1554 * 1555 * XXX return error if cannot resize. 1556 */ 1557 int 1558 hammer2_chain_resize(hammer2_chain_t *chain, 1559 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1560 int nradix, int flags) 1561 { 1562 hammer2_dev_t *hmp; 1563 size_t obytes; 1564 size_t nbytes; 1565 int error; 1566 1567 hmp = chain->hmp; 1568 1569 /* 1570 * Only data and indirect blocks can be resized for now. 1571 * (The volu root, inodes, and freemap elements use a fixed size). 1572 */ 1573 KKASSERT(chain != &hmp->vchain); 1574 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1575 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1576 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1577 1578 /* 1579 * Nothing to do if the element is already the proper size 1580 */ 1581 obytes = chain->bytes; 1582 nbytes = (nradix) ? (1U << nradix) : 0; 1583 if (obytes == nbytes) 1584 return (chain->error); 1585 1586 /* 1587 * Make sure the old data is instantiated so we can copy it. If this 1588 * is a data block, the device data may be superfluous since the data 1589 * might be in a logical block, but compressed or encrypted data is 1590 * another matter. 1591 * 1592 * NOTE: The modify will set BLKMAPUPD for us if BLKMAPPED is set. 1593 */ 1594 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1595 if (error) 1596 return error; 1597 1598 /* 1599 * Reallocate the block, even if making it smaller (because different 1600 * block sizes may be in different regions). 1601 * 1602 * NOTE: Operation does not copy the data and may only be used 1603 * to resize data blocks in-place, or directory entry blocks 1604 * which are about to be modified in some manner. 1605 */ 1606 error = hammer2_freemap_alloc(chain, nbytes); 1607 if (error) 1608 return error; 1609 1610 chain->bytes = nbytes; 1611 1612 /* 1613 * We don't want the followup chain_modify() to try to copy data 1614 * from the old (wrong-sized) buffer. It won't know how much to 1615 * copy. This case should only occur during writes when the 1616 * originator already has the data to write in-hand. 1617 */ 1618 if (chain->dio) { 1619 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1620 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1621 hammer2_io_brelse(&chain->dio); 1622 chain->data = NULL; 1623 } 1624 return (chain->error); 1625 } 1626 1627 /* 1628 * Set the chain modified so its data can be changed by the caller, or 1629 * install deduplicated data. The caller must call this routine for each 1630 * set of modifications it makes, even if the chain is already flagged 1631 * MODIFIED. 1632 * 1633 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1634 * is a CLC (cluster level change) field and is not updated by parent 1635 * propagation during a flush. 1636 * 1637 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1638 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1639 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1640 * remains unmodified with its old data ref intact and chain->error 1641 * unchanged. 1642 * 1643 * Dedup Handling 1644 * 1645 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1646 * even if the chain is still flagged MODIFIED. In this case the chain's 1647 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1648 * 1649 * If the caller passes a non-zero dedup_off we will use it to assign the 1650 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1651 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1652 * must not modify the data content upon return. 1653 */ 1654 int 1655 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1656 hammer2_off_t dedup_off, int flags) 1657 { 1658 hammer2_dev_t *hmp; 1659 hammer2_io_t *dio; 1660 int error; 1661 int wasinitial; 1662 int setmodified; 1663 int setupdate; 1664 int newmod; 1665 char *bdata; 1666 1667 hmp = chain->hmp; 1668 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE); 1669 1670 /* 1671 * Data is not optional for freemap chains (we must always be sure 1672 * to copy the data on COW storage allocations). 1673 */ 1674 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1675 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1676 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1677 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1678 } 1679 1680 /* 1681 * Data must be resolved if already assigned, unless explicitly 1682 * flagged otherwise. If we cannot safety load the data the 1683 * modification fails and we return early. 1684 */ 1685 if (chain->data == NULL && chain->bytes != 0 && 1686 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1687 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1688 hammer2_chain_load_data(chain); 1689 if (chain->error) 1690 return (chain->error); 1691 } 1692 error = 0; 1693 1694 /* 1695 * Set MODIFIED to indicate that the chain has been modified. A new 1696 * allocation is required when modifying a chain. 1697 * 1698 * Set UPDATE to ensure that the blockref is updated in the parent. 1699 * 1700 * If MODIFIED is already set determine if we can reuse the assigned 1701 * data block or if we need a new data block. 1702 */ 1703 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1704 /* 1705 * Must set modified bit. 1706 */ 1707 atomic_add_long(&hammer2_count_modified_chains, 1); 1708 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1709 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1710 setmodified = 1; 1711 1712 /* 1713 * We may be able to avoid a copy-on-write if the chain's 1714 * check mode is set to NONE and the chain's current 1715 * modify_tid is beyond the last explicit snapshot tid. 1716 * 1717 * This implements HAMMER2's overwrite-in-place feature. 1718 * 1719 * NOTE! This data-block cannot be used as a de-duplication 1720 * source when the check mode is set to NONE. 1721 */ 1722 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1723 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1724 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1725 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1726 HAMMER2_DEC_CHECK(chain->bref.methods) == 1727 HAMMER2_CHECK_NONE && 1728 chain->pmp && 1729 chain->bref.modify_tid > 1730 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1731 /* 1732 * Sector overwrite allowed. 1733 */ 1734 newmod = 0; 1735 } else if ((hmp->hflags & HMNT2_EMERG) && 1736 chain->pmp && 1737 chain->bref.modify_tid > 1738 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1739 /* 1740 * If in emergency delete mode then do a modify-in- 1741 * place on any chain type belonging to the PFS as 1742 * long as it doesn't mess up a snapshot. We might 1743 * be forced to do this anyway a little further down 1744 * in the code if the allocation fails. 1745 * 1746 * Also note that in emergency mode, these modify-in- 1747 * place operations are NOT SAFE. A storage failure, 1748 * power failure, or panic can corrupt the filesystem. 1749 */ 1750 newmod = 0; 1751 } else { 1752 /* 1753 * Sector overwrite not allowed, must copy-on-write. 1754 */ 1755 newmod = 1; 1756 } 1757 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1758 /* 1759 * If the modified chain was registered for dedup we need 1760 * a new allocation. This only happens for delayed-flush 1761 * chains (i.e. which run through the front-end buffer 1762 * cache). 1763 */ 1764 newmod = 1; 1765 setmodified = 0; 1766 } else { 1767 /* 1768 * Already flagged modified, no new allocation is needed. 1769 */ 1770 newmod = 0; 1771 setmodified = 0; 1772 } 1773 1774 /* 1775 * Flag parent update required. 1776 */ 1777 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1778 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1779 setupdate = 1; 1780 } else { 1781 setupdate = 0; 1782 } 1783 1784 /* 1785 * The XOP code returns held but unlocked focus chains. This 1786 * prevents the chain from being destroyed but does not prevent 1787 * it from being modified. diolk is used to interlock modifications 1788 * against XOP frontend accesses to the focus. 1789 * 1790 * This allows us to theoretically avoid deadlocking the frontend 1791 * if one of the backends lock up by not formally locking the 1792 * focused chain in the frontend. In addition, the synchronization 1793 * code relies on this mechanism to avoid deadlocking concurrent 1794 * synchronization threads. 1795 */ 1796 lockmgr(&chain->diolk, LK_EXCLUSIVE); 1797 1798 /* 1799 * The modification or re-modification requires an allocation and 1800 * possible COW. If an error occurs, the previous content and data 1801 * reference is retained and the modification fails. 1802 * 1803 * If dedup_off is non-zero, the caller is requesting a deduplication 1804 * rather than a modification. The MODIFIED bit is not set and the 1805 * data offset is set to the deduplication offset. The data cannot 1806 * be modified. 1807 * 1808 * NOTE: The dedup offset is allowed to be in a partially free state 1809 * and we must be sure to reset it to a fully allocated state 1810 * to force two bulkfree passes to free it again. 1811 * 1812 * NOTE: Only applicable when chain->bytes != 0. 1813 * 1814 * XXX can a chain already be marked MODIFIED without a data 1815 * assignment? If not, assert here instead of testing the case. 1816 */ 1817 if (chain != &hmp->vchain && chain != &hmp->fchain && 1818 chain->bytes) { 1819 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1820 newmod 1821 ) { 1822 /* 1823 * NOTE: We do not have to remove the dedup 1824 * registration because the area is still 1825 * allocated and the underlying DIO will 1826 * still be flushed. 1827 */ 1828 if (dedup_off) { 1829 chain->bref.data_off = dedup_off; 1830 if ((int)(dedup_off & HAMMER2_OFF_MASK_RADIX)) 1831 chain->bytes = 1 << 1832 (int)(dedup_off & 1833 HAMMER2_OFF_MASK_RADIX); 1834 else 1835 chain->bytes = 0; 1836 chain->error = 0; 1837 atomic_clear_int(&chain->flags, 1838 HAMMER2_CHAIN_MODIFIED); 1839 atomic_add_long(&hammer2_count_modified_chains, 1840 -1); 1841 if (chain->pmp) { 1842 hammer2_pfs_memory_wakeup( 1843 chain->pmp, -1); 1844 } 1845 hammer2_freemap_adjust(hmp, &chain->bref, 1846 HAMMER2_FREEMAP_DORECOVER); 1847 atomic_set_int(&chain->flags, 1848 HAMMER2_CHAIN_DEDUPABLE); 1849 } else { 1850 error = hammer2_freemap_alloc(chain, 1851 chain->bytes); 1852 atomic_clear_int(&chain->flags, 1853 HAMMER2_CHAIN_DEDUPABLE); 1854 1855 /* 1856 * If we are unable to allocate a new block 1857 * but we are in emergency mode, issue a 1858 * warning to the console and reuse the same 1859 * block. 1860 * 1861 * We behave as if the allocation were 1862 * successful. 1863 * 1864 * THIS IS IMPORTANT: These modifications 1865 * are virtually guaranteed to corrupt any 1866 * snapshots related to this filesystem. 1867 */ 1868 if (error && (hmp->hflags & HMNT2_EMERG)) { 1869 error = 0; 1870 chain->bref.flags |= 1871 HAMMER2_BREF_FLAG_EMERG_MIP; 1872 1873 krateprintf(&krate_h2em, 1874 "hammer2: Emergency Mode WARNING: " 1875 "Operation will likely corrupt " 1876 "related snapshot: " 1877 "%016jx.%02x key=%016jx\n", 1878 chain->bref.data_off, 1879 chain->bref.type, 1880 chain->bref.key); 1881 } else if (error == 0) { 1882 chain->bref.flags &= 1883 ~HAMMER2_BREF_FLAG_EMERG_MIP; 1884 } 1885 } 1886 } 1887 } 1888 1889 /* 1890 * Stop here if error. We have to undo any flag bits we might 1891 * have set above. 1892 */ 1893 if (error) { 1894 if (setmodified) { 1895 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1896 atomic_add_long(&hammer2_count_modified_chains, -1); 1897 if (chain->pmp) 1898 hammer2_pfs_memory_wakeup(chain->pmp, -1); 1899 } 1900 if (setupdate) { 1901 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1902 } 1903 lockmgr(&chain->diolk, LK_RELEASE); 1904 1905 return error; 1906 } 1907 1908 /* 1909 * Update mirror_tid and modify_tid. modify_tid is only updated 1910 * if not passed as zero (during flushes, parent propagation passes 1911 * the value 0). 1912 * 1913 * NOTE: chain->pmp could be the device spmp. 1914 */ 1915 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1916 if (mtid) 1917 chain->bref.modify_tid = mtid; 1918 1919 /* 1920 * Set BLKMAPUPD to tell the flush code that an existing blockmap entry 1921 * requires updating as well as to tell the delete code that the 1922 * chain's blockref might not exactly match (in terms of physical size 1923 * or block offset) the one in the parent's blocktable. The base key 1924 * of course will still match. 1925 */ 1926 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) 1927 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD); 1928 1929 /* 1930 * Short-cut data block handling when the caller does not need an 1931 * actual data reference to (aka OPTDATA), as long as the chain does 1932 * not already have a data pointer to the data and no de-duplication 1933 * occurred. 1934 * 1935 * This generally means that the modifications are being done via the 1936 * logical buffer cache. 1937 * 1938 * NOTE: If deduplication occurred we have to run through the data 1939 * stuff to clear INITIAL, and the caller will likely want to 1940 * assign the check code anyway. Leaving INITIAL set on a 1941 * dedup can be deadly (it can cause the block to be zero'd!). 1942 * 1943 * This code also handles bytes == 0 (most dirents). 1944 */ 1945 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1946 (flags & HAMMER2_MODIFY_OPTDATA) && 1947 chain->data == NULL) { 1948 if (dedup_off == 0) { 1949 KKASSERT(chain->dio == NULL); 1950 goto skip2; 1951 } 1952 } 1953 1954 /* 1955 * Clearing the INITIAL flag (for indirect blocks) indicates that 1956 * we've processed the uninitialized storage allocation. 1957 * 1958 * If this flag is already clear we are likely in a copy-on-write 1959 * situation but we have to be sure NOT to bzero the storage if 1960 * no data is present. 1961 * 1962 * Clearing of NOTTESTED is allowed if the MODIFIED bit is set, 1963 */ 1964 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1965 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1966 wasinitial = 1; 1967 } else { 1968 wasinitial = 0; 1969 } 1970 1971 /* 1972 * Instantiate data buffer and possibly execute COW operation 1973 */ 1974 switch(chain->bref.type) { 1975 case HAMMER2_BREF_TYPE_VOLUME: 1976 case HAMMER2_BREF_TYPE_FREEMAP: 1977 /* 1978 * The data is embedded, no copy-on-write operation is 1979 * needed. 1980 */ 1981 KKASSERT(chain->dio == NULL); 1982 break; 1983 case HAMMER2_BREF_TYPE_DIRENT: 1984 /* 1985 * The data might be fully embedded. 1986 */ 1987 if (chain->bytes == 0) { 1988 KKASSERT(chain->dio == NULL); 1989 break; 1990 } 1991 /* fall through */ 1992 case HAMMER2_BREF_TYPE_INODE: 1993 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1994 case HAMMER2_BREF_TYPE_DATA: 1995 case HAMMER2_BREF_TYPE_INDIRECT: 1996 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1997 /* 1998 * Perform the copy-on-write operation 1999 * 2000 * zero-fill or copy-on-write depending on whether 2001 * chain->data exists or not and set the dirty state for 2002 * the new buffer. hammer2_io_new() will handle the 2003 * zero-fill. 2004 * 2005 * If a dedup_off was supplied this is an existing block 2006 * and no COW, copy, or further modification is required. 2007 */ 2008 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 2009 2010 if (wasinitial && dedup_off == 0) { 2011 error = hammer2_io_new(hmp, chain->bref.type, 2012 chain->bref.data_off, 2013 chain->bytes, &dio); 2014 } else { 2015 error = hammer2_io_bread(hmp, chain->bref.type, 2016 chain->bref.data_off, 2017 chain->bytes, &dio); 2018 } 2019 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 2020 2021 /* 2022 * If an I/O error occurs make sure callers cannot accidently 2023 * modify the old buffer's contents and corrupt the filesystem. 2024 * 2025 * NOTE: hammer2_io_data() call issues bkvasync() 2026 */ 2027 if (error) { 2028 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 2029 hmp); 2030 chain->error = HAMMER2_ERROR_EIO; 2031 hammer2_io_brelse(&dio); 2032 hammer2_io_brelse(&chain->dio); 2033 chain->data = NULL; 2034 break; 2035 } 2036 chain->error = 0; 2037 bdata = hammer2_io_data(dio, chain->bref.data_off); 2038 2039 if (chain->data) { 2040 /* 2041 * COW (unless a dedup). 2042 */ 2043 KKASSERT(chain->dio != NULL); 2044 if (chain->data != (void *)bdata && dedup_off == 0) { 2045 bcopy(chain->data, bdata, chain->bytes); 2046 } 2047 } else if (wasinitial == 0 && dedup_off == 0) { 2048 /* 2049 * We have a problem. We were asked to COW but 2050 * we don't have any data to COW with! 2051 */ 2052 panic("hammer2_chain_modify: having a COW %p\n", 2053 chain); 2054 } 2055 2056 /* 2057 * Retire the old buffer, replace with the new. Dirty or 2058 * redirty the new buffer. 2059 * 2060 * WARNING! The system buffer cache may have already flushed 2061 * the buffer, so we must be sure to [re]dirty it 2062 * for further modification. 2063 * 2064 * If dedup_off was supplied, the caller is not 2065 * expected to make any further modification to the 2066 * buffer. 2067 * 2068 * WARNING! hammer2_get_gdata() assumes dio never transitions 2069 * through NULL in order to optimize away unnecessary 2070 * diolk operations. 2071 */ 2072 { 2073 hammer2_io_t *tio; 2074 2075 if ((tio = chain->dio) != NULL) 2076 hammer2_io_bqrelse(&tio); 2077 chain->data = (void *)bdata; 2078 chain->dio = dio; 2079 if (dedup_off == 0) 2080 hammer2_io_setdirty(dio); 2081 } 2082 break; 2083 default: 2084 panic("hammer2_chain_modify: illegal non-embedded type %d", 2085 chain->bref.type); 2086 break; 2087 2088 } 2089 skip2: 2090 /* 2091 * setflush on parent indicating that the parent must recurse down 2092 * to us. Do not call on chain itself which might already have it 2093 * set. 2094 */ 2095 if (chain->parent) 2096 hammer2_chain_setflush(chain->parent); 2097 lockmgr(&chain->diolk, LK_RELEASE); 2098 2099 return (chain->error); 2100 } 2101 2102 /* 2103 * Modify the chain associated with an inode. 2104 */ 2105 int 2106 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 2107 hammer2_tid_t mtid, int flags) 2108 { 2109 int error; 2110 2111 hammer2_inode_modify(ip); 2112 error = hammer2_chain_modify(chain, mtid, 0, flags); 2113 2114 return error; 2115 } 2116 2117 /* 2118 * This function returns the chain at the nearest key within the specified 2119 * range. The returned chain will be referenced but not locked. 2120 * 2121 * This function will recurse through chain->rbtree as necessary and will 2122 * return a *key_nextp suitable for iteration. *key_nextp is only set if 2123 * the iteration value is less than the current value of *key_nextp. 2124 * 2125 * The caller should use (*key_nextp) to calculate the actual range of 2126 * the returned element, which will be (key_beg to *key_nextp - 1), because 2127 * there might be another element which is superior to the returned element 2128 * and overlaps it. 2129 * 2130 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 2131 * chains continue to be returned. On EOF (*key_nextp) may overflow since 2132 * it will wind up being (key_end + 1). 2133 * 2134 * WARNING! Must be called with child's spinlock held. Spinlock remains 2135 * held through the operation. 2136 */ 2137 struct hammer2_chain_find_info { 2138 hammer2_chain_t *best; 2139 hammer2_key_t key_beg; 2140 hammer2_key_t key_end; 2141 hammer2_key_t key_next; 2142 }; 2143 2144 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 2145 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 2146 2147 static 2148 hammer2_chain_t * 2149 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 2150 hammer2_key_t key_beg, hammer2_key_t key_end) 2151 { 2152 struct hammer2_chain_find_info info; 2153 2154 info.best = NULL; 2155 info.key_beg = key_beg; 2156 info.key_end = key_end; 2157 info.key_next = *key_nextp; 2158 2159 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2160 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2161 &info); 2162 *key_nextp = info.key_next; 2163 #if 0 2164 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2165 parent, key_beg, key_end, *key_nextp); 2166 #endif 2167 2168 return (info.best); 2169 } 2170 2171 static 2172 int 2173 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2174 { 2175 struct hammer2_chain_find_info *info = data; 2176 hammer2_key_t child_beg; 2177 hammer2_key_t child_end; 2178 2179 child_beg = child->bref.key; 2180 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2181 2182 if (child_end < info->key_beg) 2183 return(-1); 2184 if (child_beg > info->key_end) 2185 return(1); 2186 return(0); 2187 } 2188 2189 static 2190 int 2191 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2192 { 2193 struct hammer2_chain_find_info *info = data; 2194 hammer2_chain_t *best; 2195 hammer2_key_t child_end; 2196 2197 if ((best = info->best) == NULL) { 2198 /* 2199 * No previous best. Assign best 2200 */ 2201 info->best = child; 2202 } else if (best->bref.key <= info->key_beg && 2203 child->bref.key <= info->key_beg) { 2204 /* 2205 * Illegal overlap. 2206 */ 2207 KKASSERT(0); 2208 /*info->best = child;*/ 2209 } else if (child->bref.key < best->bref.key) { 2210 /* 2211 * Child has a nearer key and best is not flush with key_beg. 2212 * Set best to child. Truncate key_next to the old best key. 2213 */ 2214 info->best = child; 2215 if (info->key_next > best->bref.key || info->key_next == 0) 2216 info->key_next = best->bref.key; 2217 } else if (child->bref.key == best->bref.key) { 2218 /* 2219 * If our current best is flush with the child then this 2220 * is an illegal overlap. 2221 * 2222 * key_next will automatically be limited to the smaller of 2223 * the two end-points. 2224 */ 2225 KKASSERT(0); 2226 info->best = child; 2227 } else { 2228 /* 2229 * Keep the current best but truncate key_next to the child's 2230 * base. 2231 * 2232 * key_next will also automatically be limited to the smaller 2233 * of the two end-points (probably not necessary for this case 2234 * but we do it anyway). 2235 */ 2236 if (info->key_next > child->bref.key || info->key_next == 0) 2237 info->key_next = child->bref.key; 2238 } 2239 2240 /* 2241 * Always truncate key_next based on child's end-of-range. 2242 */ 2243 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2244 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2245 info->key_next = child_end; 2246 2247 return(0); 2248 } 2249 2250 /* 2251 * Retrieve the specified chain from a media blockref, creating the 2252 * in-memory chain structure which reflects it. The returned chain is 2253 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must 2254 * handle crc-checks and so forth, and should check chain->error before 2255 * assuming that the data is good. 2256 * 2257 * To handle insertion races pass the INSERT_RACE flag along with the 2258 * generation number of the core. NULL will be returned if the generation 2259 * number changes before we have a chance to insert the chain. Insert 2260 * races can occur because the parent might be held shared. 2261 * 2262 * Caller must hold the parent locked shared or exclusive since we may 2263 * need the parent's bref array to find our block. 2264 * 2265 * WARNING! chain->pmp is always set to NULL for any chain representing 2266 * part of the super-root topology. 2267 */ 2268 hammer2_chain_t * 2269 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2270 hammer2_blockref_t *bref, int how) 2271 { 2272 hammer2_dev_t *hmp = parent->hmp; 2273 hammer2_chain_t *chain; 2274 int error; 2275 2276 /* 2277 * Allocate a chain structure representing the existing media 2278 * entry. Resulting chain has one ref and is not locked. 2279 */ 2280 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2281 chain = hammer2_chain_alloc(hmp, NULL, bref); 2282 else 2283 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2284 /* ref'd chain returned */ 2285 2286 /* 2287 * Flag that the chain is in the parent's blockmap so delete/flush 2288 * knows what to do with it. 2289 */ 2290 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 2291 2292 /* 2293 * chain must be locked to avoid unexpected ripouts 2294 */ 2295 hammer2_chain_lock(chain, how); 2296 2297 /* 2298 * Link the chain into its parent. A spinlock is required to safely 2299 * access the RBTREE, and it is possible to collide with another 2300 * hammer2_chain_get() operation because the caller might only hold 2301 * a shared lock on the parent. 2302 * 2303 * NOTE: Get races can occur quite often when we distribute 2304 * asynchronous read-aheads across multiple threads. 2305 */ 2306 KKASSERT(parent->refs > 0); 2307 error = hammer2_chain_insert(parent, chain, 2308 HAMMER2_CHAIN_INSERT_SPIN | 2309 HAMMER2_CHAIN_INSERT_RACE, 2310 generation); 2311 if (error) { 2312 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2313 /*kprintf("chain %p get race\n", chain);*/ 2314 hammer2_chain_unlock(chain); 2315 hammer2_chain_drop(chain); 2316 chain = NULL; 2317 } else { 2318 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2319 } 2320 2321 /* 2322 * Return our new chain referenced but not locked, or NULL if 2323 * a race occurred. 2324 */ 2325 return (chain); 2326 } 2327 2328 /* 2329 * Lookup initialization/completion API 2330 */ 2331 hammer2_chain_t * 2332 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2333 { 2334 hammer2_chain_ref(parent); 2335 if (flags & HAMMER2_LOOKUP_SHARED) { 2336 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2337 HAMMER2_RESOLVE_SHARED); 2338 } else { 2339 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2340 } 2341 return (parent); 2342 } 2343 2344 void 2345 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2346 { 2347 if (parent) { 2348 hammer2_chain_unlock(parent); 2349 hammer2_chain_drop(parent); 2350 } 2351 } 2352 2353 /* 2354 * Take the locked chain and return a locked parent. The chain remains 2355 * locked on return, but may have to be temporarily unlocked to acquire 2356 * the parent. Because of this, (chain) must be stable and cannot be 2357 * deleted while it was temporarily unlocked (typically means that (chain) 2358 * is an inode). 2359 * 2360 * Pass HAMMER2_RESOLVE_* flags in flags. 2361 * 2362 * This will work even if the chain is errored, and the caller can check 2363 * parent->error on return if desired since the parent will be locked. 2364 * 2365 * This function handles the lock order reversal. 2366 */ 2367 hammer2_chain_t * 2368 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2369 { 2370 hammer2_chain_t *parent; 2371 2372 /* 2373 * Be careful of order, chain must be unlocked before parent 2374 * is locked below to avoid a deadlock. Try it trivially first. 2375 */ 2376 parent = chain->parent; 2377 if (parent == NULL) 2378 panic("hammer2_chain_getparent: no parent"); 2379 hammer2_chain_ref(parent); 2380 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) 2381 return parent; 2382 2383 for (;;) { 2384 hammer2_chain_unlock(chain); 2385 hammer2_chain_lock(parent, flags); 2386 hammer2_chain_lock(chain, flags); 2387 2388 /* 2389 * Parent relinking races are quite common. We have to get 2390 * it right or we will blow up the block table. 2391 */ 2392 if (chain->parent == parent) 2393 break; 2394 hammer2_chain_unlock(parent); 2395 hammer2_chain_drop(parent); 2396 cpu_ccfence(); 2397 parent = chain->parent; 2398 if (parent == NULL) 2399 panic("hammer2_chain_getparent: no parent"); 2400 hammer2_chain_ref(parent); 2401 } 2402 return parent; 2403 } 2404 2405 /* 2406 * Take the locked chain and return a locked parent. The chain is unlocked 2407 * and dropped. *chainp is set to the returned parent as a convenience. 2408 * Pass HAMMER2_RESOLVE_* flags in flags. 2409 * 2410 * This will work even if the chain is errored, and the caller can check 2411 * parent->error on return if desired since the parent will be locked. 2412 * 2413 * The chain does NOT need to be stable. We use a tracking structure 2414 * to track the expected parent if the chain is deleted out from under us. 2415 * 2416 * This function handles the lock order reversal. 2417 */ 2418 hammer2_chain_t * 2419 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2420 { 2421 hammer2_chain_t *chain; 2422 hammer2_chain_t *parent; 2423 struct hammer2_reptrack reptrack; 2424 struct hammer2_reptrack **repp; 2425 2426 /* 2427 * Be careful of order, chain must be unlocked before parent 2428 * is locked below to avoid a deadlock. Try it trivially first. 2429 */ 2430 chain = *chainp; 2431 parent = chain->parent; 2432 if (parent == NULL) { 2433 hammer2_spin_unex(&chain->core.spin); 2434 panic("hammer2_chain_repparent: no parent"); 2435 } 2436 hammer2_chain_ref(parent); 2437 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) { 2438 hammer2_chain_unlock(chain); 2439 hammer2_chain_drop(chain); 2440 *chainp = parent; 2441 2442 return parent; 2443 } 2444 2445 /* 2446 * Ok, now it gets a bit nasty. There are multiple situations where 2447 * the parent might be in the middle of a deletion, or where the child 2448 * (chain) might be deleted the instant we let go of its lock. 2449 * We can potentially end up in a no-win situation! 2450 * 2451 * In particular, the indirect_maintenance() case can cause these 2452 * situations. 2453 * 2454 * To deal with this we install a reptrack structure in the parent 2455 * This reptrack structure 'owns' the parent ref and will automatically 2456 * migrate to the parent's parent if the parent is deleted permanently. 2457 */ 2458 hammer2_spin_init(&reptrack.spin, "h2reptrk"); 2459 reptrack.chain = parent; 2460 hammer2_chain_ref(parent); /* for the reptrack */ 2461 2462 hammer2_spin_ex(&parent->core.spin); 2463 reptrack.next = parent->core.reptrack; 2464 parent->core.reptrack = &reptrack; 2465 hammer2_spin_unex(&parent->core.spin); 2466 2467 hammer2_chain_unlock(chain); 2468 hammer2_chain_drop(chain); 2469 chain = NULL; /* gone */ 2470 2471 /* 2472 * At the top of this loop, chain is gone and parent is refd both 2473 * by us explicitly AND via our reptrack. We are attempting to 2474 * lock parent. 2475 */ 2476 for (;;) { 2477 hammer2_chain_lock(parent, flags); 2478 2479 if (reptrack.chain == parent) 2480 break; 2481 hammer2_chain_unlock(parent); 2482 hammer2_chain_drop(parent); 2483 2484 kprintf("hammer2: debug REPTRACK %p->%p\n", 2485 parent, reptrack.chain); 2486 hammer2_spin_ex(&reptrack.spin); 2487 parent = reptrack.chain; 2488 hammer2_chain_ref(parent); 2489 hammer2_spin_unex(&reptrack.spin); 2490 } 2491 2492 /* 2493 * Once parent is locked and matches our reptrack, our reptrack 2494 * will be stable and we have our parent. We can unlink our 2495 * reptrack. 2496 * 2497 * WARNING! Remember that the chain lock might be shared. Chains 2498 * locked shared have stable parent linkages. 2499 */ 2500 hammer2_spin_ex(&parent->core.spin); 2501 repp = &parent->core.reptrack; 2502 while (*repp != &reptrack) 2503 repp = &(*repp)->next; 2504 *repp = reptrack.next; 2505 hammer2_spin_unex(&parent->core.spin); 2506 2507 hammer2_chain_drop(parent); /* reptrack ref */ 2508 *chainp = parent; /* return parent lock+ref */ 2509 2510 return parent; 2511 } 2512 2513 /* 2514 * Dispose of any linked reptrack structures in (chain) by shifting them to 2515 * (parent). Both (chain) and (parent) must be exclusively locked. 2516 * 2517 * This is interlocked against any children of (chain) on the other side. 2518 * No children so remain as-of when this is called so we can test 2519 * core.reptrack without holding the spin-lock. 2520 * 2521 * Used whenever the caller intends to permanently delete chains related 2522 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE), 2523 * where the chains underneath the node being deleted are given a new parent 2524 * above the node being deleted. 2525 */ 2526 static 2527 void 2528 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain) 2529 { 2530 struct hammer2_reptrack *reptrack; 2531 2532 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)); 2533 while (chain->core.reptrack) { 2534 hammer2_spin_ex(&parent->core.spin); 2535 hammer2_spin_ex(&chain->core.spin); 2536 reptrack = chain->core.reptrack; 2537 if (reptrack == NULL) { 2538 hammer2_spin_unex(&chain->core.spin); 2539 hammer2_spin_unex(&parent->core.spin); 2540 break; 2541 } 2542 hammer2_spin_ex(&reptrack->spin); 2543 chain->core.reptrack = reptrack->next; 2544 reptrack->chain = parent; 2545 reptrack->next = parent->core.reptrack; 2546 parent->core.reptrack = reptrack; 2547 hammer2_chain_ref(parent); /* reptrack */ 2548 2549 hammer2_spin_unex(&chain->core.spin); 2550 hammer2_spin_unex(&parent->core.spin); 2551 kprintf("hammer2: debug repchange %p %p->%p\n", 2552 reptrack, chain, parent); 2553 hammer2_chain_drop(chain); /* reptrack */ 2554 } 2555 } 2556 2557 /* 2558 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2559 * (*parentp) typically points to an inode but can also point to a related 2560 * indirect block and this function will recurse upwards and find the inode 2561 * or the nearest undeleted indirect block covering the key range. 2562 * 2563 * This function unconditionally sets *errorp, replacing any previous value. 2564 * 2565 * (*parentp) must be exclusive or shared locked (depending on flags) and 2566 * referenced and can be an inode or an existing indirect block within the 2567 * inode. 2568 * 2569 * If (*parent) is errored out, this function will not attempt to recurse 2570 * the radix tree and will return NULL along with an appropriate *errorp. 2571 * If NULL is returned and *errorp is 0, the requested lookup could not be 2572 * located. 2573 * 2574 * On return (*parentp) will be modified to point at the deepest parent chain 2575 * element encountered during the search, as a helper for an insertion or 2576 * deletion. 2577 * 2578 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2579 * and referenced, and the old will be unlocked and dereferenced (no change 2580 * if they are both the same). This is particularly important if the caller 2581 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2582 * is returned, as long as no error occurred. 2583 * 2584 * The matching chain will be returned locked according to flags. 2585 * 2586 * -- 2587 * 2588 * NULL is returned if no match was found, but (*parentp) will still 2589 * potentially be adjusted. 2590 * 2591 * On return (*key_nextp) will point to an iterative value for key_beg. 2592 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2593 * 2594 * This function will also recurse up the chain if the key is not within the 2595 * current parent's range. (*parentp) can never be set to NULL. An iteration 2596 * can simply allow (*parentp) to float inside the loop. 2597 * 2598 * NOTE! chain->data is not always resolved. By default it will not be 2599 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2600 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2601 * BREF_TYPE_DATA as the device buffer can alias the logical file 2602 * buffer). 2603 */ 2604 hammer2_chain_t * 2605 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2606 hammer2_key_t key_beg, hammer2_key_t key_end, 2607 int *errorp, int flags) 2608 { 2609 hammer2_chain_t *parent; 2610 hammer2_chain_t *chain; 2611 hammer2_blockref_t *base; 2612 hammer2_blockref_t *bref; 2613 hammer2_blockref_t bsave; 2614 hammer2_key_t scan_beg; 2615 hammer2_key_t scan_end; 2616 int count = 0; 2617 int how_always = HAMMER2_RESOLVE_ALWAYS; 2618 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2619 int how; 2620 int generation; 2621 int maxloops = 300000; 2622 2623 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2624 how_maybe = how_always; 2625 how = HAMMER2_RESOLVE_ALWAYS; 2626 } else if (flags & HAMMER2_LOOKUP_NODATA) { 2627 how = HAMMER2_RESOLVE_NEVER; 2628 } else { 2629 how = HAMMER2_RESOLVE_MAYBE; 2630 } 2631 if (flags & HAMMER2_LOOKUP_SHARED) { 2632 how_maybe |= HAMMER2_RESOLVE_SHARED; 2633 how_always |= HAMMER2_RESOLVE_SHARED; 2634 how |= HAMMER2_RESOLVE_SHARED; 2635 } 2636 2637 /* 2638 * Recurse (*parentp) upward if necessary until the parent completely 2639 * encloses the key range or we hit the inode. 2640 * 2641 * Handle races against the flusher deleting indirect nodes on its 2642 * way back up by continuing to recurse upward past the deletion. 2643 */ 2644 parent = *parentp; 2645 *errorp = 0; 2646 2647 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2648 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2649 scan_beg = parent->bref.key; 2650 scan_end = scan_beg + 2651 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2652 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2653 if (key_beg >= scan_beg && key_end <= scan_end) 2654 break; 2655 } 2656 parent = hammer2_chain_repparent(parentp, how_maybe); 2657 } 2658 again: 2659 if (--maxloops == 0) 2660 panic("hammer2_chain_lookup: maxloops"); 2661 2662 /* 2663 * MATCHIND case that does not require parent->data (do prior to 2664 * parent->error check). 2665 */ 2666 switch(parent->bref.type) { 2667 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2668 case HAMMER2_BREF_TYPE_INDIRECT: 2669 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2670 scan_beg = parent->bref.key; 2671 scan_end = scan_beg + 2672 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2673 if (key_beg == scan_beg && key_end == scan_end) { 2674 chain = parent; 2675 hammer2_chain_ref(chain); 2676 hammer2_chain_lock(chain, how_maybe); 2677 *key_nextp = scan_end + 1; 2678 goto done; 2679 } 2680 } 2681 break; 2682 default: 2683 break; 2684 } 2685 2686 /* 2687 * No lookup is possible if the parent is errored. We delayed 2688 * this check as long as we could to ensure that the parent backup, 2689 * embedded data, and MATCHIND code could still execute. 2690 */ 2691 if (parent->error) { 2692 *errorp = parent->error; 2693 return NULL; 2694 } 2695 2696 /* 2697 * Locate the blockref array. Currently we do a fully associative 2698 * search through the array. 2699 */ 2700 switch(parent->bref.type) { 2701 case HAMMER2_BREF_TYPE_INODE: 2702 /* 2703 * Special shortcut for embedded data returns the inode 2704 * itself. Callers must detect this condition and access 2705 * the embedded data (the strategy code does this for us). 2706 * 2707 * This is only applicable to regular files and softlinks. 2708 * 2709 * We need a second lock on parent. Since we already have 2710 * a lock we must pass LOCKAGAIN to prevent unexpected 2711 * blocking (we don't want to block on a second shared 2712 * ref if an exclusive lock is pending) 2713 */ 2714 if (parent->data->ipdata.meta.op_flags & 2715 HAMMER2_OPFLAG_DIRECTDATA) { 2716 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2717 chain = NULL; 2718 *key_nextp = key_end + 1; 2719 goto done; 2720 } 2721 hammer2_chain_ref(parent); 2722 hammer2_chain_lock(parent, how_always | 2723 HAMMER2_RESOLVE_LOCKAGAIN); 2724 *key_nextp = key_end + 1; 2725 return (parent); 2726 } 2727 base = &parent->data->ipdata.u.blockset.blockref[0]; 2728 count = HAMMER2_SET_COUNT; 2729 break; 2730 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2731 case HAMMER2_BREF_TYPE_INDIRECT: 2732 /* 2733 * Optimize indirect blocks in the INITIAL state to avoid 2734 * I/O. 2735 * 2736 * Debugging: Enter permanent wait state instead of 2737 * panicing on unexpectedly NULL data for the moment. 2738 */ 2739 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2740 base = NULL; 2741 } else { 2742 if (parent->data == NULL) { 2743 kprintf("hammer2: unexpected NULL data " 2744 "on %p\n", parent); 2745 while (1) 2746 tsleep(parent, 0, "xxx", 0); 2747 } 2748 base = &parent->data->npdata[0]; 2749 } 2750 count = parent->bytes / sizeof(hammer2_blockref_t); 2751 break; 2752 case HAMMER2_BREF_TYPE_VOLUME: 2753 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2754 count = HAMMER2_SET_COUNT; 2755 break; 2756 case HAMMER2_BREF_TYPE_FREEMAP: 2757 base = &parent->data->blkset.blockref[0]; 2758 count = HAMMER2_SET_COUNT; 2759 break; 2760 default: 2761 panic("hammer2_chain_lookup: unrecognized " 2762 "blockref(B) type: %d", 2763 parent->bref.type); 2764 base = NULL; /* safety */ 2765 count = 0; /* safety */ 2766 break; 2767 } 2768 2769 /* 2770 * Merged scan to find next candidate. 2771 * 2772 * hammer2_base_*() functions require the parent->core.live_* fields 2773 * to be synchronized. 2774 * 2775 * We need to hold the spinlock to access the block array and RB tree 2776 * and to interlock chain creation. 2777 */ 2778 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2779 hammer2_chain_countbrefs(parent, base, count); 2780 2781 /* 2782 * Combined search 2783 */ 2784 hammer2_spin_ex(&parent->core.spin); 2785 chain = hammer2_combined_find(parent, base, count, 2786 key_nextp, 2787 key_beg, key_end, 2788 &bref); 2789 generation = parent->core.generation; 2790 2791 /* 2792 * Exhausted parent chain, iterate. 2793 */ 2794 if (bref == NULL) { 2795 KKASSERT(chain == NULL); 2796 hammer2_spin_unex(&parent->core.spin); 2797 if (key_beg == key_end) /* short cut single-key case */ 2798 return (NULL); 2799 2800 /* 2801 * Stop if we reached the end of the iteration. 2802 */ 2803 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2804 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2805 return (NULL); 2806 } 2807 2808 /* 2809 * Calculate next key, stop if we reached the end of the 2810 * iteration, otherwise go up one level and loop. 2811 */ 2812 key_beg = parent->bref.key + 2813 ((hammer2_key_t)1 << parent->bref.keybits); 2814 if (key_beg == 0 || key_beg > key_end) 2815 return (NULL); 2816 parent = hammer2_chain_repparent(parentp, how_maybe); 2817 goto again; 2818 } 2819 2820 /* 2821 * Selected from blockref or in-memory chain. 2822 */ 2823 bsave = *bref; 2824 if (chain == NULL) { 2825 hammer2_spin_unex(&parent->core.spin); 2826 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2827 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2828 chain = hammer2_chain_get(parent, generation, 2829 &bsave, how_maybe); 2830 } else { 2831 chain = hammer2_chain_get(parent, generation, 2832 &bsave, how); 2833 } 2834 if (chain == NULL) 2835 goto again; 2836 } else { 2837 hammer2_chain_ref(chain); 2838 hammer2_spin_unex(&parent->core.spin); 2839 2840 /* 2841 * chain is referenced but not locked. We must lock the 2842 * chain to obtain definitive state. 2843 */ 2844 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2845 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2846 hammer2_chain_lock(chain, how_maybe); 2847 } else { 2848 hammer2_chain_lock(chain, how); 2849 } 2850 KKASSERT(chain->parent == parent); 2851 } 2852 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 2853 chain->parent != parent) { 2854 hammer2_chain_unlock(chain); 2855 hammer2_chain_drop(chain); 2856 chain = NULL; /* SAFETY */ 2857 goto again; 2858 } 2859 2860 2861 /* 2862 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2863 * 2864 * NOTE: Chain's key range is not relevant as there might be 2865 * one-offs within the range that are not deleted. 2866 * 2867 * NOTE: Lookups can race delete-duplicate because 2868 * delete-duplicate does not lock the parent's core 2869 * (they just use the spinlock on the core). 2870 */ 2871 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2872 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2873 chain->bref.data_off, chain->bref.type, 2874 chain->bref.key); 2875 hammer2_chain_unlock(chain); 2876 hammer2_chain_drop(chain); 2877 chain = NULL; /* SAFETY */ 2878 key_beg = *key_nextp; 2879 if (key_beg == 0 || key_beg > key_end) 2880 return(NULL); 2881 goto again; 2882 } 2883 2884 /* 2885 * If the chain element is an indirect block it becomes the new 2886 * parent and we loop on it. We must maintain our top-down locks 2887 * to prevent the flusher from interfering (i.e. doing a 2888 * delete-duplicate and leaving us recursing down a deleted chain). 2889 * 2890 * The parent always has to be locked with at least RESOLVE_MAYBE 2891 * so we can access its data. It might need a fixup if the caller 2892 * passed incompatible flags. Be careful not to cause a deadlock 2893 * as a data-load requires an exclusive lock. 2894 * 2895 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2896 * range is within the requested key range we return the indirect 2897 * block and do NOT loop. This is usually only used to acquire 2898 * freemap nodes. 2899 */ 2900 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2901 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2902 hammer2_chain_unlock(parent); 2903 hammer2_chain_drop(parent); 2904 *parentp = parent = chain; 2905 chain = NULL; /* SAFETY */ 2906 goto again; 2907 } 2908 done: 2909 /* 2910 * All done, return the locked chain. 2911 * 2912 * If the caller does not want a locked chain, replace the lock with 2913 * a ref. Perhaps this can eventually be optimized to not obtain the 2914 * lock in the first place for situations where the data does not 2915 * need to be resolved. 2916 * 2917 * NOTE! A chain->error must be tested by the caller upon return. 2918 * *errorp is only set based on issues which occur while 2919 * trying to reach the chain. 2920 */ 2921 return (chain); 2922 } 2923 2924 /* 2925 * After having issued a lookup we can iterate all matching keys. 2926 * 2927 * If chain is non-NULL we continue the iteration from just after it's index. 2928 * 2929 * If chain is NULL we assume the parent was exhausted and continue the 2930 * iteration at the next parent. 2931 * 2932 * If a fatal error occurs (typically an I/O error), a dummy chain is 2933 * returned with chain->error and error-identifying information set. This 2934 * chain will assert if you try to do anything fancy with it. 2935 * 2936 * XXX Depending on where the error occurs we should allow continued iteration. 2937 * 2938 * parent must be locked on entry and remains locked throughout. chain's 2939 * lock status must match flags. Chain is always at least referenced. 2940 * 2941 * WARNING! The MATCHIND flag does not apply to this function. 2942 */ 2943 hammer2_chain_t * 2944 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2945 hammer2_key_t *key_nextp, 2946 hammer2_key_t key_beg, hammer2_key_t key_end, 2947 int *errorp, int flags) 2948 { 2949 hammer2_chain_t *parent; 2950 int how_maybe; 2951 2952 /* 2953 * Calculate locking flags for upward recursion. 2954 */ 2955 how_maybe = HAMMER2_RESOLVE_MAYBE; 2956 if (flags & HAMMER2_LOOKUP_SHARED) 2957 how_maybe |= HAMMER2_RESOLVE_SHARED; 2958 2959 parent = *parentp; 2960 *errorp = 0; 2961 2962 /* 2963 * Calculate the next index and recalculate the parent if necessary. 2964 */ 2965 if (chain) { 2966 key_beg = chain->bref.key + 2967 ((hammer2_key_t)1 << chain->bref.keybits); 2968 hammer2_chain_unlock(chain); 2969 hammer2_chain_drop(chain); 2970 2971 /* 2972 * chain invalid past this point, but we can still do a 2973 * pointer comparison w/parent. 2974 * 2975 * Any scan where the lookup returned degenerate data embedded 2976 * in the inode has an invalid index and must terminate. 2977 */ 2978 if (chain == parent) 2979 return(NULL); 2980 if (key_beg == 0 || key_beg > key_end) 2981 return(NULL); 2982 chain = NULL; 2983 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2984 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2985 /* 2986 * We reached the end of the iteration. 2987 */ 2988 return (NULL); 2989 } else { 2990 /* 2991 * Continue iteration with next parent unless the current 2992 * parent covers the range. 2993 * 2994 * (This also handles the case of a deleted, empty indirect 2995 * node). 2996 */ 2997 key_beg = parent->bref.key + 2998 ((hammer2_key_t)1 << parent->bref.keybits); 2999 if (key_beg == 0 || key_beg > key_end) 3000 return (NULL); 3001 parent = hammer2_chain_repparent(parentp, how_maybe); 3002 } 3003 3004 /* 3005 * And execute 3006 */ 3007 return (hammer2_chain_lookup(parentp, key_nextp, 3008 key_beg, key_end, 3009 errorp, flags)); 3010 } 3011 3012 /* 3013 * Caller wishes to iterate chains under parent, loading new chains into 3014 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 3015 * then call hammer2_chain_scan() repeatedly until a non-zero return. 3016 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 3017 * with the returned chain for the scan. The returned *chainp will be 3018 * locked and referenced. Any prior contents will be unlocked and dropped. 3019 * 3020 * Caller should check the return value. A normal scan EOF will return 3021 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 3022 * error trying to access parent data. Any error in the returned chain 3023 * must be tested separately by the caller. 3024 * 3025 * (*chainp) is dropped on each scan, but will only be set if the returned 3026 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 3027 * returned via *chainp. The caller will get their bref only. 3028 * 3029 * The raw scan function is similar to lookup/next but does not seek to a key. 3030 * Blockrefs are iterated via first_bref = (parent, NULL) and 3031 * next_chain = (parent, bref). 3032 * 3033 * The passed-in parent must be locked and its data resolved. The function 3034 * nominally returns a locked and referenced *chainp != NULL for chains 3035 * the caller might need to recurse on (and will dipose of any *chainp passed 3036 * in). The caller must check the chain->bref.type either way. 3037 */ 3038 int 3039 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 3040 hammer2_blockref_t *bref, int *firstp, 3041 int flags) 3042 { 3043 hammer2_blockref_t *base; 3044 hammer2_blockref_t *bref_ptr; 3045 hammer2_key_t key; 3046 hammer2_key_t next_key; 3047 hammer2_chain_t *chain = NULL; 3048 int count = 0; 3049 int how; 3050 int generation; 3051 int maxloops = 300000; 3052 int error; 3053 3054 error = 0; 3055 3056 /* 3057 * Scan flags borrowed from lookup. 3058 */ 3059 if (flags & HAMMER2_LOOKUP_ALWAYS) { 3060 how = HAMMER2_RESOLVE_ALWAYS; 3061 } else if (flags & HAMMER2_LOOKUP_NODATA) { 3062 how = HAMMER2_RESOLVE_NEVER; 3063 } else { 3064 how = HAMMER2_RESOLVE_MAYBE; 3065 } 3066 if (flags & HAMMER2_LOOKUP_SHARED) { 3067 how |= HAMMER2_RESOLVE_SHARED; 3068 } 3069 3070 /* 3071 * Calculate key to locate first/next element, unlocking the previous 3072 * element as we go. Be careful, the key calculation can overflow. 3073 * 3074 * (also reset bref to NULL) 3075 */ 3076 if (*firstp) { 3077 key = 0; 3078 *firstp = 0; 3079 } else { 3080 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 3081 if ((chain = *chainp) != NULL) { 3082 *chainp = NULL; 3083 hammer2_chain_unlock(chain); 3084 hammer2_chain_drop(chain); 3085 chain = NULL; 3086 } 3087 if (key == 0) { 3088 error |= HAMMER2_ERROR_EOF; 3089 goto done; 3090 } 3091 } 3092 3093 again: 3094 if (parent->error) { 3095 error = parent->error; 3096 goto done; 3097 } 3098 if (--maxloops == 0) 3099 panic("hammer2_chain_scan: maxloops"); 3100 3101 /* 3102 * Locate the blockref array. Currently we do a fully associative 3103 * search through the array. 3104 */ 3105 switch(parent->bref.type) { 3106 case HAMMER2_BREF_TYPE_INODE: 3107 /* 3108 * An inode with embedded data has no sub-chains. 3109 * 3110 * WARNING! Bulk scan code may pass a static chain marked 3111 * as BREF_TYPE_INODE with a copy of the volume 3112 * root blockset to snapshot the volume. 3113 */ 3114 if (parent->data->ipdata.meta.op_flags & 3115 HAMMER2_OPFLAG_DIRECTDATA) { 3116 error |= HAMMER2_ERROR_EOF; 3117 goto done; 3118 } 3119 base = &parent->data->ipdata.u.blockset.blockref[0]; 3120 count = HAMMER2_SET_COUNT; 3121 break; 3122 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3123 case HAMMER2_BREF_TYPE_INDIRECT: 3124 /* 3125 * Optimize indirect blocks in the INITIAL state to avoid 3126 * I/O. 3127 */ 3128 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3129 base = NULL; 3130 } else { 3131 if (parent->data == NULL) 3132 panic("parent->data is NULL"); 3133 base = &parent->data->npdata[0]; 3134 } 3135 count = parent->bytes / sizeof(hammer2_blockref_t); 3136 break; 3137 case HAMMER2_BREF_TYPE_VOLUME: 3138 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3139 count = HAMMER2_SET_COUNT; 3140 break; 3141 case HAMMER2_BREF_TYPE_FREEMAP: 3142 base = &parent->data->blkset.blockref[0]; 3143 count = HAMMER2_SET_COUNT; 3144 break; 3145 default: 3146 panic("hammer2_chain_scan: unrecognized blockref type: %d", 3147 parent->bref.type); 3148 base = NULL; /* safety */ 3149 count = 0; /* safety */ 3150 break; 3151 } 3152 3153 /* 3154 * Merged scan to find next candidate. 3155 * 3156 * hammer2_base_*() functions require the parent->core.live_* fields 3157 * to be synchronized. 3158 * 3159 * We need to hold the spinlock to access the block array and RB tree 3160 * and to interlock chain creation. 3161 */ 3162 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3163 hammer2_chain_countbrefs(parent, base, count); 3164 3165 next_key = 0; 3166 bref_ptr = NULL; 3167 hammer2_spin_ex(&parent->core.spin); 3168 chain = hammer2_combined_find(parent, base, count, 3169 &next_key, 3170 key, HAMMER2_KEY_MAX, 3171 &bref_ptr); 3172 generation = parent->core.generation; 3173 3174 /* 3175 * Exhausted parent chain, we're done. 3176 */ 3177 if (bref_ptr == NULL) { 3178 hammer2_spin_unex(&parent->core.spin); 3179 KKASSERT(chain == NULL); 3180 error |= HAMMER2_ERROR_EOF; 3181 goto done; 3182 } 3183 3184 /* 3185 * Copy into the supplied stack-based blockref. 3186 */ 3187 *bref = *bref_ptr; 3188 3189 /* 3190 * Selected from blockref or in-memory chain. 3191 */ 3192 if (chain == NULL) { 3193 switch(bref->type) { 3194 case HAMMER2_BREF_TYPE_INODE: 3195 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3196 case HAMMER2_BREF_TYPE_INDIRECT: 3197 case HAMMER2_BREF_TYPE_VOLUME: 3198 case HAMMER2_BREF_TYPE_FREEMAP: 3199 /* 3200 * Recursion, always get the chain 3201 */ 3202 hammer2_spin_unex(&parent->core.spin); 3203 chain = hammer2_chain_get(parent, generation, 3204 bref, how); 3205 if (chain == NULL) 3206 goto again; 3207 break; 3208 default: 3209 /* 3210 * No recursion, do not waste time instantiating 3211 * a chain, just iterate using the bref. 3212 */ 3213 hammer2_spin_unex(&parent->core.spin); 3214 break; 3215 } 3216 } else { 3217 /* 3218 * Recursion or not we need the chain in order to supply 3219 * the bref. 3220 */ 3221 hammer2_chain_ref(chain); 3222 hammer2_spin_unex(&parent->core.spin); 3223 hammer2_chain_lock(chain, how); 3224 } 3225 if (chain && 3226 (bcmp(bref, &chain->bref, sizeof(*bref)) || 3227 chain->parent != parent)) { 3228 hammer2_chain_unlock(chain); 3229 hammer2_chain_drop(chain); 3230 chain = NULL; 3231 goto again; 3232 } 3233 3234 /* 3235 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 3236 * 3237 * NOTE: chain's key range is not relevant as there might be 3238 * one-offs within the range that are not deleted. 3239 * 3240 * NOTE: XXX this could create problems with scans used in 3241 * situations other than mount-time recovery. 3242 * 3243 * NOTE: Lookups can race delete-duplicate because 3244 * delete-duplicate does not lock the parent's core 3245 * (they just use the spinlock on the core). 3246 */ 3247 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3248 hammer2_chain_unlock(chain); 3249 hammer2_chain_drop(chain); 3250 chain = NULL; 3251 3252 key = next_key; 3253 if (key == 0) { 3254 error |= HAMMER2_ERROR_EOF; 3255 goto done; 3256 } 3257 goto again; 3258 } 3259 3260 done: 3261 /* 3262 * All done, return the bref or NULL, supply chain if necessary. 3263 */ 3264 if (chain) 3265 *chainp = chain; 3266 return (error); 3267 } 3268 3269 /* 3270 * Create and return a new hammer2 system memory structure of the specified 3271 * key, type and size and insert it under (*parentp). This is a full 3272 * insertion, based on the supplied key/keybits, and may involve creating 3273 * indirect blocks and moving other chains around via delete/duplicate. 3274 * 3275 * This call can be made with parent == NULL as long as a non -1 methods 3276 * is supplied. hmp must also be supplied in this situation (otherwise 3277 * hmp is extracted from the supplied parent). The chain will be detached 3278 * from the topology. A later call with both parent and chain can be made 3279 * to attach it. 3280 * 3281 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3282 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3283 * FULL. This typically means that the caller is creating the chain after 3284 * doing a hammer2_chain_lookup(). 3285 * 3286 * (*parentp) must be exclusive locked and may be replaced on return 3287 * depending on how much work the function had to do. 3288 * 3289 * (*parentp) must not be errored or this function will assert. 3290 * 3291 * (*chainp) usually starts out NULL and returns the newly created chain, 3292 * but if the caller desires the caller may allocate a disconnected chain 3293 * and pass it in instead. 3294 * 3295 * This function should NOT be used to insert INDIRECT blocks. It is 3296 * typically used to create/insert inodes and data blocks. 3297 * 3298 * Caller must pass-in an exclusively locked parent the new chain is to 3299 * be inserted under, and optionally pass-in a disconnected, exclusively 3300 * locked chain to insert (else we create a new chain). The function will 3301 * adjust (*parentp) as necessary, create or connect the chain, and 3302 * return an exclusively locked chain in *chainp. 3303 * 3304 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3305 * and will be reassigned. 3306 * 3307 * NOTE: returns HAMMER_ERROR_* flags 3308 */ 3309 int 3310 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3311 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods, 3312 hammer2_key_t key, int keybits, int type, size_t bytes, 3313 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3314 { 3315 hammer2_chain_t *chain; 3316 hammer2_chain_t *parent; 3317 hammer2_blockref_t *base; 3318 hammer2_blockref_t dummy; 3319 int allocated = 0; 3320 int error = 0; 3321 int count; 3322 int maxloops = 300000; 3323 3324 /* 3325 * Topology may be crossing a PFS boundary. 3326 */ 3327 parent = *parentp; 3328 if (parent) { 3329 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3330 KKASSERT(parent->error == 0); 3331 hmp = parent->hmp; 3332 } 3333 chain = *chainp; 3334 3335 if (chain == NULL) { 3336 /* 3337 * First allocate media space and construct the dummy bref, 3338 * then allocate the in-memory chain structure. Set the 3339 * INITIAL flag for fresh chains which do not have embedded 3340 * data. 3341 */ 3342 bzero(&dummy, sizeof(dummy)); 3343 dummy.type = type; 3344 dummy.key = key; 3345 dummy.keybits = keybits; 3346 dummy.data_off = hammer2_getradix(bytes); 3347 3348 /* 3349 * Inherit methods from parent by default. Primarily used 3350 * for BREF_TYPE_DATA. Non-data types *must* be set to 3351 * a non-NONE check algorithm. 3352 */ 3353 if (methods == HAMMER2_METH_DEFAULT) 3354 dummy.methods = parent->bref.methods; 3355 else 3356 dummy.methods = (uint8_t)methods; 3357 3358 if (type != HAMMER2_BREF_TYPE_DATA && 3359 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3360 dummy.methods |= 3361 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3362 } 3363 3364 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3365 3366 /* 3367 * Lock the chain manually, chain_lock will load the chain 3368 * which we do NOT want to do. (note: chain->refs is set 3369 * to 1 by chain_alloc() for us, but lockcnt is not). 3370 */ 3371 chain->lockcnt = 1; 3372 hammer2_mtx_ex(&chain->lock); 3373 allocated = 1; 3374 3375 /* 3376 * Set INITIAL to optimize I/O. The flag will generally be 3377 * processed when we call hammer2_chain_modify(). 3378 */ 3379 switch(type) { 3380 case HAMMER2_BREF_TYPE_VOLUME: 3381 case HAMMER2_BREF_TYPE_FREEMAP: 3382 panic("hammer2_chain_create: called with volume type"); 3383 break; 3384 case HAMMER2_BREF_TYPE_INDIRECT: 3385 panic("hammer2_chain_create: cannot be used to" 3386 "create indirect block"); 3387 break; 3388 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3389 panic("hammer2_chain_create: cannot be used to" 3390 "create freemap root or node"); 3391 break; 3392 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3393 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3394 /* fall through */ 3395 case HAMMER2_BREF_TYPE_DIRENT: 3396 case HAMMER2_BREF_TYPE_INODE: 3397 case HAMMER2_BREF_TYPE_DATA: 3398 default: 3399 /* 3400 * leave chain->data NULL, set INITIAL 3401 */ 3402 KKASSERT(chain->data == NULL); 3403 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3404 break; 3405 } 3406 } else { 3407 /* 3408 * We are reattaching a previously deleted chain, possibly 3409 * under a new parent and possibly with a new key/keybits. 3410 * The chain does not have to be in a modified state. The 3411 * UPDATE flag will be set later on in this routine. 3412 * 3413 * Do NOT mess with the current state of the INITIAL flag. 3414 */ 3415 chain->bref.key = key; 3416 chain->bref.keybits = keybits; 3417 if (chain->flags & HAMMER2_CHAIN_DELETED) 3418 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3419 KKASSERT(chain->parent == NULL); 3420 } 3421 3422 /* 3423 * Set the appropriate bref flag if requested. 3424 * 3425 * NOTE! Callers can call this function to move chains without 3426 * knowing about special flags, so don't clear bref flags 3427 * here! 3428 */ 3429 if (flags & HAMMER2_INSERT_PFSROOT) 3430 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3431 3432 if (parent == NULL) 3433 goto skip; 3434 3435 /* 3436 * Calculate how many entries we have in the blockref array and 3437 * determine if an indirect block is required when inserting into 3438 * the parent. 3439 */ 3440 again: 3441 if (--maxloops == 0) 3442 panic("hammer2_chain_create: maxloops"); 3443 3444 switch(parent->bref.type) { 3445 case HAMMER2_BREF_TYPE_INODE: 3446 if ((parent->data->ipdata.meta.op_flags & 3447 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3448 kprintf("hammer2: parent set for direct-data! " 3449 "pkey=%016jx ckey=%016jx\n", 3450 parent->bref.key, 3451 chain->bref.key); 3452 } 3453 KKASSERT((parent->data->ipdata.meta.op_flags & 3454 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3455 KKASSERT(parent->data != NULL); 3456 base = &parent->data->ipdata.u.blockset.blockref[0]; 3457 count = HAMMER2_SET_COUNT; 3458 break; 3459 case HAMMER2_BREF_TYPE_INDIRECT: 3460 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3461 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3462 base = NULL; 3463 else 3464 base = &parent->data->npdata[0]; 3465 count = parent->bytes / sizeof(hammer2_blockref_t); 3466 break; 3467 case HAMMER2_BREF_TYPE_VOLUME: 3468 KKASSERT(parent->data != NULL); 3469 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3470 count = HAMMER2_SET_COUNT; 3471 break; 3472 case HAMMER2_BREF_TYPE_FREEMAP: 3473 KKASSERT(parent->data != NULL); 3474 base = &parent->data->blkset.blockref[0]; 3475 count = HAMMER2_SET_COUNT; 3476 break; 3477 default: 3478 panic("hammer2_chain_create: unrecognized blockref type: %d", 3479 parent->bref.type); 3480 base = NULL; 3481 count = 0; 3482 break; 3483 } 3484 3485 /* 3486 * Make sure we've counted the brefs 3487 */ 3488 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3489 hammer2_chain_countbrefs(parent, base, count); 3490 3491 KASSERT(parent->core.live_count >= 0 && 3492 parent->core.live_count <= count, 3493 ("bad live_count %d/%d (%02x, %d)", 3494 parent->core.live_count, count, 3495 parent->bref.type, parent->bytes)); 3496 3497 /* 3498 * If no free blockref could be found we must create an indirect 3499 * block and move a number of blockrefs into it. With the parent 3500 * locked we can safely lock each child in order to delete+duplicate 3501 * it without causing a deadlock. 3502 * 3503 * This may return the new indirect block or the old parent depending 3504 * on where the key falls. NULL is returned on error. 3505 */ 3506 if (parent->core.live_count == count) { 3507 hammer2_chain_t *nparent; 3508 3509 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3510 3511 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3512 mtid, type, &error); 3513 if (nparent == NULL) { 3514 if (allocated) 3515 hammer2_chain_drop(chain); 3516 chain = NULL; 3517 goto done; 3518 } 3519 if (parent != nparent) { 3520 hammer2_chain_unlock(parent); 3521 hammer2_chain_drop(parent); 3522 parent = *parentp = nparent; 3523 } 3524 goto again; 3525 } 3526 3527 /* 3528 * fall through if parent, or skip to here if no parent. 3529 */ 3530 skip: 3531 if (chain->flags & HAMMER2_CHAIN_DELETED) 3532 kprintf("Inserting deleted chain @%016jx\n", 3533 chain->bref.key); 3534 3535 /* 3536 * Link the chain into its parent. 3537 */ 3538 if (chain->parent != NULL) 3539 panic("hammer2: hammer2_chain_create: chain already connected"); 3540 KKASSERT(chain->parent == NULL); 3541 if (parent) { 3542 KKASSERT(parent->core.live_count < count); 3543 hammer2_chain_insert(parent, chain, 3544 HAMMER2_CHAIN_INSERT_SPIN | 3545 HAMMER2_CHAIN_INSERT_LIVE, 3546 0); 3547 } 3548 3549 if (allocated) { 3550 /* 3551 * Mark the newly created chain modified. This will cause 3552 * UPDATE to be set and process the INITIAL flag. 3553 * 3554 * Device buffers are not instantiated for DATA elements 3555 * as these are handled by logical buffers. 3556 * 3557 * Indirect and freemap node indirect blocks are handled 3558 * by hammer2_chain_create_indirect() and not by this 3559 * function. 3560 * 3561 * Data for all other bref types is expected to be 3562 * instantiated (INODE, LEAF). 3563 */ 3564 switch(chain->bref.type) { 3565 case HAMMER2_BREF_TYPE_DATA: 3566 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3567 case HAMMER2_BREF_TYPE_DIRENT: 3568 case HAMMER2_BREF_TYPE_INODE: 3569 error = hammer2_chain_modify(chain, mtid, dedup_off, 3570 HAMMER2_MODIFY_OPTDATA); 3571 break; 3572 default: 3573 /* 3574 * Remaining types are not supported by this function. 3575 * In particular, INDIRECT and LEAF_NODE types are 3576 * handled by create_indirect(). 3577 */ 3578 panic("hammer2_chain_create: bad type: %d", 3579 chain->bref.type); 3580 /* NOT REACHED */ 3581 break; 3582 } 3583 } else { 3584 /* 3585 * When reconnecting a chain we must set UPDATE and 3586 * setflush so the flush recognizes that it must update 3587 * the bref in the parent. 3588 */ 3589 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3590 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3591 } 3592 3593 /* 3594 * We must setflush(parent) to ensure that it recurses through to 3595 * chain. setflush(chain) might not work because ONFLUSH is possibly 3596 * already set in the chain (so it won't recurse up to set it in the 3597 * parent). 3598 */ 3599 if (parent) 3600 hammer2_chain_setflush(parent); 3601 3602 done: 3603 *chainp = chain; 3604 3605 return (error); 3606 } 3607 3608 /* 3609 * Move the chain from its old parent to a new parent. The chain must have 3610 * already been deleted or already disconnected (or never associated) with 3611 * a parent. The chain is reassociated with the new parent and the deleted 3612 * flag will be cleared (no longer deleted). The chain's modification state 3613 * is not altered. 3614 * 3615 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3616 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3617 * FULL. This typically means that the caller is creating the chain after 3618 * doing a hammer2_chain_lookup(). 3619 * 3620 * Neither (parent) or (chain) can be errored. 3621 * 3622 * If (parent) is non-NULL then the chain is inserted under the parent. 3623 * 3624 * If (parent) is NULL then the newly duplicated chain is not inserted 3625 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3626 * passing into hammer2_chain_create() after this function returns). 3627 * 3628 * WARNING! This function calls create which means it can insert indirect 3629 * blocks. This can cause other unrelated chains in the parent to 3630 * be moved to a newly inserted indirect block in addition to the 3631 * specific chain. 3632 */ 3633 void 3634 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3635 hammer2_tid_t mtid, int flags) 3636 { 3637 hammer2_blockref_t *bref; 3638 hammer2_chain_t *parent; 3639 3640 /* 3641 * WARNING! We should never resolve DATA to device buffers 3642 * (XXX allow it if the caller did?), and since 3643 * we currently do not have the logical buffer cache 3644 * buffer in-hand to fix its cached physical offset 3645 * we also force the modify code to not COW it. XXX 3646 * 3647 * NOTE! We allow error'd chains to be renamed. The bref itself 3648 * is good and can be renamed. The content, however, may 3649 * be inaccessible. 3650 */ 3651 KKASSERT(chain->parent == NULL); 3652 /*KKASSERT(chain->error == 0); allow */ 3653 bref = &chain->bref; 3654 3655 /* 3656 * If parent is not NULL the duplicated chain will be entered under 3657 * the parent and the UPDATE bit set to tell flush to update 3658 * the blockref. 3659 * 3660 * We must setflush(parent) to ensure that it recurses through to 3661 * chain. setflush(chain) might not work because ONFLUSH is possibly 3662 * already set in the chain (so it won't recurse up to set it in the 3663 * parent). 3664 * 3665 * Having both chains locked is extremely important for atomicy. 3666 */ 3667 if (parentp && (parent = *parentp) != NULL) { 3668 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3669 KKASSERT(parent->refs > 0); 3670 KKASSERT(parent->error == 0); 3671 3672 hammer2_chain_create(parentp, &chain, NULL, chain->pmp, 3673 HAMMER2_METH_DEFAULT, 3674 bref->key, bref->keybits, bref->type, 3675 chain->bytes, mtid, 0, flags); 3676 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3677 hammer2_chain_setflush(*parentp); 3678 } 3679 } 3680 3681 /* 3682 * This works in tandem with delete_obref() to install a blockref in 3683 * (typically) an indirect block that is associated with the chain being 3684 * moved to *parentp. 3685 * 3686 * The reason we need this function is that the caller needs to maintain 3687 * the blockref as it was, and not generate a new blockref for what might 3688 * be a modified chain. Otherwise stuff will leak into the flush that 3689 * the flush code's FLUSH_INODE_STOP flag is unable to catch. 3690 * 3691 * It is EXTREMELY important that we properly set CHAIN_BLKMAPUPD and 3692 * CHAIN_UPDATE. We must set BLKMAPUPD if the bref does not match, and 3693 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if 3694 * it does. Otherwise we can end up in a situation where H2 is unable to 3695 * clean up the in-memory chain topology. 3696 * 3697 * The reason for this is that flushes do not generally flush through 3698 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq 3699 * or sideq to properly flush and dispose of the related inode chain's flags. 3700 * Situations where the inode is not actually modified by the frontend, 3701 * but where we have to move the related chains around as we insert or cleanup 3702 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory 3703 * inode chain that does not have a hammer2_inode_t associated with it. 3704 */ 3705 static void 3706 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3707 hammer2_tid_t mtid, int flags, 3708 hammer2_blockref_t *obref) 3709 { 3710 hammer2_chain_rename(parentp, chain, mtid, flags); 3711 3712 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) { 3713 hammer2_blockref_t *tbase; 3714 int tcount; 3715 3716 KKASSERT((chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0); 3717 hammer2_chain_modify(*parentp, mtid, 0, 0); 3718 tbase = hammer2_chain_base_and_count(*parentp, &tcount); 3719 hammer2_base_insert(*parentp, tbase, tcount, chain, obref); 3720 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) { 3721 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD | 3722 HAMMER2_CHAIN_UPDATE); 3723 } else { 3724 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3725 } 3726 } 3727 } 3728 3729 /* 3730 * Helper function for deleting chains. 3731 * 3732 * The chain is removed from the live view (the RBTREE) as well as the parent's 3733 * blockmap. Both chain and its parent must be locked. 3734 * 3735 * parent may not be errored. chain can be errored. 3736 */ 3737 static int 3738 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3739 hammer2_tid_t mtid, int flags, 3740 hammer2_blockref_t *obref) 3741 { 3742 int error = 0; 3743 3744 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 3745 KKASSERT(chain->parent == parent); 3746 3747 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) { 3748 /* 3749 * Chain is blockmapped, so there must be a parent. 3750 * Atomically remove the chain from the parent and remove 3751 * the blockmap entry. The parent must be set modified 3752 * to remove the blockmap entry. 3753 */ 3754 hammer2_blockref_t *base; 3755 int count; 3756 3757 KKASSERT(parent != NULL); 3758 KKASSERT(parent->error == 0); 3759 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3760 error = hammer2_chain_modify(parent, mtid, 0, 0); 3761 if (error) 3762 goto done; 3763 3764 /* 3765 * Calculate blockmap pointer 3766 */ 3767 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3768 hammer2_spin_ex(&chain->core.spin); 3769 hammer2_spin_ex(&parent->core.spin); 3770 3771 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3772 atomic_add_int(&parent->core.live_count, -1); 3773 ++parent->core.generation; 3774 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3775 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3776 --parent->core.chain_count; 3777 chain->parent = NULL; 3778 3779 switch(parent->bref.type) { 3780 case HAMMER2_BREF_TYPE_INODE: 3781 /* 3782 * Access the inode's block array. However, there 3783 * is no block array if the inode is flagged 3784 * DIRECTDATA. 3785 */ 3786 if (parent->data && 3787 (parent->data->ipdata.meta.op_flags & 3788 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3789 base = 3790 &parent->data->ipdata.u.blockset.blockref[0]; 3791 } else { 3792 base = NULL; 3793 } 3794 count = HAMMER2_SET_COUNT; 3795 break; 3796 case HAMMER2_BREF_TYPE_INDIRECT: 3797 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3798 if (parent->data) 3799 base = &parent->data->npdata[0]; 3800 else 3801 base = NULL; 3802 count = parent->bytes / sizeof(hammer2_blockref_t); 3803 break; 3804 case HAMMER2_BREF_TYPE_VOLUME: 3805 base = &parent->data->voldata. 3806 sroot_blockset.blockref[0]; 3807 count = HAMMER2_SET_COUNT; 3808 break; 3809 case HAMMER2_BREF_TYPE_FREEMAP: 3810 base = &parent->data->blkset.blockref[0]; 3811 count = HAMMER2_SET_COUNT; 3812 break; 3813 default: 3814 base = NULL; 3815 count = 0; 3816 panic("_hammer2_chain_delete_helper: " 3817 "unrecognized blockref type: %d", 3818 parent->bref.type); 3819 break; 3820 } 3821 3822 /* 3823 * delete blockmapped chain from its parent. 3824 * 3825 * The parent is not affected by any statistics in chain 3826 * which are pending synchronization. That is, there is 3827 * nothing to undo in the parent since they have not yet 3828 * been incorporated into the parent. 3829 * 3830 * The parent is affected by statistics stored in inodes. 3831 * Those have already been synchronized, so they must be 3832 * undone. XXX split update possible w/delete in middle? 3833 */ 3834 if (base) { 3835 hammer2_base_delete(parent, base, count, chain, obref); 3836 } 3837 hammer2_spin_unex(&parent->core.spin); 3838 hammer2_spin_unex(&chain->core.spin); 3839 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3840 /* 3841 * Chain is not blockmapped but a parent is present. 3842 * Atomically remove the chain from the parent. There is 3843 * no blockmap entry to remove. 3844 * 3845 * Because chain was associated with a parent but not 3846 * synchronized, the chain's *_count_up fields contain 3847 * inode adjustment statistics which must be undone. 3848 */ 3849 hammer2_spin_ex(&chain->core.spin); 3850 hammer2_spin_ex(&parent->core.spin); 3851 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3852 atomic_add_int(&parent->core.live_count, -1); 3853 ++parent->core.generation; 3854 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3855 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3856 --parent->core.chain_count; 3857 chain->parent = NULL; 3858 hammer2_spin_unex(&parent->core.spin); 3859 hammer2_spin_unex(&chain->core.spin); 3860 } else { 3861 /* 3862 * Chain is not blockmapped and has no parent. This 3863 * is a degenerate case. 3864 */ 3865 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3866 } 3867 done: 3868 return error; 3869 } 3870 3871 /* 3872 * Create an indirect block that covers one or more of the elements in the 3873 * current parent. Either returns the existing parent with no locking or 3874 * ref changes or returns the new indirect block locked and referenced 3875 * and leaving the original parent lock/ref intact as well. 3876 * 3877 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3878 * 3879 * The returned chain depends on where the specified key falls. 3880 * 3881 * The key/keybits for the indirect mode only needs to follow three rules: 3882 * 3883 * (1) That all elements underneath it fit within its key space and 3884 * 3885 * (2) That all elements outside it are outside its key space. 3886 * 3887 * (3) When creating the new indirect block any elements in the current 3888 * parent that fit within the new indirect block's keyspace must be 3889 * moved into the new indirect block. 3890 * 3891 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3892 * keyspace the the current parent, but lookup/iteration rules will 3893 * ensure (and must ensure) that rule (2) for all parents leading up 3894 * to the nearest inode or the root volume header is adhered to. This 3895 * is accomplished by always recursing through matching keyspaces in 3896 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3897 * 3898 * The current implementation calculates the current worst-case keyspace by 3899 * iterating the current parent and then divides it into two halves, choosing 3900 * whichever half has the most elements (not necessarily the half containing 3901 * the requested key). 3902 * 3903 * We can also opt to use the half with the least number of elements. This 3904 * causes lower-numbered keys (aka logical file offsets) to recurse through 3905 * fewer indirect blocks and higher-numbered keys to recurse through more. 3906 * This also has the risk of not moving enough elements to the new indirect 3907 * block and being forced to create several indirect blocks before the element 3908 * can be inserted. 3909 * 3910 * Must be called with an exclusively locked parent. 3911 * 3912 * NOTE: *errorp set to HAMMER_ERROR_* flags 3913 */ 3914 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3915 hammer2_key_t *keyp, int keybits, 3916 hammer2_blockref_t *base, int count); 3917 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3918 hammer2_key_t *keyp, int keybits, 3919 hammer2_blockref_t *base, int count, 3920 int ncount); 3921 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3922 hammer2_key_t *keyp, int keybits, 3923 hammer2_blockref_t *base, int count, 3924 int ncount); 3925 static 3926 hammer2_chain_t * 3927 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3928 hammer2_key_t create_key, int create_bits, 3929 hammer2_tid_t mtid, int for_type, int *errorp) 3930 { 3931 hammer2_dev_t *hmp; 3932 hammer2_blockref_t *base; 3933 hammer2_blockref_t *bref; 3934 hammer2_blockref_t bsave; 3935 hammer2_blockref_t dummy; 3936 hammer2_chain_t *chain; 3937 hammer2_chain_t *ichain; 3938 hammer2_key_t key = create_key; 3939 hammer2_key_t key_beg; 3940 hammer2_key_t key_end; 3941 hammer2_key_t key_next; 3942 int keybits = create_bits; 3943 int count; 3944 int ncount; 3945 int nbytes; 3946 int loops; 3947 int error; 3948 int reason; 3949 int generation; 3950 int maxloops = 300000; 3951 3952 /* 3953 * Calculate the base blockref pointer or NULL if the chain 3954 * is known to be empty. We need to calculate the array count 3955 * for RB lookups either way. 3956 */ 3957 hmp = parent->hmp; 3958 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3959 3960 /* 3961 * Pre-modify the parent now to avoid having to deal with error 3962 * processing if we tried to later (in the middle of our loop). 3963 * 3964 * We are going to be moving bref's around, the indirect blocks 3965 * cannot be in an initial state. Do not pass MODIFY_OPTDATA. 3966 */ 3967 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 3968 if (*errorp) { 3969 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 3970 *errorp, hammer2_error_str(*errorp)); 3971 return NULL; 3972 } 3973 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3974 3975 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 3976 base = hammer2_chain_base_and_count(parent, &count); 3977 3978 /* 3979 * How big should our new indirect block be? It has to be at least 3980 * as large as its parent for splits to work properly. 3981 * 3982 * The freemap uses a specific indirect block size. The number of 3983 * levels are built dynamically and ultimately depend on the size 3984 * volume. Because freemap blocks are taken from the reserved areas 3985 * of the volume our goal is efficiency (fewer levels) and not so 3986 * much to save disk space. 3987 * 3988 * The first indirect block level for a directory usually uses 3989 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 3990 * the hash mechanism, this typically gives us a nominal 3991 * 32 * 4 entries with one level of indirection. 3992 * 3993 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 3994 * indirect blocks. The initial 4 entries in the inode gives us 3995 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 3996 * of indirection gives us 137GB, and so forth. H2 can support 3997 * huge file sizes but they are not typical, so we try to stick 3998 * with compactness and do not use a larger indirect block size. 3999 * 4000 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 4001 * due to the way indirect blocks are created this usually winds 4002 * up being extremely inefficient for small files. Even though 4003 * 16KB requires more levels of indirection for very large files, 4004 * the 16KB records can be ganged together into 64KB DIOs. 4005 */ 4006 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4007 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4008 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 4009 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4010 if (parent->data->ipdata.meta.type == 4011 HAMMER2_OBJTYPE_DIRECTORY) 4012 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 4013 else 4014 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 4015 4016 } else { 4017 nbytes = HAMMER2_IND_BYTES_NOM; 4018 } 4019 if (nbytes < count * sizeof(hammer2_blockref_t)) { 4020 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 4021 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 4022 nbytes = count * sizeof(hammer2_blockref_t); 4023 } 4024 ncount = nbytes / sizeof(hammer2_blockref_t); 4025 4026 /* 4027 * When creating an indirect block for a freemap node or leaf 4028 * the key/keybits must be fitted to static radix levels because 4029 * particular radix levels use particular reserved blocks in the 4030 * related zone. 4031 * 4032 * This routine calculates the key/radix of the indirect block 4033 * we need to create, and whether it is on the high-side or the 4034 * low-side. 4035 */ 4036 switch(for_type) { 4037 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4038 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4039 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 4040 base, count); 4041 break; 4042 case HAMMER2_BREF_TYPE_DATA: 4043 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 4044 base, count, ncount); 4045 break; 4046 case HAMMER2_BREF_TYPE_DIRENT: 4047 case HAMMER2_BREF_TYPE_INODE: 4048 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 4049 base, count, ncount); 4050 break; 4051 default: 4052 panic("illegal indirect block for bref type %d", for_type); 4053 break; 4054 } 4055 4056 /* 4057 * Normalize the key for the radix being represented, keeping the 4058 * high bits and throwing away the low bits. 4059 */ 4060 key &= ~(((hammer2_key_t)1 << keybits) - 1); 4061 4062 /* 4063 * Ok, create our new indirect block 4064 */ 4065 bzero(&dummy, sizeof(dummy)); 4066 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4067 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4068 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 4069 } else { 4070 dummy.type = HAMMER2_BREF_TYPE_INDIRECT; 4071 } 4072 dummy.key = key; 4073 dummy.keybits = keybits; 4074 dummy.data_off = hammer2_getradix(nbytes); 4075 dummy.methods = 4076 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 4077 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 4078 4079 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy); 4080 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 4081 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 4082 /* ichain has one ref at this point */ 4083 4084 /* 4085 * We have to mark it modified to allocate its block, but use 4086 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 4087 * it won't be acted upon by the flush code. 4088 * 4089 * XXX remove OPTDATA, we need a fully initialized indirect block to 4090 * be able to move the original blockref. 4091 */ 4092 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0); 4093 if (*errorp) { 4094 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 4095 *errorp, hammer2_error_str(*errorp)); 4096 hammer2_chain_unlock(ichain); 4097 hammer2_chain_drop(ichain); 4098 return NULL; 4099 } 4100 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4101 4102 /* 4103 * Iterate the original parent and move the matching brefs into 4104 * the new indirect block. 4105 * 4106 * XXX handle flushes. 4107 */ 4108 key_beg = 0; 4109 key_end = HAMMER2_KEY_MAX; 4110 key_next = 0; /* avoid gcc warnings */ 4111 hammer2_spin_ex(&parent->core.spin); 4112 loops = 0; 4113 reason = 0; 4114 4115 for (;;) { 4116 /* 4117 * Parent may have been modified, relocating its block array. 4118 * Reload the base pointer. 4119 */ 4120 base = hammer2_chain_base_and_count(parent, &count); 4121 4122 if (++loops > 100000) { 4123 hammer2_spin_unex(&parent->core.spin); 4124 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 4125 reason, parent, base, count, key_next); 4126 } 4127 4128 /* 4129 * NOTE: spinlock stays intact, returned chain (if not NULL) 4130 * is not referenced or locked which means that we 4131 * cannot safely check its flagged / deletion status 4132 * until we lock it. 4133 */ 4134 chain = hammer2_combined_find(parent, base, count, 4135 &key_next, 4136 key_beg, key_end, 4137 &bref); 4138 generation = parent->core.generation; 4139 if (bref == NULL) 4140 break; 4141 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4142 4143 /* 4144 * Skip keys that are not within the key/radix of the new 4145 * indirect block. They stay in the parent. 4146 */ 4147 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) { 4148 goto next_key_spinlocked; 4149 } 4150 4151 /* 4152 * Load the new indirect block by acquiring the related 4153 * chains (potentially from media as it might not be 4154 * in-memory). Then move it to the new parent (ichain). 4155 * 4156 * chain is referenced but not locked. We must lock the 4157 * chain to obtain definitive state. 4158 */ 4159 bsave = *bref; 4160 if (chain) { 4161 /* 4162 * Use chain already present in the RBTREE 4163 */ 4164 hammer2_chain_ref(chain); 4165 hammer2_spin_unex(&parent->core.spin); 4166 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 4167 } else { 4168 /* 4169 * Get chain for blockref element. _get returns NULL 4170 * on insertion race. 4171 */ 4172 hammer2_spin_unex(&parent->core.spin); 4173 chain = hammer2_chain_get(parent, generation, &bsave, 4174 HAMMER2_RESOLVE_NEVER); 4175 if (chain == NULL) { 4176 reason = 1; 4177 hammer2_spin_ex(&parent->core.spin); 4178 continue; 4179 } 4180 } 4181 4182 /* 4183 * This is always live so if the chain has been deleted 4184 * we raced someone and we have to retry. 4185 * 4186 * NOTE: Lookups can race delete-duplicate because 4187 * delete-duplicate does not lock the parent's core 4188 * (they just use the spinlock on the core). 4189 * 4190 * (note reversed logic for this one) 4191 */ 4192 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 4193 chain->parent != parent || 4194 (chain->flags & HAMMER2_CHAIN_DELETED)) { 4195 hammer2_chain_unlock(chain); 4196 hammer2_chain_drop(chain); 4197 if (hammer2_debug & 0x0040) { 4198 kprintf("LOST PARENT RETRY " 4199 "RETRY (%p,%p)->%p %08x\n", 4200 parent, chain->parent, chain, chain->flags); 4201 } 4202 hammer2_spin_ex(&parent->core.spin); 4203 continue; 4204 } 4205 4206 /* 4207 * Shift the chain to the indirect block. 4208 * 4209 * WARNING! No reason for us to load chain data, pass NOSTATS 4210 * to prevent delete/insert from trying to access 4211 * inode stats (and thus asserting if there is no 4212 * chain->data loaded). 4213 * 4214 * WARNING! The (parent, chain) deletion may modify the parent 4215 * and invalidate the base pointer. 4216 * 4217 * WARNING! Parent must already be marked modified, so we 4218 * can assume that chain_delete always suceeds. 4219 * 4220 * WARNING! hammer2_chain_repchange() does not have to be 4221 * called (and doesn't work anyway because we are 4222 * only doing a partial shift). A recursion that is 4223 * in-progress can continue at the current parent 4224 * and will be able to properly find its next key. 4225 */ 4226 error = hammer2_chain_delete_obref(parent, chain, mtid, 0, 4227 &bsave); 4228 KKASSERT(error == 0); 4229 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave); 4230 hammer2_chain_unlock(chain); 4231 hammer2_chain_drop(chain); 4232 KKASSERT(parent->refs > 0); 4233 chain = NULL; 4234 base = NULL; /* safety */ 4235 hammer2_spin_ex(&parent->core.spin); 4236 next_key_spinlocked: 4237 if (--maxloops == 0) 4238 panic("hammer2_chain_create_indirect: maxloops"); 4239 reason = 4; 4240 if (key_next == 0 || key_next > key_end) 4241 break; 4242 key_beg = key_next; 4243 /* loop */ 4244 } 4245 hammer2_spin_unex(&parent->core.spin); 4246 4247 /* 4248 * Insert the new indirect block into the parent now that we've 4249 * cleared out some entries in the parent. We calculated a good 4250 * insertion index in the loop above (ichain->index). 4251 * 4252 * We don't have to set UPDATE here because we mark ichain 4253 * modified down below (so the normal modified -> flush -> set-moved 4254 * sequence applies). 4255 * 4256 * The insertion shouldn't race as this is a completely new block 4257 * and the parent is locked. 4258 */ 4259 base = NULL; /* safety, parent modify may change address */ 4260 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 4261 KKASSERT(parent->core.live_count < count); 4262 hammer2_chain_insert(parent, ichain, 4263 HAMMER2_CHAIN_INSERT_SPIN | 4264 HAMMER2_CHAIN_INSERT_LIVE, 4265 0); 4266 4267 /* 4268 * Make sure flushes propogate after our manual insertion. 4269 */ 4270 hammer2_chain_setflush(ichain); 4271 hammer2_chain_setflush(parent); 4272 4273 /* 4274 * Figure out what to return. 4275 */ 4276 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits)) { 4277 /* 4278 * Key being created is outside the key range, 4279 * return the original parent. 4280 */ 4281 hammer2_chain_unlock(ichain); 4282 hammer2_chain_drop(ichain); 4283 } else { 4284 /* 4285 * Otherwise its in the range, return the new parent. 4286 * (leave both the new and old parent locked). 4287 */ 4288 parent = ichain; 4289 } 4290 4291 return(parent); 4292 } 4293 4294 /* 4295 * Do maintenance on an indirect chain. Both parent and chain are locked. 4296 * 4297 * Returns non-zero if (chain) is deleted, either due to being empty or 4298 * because its children were safely moved into the parent. 4299 */ 4300 int 4301 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4302 hammer2_chain_t *chain) 4303 { 4304 hammer2_blockref_t *chain_base; 4305 hammer2_blockref_t *base; 4306 hammer2_blockref_t *bref; 4307 hammer2_blockref_t bsave; 4308 hammer2_key_t key_next; 4309 hammer2_key_t key_beg; 4310 hammer2_key_t key_end; 4311 hammer2_chain_t *sub; 4312 int chain_count; 4313 int count; 4314 int error; 4315 int generation; 4316 4317 /* 4318 * Make sure we have an accurate live_count 4319 */ 4320 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4321 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4322 base = &chain->data->npdata[0]; 4323 count = chain->bytes / sizeof(hammer2_blockref_t); 4324 hammer2_chain_countbrefs(chain, base, count); 4325 } 4326 4327 /* 4328 * If the indirect block is empty we can delete it. 4329 * (ignore deletion error) 4330 */ 4331 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4332 hammer2_chain_delete(parent, chain, 4333 chain->bref.modify_tid, 4334 HAMMER2_DELETE_PERMANENT); 4335 hammer2_chain_repchange(parent, chain); 4336 return 1; 4337 } 4338 4339 base = hammer2_chain_base_and_count(parent, &count); 4340 4341 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4342 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4343 hammer2_chain_countbrefs(parent, base, count); 4344 } 4345 4346 /* 4347 * Determine if we can collapse chain into parent, calculate 4348 * hysteresis for chain emptiness. 4349 */ 4350 if (parent->core.live_count + chain->core.live_count - 1 > count) 4351 return 0; 4352 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4353 if (chain->core.live_count > chain_count * 3 / 4) 4354 return 0; 4355 4356 /* 4357 * Ok, theoretically we can collapse chain's contents into 4358 * parent. chain is locked, but any in-memory children of chain 4359 * are not. For this to work, we must be able to dispose of any 4360 * in-memory children of chain. 4361 * 4362 * For now require that there are no in-memory children of chain. 4363 * 4364 * WARNING! Both chain and parent must remain locked across this 4365 * entire operation. 4366 */ 4367 4368 /* 4369 * Parent must be marked modified. Don't try to collapse it if we 4370 * can't mark it modified. Once modified, destroy chain to make room 4371 * and to get rid of what will be a conflicting key (this is included 4372 * in the calculation above). Finally, move the children of chain 4373 * into chain's parent. 4374 * 4375 * This order creates an accounting problem for bref.embed.stats 4376 * because we destroy chain before we remove its children. Any 4377 * elements whos blockref is already synchronized will be counted 4378 * twice. To deal with the problem we clean out chain's stats prior 4379 * to deleting it. 4380 */ 4381 error = hammer2_chain_modify(parent, 0, 0, 0); 4382 if (error) { 4383 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4384 hammer2_error_str(error)); 4385 return 0; 4386 } 4387 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4388 if (error) { 4389 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4390 hammer2_error_str(error)); 4391 return 0; 4392 } 4393 4394 chain->bref.embed.stats.inode_count = 0; 4395 chain->bref.embed.stats.data_count = 0; 4396 error = hammer2_chain_delete(parent, chain, 4397 chain->bref.modify_tid, 4398 HAMMER2_DELETE_PERMANENT); 4399 KKASSERT(error == 0); 4400 4401 /* 4402 * The combined_find call requires core.spin to be held. One would 4403 * think there wouldn't be any conflicts since we hold chain 4404 * exclusively locked, but the caching mechanism for 0-ref children 4405 * does not require a chain lock. 4406 */ 4407 hammer2_spin_ex(&chain->core.spin); 4408 4409 key_next = 0; 4410 key_beg = 0; 4411 key_end = HAMMER2_KEY_MAX; 4412 for (;;) { 4413 chain_base = &chain->data->npdata[0]; 4414 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4415 sub = hammer2_combined_find(chain, chain_base, chain_count, 4416 &key_next, 4417 key_beg, key_end, 4418 &bref); 4419 generation = chain->core.generation; 4420 if (bref == NULL) 4421 break; 4422 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4423 4424 bsave = *bref; 4425 if (sub) { 4426 hammer2_chain_ref(sub); 4427 hammer2_spin_unex(&chain->core.spin); 4428 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4429 } else { 4430 hammer2_spin_unex(&chain->core.spin); 4431 sub = hammer2_chain_get(chain, generation, &bsave, 4432 HAMMER2_RESOLVE_NEVER); 4433 if (sub == NULL) { 4434 hammer2_spin_ex(&chain->core.spin); 4435 continue; 4436 } 4437 } 4438 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) || 4439 sub->parent != chain || 4440 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4441 hammer2_chain_unlock(sub); 4442 hammer2_chain_drop(sub); 4443 hammer2_spin_ex(&chain->core.spin); 4444 sub = NULL; /* safety */ 4445 continue; 4446 } 4447 error = hammer2_chain_delete_obref(chain, sub, 4448 sub->bref.modify_tid, 0, 4449 &bsave); 4450 KKASSERT(error == 0); 4451 hammer2_chain_rename_obref(&parent, sub, 4452 sub->bref.modify_tid, 4453 HAMMER2_INSERT_SAMEPARENT, &bsave); 4454 hammer2_chain_unlock(sub); 4455 hammer2_chain_drop(sub); 4456 hammer2_spin_ex(&chain->core.spin); 4457 4458 if (key_next == 0) 4459 break; 4460 key_beg = key_next; 4461 } 4462 hammer2_spin_unex(&chain->core.spin); 4463 4464 hammer2_chain_repchange(parent, chain); 4465 4466 return 1; 4467 } 4468 4469 /* 4470 * Freemap indirect blocks 4471 * 4472 * Calculate the keybits and highside/lowside of the freemap node the 4473 * caller is creating. 4474 * 4475 * This routine will specify the next higher-level freemap key/radix 4476 * representing the lowest-ordered set. By doing so, eventually all 4477 * low-ordered sets will be moved one level down. 4478 * 4479 * We have to be careful here because the freemap reserves a limited 4480 * number of blocks for a limited number of levels. So we can't just 4481 * push indiscriminately. 4482 */ 4483 int 4484 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4485 int keybits, hammer2_blockref_t *base, int count) 4486 { 4487 hammer2_chain_t *chain; 4488 hammer2_blockref_t *bref; 4489 hammer2_key_t key; 4490 hammer2_key_t key_beg; 4491 hammer2_key_t key_end; 4492 hammer2_key_t key_next; 4493 int maxloops = 300000; 4494 4495 key = *keyp; 4496 keybits = 64; 4497 4498 /* 4499 * Calculate the range of keys in the array being careful to skip 4500 * slots which are overridden with a deletion. 4501 */ 4502 key_beg = 0; 4503 key_end = HAMMER2_KEY_MAX; 4504 hammer2_spin_ex(&parent->core.spin); 4505 4506 for (;;) { 4507 if (--maxloops == 0) { 4508 panic("indkey_freemap shit %p %p:%d\n", 4509 parent, base, count); 4510 } 4511 chain = hammer2_combined_find(parent, base, count, 4512 &key_next, 4513 key_beg, key_end, 4514 &bref); 4515 4516 /* 4517 * Exhausted search 4518 */ 4519 if (bref == NULL) 4520 break; 4521 4522 /* 4523 * Skip deleted chains. 4524 */ 4525 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4526 if (key_next == 0 || key_next > key_end) 4527 break; 4528 key_beg = key_next; 4529 continue; 4530 } 4531 4532 /* 4533 * Use the full live (not deleted) element for the scan 4534 * iteration. HAMMER2 does not allow partial replacements. 4535 * 4536 * XXX should be built into hammer2_combined_find(). 4537 */ 4538 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4539 4540 if (keybits > bref->keybits) { 4541 key = bref->key; 4542 keybits = bref->keybits; 4543 } else if (keybits == bref->keybits && bref->key < key) { 4544 key = bref->key; 4545 } 4546 if (key_next == 0) 4547 break; 4548 key_beg = key_next; 4549 } 4550 hammer2_spin_unex(&parent->core.spin); 4551 4552 /* 4553 * Return the keybits for a higher-level FREEMAP_NODE covering 4554 * this node. 4555 */ 4556 switch(keybits) { 4557 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4558 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4559 break; 4560 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4561 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4562 break; 4563 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4564 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4565 break; 4566 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4567 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4568 break; 4569 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4570 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4571 break; 4572 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4573 panic("hammer2_chain_indkey_freemap: level too high"); 4574 break; 4575 default: 4576 panic("hammer2_chain_indkey_freemap: bad radix"); 4577 break; 4578 } 4579 *keyp = key; 4580 4581 return (keybits); 4582 } 4583 4584 /* 4585 * File indirect blocks 4586 * 4587 * Calculate the key/keybits for the indirect block to create by scanning 4588 * existing keys. The key being created is also passed in *keyp and can be 4589 * inside or outside the indirect block. Regardless, the indirect block 4590 * must hold at least two keys in order to guarantee sufficient space. 4591 * 4592 * We use a modified version of the freemap's fixed radix tree, but taylored 4593 * for file data. Basically we configure an indirect block encompassing the 4594 * smallest key. 4595 */ 4596 static int 4597 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4598 int keybits, hammer2_blockref_t *base, int count, 4599 int ncount) 4600 { 4601 hammer2_chain_t *chain; 4602 hammer2_blockref_t *bref; 4603 hammer2_key_t key; 4604 hammer2_key_t key_beg; 4605 hammer2_key_t key_end; 4606 hammer2_key_t key_next; 4607 int nradix; 4608 int maxloops = 300000; 4609 4610 key = *keyp; 4611 keybits = 64; 4612 4613 /* 4614 * Calculate the range of keys in the array being careful to skip 4615 * slots which are overridden with a deletion. 4616 * 4617 * Locate the smallest key. 4618 */ 4619 key_beg = 0; 4620 key_end = HAMMER2_KEY_MAX; 4621 hammer2_spin_ex(&parent->core.spin); 4622 4623 for (;;) { 4624 if (--maxloops == 0) { 4625 panic("indkey_freemap shit %p %p:%d\n", 4626 parent, base, count); 4627 } 4628 chain = hammer2_combined_find(parent, base, count, 4629 &key_next, 4630 key_beg, key_end, 4631 &bref); 4632 4633 /* 4634 * Exhausted search 4635 */ 4636 if (bref == NULL) 4637 break; 4638 4639 /* 4640 * Skip deleted chains. 4641 */ 4642 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4643 if (key_next == 0 || key_next > key_end) 4644 break; 4645 key_beg = key_next; 4646 continue; 4647 } 4648 4649 /* 4650 * Use the full live (not deleted) element for the scan 4651 * iteration. HAMMER2 does not allow partial replacements. 4652 * 4653 * XXX should be built into hammer2_combined_find(). 4654 */ 4655 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4656 4657 if (keybits > bref->keybits) { 4658 key = bref->key; 4659 keybits = bref->keybits; 4660 } else if (keybits == bref->keybits && bref->key < key) { 4661 key = bref->key; 4662 } 4663 if (key_next == 0) 4664 break; 4665 key_beg = key_next; 4666 } 4667 hammer2_spin_unex(&parent->core.spin); 4668 4669 /* 4670 * Calculate the static keybits for a higher-level indirect block 4671 * that contains the key. 4672 */ 4673 *keyp = key; 4674 4675 switch(ncount) { 4676 case HAMMER2_IND_COUNT_MIN: 4677 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4678 break; 4679 case HAMMER2_IND_COUNT_NOM: 4680 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4681 break; 4682 case HAMMER2_IND_COUNT_MAX: 4683 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4684 break; 4685 default: 4686 panic("bad ncount %d\n", ncount); 4687 nradix = 0; 4688 break; 4689 } 4690 4691 /* 4692 * The largest radix that can be returned for an indirect block is 4693 * 63 bits. (The largest practical indirect block radix is actually 4694 * 62 bits because the top-level inode or volume root contains four 4695 * entries, but allow 63 to be returned). 4696 */ 4697 if (nradix >= 64) 4698 nradix = 63; 4699 4700 return keybits + nradix; 4701 } 4702 4703 #if 1 4704 4705 /* 4706 * Directory indirect blocks. 4707 * 4708 * Covers both the inode index (directory of inodes), and directory contents 4709 * (filenames hardlinked to inodes). 4710 * 4711 * Because directory keys are hashed we generally try to cut the space in 4712 * half. We accomodate the inode index (which tends to have linearly 4713 * increasing inode numbers) by ensuring that the keyspace is at least large 4714 * enough to fill up the indirect block being created. 4715 */ 4716 static int 4717 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4718 int keybits, hammer2_blockref_t *base, int count, 4719 int ncount) 4720 { 4721 hammer2_blockref_t *bref; 4722 hammer2_chain_t *chain; 4723 hammer2_key_t key_beg; 4724 hammer2_key_t key_end; 4725 hammer2_key_t key_next; 4726 hammer2_key_t key; 4727 int nkeybits; 4728 int locount; 4729 int hicount; 4730 int maxloops = 300000; 4731 4732 /* 4733 * NOTE: We can't take a shortcut here anymore for inodes because 4734 * the root directory can contain a mix of inodes and directory 4735 * entries (we used to just return 63 if parent->bref.type was 4736 * HAMMER2_BREF_TYPE_INODE. 4737 */ 4738 key = *keyp; 4739 locount = 0; 4740 hicount = 0; 4741 4742 /* 4743 * Calculate the range of keys in the array being careful to skip 4744 * slots which are overridden with a deletion. 4745 */ 4746 key_beg = 0; 4747 key_end = HAMMER2_KEY_MAX; 4748 hammer2_spin_ex(&parent->core.spin); 4749 4750 for (;;) { 4751 if (--maxloops == 0) { 4752 panic("indkey_freemap shit %p %p:%d\n", 4753 parent, base, count); 4754 } 4755 chain = hammer2_combined_find(parent, base, count, 4756 &key_next, 4757 key_beg, key_end, 4758 &bref); 4759 4760 /* 4761 * Exhausted search 4762 */ 4763 if (bref == NULL) 4764 break; 4765 4766 /* 4767 * Deleted object 4768 */ 4769 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4770 if (key_next == 0 || key_next > key_end) 4771 break; 4772 key_beg = key_next; 4773 continue; 4774 } 4775 4776 /* 4777 * Use the full live (not deleted) element for the scan 4778 * iteration. HAMMER2 does not allow partial replacements. 4779 * 4780 * XXX should be built into hammer2_combined_find(). 4781 */ 4782 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4783 4784 /* 4785 * Expand our calculated key range (key, keybits) to fit 4786 * the scanned key. nkeybits represents the full range 4787 * that we will later cut in half (two halves @ nkeybits - 1). 4788 */ 4789 nkeybits = keybits; 4790 if (nkeybits < bref->keybits) { 4791 if (bref->keybits > 64) { 4792 kprintf("bad bref chain %p bref %p\n", 4793 chain, bref); 4794 Debugger("fubar"); 4795 } 4796 nkeybits = bref->keybits; 4797 } 4798 while (nkeybits < 64 && 4799 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) { 4800 ++nkeybits; 4801 } 4802 4803 /* 4804 * If the new key range is larger we have to determine 4805 * which side of the new key range the existing keys fall 4806 * under by checking the high bit, then collapsing the 4807 * locount into the hicount or vise-versa. 4808 */ 4809 if (keybits != nkeybits) { 4810 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4811 hicount += locount; 4812 locount = 0; 4813 } else { 4814 locount += hicount; 4815 hicount = 0; 4816 } 4817 keybits = nkeybits; 4818 } 4819 4820 /* 4821 * The newly scanned key will be in the lower half or the 4822 * upper half of the (new) key range. 4823 */ 4824 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4825 ++hicount; 4826 else 4827 ++locount; 4828 4829 if (key_next == 0) 4830 break; 4831 key_beg = key_next; 4832 } 4833 hammer2_spin_unex(&parent->core.spin); 4834 bref = NULL; /* now invalid (safety) */ 4835 4836 /* 4837 * Adjust keybits to represent half of the full range calculated 4838 * above (radix 63 max) for our new indirect block. 4839 */ 4840 --keybits; 4841 4842 /* 4843 * Expand keybits to hold at least ncount elements. ncount will be 4844 * a power of 2. This is to try to completely fill leaf nodes (at 4845 * least for keys which are not hashes). 4846 * 4847 * We aren't counting 'in' or 'out', we are counting 'high side' 4848 * and 'low side' based on the bit at (1LL << keybits). We want 4849 * everything to be inside in these cases so shift it all to 4850 * the low or high side depending on the new high bit. 4851 */ 4852 while (((hammer2_key_t)1 << keybits) < ncount) { 4853 ++keybits; 4854 if (key & ((hammer2_key_t)1 << keybits)) { 4855 hicount += locount; 4856 locount = 0; 4857 } else { 4858 locount += hicount; 4859 hicount = 0; 4860 } 4861 } 4862 4863 if (hicount > locount) 4864 key |= (hammer2_key_t)1 << keybits; 4865 else 4866 key &= ~(hammer2_key_t)1 << keybits; 4867 4868 *keyp = key; 4869 4870 return (keybits); 4871 } 4872 4873 #else 4874 4875 /* 4876 * Directory indirect blocks. 4877 * 4878 * Covers both the inode index (directory of inodes), and directory contents 4879 * (filenames hardlinked to inodes). 4880 * 4881 * Because directory keys are hashed we generally try to cut the space in 4882 * half. We accomodate the inode index (which tends to have linearly 4883 * increasing inode numbers) by ensuring that the keyspace is at least large 4884 * enough to fill up the indirect block being created. 4885 */ 4886 static int 4887 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4888 int keybits, hammer2_blockref_t *base, int count, 4889 int ncount) 4890 { 4891 hammer2_blockref_t *bref; 4892 hammer2_chain_t *chain; 4893 hammer2_key_t key_beg; 4894 hammer2_key_t key_end; 4895 hammer2_key_t key_next; 4896 hammer2_key_t key; 4897 int nkeybits; 4898 int locount; 4899 int hicount; 4900 int maxloops = 300000; 4901 4902 /* 4903 * Shortcut if the parent is the inode. In this situation the 4904 * parent has 4+1 directory entries and we are creating an indirect 4905 * block capable of holding many more. 4906 */ 4907 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4908 return 63; 4909 } 4910 4911 key = *keyp; 4912 locount = 0; 4913 hicount = 0; 4914 4915 /* 4916 * Calculate the range of keys in the array being careful to skip 4917 * slots which are overridden with a deletion. 4918 */ 4919 key_beg = 0; 4920 key_end = HAMMER2_KEY_MAX; 4921 hammer2_spin_ex(&parent->core.spin); 4922 4923 for (;;) { 4924 if (--maxloops == 0) { 4925 panic("indkey_freemap shit %p %p:%d\n", 4926 parent, base, count); 4927 } 4928 chain = hammer2_combined_find(parent, base, count, 4929 &key_next, 4930 key_beg, key_end, 4931 &bref); 4932 4933 /* 4934 * Exhausted search 4935 */ 4936 if (bref == NULL) 4937 break; 4938 4939 /* 4940 * Deleted object 4941 */ 4942 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4943 if (key_next == 0 || key_next > key_end) 4944 break; 4945 key_beg = key_next; 4946 continue; 4947 } 4948 4949 /* 4950 * Use the full live (not deleted) element for the scan 4951 * iteration. HAMMER2 does not allow partial replacements. 4952 * 4953 * XXX should be built into hammer2_combined_find(). 4954 */ 4955 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4956 4957 /* 4958 * Expand our calculated key range (key, keybits) to fit 4959 * the scanned key. nkeybits represents the full range 4960 * that we will later cut in half (two halves @ nkeybits - 1). 4961 */ 4962 nkeybits = keybits; 4963 if (nkeybits < bref->keybits) { 4964 if (bref->keybits > 64) { 4965 kprintf("bad bref chain %p bref %p\n", 4966 chain, bref); 4967 Debugger("fubar"); 4968 } 4969 nkeybits = bref->keybits; 4970 } 4971 while (nkeybits < 64 && 4972 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4973 (key ^ bref->key)) != 0) { 4974 ++nkeybits; 4975 } 4976 4977 /* 4978 * If the new key range is larger we have to determine 4979 * which side of the new key range the existing keys fall 4980 * under by checking the high bit, then collapsing the 4981 * locount into the hicount or vise-versa. 4982 */ 4983 if (keybits != nkeybits) { 4984 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4985 hicount += locount; 4986 locount = 0; 4987 } else { 4988 locount += hicount; 4989 hicount = 0; 4990 } 4991 keybits = nkeybits; 4992 } 4993 4994 /* 4995 * The newly scanned key will be in the lower half or the 4996 * upper half of the (new) key range. 4997 */ 4998 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4999 ++hicount; 5000 else 5001 ++locount; 5002 5003 if (key_next == 0) 5004 break; 5005 key_beg = key_next; 5006 } 5007 hammer2_spin_unex(&parent->core.spin); 5008 bref = NULL; /* now invalid (safety) */ 5009 5010 /* 5011 * Adjust keybits to represent half of the full range calculated 5012 * above (radix 63 max) for our new indirect block. 5013 */ 5014 --keybits; 5015 5016 /* 5017 * Expand keybits to hold at least ncount elements. ncount will be 5018 * a power of 2. This is to try to completely fill leaf nodes (at 5019 * least for keys which are not hashes). 5020 * 5021 * We aren't counting 'in' or 'out', we are counting 'high side' 5022 * and 'low side' based on the bit at (1LL << keybits). We want 5023 * everything to be inside in these cases so shift it all to 5024 * the low or high side depending on the new high bit. 5025 */ 5026 while (((hammer2_key_t)1 << keybits) < ncount) { 5027 ++keybits; 5028 if (key & ((hammer2_key_t)1 << keybits)) { 5029 hicount += locount; 5030 locount = 0; 5031 } else { 5032 locount += hicount; 5033 hicount = 0; 5034 } 5035 } 5036 5037 if (hicount > locount) 5038 key |= (hammer2_key_t)1 << keybits; 5039 else 5040 key &= ~(hammer2_key_t)1 << keybits; 5041 5042 *keyp = key; 5043 5044 return (keybits); 5045 } 5046 5047 #endif 5048 5049 /* 5050 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 5051 * it exists. 5052 * 5053 * Both parent and chain must be locked exclusively. 5054 * 5055 * This function will modify the parent if the blockref requires removal 5056 * from the parent's block table. 5057 * 5058 * This function is NOT recursive. Any entity already pushed into the 5059 * chain (such as an inode) may still need visibility into its contents, 5060 * as well as the ability to read and modify the contents. For example, 5061 * for an unlinked file which is still open. 5062 * 5063 * Also note that the flusher is responsible for cleaning up empty 5064 * indirect blocks. 5065 */ 5066 int 5067 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 5068 hammer2_tid_t mtid, int flags) 5069 { 5070 int error = 0; 5071 5072 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5073 5074 /* 5075 * Nothing to do if already marked. 5076 * 5077 * We need the spinlock on the core whos RBTREE contains chain 5078 * to protect against races. 5079 */ 5080 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5081 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5082 chain->parent == parent); 5083 error = _hammer2_chain_delete_helper(parent, chain, 5084 mtid, flags, NULL); 5085 } 5086 5087 /* 5088 * Permanent deletions mark the chain as destroyed. 5089 * 5090 * NOTE: We do not setflush the chain unless the deletion is 5091 * permanent, since the deletion of a chain does not actually 5092 * require it to be flushed. 5093 */ 5094 if (error == 0) { 5095 if (flags & HAMMER2_DELETE_PERMANENT) { 5096 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5097 hammer2_chain_setflush(chain); 5098 } 5099 } 5100 5101 return error; 5102 } 5103 5104 static int 5105 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain, 5106 hammer2_tid_t mtid, int flags, 5107 hammer2_blockref_t *obref) 5108 { 5109 int error = 0; 5110 5111 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5112 5113 /* 5114 * Nothing to do if already marked. 5115 * 5116 * We need the spinlock on the core whos RBTREE contains chain 5117 * to protect against races. 5118 */ 5119 obref->type = HAMMER2_BREF_TYPE_EMPTY; 5120 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5121 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5122 chain->parent == parent); 5123 error = _hammer2_chain_delete_helper(parent, chain, 5124 mtid, flags, obref); 5125 } 5126 5127 /* 5128 * Permanent deletions mark the chain as destroyed. 5129 * 5130 * NOTE: We do not setflush the chain unless the deletion is 5131 * permanent, since the deletion of a chain does not actually 5132 * require it to be flushed. 5133 */ 5134 if (error == 0) { 5135 if (flags & HAMMER2_DELETE_PERMANENT) { 5136 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5137 hammer2_chain_setflush(chain); 5138 } 5139 } 5140 5141 return error; 5142 } 5143 5144 /* 5145 * Returns the index of the nearest element in the blockref array >= elm. 5146 * Returns (count) if no element could be found. 5147 * 5148 * Sets *key_nextp to the next key for loop purposes but does not modify 5149 * it if the next key would be higher than the current value of *key_nextp. 5150 * Note that *key_nexp can overflow to 0, which should be tested by the 5151 * caller. 5152 * 5153 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5154 * held through the operation. 5155 */ 5156 static int 5157 hammer2_base_find(hammer2_chain_t *parent, 5158 hammer2_blockref_t *base, int count, 5159 hammer2_key_t *key_nextp, 5160 hammer2_key_t key_beg, hammer2_key_t key_end) 5161 { 5162 hammer2_blockref_t *scan; 5163 hammer2_key_t scan_end; 5164 int i; 5165 int limit; 5166 5167 /* 5168 * Require the live chain's already have their core's counted 5169 * so we can optimize operations. 5170 */ 5171 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 5172 5173 /* 5174 * Degenerate case 5175 */ 5176 if (count == 0 || base == NULL) 5177 return(count); 5178 5179 /* 5180 * Sequential optimization using parent->cache_index. This is 5181 * the most likely scenario. 5182 * 5183 * We can avoid trailing empty entries on live chains, otherwise 5184 * we might have to check the whole block array. 5185 */ 5186 i = parent->cache_index; /* SMP RACE OK */ 5187 cpu_ccfence(); 5188 limit = parent->core.live_zero; 5189 if (i >= limit) 5190 i = limit - 1; 5191 if (i < 0) 5192 i = 0; 5193 KKASSERT(i < count); 5194 5195 /* 5196 * Search backwards 5197 */ 5198 scan = &base[i]; 5199 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY || 5200 scan->key > key_beg)) { 5201 --scan; 5202 --i; 5203 } 5204 parent->cache_index = i; 5205 5206 /* 5207 * Search forwards, stop when we find a scan element which 5208 * encloses the key or until we know that there are no further 5209 * elements. 5210 */ 5211 while (i < count) { 5212 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) { 5213 scan_end = scan->key + 5214 ((hammer2_key_t)1 << scan->keybits) - 1; 5215 if (scan->key > key_beg || scan_end >= key_beg) 5216 break; 5217 } 5218 if (i >= limit) 5219 return (count); 5220 ++scan; 5221 ++i; 5222 } 5223 if (i != count) { 5224 parent->cache_index = i; 5225 if (i >= limit) { 5226 i = count; 5227 } else { 5228 scan_end = scan->key + 5229 ((hammer2_key_t)1 << scan->keybits); 5230 if (scan_end && (*key_nextp > scan_end || 5231 *key_nextp == 0)) { 5232 *key_nextp = scan_end; 5233 } 5234 } 5235 } 5236 return (i); 5237 } 5238 5239 /* 5240 * Do a combined search and return the next match either from the blockref 5241 * array or from the in-memory chain. Sets *brefp to the returned bref in 5242 * both cases, or sets it to NULL if the search exhausted. Only returns 5243 * a non-NULL chain if the search matched from the in-memory chain. 5244 * 5245 * When no in-memory chain has been found and a non-NULL bref is returned 5246 * in *brefp. 5247 * 5248 * 5249 * The returned chain is not locked or referenced. Use the returned bref 5250 * to determine if the search exhausted or not. Iterate if the base find 5251 * is chosen but matches a deleted chain. 5252 * 5253 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5254 * held through the operation. 5255 */ 5256 static hammer2_chain_t * 5257 hammer2_combined_find(hammer2_chain_t *parent, 5258 hammer2_blockref_t *base, int count, 5259 hammer2_key_t *key_nextp, 5260 hammer2_key_t key_beg, hammer2_key_t key_end, 5261 hammer2_blockref_t **brefp) 5262 { 5263 hammer2_blockref_t *bref; 5264 hammer2_chain_t *chain; 5265 int i; 5266 5267 /* 5268 * Lookup in block array and in rbtree. 5269 */ 5270 *key_nextp = key_end + 1; 5271 i = hammer2_base_find(parent, base, count, key_nextp, 5272 key_beg, key_end); 5273 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 5274 5275 /* 5276 * Neither matched 5277 */ 5278 if (i == count && chain == NULL) { 5279 *brefp = NULL; 5280 return(NULL); 5281 } 5282 5283 /* 5284 * Only chain matched. 5285 */ 5286 if (i == count) { 5287 bref = &chain->bref; 5288 goto found; 5289 } 5290 5291 /* 5292 * Only blockref matched. 5293 */ 5294 if (chain == NULL) { 5295 bref = &base[i]; 5296 goto found; 5297 } 5298 5299 /* 5300 * Both in-memory and blockref matched, select the nearer element. 5301 * 5302 * If both are flush with the left-hand side or both are the 5303 * same distance away, select the chain. In this situation the 5304 * chain must have been loaded from the matching blockmap. 5305 */ 5306 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 5307 chain->bref.key == base[i].key) { 5308 KKASSERT(chain->bref.key == base[i].key); 5309 bref = &chain->bref; 5310 goto found; 5311 } 5312 5313 /* 5314 * Select the nearer key 5315 */ 5316 if (chain->bref.key < base[i].key) { 5317 bref = &chain->bref; 5318 } else { 5319 bref = &base[i]; 5320 chain = NULL; 5321 } 5322 5323 /* 5324 * If the bref is out of bounds we've exhausted our search. 5325 */ 5326 found: 5327 if (bref->key > key_end) { 5328 *brefp = NULL; 5329 chain = NULL; 5330 } else { 5331 *brefp = bref; 5332 } 5333 return(chain); 5334 } 5335 5336 /* 5337 * Locate the specified block array element and delete it. The element 5338 * must exist. 5339 * 5340 * The spin lock on the related chain must be held. 5341 * 5342 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5343 * need to be adjusted when we commit the media change. 5344 */ 5345 void 5346 hammer2_base_delete(hammer2_chain_t *parent, 5347 hammer2_blockref_t *base, int count, 5348 hammer2_chain_t *chain, 5349 hammer2_blockref_t *obref) 5350 { 5351 hammer2_blockref_t *elm = &chain->bref; 5352 hammer2_blockref_t *scan; 5353 hammer2_key_t key_next; 5354 int i; 5355 5356 /* 5357 * Delete element. Expect the element to exist. 5358 * 5359 * XXX see caller, flush code not yet sophisticated enough to prevent 5360 * re-flushed in some cases. 5361 */ 5362 key_next = 0; /* max range */ 5363 i = hammer2_base_find(parent, base, count, &key_next, 5364 elm->key, elm->key); 5365 scan = &base[i]; 5366 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY || 5367 scan->key != elm->key || 5368 ((chain->flags & HAMMER2_CHAIN_BLKMAPUPD) == 0 && 5369 scan->keybits != elm->keybits)) { 5370 hammer2_spin_unex(&parent->core.spin); 5371 panic("delete base %p element not found at %d/%d elm %p\n", 5372 base, i, count, elm); 5373 return; 5374 } 5375 5376 /* 5377 * Update stats and zero the entry. 5378 * 5379 * NOTE: Handle radix == 0 (0 bytes) case. 5380 */ 5381 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5382 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5383 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5384 } 5385 switch(scan->type) { 5386 case HAMMER2_BREF_TYPE_INODE: 5387 --parent->bref.embed.stats.inode_count; 5388 /* fall through */ 5389 case HAMMER2_BREF_TYPE_DATA: 5390 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5391 atomic_set_int(&chain->flags, 5392 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5393 } else { 5394 if (parent->bref.leaf_count) 5395 --parent->bref.leaf_count; 5396 } 5397 /* fall through */ 5398 case HAMMER2_BREF_TYPE_INDIRECT: 5399 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5400 parent->bref.embed.stats.data_count -= 5401 scan->embed.stats.data_count; 5402 parent->bref.embed.stats.inode_count -= 5403 scan->embed.stats.inode_count; 5404 } 5405 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5406 break; 5407 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5408 atomic_set_int(&chain->flags, 5409 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5410 } else { 5411 if (parent->bref.leaf_count <= scan->leaf_count) 5412 parent->bref.leaf_count = 0; 5413 else 5414 parent->bref.leaf_count -= scan->leaf_count; 5415 } 5416 break; 5417 case HAMMER2_BREF_TYPE_DIRENT: 5418 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5419 atomic_set_int(&chain->flags, 5420 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5421 } else { 5422 if (parent->bref.leaf_count) 5423 --parent->bref.leaf_count; 5424 } 5425 default: 5426 break; 5427 } 5428 5429 if (obref) 5430 *obref = *scan; 5431 bzero(scan, sizeof(*scan)); 5432 5433 /* 5434 * We can only optimize parent->core.live_zero for live chains. 5435 */ 5436 if (parent->core.live_zero == i + 1) { 5437 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY) 5438 ; 5439 parent->core.live_zero = i + 1; 5440 } 5441 5442 /* 5443 * Clear appropriate blockmap flags in chain. 5444 */ 5445 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED | 5446 HAMMER2_CHAIN_BLKMAPUPD); 5447 } 5448 5449 /* 5450 * Insert the specified element. The block array must not already have the 5451 * element and must have space available for the insertion. 5452 * 5453 * The spin lock on the related chain must be held. 5454 * 5455 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5456 * need to be adjusted when we commit the media change. 5457 */ 5458 void 5459 hammer2_base_insert(hammer2_chain_t *parent, 5460 hammer2_blockref_t *base, int count, 5461 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5462 { 5463 hammer2_key_t key_next; 5464 hammer2_key_t xkey; 5465 int i; 5466 int j; 5467 int k; 5468 int l; 5469 int u = 1; 5470 5471 /* 5472 * Insert new element. Expect the element to not already exist 5473 * unless we are replacing it. 5474 * 5475 * XXX see caller, flush code not yet sophisticated enough to prevent 5476 * re-flushed in some cases. 5477 */ 5478 key_next = 0; /* max range */ 5479 i = hammer2_base_find(parent, base, count, &key_next, 5480 elm->key, elm->key); 5481 5482 /* 5483 * Shortcut fill optimization, typical ordered insertion(s) may not 5484 * require a search. 5485 */ 5486 KKASSERT(i >= 0 && i <= count); 5487 5488 /* 5489 * Set appropriate blockmap flags in chain (if not NULL) 5490 */ 5491 if (chain) 5492 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 5493 5494 /* 5495 * Update stats and zero the entry 5496 */ 5497 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5498 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5499 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5500 } 5501 switch(elm->type) { 5502 case HAMMER2_BREF_TYPE_INODE: 5503 ++parent->bref.embed.stats.inode_count; 5504 /* fall through */ 5505 case HAMMER2_BREF_TYPE_DATA: 5506 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5507 ++parent->bref.leaf_count; 5508 /* fall through */ 5509 case HAMMER2_BREF_TYPE_INDIRECT: 5510 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5511 parent->bref.embed.stats.data_count += 5512 elm->embed.stats.data_count; 5513 parent->bref.embed.stats.inode_count += 5514 elm->embed.stats.inode_count; 5515 } 5516 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5517 break; 5518 if (parent->bref.leaf_count + elm->leaf_count < 5519 HAMMER2_BLOCKREF_LEAF_MAX) { 5520 parent->bref.leaf_count += elm->leaf_count; 5521 } else { 5522 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5523 } 5524 break; 5525 case HAMMER2_BREF_TYPE_DIRENT: 5526 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5527 ++parent->bref.leaf_count; 5528 break; 5529 default: 5530 break; 5531 } 5532 5533 5534 /* 5535 * We can only optimize parent->core.live_zero for live chains. 5536 */ 5537 if (i == count && parent->core.live_zero < count) { 5538 i = parent->core.live_zero++; 5539 base[i] = *elm; 5540 return; 5541 } 5542 5543 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5544 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5545 hammer2_spin_unex(&parent->core.spin); 5546 panic("insert base %p overlapping elements at %d elm %p\n", 5547 base, i, elm); 5548 } 5549 5550 /* 5551 * Try to find an empty slot before or after. 5552 */ 5553 j = i; 5554 k = i; 5555 while (j > 0 || k < count) { 5556 --j; 5557 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) { 5558 if (j == i - 1) { 5559 base[j] = *elm; 5560 } else { 5561 bcopy(&base[j+1], &base[j], 5562 (i - j - 1) * sizeof(*base)); 5563 base[i - 1] = *elm; 5564 } 5565 goto validate; 5566 } 5567 ++k; 5568 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) { 5569 bcopy(&base[i], &base[i+1], 5570 (k - i) * sizeof(hammer2_blockref_t)); 5571 base[i] = *elm; 5572 5573 /* 5574 * We can only update parent->core.live_zero for live 5575 * chains. 5576 */ 5577 if (parent->core.live_zero <= k) 5578 parent->core.live_zero = k + 1; 5579 u = 2; 5580 goto validate; 5581 } 5582 } 5583 panic("hammer2_base_insert: no room!"); 5584 5585 /* 5586 * Debugging 5587 */ 5588 validate: 5589 key_next = 0; 5590 for (l = 0; l < count; ++l) { 5591 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5592 key_next = base[l].key + 5593 ((hammer2_key_t)1 << base[l].keybits) - 1; 5594 break; 5595 } 5596 } 5597 while (++l < count) { 5598 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5599 if (base[l].key <= key_next) 5600 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5601 key_next = base[l].key + 5602 ((hammer2_key_t)1 << base[l].keybits) - 1; 5603 5604 } 5605 } 5606 5607 } 5608 5609 #if 0 5610 5611 /* 5612 * Sort the blockref array for the chain. Used by the flush code to 5613 * sort the blockref[] array. 5614 * 5615 * The chain must be exclusively locked AND spin-locked. 5616 */ 5617 typedef hammer2_blockref_t *hammer2_blockref_p; 5618 5619 static 5620 int 5621 hammer2_base_sort_callback(const void *v1, const void *v2) 5622 { 5623 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5624 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5625 5626 /* 5627 * Make sure empty elements are placed at the end of the array 5628 */ 5629 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) { 5630 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) 5631 return(0); 5632 return(1); 5633 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) { 5634 return(-1); 5635 } 5636 5637 /* 5638 * Sort by key 5639 */ 5640 if (bref1->key < bref2->key) 5641 return(-1); 5642 if (bref1->key > bref2->key) 5643 return(1); 5644 return(0); 5645 } 5646 5647 void 5648 hammer2_base_sort(hammer2_chain_t *chain) 5649 { 5650 hammer2_blockref_t *base; 5651 int count; 5652 5653 switch(chain->bref.type) { 5654 case HAMMER2_BREF_TYPE_INODE: 5655 /* 5656 * Special shortcut for embedded data returns the inode 5657 * itself. Callers must detect this condition and access 5658 * the embedded data (the strategy code does this for us). 5659 * 5660 * This is only applicable to regular files and softlinks. 5661 */ 5662 if (chain->data->ipdata.meta.op_flags & 5663 HAMMER2_OPFLAG_DIRECTDATA) { 5664 return; 5665 } 5666 base = &chain->data->ipdata.u.blockset.blockref[0]; 5667 count = HAMMER2_SET_COUNT; 5668 break; 5669 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5670 case HAMMER2_BREF_TYPE_INDIRECT: 5671 /* 5672 * Optimize indirect blocks in the INITIAL state to avoid 5673 * I/O. 5674 */ 5675 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5676 base = &chain->data->npdata[0]; 5677 count = chain->bytes / sizeof(hammer2_blockref_t); 5678 break; 5679 case HAMMER2_BREF_TYPE_VOLUME: 5680 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5681 count = HAMMER2_SET_COUNT; 5682 break; 5683 case HAMMER2_BREF_TYPE_FREEMAP: 5684 base = &chain->data->blkset.blockref[0]; 5685 count = HAMMER2_SET_COUNT; 5686 break; 5687 default: 5688 panic("hammer2_base_sort: unrecognized " 5689 "blockref(A) type: %d", 5690 chain->bref.type); 5691 base = NULL; /* safety */ 5692 count = 0; /* safety */ 5693 break; 5694 } 5695 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5696 } 5697 5698 #endif 5699 5700 /* 5701 * Set the check data for a chain. This can be a heavy-weight operation 5702 * and typically only runs on-flush. For file data check data is calculated 5703 * when the logical buffers are flushed. 5704 */ 5705 void 5706 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5707 { 5708 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 5709 5710 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5711 case HAMMER2_CHECK_NONE: 5712 break; 5713 case HAMMER2_CHECK_DISABLED: 5714 break; 5715 case HAMMER2_CHECK_ISCSI32: 5716 chain->bref.check.iscsi32.value = 5717 hammer2_icrc32(bdata, chain->bytes); 5718 break; 5719 case HAMMER2_CHECK_XXHASH64: 5720 chain->bref.check.xxhash64.value = 5721 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5722 break; 5723 case HAMMER2_CHECK_SHA192: 5724 { 5725 SHA256_CTX hash_ctx; 5726 union { 5727 uint8_t digest[SHA256_DIGEST_LENGTH]; 5728 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5729 } u; 5730 5731 SHA256_Init(&hash_ctx); 5732 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5733 SHA256_Final(u.digest, &hash_ctx); 5734 u.digest64[2] ^= u.digest64[3]; 5735 bcopy(u.digest, 5736 chain->bref.check.sha192.data, 5737 sizeof(chain->bref.check.sha192.data)); 5738 } 5739 break; 5740 case HAMMER2_CHECK_FREEMAP: 5741 chain->bref.check.freemap.icrc32 = 5742 hammer2_icrc32(bdata, chain->bytes); 5743 break; 5744 default: 5745 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5746 chain->bref.methods); 5747 break; 5748 } 5749 } 5750 5751 /* 5752 * Characterize a failed check code and try to trace back to the inode. 5753 */ 5754 static void 5755 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check, 5756 int bits) 5757 { 5758 hammer2_chain_t *lchain; 5759 hammer2_chain_t *ochain; 5760 int did; 5761 5762 did = krateprintf(&krate_h2chk, 5763 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL " 5764 "(flags=%08x, bref/data ", 5765 chain->bref.data_off, 5766 chain->bref.type, 5767 hammer2_bref_type_str(chain->bref.type), 5768 chain->bref.methods, 5769 chain->flags); 5770 if (did == 0) 5771 return; 5772 5773 if (bits == 32) { 5774 kprintf("%08x/%08x)\n", 5775 chain->bref.check.iscsi32.value, 5776 (uint32_t)check); 5777 } else { 5778 kprintf("%016jx/%016jx)\n", 5779 chain->bref.check.xxhash64.value, 5780 check); 5781 } 5782 5783 /* 5784 * Run up the chains to try to find the governing inode so we 5785 * can report it. 5786 * 5787 * XXX This error reporting is not really MPSAFE 5788 */ 5789 ochain = chain; 5790 lchain = chain; 5791 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) { 5792 lchain = chain; 5793 chain = chain->parent; 5794 } 5795 5796 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE && 5797 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 || 5798 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) { 5799 kprintf(" Resides at/in inode %ld\n", 5800 chain->bref.key); 5801 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5802 kprintf(" Resides in inode index - CRITICAL!!!\n"); 5803 } else { 5804 kprintf(" Resides in root index - CRITICAL!!!\n"); 5805 } 5806 if (ochain->hmp) { 5807 const char *pfsname = "UNKNOWN"; 5808 int i; 5809 5810 if (ochain->pmp) { 5811 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 5812 if (ochain->pmp->pfs_hmps[i] == ochain->hmp && 5813 ochain->pmp->pfs_names[i]) { 5814 pfsname = ochain->pmp->pfs_names[i]; 5815 break; 5816 } 5817 } 5818 } 5819 kprintf(" In pfs %s on device %s\n", 5820 pfsname, ochain->hmp->devrepname); 5821 } 5822 } 5823 5824 /* 5825 * Returns non-zero on success, 0 on failure. 5826 */ 5827 int 5828 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5829 { 5830 uint32_t check32; 5831 uint64_t check64; 5832 int r; 5833 5834 if (chain->flags & HAMMER2_CHAIN_NOTTESTED) 5835 return 1; 5836 5837 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5838 case HAMMER2_CHECK_NONE: 5839 r = 1; 5840 break; 5841 case HAMMER2_CHECK_DISABLED: 5842 r = 1; 5843 break; 5844 case HAMMER2_CHECK_ISCSI32: 5845 check32 = hammer2_icrc32(bdata, chain->bytes); 5846 r = (chain->bref.check.iscsi32.value == check32); 5847 if (r == 0) { 5848 hammer2_characterize_failed_chain(chain, check32, 32); 5849 } 5850 hammer2_process_icrc32 += chain->bytes; 5851 break; 5852 case HAMMER2_CHECK_XXHASH64: 5853 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5854 r = (chain->bref.check.xxhash64.value == check64); 5855 if (r == 0) { 5856 hammer2_characterize_failed_chain(chain, check64, 64); 5857 } 5858 hammer2_process_xxhash64 += chain->bytes; 5859 break; 5860 case HAMMER2_CHECK_SHA192: 5861 { 5862 SHA256_CTX hash_ctx; 5863 union { 5864 uint8_t digest[SHA256_DIGEST_LENGTH]; 5865 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5866 } u; 5867 5868 SHA256_Init(&hash_ctx); 5869 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5870 SHA256_Final(u.digest, &hash_ctx); 5871 u.digest64[2] ^= u.digest64[3]; 5872 if (bcmp(u.digest, 5873 chain->bref.check.sha192.data, 5874 sizeof(chain->bref.check.sha192.data)) == 0) { 5875 r = 1; 5876 } else { 5877 r = 0; 5878 krateprintf(&krate_h2chk, 5879 "chain %016jx.%02x meth=%02x " 5880 "CHECK FAIL\n", 5881 chain->bref.data_off, 5882 chain->bref.type, 5883 chain->bref.methods); 5884 } 5885 } 5886 break; 5887 case HAMMER2_CHECK_FREEMAP: 5888 r = (chain->bref.check.freemap.icrc32 == 5889 hammer2_icrc32(bdata, chain->bytes)); 5890 if (r == 0) { 5891 int did; 5892 5893 did = krateprintf(&krate_h2chk, 5894 "chain %016jx.%02x meth=%02x " 5895 "CHECK FAIL\n", 5896 chain->bref.data_off, 5897 chain->bref.type, 5898 chain->bref.methods); 5899 if (did) { 5900 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 5901 chain->bref.check.freemap.icrc32, 5902 hammer2_icrc32(bdata, chain->bytes), 5903 chain->bytes); 5904 if (chain->dio) { 5905 kprintf("dio %p buf %016jx,%d " 5906 "bdata %p/%p\n", 5907 chain->dio, 5908 chain->dio->bp->b_loffset, 5909 chain->dio->bp->b_bufsize, 5910 bdata, 5911 chain->dio->bp->b_data); 5912 } 5913 } 5914 } 5915 break; 5916 default: 5917 kprintf("hammer2_chain_testcheck: unknown check type %02x\n", 5918 chain->bref.methods); 5919 r = 1; 5920 break; 5921 } 5922 return r; 5923 } 5924 5925 /* 5926 * Acquire the chain and parent representing the specified inode for the 5927 * device at the specified cluster index. 5928 * 5929 * The flags passed in are LOOKUP flags, not RESOLVE flags. 5930 * 5931 * If we are unable to locate the inode, HAMMER2_ERROR_EIO or HAMMER2_ERROR_CHECK 5932 * is returned. In case of error, *chainp and/or *parentp may still be returned 5933 * non-NULL. 5934 * 5935 * The caller may pass-in a locked *parentp and/or *chainp, or neither. 5936 * They will be unlocked and released by this function. The *parentp and 5937 * *chainp representing the located inode are returned locked. 5938 * 5939 * The returned error includes any error on the returned chain in addition to 5940 * errors incurred while trying to lookup the inode. However, a chain->error 5941 * might not be recognized if HAMMER2_LOOKUP_NODATA is passed. This flag may 5942 * not be passed to this function. 5943 */ 5944 int 5945 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 5946 int clindex, int flags, 5947 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 5948 { 5949 hammer2_chain_t *parent; 5950 hammer2_chain_t *rchain; 5951 hammer2_key_t key_dummy; 5952 hammer2_inode_t *ip; 5953 int resolve_flags; 5954 int error; 5955 5956 KKASSERT((flags & HAMMER2_LOOKUP_NODATA) == 0); 5957 5958 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 5959 HAMMER2_RESOLVE_SHARED : 0; 5960 5961 /* 5962 * Caller expects us to replace these. 5963 */ 5964 if (*chainp) { 5965 hammer2_chain_unlock(*chainp); 5966 hammer2_chain_drop(*chainp); 5967 *chainp = NULL; 5968 } 5969 if (*parentp) { 5970 hammer2_chain_unlock(*parentp); 5971 hammer2_chain_drop(*parentp); 5972 *parentp = NULL; 5973 } 5974 5975 /* 5976 * Be very careful, this is a backend function and we CANNOT 5977 * lock any frontend inode structure we find. But we have to 5978 * look the inode up this way first in case it exists but is 5979 * detached from the radix tree. 5980 */ 5981 ip = hammer2_inode_lookup(pmp, inum); 5982 if (ip) { 5983 *chainp = hammer2_inode_chain_and_parent(ip, clindex, 5984 parentp, 5985 resolve_flags); 5986 hammer2_inode_drop(ip); 5987 if (*chainp) 5988 return (*chainp)->error; 5989 hammer2_chain_unlock(*chainp); 5990 hammer2_chain_drop(*chainp); 5991 *chainp = NULL; 5992 if (*parentp) { 5993 hammer2_chain_unlock(*parentp); 5994 hammer2_chain_drop(*parentp); 5995 *parentp = NULL; 5996 } 5997 } 5998 5999 /* 6000 * Inodes hang off of the iroot (bit 63 is clear, differentiating 6001 * inodes from root directory entries in the key lookup). 6002 */ 6003 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 6004 rchain = NULL; 6005 if (parent) { 6006 /* 6007 * NOTE: rchain can be returned as NULL even if error == 0 6008 * (i.e. not found) 6009 */ 6010 rchain = hammer2_chain_lookup(&parent, &key_dummy, 6011 inum, inum, 6012 &error, flags); 6013 /* 6014 * Propagate a chain-specific error to caller. 6015 * 6016 * If the chain is not errored, we must still validate that the inode 6017 * number is correct, because all hell will break loose if it isn't 6018 * correct. It should always be correct so print to the console and 6019 * simulate a CHECK error if it is not. 6020 */ 6021 if (error == 0 && rchain) { 6022 error = rchain->error; 6023 if (error == 0 && rchain->data) { 6024 if (inum != rchain->data->ipdata.meta.inum) { 6025 kprintf("hammer2_chain_inode_find: lookup inum %ld, " 6026 "got valid inode but with inum %ld\n", 6027 inum, rchain->data->ipdata.meta.inum); 6028 error = HAMMER2_ERROR_CHECK; 6029 rchain->error = error; 6030 } 6031 } 6032 } 6033 } else { 6034 error = HAMMER2_ERROR_EIO; 6035 } 6036 *parentp = parent; 6037 *chainp = rchain; 6038 6039 return error; 6040 } 6041 6042 /* 6043 * Used by the bulkscan code to snapshot the synchronized storage for 6044 * a volume, allowing it to be scanned concurrently against normal 6045 * operation. 6046 */ 6047 hammer2_chain_t * 6048 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 6049 { 6050 hammer2_chain_t *copy; 6051 6052 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 6053 copy->data = kmalloc(sizeof(copy->data->voldata), 6054 hmp->mmsg, M_WAITOK | M_ZERO); 6055 hammer2_voldata_lock(hmp); 6056 copy->data->voldata = hmp->volsync; 6057 hammer2_voldata_unlock(hmp); 6058 6059 return copy; 6060 } 6061 6062 void 6063 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 6064 { 6065 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 6066 KKASSERT(copy->data); 6067 kfree(copy->data, copy->hmp->mmsg); 6068 copy->data = NULL; 6069 hammer2_chain_drop(copy); 6070 } 6071 6072 /* 6073 * Returns non-zero if the chain (INODE or DIRENT) matches the 6074 * filename. 6075 */ 6076 int 6077 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 6078 size_t name_len) 6079 { 6080 const hammer2_inode_data_t *ripdata; 6081 6082 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 6083 ripdata = &chain->data->ipdata; 6084 if (ripdata->meta.name_len == name_len && 6085 bcmp(ripdata->filename, name, name_len) == 0) { 6086 return 1; 6087 } 6088 } 6089 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 6090 chain->bref.embed.dirent.namlen == name_len) { 6091 if (name_len > sizeof(chain->bref.check.buf) && 6092 bcmp(chain->data->buf, name, name_len) == 0) { 6093 return 1; 6094 } 6095 if (name_len <= sizeof(chain->bref.check.buf) && 6096 bcmp(chain->bref.check.buf, name, name_len) == 0) { 6097 return 1; 6098 } 6099 } 6100 return 0; 6101 } 6102 6103 /* 6104 * Debugging 6105 */ 6106 void 6107 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int bi, int *countp, 6108 char pfx, u_int flags) 6109 { 6110 hammer2_chain_t *scan; 6111 hammer2_chain_t *parent; 6112 6113 --*countp; 6114 if (*countp == 0) { 6115 kprintf("%*.*s...\n", tab, tab, ""); 6116 return; 6117 } 6118 if (*countp < 0) 6119 return; 6120 kprintf("%*.*s%c-chain %p %s.%-3d %016jx %016jx/%-2d mir=%016jx\n", 6121 tab, tab, "", pfx, chain, 6122 hammer2_bref_type_str(chain->bref.type), bi, 6123 chain->bref.data_off, chain->bref.key, chain->bref.keybits, 6124 chain->bref.mirror_tid); 6125 6126 kprintf("%*.*s [%08x] (%s) refs=%d", 6127 tab, tab, "", 6128 chain->flags, 6129 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE && 6130 chain->data) ? (char *)chain->data->ipdata.filename : "?"), 6131 chain->refs); 6132 6133 parent = chain->parent; 6134 if (parent) 6135 kprintf("\n%*.*s p=%p [pflags %08x prefs %d]", 6136 tab, tab, "", 6137 parent, parent->flags, parent->refs); 6138 if (RB_EMPTY(&chain->core.rbtree)) { 6139 kprintf("\n"); 6140 } else { 6141 int bi = 0; 6142 kprintf(" {\n"); 6143 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) { 6144 if ((scan->flags & flags) || flags == (u_int)-1) { 6145 hammer2_dump_chain(scan, tab + 4, bi, countp, 6146 'a', flags); 6147 } 6148 bi++; 6149 } 6150 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) 6151 kprintf("%*.*s}(%s)\n", tab, tab, "", 6152 chain->data->ipdata.filename); 6153 else 6154 kprintf("%*.*s}\n", tab, tab, ""); 6155 } 6156 } 6157