1 /* 2 * Copyright (c) 2011-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static hammer2_chain_t *hammer2_chain_create_indirect( 68 hammer2_chain_t *parent, 69 hammer2_key_t key, int keybits, 70 hammer2_tid_t mtid, int for_type, int *errorp); 71 static void hammer2_chain_rename_obref(hammer2_chain_t **parentp, 72 hammer2_chain_t *chain, hammer2_tid_t mtid, 73 int flags, hammer2_blockref_t *obref); 74 static int hammer2_chain_delete_obref(hammer2_chain_t *parent, 75 hammer2_chain_t *chain, 76 hammer2_tid_t mtid, int flags, 77 hammer2_blockref_t *obref); 78 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain); 79 static hammer2_chain_t *hammer2_combined_find( 80 hammer2_chain_t *parent, 81 hammer2_blockref_t *base, int count, 82 hammer2_key_t *key_nextp, 83 hammer2_key_t key_beg, hammer2_key_t key_end, 84 hammer2_blockref_t **bresp); 85 86 /* 87 * There are many degenerate situations where an extreme rate of console 88 * output can occur from warnings and errors. Make sure this output does 89 * not impede operations. 90 */ 91 static struct krate krate_h2chk = { .freq = 5 }; 92 static struct krate krate_h2me = { .freq = 1 }; 93 static struct krate krate_h2em = { .freq = 1 }; 94 95 /* 96 * Basic RBTree for chains (core.rbtree). 97 */ 98 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 99 100 int 101 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 102 { 103 hammer2_key_t c1_beg; 104 hammer2_key_t c1_end; 105 hammer2_key_t c2_beg; 106 hammer2_key_t c2_end; 107 108 /* 109 * Compare chains. Overlaps are not supposed to happen and catch 110 * any software issues early we count overlaps as a match. 111 */ 112 c1_beg = chain1->bref.key; 113 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 114 c2_beg = chain2->bref.key; 115 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 116 117 if (c1_end < c2_beg) /* fully to the left */ 118 return(-1); 119 if (c1_beg > c2_end) /* fully to the right */ 120 return(1); 121 return(0); /* overlap (must not cross edge boundary) */ 122 } 123 124 /* 125 * Assert that a chain has no media data associated with it. 126 */ 127 static __inline void 128 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 129 { 130 KKASSERT(chain->dio == NULL); 131 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 132 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 133 chain->data) { 134 panic("hammer2_chain_assert_no_data: chain %p still has data", 135 chain); 136 } 137 } 138 139 /* 140 * Make a chain visible to the flusher. The flusher operates using a top-down 141 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains, 142 * flushes them, and updates blocks back to the volume root. 143 * 144 * This routine sets the ONFLUSH flag upward from the triggering chain until 145 * it hits an inode root or the volume root. Inode chains serve as inflection 146 * points, requiring the flusher to bridge across trees. Inodes include 147 * regular inodes, PFS roots (pmp->iroot), and the media super root 148 * (spmp->iroot). 149 */ 150 void 151 hammer2_chain_setflush(hammer2_chain_t *chain) 152 { 153 hammer2_chain_t *parent; 154 155 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 156 hammer2_spin_sh(&chain->core.spin); 157 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 158 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 159 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 160 break; 161 if ((parent = chain->parent) == NULL) 162 break; 163 hammer2_spin_sh(&parent->core.spin); 164 hammer2_spin_unsh(&chain->core.spin); 165 chain = parent; 166 } 167 hammer2_spin_unsh(&chain->core.spin); 168 } 169 } 170 171 /* 172 * Allocate a new disconnected chain element representing the specified 173 * bref. chain->refs is set to 1 and the passed bref is copied to 174 * chain->bref. chain->bytes is derived from the bref. 175 * 176 * chain->pmp inherits pmp unless the chain is an inode (other than the 177 * super-root inode). 178 * 179 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 180 */ 181 hammer2_chain_t * 182 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 183 hammer2_blockref_t *bref) 184 { 185 hammer2_chain_t *chain; 186 u_int bytes; 187 188 /* 189 * Special case - radix of 0 indicates a chain that does not 190 * need a data reference (context is completely embedded in the 191 * bref). 192 */ 193 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 194 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 195 else 196 bytes = 0; 197 198 atomic_add_long(&hammer2_chain_allocs, 1); 199 200 /* 201 * Construct the appropriate system structure. 202 */ 203 switch(bref->type) { 204 case HAMMER2_BREF_TYPE_DIRENT: 205 case HAMMER2_BREF_TYPE_INODE: 206 case HAMMER2_BREF_TYPE_INDIRECT: 207 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 208 case HAMMER2_BREF_TYPE_DATA: 209 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 210 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 211 break; 212 case HAMMER2_BREF_TYPE_VOLUME: 213 case HAMMER2_BREF_TYPE_FREEMAP: 214 /* 215 * Only hammer2_chain_bulksnap() calls this function with these 216 * types. 217 */ 218 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 219 break; 220 default: 221 chain = NULL; 222 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 223 bref->type); 224 } 225 226 /* 227 * Initialize the new chain structure. pmp must be set to NULL for 228 * chains belonging to the super-root topology of a device mount. 229 */ 230 if (pmp == hmp->spmp) 231 chain->pmp = NULL; 232 else 233 chain->pmp = pmp; 234 235 chain->hmp = hmp; 236 chain->bref = *bref; 237 chain->bytes = bytes; 238 chain->refs = 1; 239 chain->flags = HAMMER2_CHAIN_ALLOCATED; 240 lockinit(&chain->diolk, "chdio", 0, 0); 241 242 /* 243 * Set the PFS boundary flag if this chain represents a PFS root. 244 */ 245 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 246 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 247 hammer2_chain_core_init(chain); 248 249 return (chain); 250 } 251 252 /* 253 * Initialize a chain's core structure. This structure used to be allocated 254 * but is now embedded. 255 * 256 * The core is not locked. No additional refs on the chain are made. 257 * (trans) must not be NULL if (core) is not NULL. 258 */ 259 void 260 hammer2_chain_core_init(hammer2_chain_t *chain) 261 { 262 /* 263 * Fresh core under nchain (no multi-homing of ochain's 264 * sub-tree). 265 */ 266 RB_INIT(&chain->core.rbtree); /* live chains */ 267 hammer2_mtx_init(&chain->lock, "h2chain"); 268 } 269 270 /* 271 * Add a reference to a chain element, preventing its destruction. 272 * 273 * (can be called with spinlock held) 274 */ 275 void 276 hammer2_chain_ref(hammer2_chain_t *chain) 277 { 278 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 279 /* 280 * Just flag that the chain was used and should be recycled 281 * on the LRU if it encounters it later. 282 */ 283 if (chain->flags & HAMMER2_CHAIN_ONLRU) 284 atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 285 286 #if 0 287 /* 288 * REMOVED - reduces contention, lru_list is more heuristical 289 * now. 290 * 291 * 0->non-zero transition must ensure that chain is removed 292 * from the LRU list. 293 * 294 * NOTE: Already holding lru_spin here so we cannot call 295 * hammer2_chain_ref() to get it off lru_list, do 296 * it manually. 297 */ 298 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 299 hammer2_pfs_t *pmp = chain->pmp; 300 hammer2_spin_ex(&pmp->lru_spin); 301 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 302 atomic_add_int(&pmp->lru_count, -1); 303 atomic_clear_int(&chain->flags, 304 HAMMER2_CHAIN_ONLRU); 305 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 306 } 307 hammer2_spin_unex(&pmp->lru_spin); 308 } 309 #endif 310 } 311 } 312 313 /* 314 * Ref a locked chain and force the data to be held across an unlock. 315 * Chain must be currently locked. The user of the chain who desires 316 * to release the hold must call hammer2_chain_lock_unhold() to relock 317 * and unhold the chain, then unlock normally, or may simply call 318 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 319 */ 320 void 321 hammer2_chain_ref_hold(hammer2_chain_t *chain) 322 { 323 atomic_add_int(&chain->lockcnt, 1); 324 hammer2_chain_ref(chain); 325 } 326 327 /* 328 * Insert the chain in the core rbtree. 329 * 330 * Normal insertions are placed in the live rbtree. Insertion of a deleted 331 * chain is a special case used by the flush code that is placed on the 332 * unstaged deleted list to avoid confusing the live view. 333 */ 334 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 335 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 336 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 337 338 static 339 int 340 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 341 int flags, int generation) 342 { 343 hammer2_chain_t *xchain; 344 int error = 0; 345 346 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 347 hammer2_spin_ex(&parent->core.spin); 348 349 /* 350 * Interlocked by spinlock, check for race 351 */ 352 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 353 parent->core.generation != generation) { 354 error = HAMMER2_ERROR_EAGAIN; 355 goto failed; 356 } 357 358 /* 359 * Insert chain 360 */ 361 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 362 KASSERT(xchain == NULL, 363 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 364 chain, xchain, chain->bref.key)); 365 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 366 chain->parent = parent; 367 ++parent->core.chain_count; 368 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 369 370 /* 371 * We have to keep track of the effective live-view blockref count 372 * so the create code knows when to push an indirect block. 373 */ 374 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 375 atomic_add_int(&parent->core.live_count, 1); 376 failed: 377 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 378 hammer2_spin_unex(&parent->core.spin); 379 return error; 380 } 381 382 /* 383 * Drop the caller's reference to the chain. When the ref count drops to 384 * zero this function will try to disassociate the chain from its parent and 385 * deallocate it, then recursely drop the parent using the implied ref 386 * from the chain's chain->parent. 387 * 388 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 389 * races an acquisition by another cpu. Therefore we can loop if we are 390 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 391 * race against another drop. 392 */ 393 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 394 int depth); 395 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp); 396 397 void 398 hammer2_chain_drop(hammer2_chain_t *chain) 399 { 400 u_int refs; 401 402 KKASSERT(chain->refs > 0); 403 404 while (chain) { 405 refs = chain->refs; 406 cpu_ccfence(); 407 KKASSERT(refs > 0); 408 409 if (refs == 1) { 410 if (hammer2_mtx_ex_try(&chain->lock) == 0) 411 chain = hammer2_chain_lastdrop(chain, 0); 412 /* retry the same chain, or chain from lastdrop */ 413 } else { 414 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 415 break; 416 /* retry the same chain */ 417 } 418 cpu_pause(); 419 } 420 } 421 422 /* 423 * Unhold a held and probably not-locked chain, ensure that the data is 424 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 425 * lock and then simply unlocking the chain. 426 */ 427 void 428 hammer2_chain_unhold(hammer2_chain_t *chain) 429 { 430 u_int lockcnt; 431 int iter = 0; 432 433 for (;;) { 434 lockcnt = chain->lockcnt; 435 cpu_ccfence(); 436 if (lockcnt > 1) { 437 if (atomic_cmpset_int(&chain->lockcnt, 438 lockcnt, lockcnt - 1)) { 439 break; 440 } 441 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) { 442 hammer2_chain_unlock(chain); 443 break; 444 } else { 445 /* 446 * This situation can easily occur on SMP due to 447 * the gap inbetween the 1->0 transition and the 448 * final unlock. We cannot safely block on the 449 * mutex because lockcnt might go above 1. 450 * 451 * XXX Sleep for one tick if it takes too long. 452 */ 453 if (++iter > 1000) { 454 if (iter > 1000 + hz) { 455 kprintf("hammer2: h2race1 %p\n", chain); 456 iter = 1000; 457 } 458 tsleep(&iter, 0, "h2race1", 1); 459 } 460 cpu_pause(); 461 } 462 } 463 } 464 465 void 466 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 467 { 468 hammer2_chain_unhold(chain); 469 hammer2_chain_drop(chain); 470 } 471 472 void 473 hammer2_chain_rehold(hammer2_chain_t *chain) 474 { 475 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED); 476 atomic_add_int(&chain->lockcnt, 1); 477 hammer2_chain_unlock(chain); 478 } 479 480 /* 481 * Handles the (potential) last drop of chain->refs from 1->0. Called with 482 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 483 * possible against refs and lockcnt. We must dispose of the mutex on chain. 484 * 485 * This function returns an unlocked chain for recursive drop or NULL. It 486 * can return the same chain if it determines it has raced another ref. 487 * 488 * -- 489 * 490 * When two chains need to be recursively dropped we use the chain we 491 * would otherwise free to placehold the additional chain. It's a bit 492 * convoluted but we can't just recurse without potentially blowing out 493 * the kernel stack. 494 * 495 * The chain cannot be freed if it has any children. 496 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 497 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 498 * Any dedup registration can remain intact. 499 * 500 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 501 */ 502 static 503 hammer2_chain_t * 504 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth) 505 { 506 hammer2_pfs_t *pmp; 507 hammer2_dev_t *hmp; 508 hammer2_chain_t *parent; 509 hammer2_chain_t *rdrop; 510 511 /* 512 * We need chain's spinlock to interlock the sub-tree test. 513 * We already have chain's mutex, protecting chain->parent. 514 * 515 * Remember that chain->refs can be in flux. 516 */ 517 hammer2_spin_ex(&chain->core.spin); 518 519 if (chain->parent != NULL) { 520 /* 521 * If the chain has a parent the UPDATE bit prevents scrapping 522 * as the chain is needed to properly flush the parent. Try 523 * to complete the 1->0 transition and return NULL. Retry 524 * (return chain) if we are unable to complete the 1->0 525 * transition, else return NULL (nothing more to do). 526 * 527 * If the chain has a parent the MODIFIED bit prevents 528 * scrapping. 529 * 530 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 531 */ 532 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 533 HAMMER2_CHAIN_MODIFIED)) { 534 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 535 hammer2_spin_unex(&chain->core.spin); 536 hammer2_chain_assert_no_data(chain); 537 hammer2_mtx_unlock(&chain->lock); 538 chain = NULL; 539 } else { 540 hammer2_spin_unex(&chain->core.spin); 541 hammer2_mtx_unlock(&chain->lock); 542 } 543 return (chain); 544 } 545 /* spinlock still held */ 546 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME || 547 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) { 548 /* 549 * Retain the static vchain and fchain. Clear bits that 550 * are not relevant. Do not clear the MODIFIED bit, 551 * and certainly do not put it on the delayed-flush queue. 552 */ 553 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 554 } else { 555 /* 556 * The chain has no parent and can be flagged for destruction. 557 * Since it has no parent, UPDATE can also be cleared. 558 */ 559 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 560 if (chain->flags & HAMMER2_CHAIN_UPDATE) 561 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 562 563 /* 564 * If the chain has children we must propagate the DESTROY 565 * flag downward and rip the disconnected topology apart. 566 * This is accomplished by calling hammer2_flush() on the 567 * chain. 568 * 569 * Any dedup is already handled by the underlying DIO, so 570 * we do not have to specifically flush it here. 571 */ 572 if (chain->core.chain_count) { 573 hammer2_spin_unex(&chain->core.spin); 574 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 575 HAMMER2_FLUSH_ALL); 576 hammer2_mtx_unlock(&chain->lock); 577 578 return(chain); /* retry drop */ 579 } 580 581 /* 582 * Otherwise we can scrap the MODIFIED bit if it is set, 583 * and continue along the freeing path. 584 * 585 * Be sure to clean-out any dedup bits. Without a parent 586 * this chain will no longer be visible to the flush code. 587 * Easy check data_off to avoid the volume root. 588 */ 589 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 590 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 591 atomic_add_long(&hammer2_count_modified_chains, -1); 592 if (chain->pmp) 593 hammer2_pfs_memory_wakeup(chain->pmp, -1); 594 } 595 /* spinlock still held */ 596 } 597 598 /* spinlock still held */ 599 600 /* 601 * If any children exist we must leave the chain intact with refs == 0. 602 * They exist because chains are retained below us which have refs or 603 * may require flushing. 604 * 605 * Retry (return chain) if we fail to transition the refs to 0, else 606 * return NULL indication nothing more to do. 607 * 608 * Chains with children are NOT put on the LRU list. 609 */ 610 if (chain->core.chain_count) { 611 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 612 hammer2_spin_unex(&chain->core.spin); 613 hammer2_chain_assert_no_data(chain); 614 hammer2_mtx_unlock(&chain->lock); 615 chain = NULL; 616 } else { 617 hammer2_spin_unex(&chain->core.spin); 618 hammer2_mtx_unlock(&chain->lock); 619 } 620 return (chain); 621 } 622 /* spinlock still held */ 623 /* no chains left under us */ 624 625 /* 626 * chain->core has no children left so no accessors can get to our 627 * chain from there. Now we have to lock the parent core to interlock 628 * remaining possible accessors that might bump chain's refs before 629 * we can safely drop chain's refs with intent to free the chain. 630 */ 631 hmp = chain->hmp; 632 pmp = chain->pmp; /* can be NULL */ 633 rdrop = NULL; 634 635 parent = chain->parent; 636 637 /* 638 * WARNING! chain's spin lock is still held here, and other spinlocks 639 * will be acquired and released in the code below. We 640 * cannot be making fancy procedure calls! 641 */ 642 643 /* 644 * We can cache the chain if it is associated with a pmp 645 * and not flagged as being destroyed or requesting a full 646 * release. In this situation the chain is not removed 647 * from its parent, i.e. it can still be looked up. 648 * 649 * We intentionally do not cache DATA chains because these 650 * were likely used to load data into the logical buffer cache 651 * and will not be accessed again for some time. 652 */ 653 if ((chain->flags & 654 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 655 chain->pmp && 656 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 657 if (parent) 658 hammer2_spin_ex(&parent->core.spin); 659 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 660 /* 661 * 1->0 transition failed, retry. Do not drop 662 * the chain's data yet! 663 */ 664 if (parent) 665 hammer2_spin_unex(&parent->core.spin); 666 hammer2_spin_unex(&chain->core.spin); 667 hammer2_mtx_unlock(&chain->lock); 668 669 return(chain); 670 } 671 672 /* 673 * Success 674 */ 675 hammer2_chain_assert_no_data(chain); 676 677 /* 678 * Make sure we are on the LRU list, clean up excessive 679 * LRU entries. We can only really drop one but there might 680 * be other entries that we can remove from the lru_list 681 * without dropping. 682 * 683 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when 684 * chain->core.spin AND pmp->lru_spin are held, but 685 * can be safely cleared only holding pmp->lru_spin. 686 */ 687 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 688 hammer2_spin_ex(&pmp->lru_spin); 689 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 690 atomic_set_int(&chain->flags, 691 HAMMER2_CHAIN_ONLRU); 692 TAILQ_INSERT_TAIL(&pmp->lru_list, 693 chain, lru_node); 694 atomic_add_int(&pmp->lru_count, 1); 695 } 696 if (pmp->lru_count < HAMMER2_LRU_LIMIT) 697 depth = 1; /* disable lru_list flush */ 698 hammer2_spin_unex(&pmp->lru_spin); 699 } else { 700 /* disable lru flush */ 701 depth = 1; 702 } 703 704 if (parent) { 705 hammer2_spin_unex(&parent->core.spin); 706 parent = NULL; /* safety */ 707 } 708 hammer2_spin_unex(&chain->core.spin); 709 hammer2_mtx_unlock(&chain->lock); 710 711 /* 712 * lru_list hysteresis (see above for depth overrides). 713 * Note that depth also prevents excessive lastdrop recursion. 714 */ 715 if (depth == 0) 716 hammer2_chain_lru_flush(pmp); 717 718 return NULL; 719 /* NOT REACHED */ 720 } 721 722 /* 723 * Make sure we are not on the LRU list. 724 */ 725 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 726 hammer2_spin_ex(&pmp->lru_spin); 727 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 728 atomic_add_int(&pmp->lru_count, -1); 729 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 730 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 731 } 732 hammer2_spin_unex(&pmp->lru_spin); 733 } 734 735 /* 736 * Spinlock the parent and try to drop the last ref on chain. 737 * On success determine if we should dispose of the chain 738 * (remove the chain from its parent, etc). 739 * 740 * (normal core locks are top-down recursive but we define 741 * core spinlocks as bottom-up recursive, so this is safe). 742 */ 743 if (parent) { 744 hammer2_spin_ex(&parent->core.spin); 745 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 746 /* 747 * 1->0 transition failed, retry. 748 */ 749 hammer2_spin_unex(&parent->core.spin); 750 hammer2_spin_unex(&chain->core.spin); 751 hammer2_mtx_unlock(&chain->lock); 752 753 return(chain); 754 } 755 756 /* 757 * 1->0 transition successful, parent spin held to prevent 758 * new lookups, chain spinlock held to protect parent field. 759 * Remove chain from the parent. 760 * 761 * If the chain is being removed from the parent's btree but 762 * is not bmapped, we have to adjust live_count downward. If 763 * it is bmapped then the blockref is retained in the parent 764 * as is its associated live_count. This case can occur when 765 * a chain added to the topology is unable to flush and is 766 * then later deleted. 767 */ 768 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 769 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 770 (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 771 atomic_add_int(&parent->core.live_count, -1); 772 } 773 RB_REMOVE(hammer2_chain_tree, 774 &parent->core.rbtree, chain); 775 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 776 --parent->core.chain_count; 777 chain->parent = NULL; 778 } 779 780 /* 781 * If our chain was the last chain in the parent's core the 782 * core is now empty and its parent might have to be 783 * re-dropped if it has 0 refs. 784 */ 785 if (parent->core.chain_count == 0) { 786 rdrop = parent; 787 atomic_add_int(&rdrop->refs, 1); 788 /* 789 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 790 rdrop = NULL; 791 */ 792 } 793 hammer2_spin_unex(&parent->core.spin); 794 parent = NULL; /* safety */ 795 /* FALL THROUGH */ 796 } else { 797 /* 798 * No-parent case. 799 */ 800 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 801 /* 802 * 1->0 transition failed, retry. 803 */ 804 hammer2_spin_unex(&parent->core.spin); 805 hammer2_spin_unex(&chain->core.spin); 806 hammer2_mtx_unlock(&chain->lock); 807 808 return(chain); 809 } 810 } 811 812 /* 813 * Successful 1->0 transition, no parent, no children... no way for 814 * anyone to ref this chain any more. We can clean-up and free it. 815 * 816 * We still have the core spinlock, and core's chain_count is 0. 817 * Any parent spinlock is gone. 818 */ 819 hammer2_spin_unex(&chain->core.spin); 820 hammer2_chain_assert_no_data(chain); 821 hammer2_mtx_unlock(&chain->lock); 822 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 823 chain->core.chain_count == 0); 824 825 /* 826 * All locks are gone, no pointers remain to the chain, finish 827 * freeing it. 828 */ 829 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 830 HAMMER2_CHAIN_MODIFIED)) == 0); 831 832 /* 833 * Once chain resources are gone we can use the now dead chain 834 * structure to placehold what might otherwise require a recursive 835 * drop, because we have potentially two things to drop and can only 836 * return one directly. 837 */ 838 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 839 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 840 chain->hmp = NULL; 841 kfree(chain, hmp->mchain); 842 } 843 844 /* 845 * Possible chaining loop when parent re-drop needed. 846 */ 847 return(rdrop); 848 } 849 850 /* 851 * Heuristical flush of the LRU, try to reduce the number of entries 852 * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3). This procedure is called 853 * only when lru_count exceeds HAMMER2_LRU_LIMIT. 854 */ 855 static 856 void 857 hammer2_chain_lru_flush(hammer2_pfs_t *pmp) 858 { 859 hammer2_chain_t *chain; 860 861 again: 862 chain = NULL; 863 hammer2_spin_ex(&pmp->lru_spin); 864 while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) { 865 /* 866 * Pick a chain off the lru_list, just recycle it quickly 867 * if LRUHINT is set (the chain was ref'd but left on 868 * the lru_list, so cycle to the end). 869 */ 870 chain = TAILQ_FIRST(&pmp->lru_list); 871 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 872 873 if (chain->flags & HAMMER2_CHAIN_LRUHINT) { 874 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 875 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 876 chain = NULL; 877 continue; 878 } 879 880 /* 881 * Ok, we are off the LRU. We must adjust refs before we 882 * can safely clear the ONLRU flag. 883 */ 884 atomic_add_int(&pmp->lru_count, -1); 885 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 886 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 887 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 888 break; 889 } 890 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 891 chain = NULL; 892 } 893 hammer2_spin_unex(&pmp->lru_spin); 894 if (chain == NULL) 895 return; 896 897 /* 898 * If we picked a chain off the lru list we may be able to lastdrop 899 * it. Use a depth of 1 to prevent excessive lastdrop recursion. 900 */ 901 while (chain) { 902 u_int refs; 903 904 refs = chain->refs; 905 cpu_ccfence(); 906 KKASSERT(refs > 0); 907 908 if (refs == 1) { 909 if (hammer2_mtx_ex_try(&chain->lock) == 0) 910 chain = hammer2_chain_lastdrop(chain, 1); 911 /* retry the same chain, or chain from lastdrop */ 912 } else { 913 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 914 break; 915 /* retry the same chain */ 916 } 917 cpu_pause(); 918 } 919 goto again; 920 } 921 922 /* 923 * On last lock release. 924 */ 925 static hammer2_io_t * 926 hammer2_chain_drop_data(hammer2_chain_t *chain) 927 { 928 hammer2_io_t *dio; 929 930 if ((dio = chain->dio) != NULL) { 931 chain->dio = NULL; 932 chain->data = NULL; 933 } else { 934 switch(chain->bref.type) { 935 case HAMMER2_BREF_TYPE_VOLUME: 936 case HAMMER2_BREF_TYPE_FREEMAP: 937 break; 938 default: 939 if (chain->data != NULL) { 940 hammer2_spin_unex(&chain->core.spin); 941 panic("chain data not null: " 942 "chain %p bref %016jx.%02x " 943 "refs %d parent %p dio %p data %p", 944 chain, chain->bref.data_off, 945 chain->bref.type, chain->refs, 946 chain->parent, 947 chain->dio, chain->data); 948 } 949 KKASSERT(chain->data == NULL); 950 break; 951 } 952 } 953 return dio; 954 } 955 956 /* 957 * Lock a referenced chain element, acquiring its data with I/O if necessary, 958 * and specify how you would like the data to be resolved. 959 * 960 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 961 * 962 * The lock is allowed to recurse, multiple locking ops will aggregate 963 * the requested resolve types. Once data is assigned it will not be 964 * removed until the last unlock. 965 * 966 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 967 * (typically used to avoid device/logical buffer 968 * aliasing for data) 969 * 970 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 971 * the INITIAL-create state (indirect blocks only). 972 * 973 * Do not resolve data elements for DATA chains. 974 * (typically used to avoid device/logical buffer 975 * aliasing for data) 976 * 977 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 978 * 979 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 980 * it will be locked exclusive. 981 * 982 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If 983 * the lock fails, EAGAIN is returned. 984 * 985 * NOTE: Embedded elements (volume header, inodes) are always resolved 986 * regardless. 987 * 988 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 989 * element will instantiate and zero its buffer, and flush it on 990 * release. 991 * 992 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 993 * so as not to instantiate a device buffer, which could alias against 994 * a logical file buffer. However, if ALWAYS is specified the 995 * device buffer will be instantiated anyway. 996 * 997 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which 998 * case it can be either 0 or EAGAIN. 999 * 1000 * WARNING! This function blocks on I/O if data needs to be fetched. This 1001 * blocking can run concurrent with other compatible lock holders 1002 * who do not need data returning. The lock is not upgraded to 1003 * exclusive during a data fetch, a separate bit is used to 1004 * interlock I/O. However, an exclusive lock holder can still count 1005 * on being interlocked against an I/O fetch managed by a shared 1006 * lock holder. 1007 */ 1008 int 1009 hammer2_chain_lock(hammer2_chain_t *chain, int how) 1010 { 1011 KKASSERT(chain->refs > 0); 1012 1013 if (how & HAMMER2_RESOLVE_NONBLOCK) { 1014 /* 1015 * We still have to bump lockcnt before acquiring the lock, 1016 * even for non-blocking operation, because the unlock code 1017 * live-loops on lockcnt == 1 when dropping the last lock. 1018 * 1019 * If the non-blocking operation fails we have to use an 1020 * unhold sequence to undo the mess. 1021 * 1022 * NOTE: LOCKAGAIN must always succeed without blocking, 1023 * even if NONBLOCK is specified. 1024 */ 1025 atomic_add_int(&chain->lockcnt, 1); 1026 if (how & HAMMER2_RESOLVE_SHARED) { 1027 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1028 hammer2_mtx_sh_again(&chain->lock); 1029 } else { 1030 if (hammer2_mtx_sh_try(&chain->lock) != 0) { 1031 hammer2_chain_unhold(chain); 1032 return EAGAIN; 1033 } 1034 } 1035 } else { 1036 if (hammer2_mtx_ex_try(&chain->lock) != 0) { 1037 hammer2_chain_unhold(chain); 1038 return EAGAIN; 1039 } 1040 } 1041 } else { 1042 /* 1043 * Get the appropriate lock. If LOCKAGAIN is flagged with 1044 * SHARED the caller expects a shared lock to already be 1045 * present and we are giving it another ref. This case must 1046 * importantly not block if there is a pending exclusive lock 1047 * request. 1048 */ 1049 atomic_add_int(&chain->lockcnt, 1); 1050 if (how & HAMMER2_RESOLVE_SHARED) { 1051 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1052 hammer2_mtx_sh_again(&chain->lock); 1053 } else { 1054 hammer2_mtx_sh(&chain->lock); 1055 } 1056 } else { 1057 hammer2_mtx_ex(&chain->lock); 1058 } 1059 } 1060 1061 /* 1062 * If we already have a valid data pointer make sure the data is 1063 * synchronized to the current cpu, and then no further action is 1064 * necessary. 1065 */ 1066 if (chain->data) { 1067 if (chain->dio) 1068 hammer2_io_bkvasync(chain->dio); 1069 return 0; 1070 } 1071 1072 /* 1073 * Do we have to resolve the data? This is generally only 1074 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 1075 * Other BREF types expects the data to be there. 1076 */ 1077 switch(how & HAMMER2_RESOLVE_MASK) { 1078 case HAMMER2_RESOLVE_NEVER: 1079 return 0; 1080 case HAMMER2_RESOLVE_MAYBE: 1081 if (chain->flags & HAMMER2_CHAIN_INITIAL) 1082 return 0; 1083 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 1084 return 0; 1085 #if 0 1086 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 1087 return 0; 1088 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 1089 return 0; 1090 #endif 1091 /* fall through */ 1092 case HAMMER2_RESOLVE_ALWAYS: 1093 default: 1094 break; 1095 } 1096 1097 /* 1098 * Caller requires data 1099 */ 1100 hammer2_chain_load_data(chain); 1101 1102 return 0; 1103 } 1104 1105 /* 1106 * Lock the chain, retain the hold, and drop the data persistence count. 1107 * The data should remain valid because we never transitioned lockcnt 1108 * through 0. 1109 */ 1110 void 1111 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 1112 { 1113 hammer2_chain_lock(chain, how); 1114 atomic_add_int(&chain->lockcnt, -1); 1115 } 1116 1117 #if 0 1118 /* 1119 * Downgrade an exclusive chain lock to a shared chain lock. 1120 * 1121 * NOTE: There is no upgrade equivalent due to the ease of 1122 * deadlocks in that direction. 1123 */ 1124 void 1125 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1126 { 1127 hammer2_mtx_downgrade(&chain->lock); 1128 } 1129 #endif 1130 1131 /* 1132 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1133 * may be of any type. 1134 * 1135 * Once chain->data is set it cannot be disposed of until all locks are 1136 * released. 1137 * 1138 * Make sure the data is synchronized to the current cpu. 1139 */ 1140 void 1141 hammer2_chain_load_data(hammer2_chain_t *chain) 1142 { 1143 hammer2_blockref_t *bref; 1144 hammer2_dev_t *hmp; 1145 hammer2_io_t *dio; 1146 char *bdata; 1147 int error; 1148 1149 /* 1150 * Degenerate case, data already present, or chain has no media 1151 * reference to load. 1152 */ 1153 KKASSERT(chain->lock.mtx_lock & MTX_MASK); 1154 if (chain->data) { 1155 if (chain->dio) 1156 hammer2_io_bkvasync(chain->dio); 1157 return; 1158 } 1159 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1160 return; 1161 1162 hmp = chain->hmp; 1163 KKASSERT(hmp != NULL); 1164 1165 /* 1166 * Gain the IOINPROG bit, interlocked block. 1167 */ 1168 for (;;) { 1169 u_int oflags; 1170 u_int nflags; 1171 1172 oflags = chain->flags; 1173 cpu_ccfence(); 1174 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1175 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1176 tsleep_interlock(&chain->flags, 0); 1177 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1178 tsleep(&chain->flags, PINTERLOCKED, 1179 "h2iocw", 0); 1180 } 1181 /* retry */ 1182 } else { 1183 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1184 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1185 break; 1186 } 1187 /* retry */ 1188 } 1189 } 1190 1191 /* 1192 * We own CHAIN_IOINPROG 1193 * 1194 * Degenerate case if we raced another load. 1195 */ 1196 if (chain->data) { 1197 if (chain->dio) 1198 hammer2_io_bkvasync(chain->dio); 1199 goto done; 1200 } 1201 1202 /* 1203 * We must resolve to a device buffer, either by issuing I/O or 1204 * by creating a zero-fill element. We do not mark the buffer 1205 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1206 * API must still be used to do that). 1207 * 1208 * The device buffer is variable-sized in powers of 2 down 1209 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1210 * chunk always contains buffers of the same size. (XXX) 1211 * 1212 * The minimum physical IO size may be larger than the variable 1213 * block size. 1214 */ 1215 bref = &chain->bref; 1216 1217 /* 1218 * The getblk() optimization can only be used on newly created 1219 * elements if the physical block size matches the request. 1220 */ 1221 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1222 error = hammer2_io_new(hmp, bref->type, 1223 bref->data_off, chain->bytes, 1224 &chain->dio); 1225 } else { 1226 error = hammer2_io_bread(hmp, bref->type, 1227 bref->data_off, chain->bytes, 1228 &chain->dio); 1229 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 1230 } 1231 if (error) { 1232 chain->error = HAMMER2_ERROR_EIO; 1233 kprintf("hammer2_chain_load_data: I/O error %016jx: %d\n", 1234 (intmax_t)bref->data_off, error); 1235 hammer2_io_bqrelse(&chain->dio); 1236 goto done; 1237 } 1238 chain->error = 0; 1239 1240 /* 1241 * This isn't perfect and can be ignored on OSs which do not have 1242 * an indication as to whether a buffer is coming from cache or 1243 * if I/O was actually issued for the read. TESTEDGOOD will work 1244 * pretty well without the B_IOISSUED logic because chains are 1245 * cached, but in that situation (without B_IOISSUED) it will not 1246 * detect whether a re-read via I/O is corrupted verses the original 1247 * read. 1248 * 1249 * We can't re-run the CRC on every fresh lock. That would be 1250 * insanely expensive. 1251 * 1252 * If the underlying kernel buffer covers the entire chain we can 1253 * use the B_IOISSUED indication to determine if we have to re-run 1254 * the CRC on chain data for chains that managed to stay cached 1255 * across the kernel disposal of the original buffer. 1256 */ 1257 if ((dio = chain->dio) != NULL && dio->bp) { 1258 struct buf *bp = dio->bp; 1259 1260 if (dio->psize == chain->bytes && 1261 (bp->b_flags & B_IOISSUED)) { 1262 atomic_clear_int(&chain->flags, 1263 HAMMER2_CHAIN_TESTEDGOOD); 1264 bp->b_flags &= ~B_IOISSUED; 1265 } 1266 } 1267 1268 /* 1269 * NOTE: A locked chain's data cannot be modified without first 1270 * calling hammer2_chain_modify(). 1271 */ 1272 1273 /* 1274 * NOTE: hammer2_io_data() call issues bkvasync() 1275 */ 1276 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1277 1278 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1279 /* 1280 * Clear INITIAL. In this case we used io_new() and the 1281 * buffer has been zero'd and marked dirty. 1282 * 1283 * CHAIN_MODIFIED has not been set yet, and we leave it 1284 * that way for now. Set a temporary CHAIN_NOTTESTED flag 1285 * to prevent hammer2_chain_testcheck() from trying to match 1286 * a check code that has not yet been generated. This bit 1287 * should NOT end up on the actual media. 1288 */ 1289 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1290 atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 1291 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1292 /* 1293 * check data not currently synchronized due to 1294 * modification. XXX assumes data stays in the buffer 1295 * cache, which might not be true (need biodep on flush 1296 * to calculate crc? or simple crc?). 1297 */ 1298 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1299 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1300 chain->error = HAMMER2_ERROR_CHECK; 1301 } else { 1302 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1303 } 1304 } 1305 1306 /* 1307 * Setup the data pointer, either pointing it to an embedded data 1308 * structure and copying the data from the buffer, or pointing it 1309 * into the buffer. 1310 * 1311 * The buffer is not retained when copying to an embedded data 1312 * structure in order to avoid potential deadlocks or recursions 1313 * on the same physical buffer. 1314 * 1315 * WARNING! Other threads can start using the data the instant we 1316 * set chain->data non-NULL. 1317 */ 1318 switch (bref->type) { 1319 case HAMMER2_BREF_TYPE_VOLUME: 1320 case HAMMER2_BREF_TYPE_FREEMAP: 1321 /* 1322 * Copy data from bp to embedded buffer 1323 */ 1324 panic("hammer2_chain_load_data: unresolved volume header"); 1325 break; 1326 case HAMMER2_BREF_TYPE_DIRENT: 1327 KKASSERT(chain->bytes != 0); 1328 /* fall through */ 1329 case HAMMER2_BREF_TYPE_INODE: 1330 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1331 case HAMMER2_BREF_TYPE_INDIRECT: 1332 case HAMMER2_BREF_TYPE_DATA: 1333 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1334 default: 1335 /* 1336 * Point data at the device buffer and leave dio intact. 1337 */ 1338 chain->data = (void *)bdata; 1339 break; 1340 } 1341 1342 /* 1343 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1344 */ 1345 done: 1346 for (;;) { 1347 u_int oflags; 1348 u_int nflags; 1349 1350 oflags = chain->flags; 1351 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1352 HAMMER2_CHAIN_IOSIGNAL); 1353 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1354 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1355 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1356 wakeup(&chain->flags); 1357 break; 1358 } 1359 } 1360 } 1361 1362 /* 1363 * Unlock and deref a chain element. 1364 * 1365 * Remember that the presence of children under chain prevent the chain's 1366 * destruction but do not add additional references, so the dio will still 1367 * be dropped. 1368 */ 1369 void 1370 hammer2_chain_unlock(hammer2_chain_t *chain) 1371 { 1372 hammer2_io_t *dio; 1373 u_int lockcnt; 1374 int iter = 0; 1375 1376 /* 1377 * If multiple locks are present (or being attempted) on this 1378 * particular chain we can just unlock, drop refs, and return. 1379 * 1380 * Otherwise fall-through on the 1->0 transition. 1381 */ 1382 for (;;) { 1383 lockcnt = chain->lockcnt; 1384 KKASSERT(lockcnt > 0); 1385 cpu_ccfence(); 1386 if (lockcnt > 1) { 1387 if (atomic_cmpset_int(&chain->lockcnt, 1388 lockcnt, lockcnt - 1)) { 1389 hammer2_mtx_unlock(&chain->lock); 1390 return; 1391 } 1392 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1393 /* while holding the mutex exclusively */ 1394 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1395 break; 1396 } else { 1397 /* 1398 * This situation can easily occur on SMP due to 1399 * the gap inbetween the 1->0 transition and the 1400 * final unlock. We cannot safely block on the 1401 * mutex because lockcnt might go above 1. 1402 * 1403 * XXX Sleep for one tick if it takes too long. 1404 */ 1405 if (++iter > 1000) { 1406 if (iter > 1000 + hz) { 1407 kprintf("hammer2: h2race2 %p\n", chain); 1408 iter = 1000; 1409 } 1410 tsleep(&iter, 0, "h2race2", 1); 1411 } 1412 cpu_pause(); 1413 } 1414 /* retry */ 1415 } 1416 1417 /* 1418 * Last unlock / mutex upgraded to exclusive. Drop the data 1419 * reference. 1420 */ 1421 dio = hammer2_chain_drop_data(chain); 1422 if (dio) 1423 hammer2_io_bqrelse(&dio); 1424 hammer2_mtx_unlock(&chain->lock); 1425 } 1426 1427 /* 1428 * Unlock and hold chain data intact 1429 */ 1430 void 1431 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1432 { 1433 atomic_add_int(&chain->lockcnt, 1); 1434 hammer2_chain_unlock(chain); 1435 } 1436 1437 /* 1438 * Helper to obtain the blockref[] array base and count for a chain. 1439 * 1440 * XXX Not widely used yet, various use cases need to be validated and 1441 * converted to use this function. 1442 */ 1443 static 1444 hammer2_blockref_t * 1445 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1446 { 1447 hammer2_blockref_t *base; 1448 int count; 1449 1450 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1451 base = NULL; 1452 1453 switch(parent->bref.type) { 1454 case HAMMER2_BREF_TYPE_INODE: 1455 count = HAMMER2_SET_COUNT; 1456 break; 1457 case HAMMER2_BREF_TYPE_INDIRECT: 1458 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1459 count = parent->bytes / sizeof(hammer2_blockref_t); 1460 break; 1461 case HAMMER2_BREF_TYPE_VOLUME: 1462 count = HAMMER2_SET_COUNT; 1463 break; 1464 case HAMMER2_BREF_TYPE_FREEMAP: 1465 count = HAMMER2_SET_COUNT; 1466 break; 1467 default: 1468 panic("hammer2_chain_base_and_count: " 1469 "unrecognized blockref type: %d", 1470 parent->bref.type); 1471 count = 0; 1472 break; 1473 } 1474 } else { 1475 switch(parent->bref.type) { 1476 case HAMMER2_BREF_TYPE_INODE: 1477 base = &parent->data->ipdata.u.blockset.blockref[0]; 1478 count = HAMMER2_SET_COUNT; 1479 break; 1480 case HAMMER2_BREF_TYPE_INDIRECT: 1481 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1482 base = &parent->data->npdata[0]; 1483 count = parent->bytes / sizeof(hammer2_blockref_t); 1484 break; 1485 case HAMMER2_BREF_TYPE_VOLUME: 1486 base = &parent->data->voldata. 1487 sroot_blockset.blockref[0]; 1488 count = HAMMER2_SET_COUNT; 1489 break; 1490 case HAMMER2_BREF_TYPE_FREEMAP: 1491 base = &parent->data->blkset.blockref[0]; 1492 count = HAMMER2_SET_COUNT; 1493 break; 1494 default: 1495 panic("hammer2_chain_base_and_count: " 1496 "unrecognized blockref type: %d", 1497 parent->bref.type); 1498 count = 0; 1499 break; 1500 } 1501 } 1502 *countp = count; 1503 1504 return base; 1505 } 1506 1507 /* 1508 * This counts the number of live blockrefs in a block array and 1509 * also calculates the point at which all remaining blockrefs are empty. 1510 * This routine can only be called on a live chain. 1511 * 1512 * Caller holds the chain locked, but possibly with a shared lock. We 1513 * must use an exclusive spinlock to prevent corruption. 1514 * 1515 * NOTE: Flag is not set until after the count is complete, allowing 1516 * callers to test the flag without holding the spinlock. 1517 * 1518 * NOTE: If base is NULL the related chain is still in the INITIAL 1519 * state and there are no blockrefs to count. 1520 * 1521 * NOTE: live_count may already have some counts accumulated due to 1522 * creation and deletion and could even be initially negative. 1523 */ 1524 void 1525 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1526 hammer2_blockref_t *base, int count) 1527 { 1528 hammer2_spin_ex(&chain->core.spin); 1529 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1530 if (base) { 1531 while (--count >= 0) { 1532 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1533 break; 1534 } 1535 chain->core.live_zero = count + 1; 1536 while (count >= 0) { 1537 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1538 atomic_add_int(&chain->core.live_count, 1539 1); 1540 --count; 1541 } 1542 } else { 1543 chain->core.live_zero = 0; 1544 } 1545 /* else do not modify live_count */ 1546 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1547 } 1548 hammer2_spin_unex(&chain->core.spin); 1549 } 1550 1551 /* 1552 * Resize the chain's physical storage allocation in-place. This function does 1553 * not usually adjust the data pointer and must be followed by (typically) a 1554 * hammer2_chain_modify() call to copy any old data over and adjust the 1555 * data pointer. 1556 * 1557 * Chains can be resized smaller without reallocating the storage. Resizing 1558 * larger will reallocate the storage. Excess or prior storage is reclaimed 1559 * asynchronously at a later time. 1560 * 1561 * An nradix value of 0 is special-cased to mean that the storage should 1562 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1563 * byte). 1564 * 1565 * Must be passed an exclusively locked parent and chain. 1566 * 1567 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1568 * to avoid instantiating a device buffer that conflicts with the vnode data 1569 * buffer. However, because H2 can compress or encrypt data, the chain may 1570 * have a dio assigned to it in those situations, and they do not conflict. 1571 * 1572 * XXX return error if cannot resize. 1573 */ 1574 int 1575 hammer2_chain_resize(hammer2_chain_t *chain, 1576 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1577 int nradix, int flags) 1578 { 1579 hammer2_dev_t *hmp; 1580 size_t obytes; 1581 size_t nbytes; 1582 int error; 1583 1584 hmp = chain->hmp; 1585 1586 /* 1587 * Only data and indirect blocks can be resized for now. 1588 * (The volu root, inodes, and freemap elements use a fixed size). 1589 */ 1590 KKASSERT(chain != &hmp->vchain); 1591 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1592 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1593 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1594 1595 /* 1596 * Nothing to do if the element is already the proper size 1597 */ 1598 obytes = chain->bytes; 1599 nbytes = (nradix) ? (1U << nradix) : 0; 1600 if (obytes == nbytes) 1601 return (chain->error); 1602 1603 /* 1604 * Make sure the old data is instantiated so we can copy it. If this 1605 * is a data block, the device data may be superfluous since the data 1606 * might be in a logical block, but compressed or encrypted data is 1607 * another matter. 1608 * 1609 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 1610 */ 1611 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1612 if (error) 1613 return error; 1614 1615 /* 1616 * Reallocate the block, even if making it smaller (because different 1617 * block sizes may be in different regions). 1618 * 1619 * NOTE: Operation does not copy the data and may only be used 1620 * to resize data blocks in-place, or directory entry blocks 1621 * which are about to be modified in some manner. 1622 */ 1623 error = hammer2_freemap_alloc(chain, nbytes); 1624 if (error) 1625 return error; 1626 1627 chain->bytes = nbytes; 1628 1629 /* 1630 * We don't want the followup chain_modify() to try to copy data 1631 * from the old (wrong-sized) buffer. It won't know how much to 1632 * copy. This case should only occur during writes when the 1633 * originator already has the data to write in-hand. 1634 */ 1635 if (chain->dio) { 1636 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1637 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1638 hammer2_io_brelse(&chain->dio); 1639 chain->data = NULL; 1640 } 1641 return (chain->error); 1642 } 1643 1644 /* 1645 * Set the chain modified so its data can be changed by the caller, or 1646 * install deduplicated data. The caller must call this routine for each 1647 * set of modifications it makes, even if the chain is already flagged 1648 * MODIFIED. 1649 * 1650 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1651 * is a CLC (cluster level change) field and is not updated by parent 1652 * propagation during a flush. 1653 * 1654 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1655 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1656 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1657 * remains unmodified with its old data ref intact and chain->error 1658 * unchanged. 1659 * 1660 * Dedup Handling 1661 * 1662 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1663 * even if the chain is still flagged MODIFIED. In this case the chain's 1664 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1665 * 1666 * If the caller passes a non-zero dedup_off we will use it to assign the 1667 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1668 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1669 * must not modify the data content upon return. 1670 */ 1671 int 1672 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1673 hammer2_off_t dedup_off, int flags) 1674 { 1675 hammer2_blockref_t obref; 1676 hammer2_dev_t *hmp; 1677 hammer2_io_t *dio; 1678 int error; 1679 int wasinitial; 1680 int setmodified; 1681 int setupdate; 1682 int newmod; 1683 char *bdata; 1684 1685 hmp = chain->hmp; 1686 obref = chain->bref; 1687 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE); 1688 1689 /* 1690 * Data is not optional for freemap chains (we must always be sure 1691 * to copy the data on COW storage allocations). 1692 */ 1693 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1694 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1695 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1696 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1697 } 1698 1699 /* 1700 * Data must be resolved if already assigned, unless explicitly 1701 * flagged otherwise. If we cannot safety load the data the 1702 * modification fails and we return early. 1703 */ 1704 if (chain->data == NULL && chain->bytes != 0 && 1705 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1706 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1707 hammer2_chain_load_data(chain); 1708 if (chain->error) 1709 return (chain->error); 1710 } 1711 error = 0; 1712 1713 /* 1714 * Set MODIFIED to indicate that the chain has been modified. A new 1715 * allocation is required when modifying a chain. 1716 * 1717 * Set UPDATE to ensure that the blockref is updated in the parent. 1718 * 1719 * If MODIFIED is already set determine if we can reuse the assigned 1720 * data block or if we need a new data block. 1721 */ 1722 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1723 /* 1724 * Must set modified bit. 1725 */ 1726 atomic_add_long(&hammer2_count_modified_chains, 1); 1727 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1728 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1729 setmodified = 1; 1730 1731 /* 1732 * We may be able to avoid a copy-on-write if the chain's 1733 * check mode is set to NONE and the chain's current 1734 * modify_tid is beyond the last explicit snapshot tid. 1735 * 1736 * This implements HAMMER2's overwrite-in-place feature. 1737 * 1738 * NOTE! This data-block cannot be used as a de-duplication 1739 * source when the check mode is set to NONE. 1740 */ 1741 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1742 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1743 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1744 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1745 HAMMER2_DEC_CHECK(chain->bref.methods) == 1746 HAMMER2_CHECK_NONE && 1747 chain->pmp && 1748 chain->bref.modify_tid > 1749 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1750 /* 1751 * Sector overwrite allowed. 1752 */ 1753 newmod = 0; 1754 } else if ((hmp->hflags & HMNT2_EMERG) && 1755 chain->pmp && 1756 chain->bref.modify_tid > 1757 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1758 /* 1759 * If in emergency delete mode then do a modify-in- 1760 * place on any chain type belonging to the PFS as 1761 * long as it doesn't mess up a snapshot. We might 1762 * be forced to do this anyway a little further down 1763 * in the code if the allocation fails. 1764 * 1765 * Also note that in emergency mode, these modify-in- 1766 * place operations are NOT SAFE. A storage failure, 1767 * power failure, or panic can corrupt the filesystem. 1768 */ 1769 newmod = 0; 1770 } else { 1771 /* 1772 * Sector overwrite not allowed, must copy-on-write. 1773 */ 1774 newmod = 1; 1775 } 1776 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1777 /* 1778 * If the modified chain was registered for dedup we need 1779 * a new allocation. This only happens for delayed-flush 1780 * chains (i.e. which run through the front-end buffer 1781 * cache). 1782 */ 1783 newmod = 1; 1784 setmodified = 0; 1785 } else { 1786 /* 1787 * Already flagged modified, no new allocation is needed. 1788 */ 1789 newmod = 0; 1790 setmodified = 0; 1791 } 1792 1793 /* 1794 * Flag parent update required. 1795 */ 1796 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1797 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1798 setupdate = 1; 1799 } else { 1800 setupdate = 0; 1801 } 1802 1803 /* 1804 * The XOP code returns held but unlocked focus chains. This 1805 * prevents the chain from being destroyed but does not prevent 1806 * it from being modified. diolk is used to interlock modifications 1807 * against XOP frontend accesses to the focus. 1808 * 1809 * This allows us to theoretically avoid deadlocking the frontend 1810 * if one of the backends lock up by not formally locking the 1811 * focused chain in the frontend. In addition, the synchronization 1812 * code relies on this mechanism to avoid deadlocking concurrent 1813 * synchronization threads. 1814 */ 1815 lockmgr(&chain->diolk, LK_EXCLUSIVE); 1816 1817 /* 1818 * The modification or re-modification requires an allocation and 1819 * possible COW. If an error occurs, the previous content and data 1820 * reference is retained and the modification fails. 1821 * 1822 * If dedup_off is non-zero, the caller is requesting a deduplication 1823 * rather than a modification. The MODIFIED bit is not set and the 1824 * data offset is set to the deduplication offset. The data cannot 1825 * be modified. 1826 * 1827 * NOTE: The dedup offset is allowed to be in a partially free state 1828 * and we must be sure to reset it to a fully allocated state 1829 * to force two bulkfree passes to free it again. 1830 * 1831 * NOTE: Only applicable when chain->bytes != 0. 1832 * 1833 * XXX can a chain already be marked MODIFIED without a data 1834 * assignment? If not, assert here instead of testing the case. 1835 */ 1836 if (chain != &hmp->vchain && chain != &hmp->fchain && 1837 chain->bytes) { 1838 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1839 newmod 1840 ) { 1841 /* 1842 * NOTE: We do not have to remove the dedup 1843 * registration because the area is still 1844 * allocated and the underlying DIO will 1845 * still be flushed. 1846 */ 1847 if (dedup_off) { 1848 chain->bref.data_off = dedup_off; 1849 chain->bytes = 1 << (dedup_off & 1850 HAMMER2_OFF_MASK_RADIX); 1851 chain->error = 0; 1852 atomic_clear_int(&chain->flags, 1853 HAMMER2_CHAIN_MODIFIED); 1854 atomic_add_long(&hammer2_count_modified_chains, 1855 -1); 1856 if (chain->pmp) { 1857 hammer2_pfs_memory_wakeup( 1858 chain->pmp, -1); 1859 } 1860 hammer2_freemap_adjust(hmp, &chain->bref, 1861 HAMMER2_FREEMAP_DORECOVER); 1862 atomic_set_int(&chain->flags, 1863 HAMMER2_CHAIN_DEDUPABLE); 1864 } else { 1865 error = hammer2_freemap_alloc(chain, 1866 chain->bytes); 1867 atomic_clear_int(&chain->flags, 1868 HAMMER2_CHAIN_DEDUPABLE); 1869 1870 /* 1871 * If we are unable to allocate a new block 1872 * but we are in emergency mode, issue a 1873 * warning to the console and reuse the same 1874 * block. 1875 * 1876 * We behave as if the allocation were 1877 * successful. 1878 * 1879 * THIS IS IMPORTANT: These modifications 1880 * are virtually guaranteed to corrupt any 1881 * snapshots related to this filesystem. 1882 */ 1883 if (error && (hmp->hflags & HMNT2_EMERG)) { 1884 error = 0; 1885 chain->bref.flags |= 1886 HAMMER2_BREF_FLAG_EMERG_MIP; 1887 1888 krateprintf(&krate_h2em, 1889 "hammer2: Emergency Mode WARNING: " 1890 "Operation will likely corrupt " 1891 "related snapshot: " 1892 "%016jx.%02x key=%016jx\n", 1893 chain->bref.data_off, 1894 chain->bref.type, 1895 chain->bref.key); 1896 } else if (error == 0) { 1897 chain->bref.flags &= 1898 ~HAMMER2_BREF_FLAG_EMERG_MIP; 1899 } 1900 } 1901 } 1902 } 1903 1904 /* 1905 * Stop here if error. We have to undo any flag bits we might 1906 * have set above. 1907 */ 1908 if (error) { 1909 if (setmodified) { 1910 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1911 atomic_add_long(&hammer2_count_modified_chains, -1); 1912 if (chain->pmp) 1913 hammer2_pfs_memory_wakeup(chain->pmp, -1); 1914 } 1915 if (setupdate) { 1916 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1917 } 1918 lockmgr(&chain->diolk, LK_RELEASE); 1919 1920 return error; 1921 } 1922 1923 /* 1924 * Update mirror_tid and modify_tid. modify_tid is only updated 1925 * if not passed as zero (during flushes, parent propagation passes 1926 * the value 0). 1927 * 1928 * NOTE: chain->pmp could be the device spmp. 1929 */ 1930 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1931 if (mtid) 1932 chain->bref.modify_tid = mtid; 1933 1934 /* 1935 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1936 * requires updating as well as to tell the delete code that the 1937 * chain's blockref might not exactly match (in terms of physical size 1938 * or block offset) the one in the parent's blocktable. The base key 1939 * of course will still match. 1940 */ 1941 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1942 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1943 1944 /* 1945 * Short-cut data block handling when the caller does not need an 1946 * actual data reference to (aka OPTDATA), as long as the chain does 1947 * not already have a data pointer to the data and no de-duplication 1948 * occurred. 1949 * 1950 * This generally means that the modifications are being done via the 1951 * logical buffer cache. 1952 * 1953 * NOTE: If deduplication occurred we have to run through the data 1954 * stuff to clear INITIAL, and the caller will likely want to 1955 * assign the check code anyway. Leaving INITIAL set on a 1956 * dedup can be deadly (it can cause the block to be zero'd!). 1957 * 1958 * This code also handles bytes == 0 (most dirents). 1959 */ 1960 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1961 (flags & HAMMER2_MODIFY_OPTDATA) && 1962 chain->data == NULL) { 1963 if (dedup_off == 0) { 1964 KKASSERT(chain->dio == NULL); 1965 goto skip2; 1966 } 1967 } 1968 1969 /* 1970 * Clearing the INITIAL flag (for indirect blocks) indicates that 1971 * we've processed the uninitialized storage allocation. 1972 * 1973 * If this flag is already clear we are likely in a copy-on-write 1974 * situation but we have to be sure NOT to bzero the storage if 1975 * no data is present. 1976 * 1977 * Clearing of NOTTESTED is allowed if the MODIFIED bit is set, 1978 */ 1979 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1980 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1981 wasinitial = 1; 1982 } else { 1983 wasinitial = 0; 1984 } 1985 1986 /* 1987 * Instantiate data buffer and possibly execute COW operation 1988 */ 1989 switch(chain->bref.type) { 1990 case HAMMER2_BREF_TYPE_VOLUME: 1991 case HAMMER2_BREF_TYPE_FREEMAP: 1992 /* 1993 * The data is embedded, no copy-on-write operation is 1994 * needed. 1995 */ 1996 KKASSERT(chain->dio == NULL); 1997 break; 1998 case HAMMER2_BREF_TYPE_DIRENT: 1999 /* 2000 * The data might be fully embedded. 2001 */ 2002 if (chain->bytes == 0) { 2003 KKASSERT(chain->dio == NULL); 2004 break; 2005 } 2006 /* fall through */ 2007 case HAMMER2_BREF_TYPE_INODE: 2008 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2009 case HAMMER2_BREF_TYPE_DATA: 2010 case HAMMER2_BREF_TYPE_INDIRECT: 2011 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2012 /* 2013 * Perform the copy-on-write operation 2014 * 2015 * zero-fill or copy-on-write depending on whether 2016 * chain->data exists or not and set the dirty state for 2017 * the new buffer. hammer2_io_new() will handle the 2018 * zero-fill. 2019 * 2020 * If a dedup_off was supplied this is an existing block 2021 * and no COW, copy, or further modification is required. 2022 */ 2023 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 2024 2025 if (wasinitial && dedup_off == 0) { 2026 error = hammer2_io_new(hmp, chain->bref.type, 2027 chain->bref.data_off, 2028 chain->bytes, &dio); 2029 } else { 2030 error = hammer2_io_bread(hmp, chain->bref.type, 2031 chain->bref.data_off, 2032 chain->bytes, &dio); 2033 } 2034 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 2035 2036 /* 2037 * If an I/O error occurs make sure callers cannot accidently 2038 * modify the old buffer's contents and corrupt the filesystem. 2039 * 2040 * NOTE: hammer2_io_data() call issues bkvasync() 2041 */ 2042 if (error) { 2043 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 2044 hmp); 2045 chain->error = HAMMER2_ERROR_EIO; 2046 hammer2_io_brelse(&dio); 2047 hammer2_io_brelse(&chain->dio); 2048 chain->data = NULL; 2049 break; 2050 } 2051 chain->error = 0; 2052 bdata = hammer2_io_data(dio, chain->bref.data_off); 2053 2054 if (chain->data) { 2055 /* 2056 * COW (unless a dedup). 2057 */ 2058 KKASSERT(chain->dio != NULL); 2059 if (chain->data != (void *)bdata && dedup_off == 0) { 2060 bcopy(chain->data, bdata, chain->bytes); 2061 } 2062 } else if (wasinitial == 0 && dedup_off == 0) { 2063 /* 2064 * We have a problem. We were asked to COW but 2065 * we don't have any data to COW with! 2066 */ 2067 panic("hammer2_chain_modify: having a COW %p\n", 2068 chain); 2069 } 2070 2071 /* 2072 * Retire the old buffer, replace with the new. Dirty or 2073 * redirty the new buffer. 2074 * 2075 * WARNING! The system buffer cache may have already flushed 2076 * the buffer, so we must be sure to [re]dirty it 2077 * for further modification. 2078 * 2079 * If dedup_off was supplied, the caller is not 2080 * expected to make any further modification to the 2081 * buffer. 2082 * 2083 * WARNING! hammer2_get_gdata() assumes dio never transitions 2084 * through NULL in order to optimize away unnecessary 2085 * diolk operations. 2086 */ 2087 { 2088 hammer2_io_t *tio; 2089 2090 if ((tio = chain->dio) != NULL) 2091 hammer2_io_bqrelse(&tio); 2092 chain->data = (void *)bdata; 2093 chain->dio = dio; 2094 if (dedup_off == 0) 2095 hammer2_io_setdirty(dio); 2096 } 2097 break; 2098 default: 2099 panic("hammer2_chain_modify: illegal non-embedded type %d", 2100 chain->bref.type); 2101 break; 2102 2103 } 2104 skip2: 2105 /* 2106 * setflush on parent indicating that the parent must recurse down 2107 * to us. Do not call on chain itself which might already have it 2108 * set. 2109 */ 2110 if (chain->parent) 2111 hammer2_chain_setflush(chain->parent); 2112 lockmgr(&chain->diolk, LK_RELEASE); 2113 2114 return (chain->error); 2115 } 2116 2117 /* 2118 * Modify the chain associated with an inode. 2119 */ 2120 int 2121 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 2122 hammer2_tid_t mtid, int flags) 2123 { 2124 int error; 2125 2126 hammer2_inode_modify(ip); 2127 error = hammer2_chain_modify(chain, mtid, 0, flags); 2128 2129 return error; 2130 } 2131 2132 /* 2133 * Volume header data locks 2134 */ 2135 void 2136 hammer2_voldata_lock(hammer2_dev_t *hmp) 2137 { 2138 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 2139 } 2140 2141 void 2142 hammer2_voldata_unlock(hammer2_dev_t *hmp) 2143 { 2144 lockmgr(&hmp->vollk, LK_RELEASE); 2145 } 2146 2147 void 2148 hammer2_voldata_modify(hammer2_dev_t *hmp) 2149 { 2150 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 2151 atomic_add_long(&hammer2_count_modified_chains, 1); 2152 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 2153 hammer2_pfs_memory_inc(hmp->vchain.pmp); 2154 } 2155 } 2156 2157 /* 2158 * This function returns the chain at the nearest key within the specified 2159 * range. The returned chain will be referenced but not locked. 2160 * 2161 * This function will recurse through chain->rbtree as necessary and will 2162 * return a *key_nextp suitable for iteration. *key_nextp is only set if 2163 * the iteration value is less than the current value of *key_nextp. 2164 * 2165 * The caller should use (*key_nextp) to calculate the actual range of 2166 * the returned element, which will be (key_beg to *key_nextp - 1), because 2167 * there might be another element which is superior to the returned element 2168 * and overlaps it. 2169 * 2170 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 2171 * chains continue to be returned. On EOF (*key_nextp) may overflow since 2172 * it will wind up being (key_end + 1). 2173 * 2174 * WARNING! Must be called with child's spinlock held. Spinlock remains 2175 * held through the operation. 2176 */ 2177 struct hammer2_chain_find_info { 2178 hammer2_chain_t *best; 2179 hammer2_key_t key_beg; 2180 hammer2_key_t key_end; 2181 hammer2_key_t key_next; 2182 }; 2183 2184 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 2185 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 2186 2187 static 2188 hammer2_chain_t * 2189 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 2190 hammer2_key_t key_beg, hammer2_key_t key_end) 2191 { 2192 struct hammer2_chain_find_info info; 2193 2194 info.best = NULL; 2195 info.key_beg = key_beg; 2196 info.key_end = key_end; 2197 info.key_next = *key_nextp; 2198 2199 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2200 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2201 &info); 2202 *key_nextp = info.key_next; 2203 #if 0 2204 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2205 parent, key_beg, key_end, *key_nextp); 2206 #endif 2207 2208 return (info.best); 2209 } 2210 2211 static 2212 int 2213 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2214 { 2215 struct hammer2_chain_find_info *info = data; 2216 hammer2_key_t child_beg; 2217 hammer2_key_t child_end; 2218 2219 child_beg = child->bref.key; 2220 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2221 2222 if (child_end < info->key_beg) 2223 return(-1); 2224 if (child_beg > info->key_end) 2225 return(1); 2226 return(0); 2227 } 2228 2229 static 2230 int 2231 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2232 { 2233 struct hammer2_chain_find_info *info = data; 2234 hammer2_chain_t *best; 2235 hammer2_key_t child_end; 2236 2237 /* 2238 * WARNING! Layerq is scanned forwards, exact matches should keep 2239 * the existing info->best. 2240 */ 2241 if ((best = info->best) == NULL) { 2242 /* 2243 * No previous best. Assign best 2244 */ 2245 info->best = child; 2246 } else if (best->bref.key <= info->key_beg && 2247 child->bref.key <= info->key_beg) { 2248 /* 2249 * Illegal overlap. 2250 */ 2251 KKASSERT(0); 2252 /*info->best = child;*/ 2253 } else if (child->bref.key < best->bref.key) { 2254 /* 2255 * Child has a nearer key and best is not flush with key_beg. 2256 * Set best to child. Truncate key_next to the old best key. 2257 */ 2258 info->best = child; 2259 if (info->key_next > best->bref.key || info->key_next == 0) 2260 info->key_next = best->bref.key; 2261 } else if (child->bref.key == best->bref.key) { 2262 /* 2263 * If our current best is flush with the child then this 2264 * is an illegal overlap. 2265 * 2266 * key_next will automatically be limited to the smaller of 2267 * the two end-points. 2268 */ 2269 KKASSERT(0); 2270 info->best = child; 2271 } else { 2272 /* 2273 * Keep the current best but truncate key_next to the child's 2274 * base. 2275 * 2276 * key_next will also automatically be limited to the smaller 2277 * of the two end-points (probably not necessary for this case 2278 * but we do it anyway). 2279 */ 2280 if (info->key_next > child->bref.key || info->key_next == 0) 2281 info->key_next = child->bref.key; 2282 } 2283 2284 /* 2285 * Always truncate key_next based on child's end-of-range. 2286 */ 2287 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2288 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2289 info->key_next = child_end; 2290 2291 return(0); 2292 } 2293 2294 /* 2295 * Retrieve the specified chain from a media blockref, creating the 2296 * in-memory chain structure which reflects it. The returned chain is 2297 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must 2298 * handle crc-checks and so forth, and should check chain->error before 2299 * assuming that the data is good. 2300 * 2301 * To handle insertion races pass the INSERT_RACE flag along with the 2302 * generation number of the core. NULL will be returned if the generation 2303 * number changes before we have a chance to insert the chain. Insert 2304 * races can occur because the parent might be held shared. 2305 * 2306 * Caller must hold the parent locked shared or exclusive since we may 2307 * need the parent's bref array to find our block. 2308 * 2309 * WARNING! chain->pmp is always set to NULL for any chain representing 2310 * part of the super-root topology. 2311 */ 2312 hammer2_chain_t * 2313 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2314 hammer2_blockref_t *bref, int how) 2315 { 2316 hammer2_dev_t *hmp = parent->hmp; 2317 hammer2_chain_t *chain; 2318 int error; 2319 2320 /* 2321 * Allocate a chain structure representing the existing media 2322 * entry. Resulting chain has one ref and is not locked. 2323 */ 2324 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2325 chain = hammer2_chain_alloc(hmp, NULL, bref); 2326 else 2327 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2328 /* ref'd chain returned */ 2329 2330 /* 2331 * Flag that the chain is in the parent's blockmap so delete/flush 2332 * knows what to do with it. 2333 */ 2334 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 2335 2336 /* 2337 * chain must be locked to avoid unexpected ripouts 2338 */ 2339 hammer2_chain_lock(chain, how); 2340 2341 /* 2342 * Link the chain into its parent. A spinlock is required to safely 2343 * access the RBTREE, and it is possible to collide with another 2344 * hammer2_chain_get() operation because the caller might only hold 2345 * a shared lock on the parent. 2346 * 2347 * NOTE: Get races can occur quite often when we distribute 2348 * asynchronous read-aheads across multiple threads. 2349 */ 2350 KKASSERT(parent->refs > 0); 2351 error = hammer2_chain_insert(parent, chain, 2352 HAMMER2_CHAIN_INSERT_SPIN | 2353 HAMMER2_CHAIN_INSERT_RACE, 2354 generation); 2355 if (error) { 2356 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2357 /*kprintf("chain %p get race\n", chain);*/ 2358 hammer2_chain_unlock(chain); 2359 hammer2_chain_drop(chain); 2360 chain = NULL; 2361 } else { 2362 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2363 } 2364 2365 /* 2366 * Return our new chain referenced but not locked, or NULL if 2367 * a race occurred. 2368 */ 2369 return (chain); 2370 } 2371 2372 /* 2373 * Lookup initialization/completion API 2374 */ 2375 hammer2_chain_t * 2376 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2377 { 2378 hammer2_chain_ref(parent); 2379 if (flags & HAMMER2_LOOKUP_SHARED) { 2380 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2381 HAMMER2_RESOLVE_SHARED); 2382 } else { 2383 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2384 } 2385 return (parent); 2386 } 2387 2388 void 2389 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2390 { 2391 if (parent) { 2392 hammer2_chain_unlock(parent); 2393 hammer2_chain_drop(parent); 2394 } 2395 } 2396 2397 /* 2398 * Take the locked chain and return a locked parent. The chain remains 2399 * locked on return, but may have to be temporarily unlocked to acquire 2400 * the parent. Because of this, (chain) must be stable and cannot be 2401 * deleted while it was temporarily unlocked (typically means that (chain) 2402 * is an inode). 2403 * 2404 * Pass HAMMER2_RESOLVE_* flags in flags. 2405 * 2406 * This will work even if the chain is errored, and the caller can check 2407 * parent->error on return if desired since the parent will be locked. 2408 * 2409 * This function handles the lock order reversal. 2410 */ 2411 hammer2_chain_t * 2412 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2413 { 2414 hammer2_chain_t *parent; 2415 2416 /* 2417 * Be careful of order, chain must be unlocked before parent 2418 * is locked below to avoid a deadlock. Try it trivially first. 2419 */ 2420 parent = chain->parent; 2421 if (parent == NULL) 2422 panic("hammer2_chain_getparent: no parent"); 2423 hammer2_chain_ref(parent); 2424 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) 2425 return parent; 2426 2427 for (;;) { 2428 hammer2_chain_unlock(chain); 2429 hammer2_chain_lock(parent, flags); 2430 hammer2_chain_lock(chain, flags); 2431 2432 /* 2433 * Parent relinking races are quite common. We have to get 2434 * it right or we will blow up the block table. 2435 */ 2436 if (chain->parent == parent) 2437 break; 2438 hammer2_chain_unlock(parent); 2439 hammer2_chain_drop(parent); 2440 cpu_ccfence(); 2441 parent = chain->parent; 2442 if (parent == NULL) 2443 panic("hammer2_chain_getparent: no parent"); 2444 hammer2_chain_ref(parent); 2445 } 2446 return parent; 2447 } 2448 2449 /* 2450 * Take the locked chain and return a locked parent. The chain is unlocked 2451 * and dropped. *chainp is set to the returned parent as a convenience. 2452 * Pass HAMMER2_RESOLVE_* flags in flags. 2453 * 2454 * This will work even if the chain is errored, and the caller can check 2455 * parent->error on return if desired since the parent will be locked. 2456 * 2457 * The chain does NOT need to be stable. We use a tracking structure 2458 * to track the expected parent if the chain is deleted out from under us. 2459 * 2460 * This function handles the lock order reversal. 2461 */ 2462 hammer2_chain_t * 2463 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2464 { 2465 hammer2_chain_t *chain; 2466 hammer2_chain_t *parent; 2467 struct hammer2_reptrack reptrack; 2468 struct hammer2_reptrack **repp; 2469 2470 /* 2471 * Be careful of order, chain must be unlocked before parent 2472 * is locked below to avoid a deadlock. Try it trivially first. 2473 */ 2474 chain = *chainp; 2475 parent = chain->parent; 2476 if (parent == NULL) { 2477 hammer2_spin_unex(&chain->core.spin); 2478 panic("hammer2_chain_repparent: no parent"); 2479 } 2480 hammer2_chain_ref(parent); 2481 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) { 2482 hammer2_chain_unlock(chain); 2483 hammer2_chain_drop(chain); 2484 *chainp = parent; 2485 2486 return parent; 2487 } 2488 2489 /* 2490 * Ok, now it gets a bit nasty. There are multiple situations where 2491 * the parent might be in the middle of a deletion, or where the child 2492 * (chain) might be deleted the instant we let go of its lock. 2493 * We can potentially end up in a no-win situation! 2494 * 2495 * In particular, the indirect_maintenance() case can cause these 2496 * situations. 2497 * 2498 * To deal with this we install a reptrack structure in the parent 2499 * This reptrack structure 'owns' the parent ref and will automatically 2500 * migrate to the parent's parent if the parent is deleted permanently. 2501 */ 2502 hammer2_spin_init(&reptrack.spin, "h2reptrk"); 2503 reptrack.chain = parent; 2504 hammer2_chain_ref(parent); /* for the reptrack */ 2505 2506 hammer2_spin_ex(&parent->core.spin); 2507 reptrack.next = parent->core.reptrack; 2508 parent->core.reptrack = &reptrack; 2509 hammer2_spin_unex(&parent->core.spin); 2510 2511 hammer2_chain_unlock(chain); 2512 hammer2_chain_drop(chain); 2513 chain = NULL; /* gone */ 2514 2515 /* 2516 * At the top of this loop, chain is gone and parent is refd both 2517 * by us explicitly AND via our reptrack. We are attempting to 2518 * lock parent. 2519 */ 2520 for (;;) { 2521 hammer2_chain_lock(parent, flags); 2522 2523 if (reptrack.chain == parent) 2524 break; 2525 hammer2_chain_unlock(parent); 2526 hammer2_chain_drop(parent); 2527 2528 kprintf("hammer2: debug REPTRACK %p->%p\n", 2529 parent, reptrack.chain); 2530 hammer2_spin_ex(&reptrack.spin); 2531 parent = reptrack.chain; 2532 hammer2_chain_ref(parent); 2533 hammer2_spin_unex(&reptrack.spin); 2534 } 2535 2536 /* 2537 * Once parent is locked and matches our reptrack, our reptrack 2538 * will be stable and we have our parent. We can unlink our 2539 * reptrack. 2540 * 2541 * WARNING! Remember that the chain lock might be shared. Chains 2542 * locked shared have stable parent linkages. 2543 */ 2544 hammer2_spin_ex(&parent->core.spin); 2545 repp = &parent->core.reptrack; 2546 while (*repp != &reptrack) 2547 repp = &(*repp)->next; 2548 *repp = reptrack.next; 2549 hammer2_spin_unex(&parent->core.spin); 2550 2551 hammer2_chain_drop(parent); /* reptrack ref */ 2552 *chainp = parent; /* return parent lock+ref */ 2553 2554 return parent; 2555 } 2556 2557 /* 2558 * Dispose of any linked reptrack structures in (chain) by shifting them to 2559 * (parent). Both (chain) and (parent) must be exclusively locked. 2560 * 2561 * This is interlocked against any children of (chain) on the other side. 2562 * No children so remain as-of when this is called so we can test 2563 * core.reptrack without holding the spin-lock. 2564 * 2565 * Used whenever the caller intends to permanently delete chains related 2566 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE), 2567 * where the chains underneath the node being deleted are given a new parent 2568 * above the node being deleted. 2569 */ 2570 static 2571 void 2572 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain) 2573 { 2574 struct hammer2_reptrack *reptrack; 2575 2576 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)); 2577 while (chain->core.reptrack) { 2578 hammer2_spin_ex(&parent->core.spin); 2579 hammer2_spin_ex(&chain->core.spin); 2580 reptrack = chain->core.reptrack; 2581 if (reptrack == NULL) { 2582 hammer2_spin_unex(&chain->core.spin); 2583 hammer2_spin_unex(&parent->core.spin); 2584 break; 2585 } 2586 hammer2_spin_ex(&reptrack->spin); 2587 chain->core.reptrack = reptrack->next; 2588 reptrack->chain = parent; 2589 reptrack->next = parent->core.reptrack; 2590 parent->core.reptrack = reptrack; 2591 hammer2_chain_ref(parent); /* reptrack */ 2592 2593 hammer2_spin_unex(&chain->core.spin); 2594 hammer2_spin_unex(&parent->core.spin); 2595 kprintf("hammer2: debug repchange %p %p->%p\n", 2596 reptrack, chain, parent); 2597 hammer2_chain_drop(chain); /* reptrack */ 2598 } 2599 } 2600 2601 /* 2602 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2603 * (*parentp) typically points to an inode but can also point to a related 2604 * indirect block and this function will recurse upwards and find the inode 2605 * or the nearest undeleted indirect block covering the key range. 2606 * 2607 * This function unconditionally sets *errorp, replacing any previous value. 2608 * 2609 * (*parentp) must be exclusive or shared locked (depending on flags) and 2610 * referenced and can be an inode or an existing indirect block within the 2611 * inode. 2612 * 2613 * If (*parent) is errored out, this function will not attempt to recurse 2614 * the radix tree and will return NULL along with an appropriate *errorp. 2615 * If NULL is returned and *errorp is 0, the requested lookup could not be 2616 * located. 2617 * 2618 * On return (*parentp) will be modified to point at the deepest parent chain 2619 * element encountered during the search, as a helper for an insertion or 2620 * deletion. 2621 * 2622 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2623 * and referenced, and the old will be unlocked and dereferenced (no change 2624 * if they are both the same). This is particularly important if the caller 2625 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2626 * is returned, as long as no error occurred. 2627 * 2628 * The matching chain will be returned locked according to flags. 2629 * 2630 * -- 2631 * 2632 * NULL is returned if no match was found, but (*parentp) will still 2633 * potentially be adjusted. 2634 * 2635 * On return (*key_nextp) will point to an iterative value for key_beg. 2636 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2637 * 2638 * This function will also recurse up the chain if the key is not within the 2639 * current parent's range. (*parentp) can never be set to NULL. An iteration 2640 * can simply allow (*parentp) to float inside the loop. 2641 * 2642 * NOTE! chain->data is not always resolved. By default it will not be 2643 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2644 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2645 * BREF_TYPE_DATA as the device buffer can alias the logical file 2646 * buffer). 2647 */ 2648 2649 hammer2_chain_t * 2650 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2651 hammer2_key_t key_beg, hammer2_key_t key_end, 2652 int *errorp, int flags) 2653 { 2654 hammer2_dev_t *hmp; 2655 hammer2_chain_t *parent; 2656 hammer2_chain_t *chain; 2657 hammer2_blockref_t *base; 2658 hammer2_blockref_t *bref; 2659 hammer2_blockref_t bsave; 2660 hammer2_key_t scan_beg; 2661 hammer2_key_t scan_end; 2662 int count = 0; 2663 int how_always = HAMMER2_RESOLVE_ALWAYS; 2664 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2665 int how; 2666 int generation; 2667 int maxloops = 300000; 2668 volatile hammer2_mtx_t save_mtx; 2669 2670 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2671 how_maybe = how_always; 2672 how = HAMMER2_RESOLVE_ALWAYS; 2673 } else if (flags & HAMMER2_LOOKUP_NODATA) { 2674 how = HAMMER2_RESOLVE_NEVER; 2675 } else { 2676 how = HAMMER2_RESOLVE_MAYBE; 2677 } 2678 if (flags & HAMMER2_LOOKUP_SHARED) { 2679 how_maybe |= HAMMER2_RESOLVE_SHARED; 2680 how_always |= HAMMER2_RESOLVE_SHARED; 2681 how |= HAMMER2_RESOLVE_SHARED; 2682 } 2683 2684 /* 2685 * Recurse (*parentp) upward if necessary until the parent completely 2686 * encloses the key range or we hit the inode. 2687 * 2688 * Handle races against the flusher deleting indirect nodes on its 2689 * way back up by continuing to recurse upward past the deletion. 2690 */ 2691 parent = *parentp; 2692 hmp = parent->hmp; 2693 *errorp = 0; 2694 2695 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2696 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2697 scan_beg = parent->bref.key; 2698 scan_end = scan_beg + 2699 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2700 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2701 if (key_beg >= scan_beg && key_end <= scan_end) 2702 break; 2703 } 2704 parent = hammer2_chain_repparent(parentp, how_maybe); 2705 } 2706 again: 2707 if (--maxloops == 0) 2708 panic("hammer2_chain_lookup: maxloops"); 2709 2710 /* 2711 * MATCHIND case that does not require parent->data (do prior to 2712 * parent->error check). 2713 */ 2714 switch(parent->bref.type) { 2715 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2716 case HAMMER2_BREF_TYPE_INDIRECT: 2717 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2718 scan_beg = parent->bref.key; 2719 scan_end = scan_beg + 2720 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2721 if (key_beg == scan_beg && key_end == scan_end) { 2722 chain = parent; 2723 hammer2_chain_ref(chain); 2724 hammer2_chain_lock(chain, how_maybe); 2725 *key_nextp = scan_end + 1; 2726 goto done; 2727 } 2728 } 2729 break; 2730 default: 2731 break; 2732 } 2733 2734 /* 2735 * No lookup is possible if the parent is errored. We delayed 2736 * this check as long as we could to ensure that the parent backup, 2737 * embedded data, and MATCHIND code could still execute. 2738 */ 2739 if (parent->error) { 2740 *errorp = parent->error; 2741 return NULL; 2742 } 2743 2744 /* 2745 * Locate the blockref array. Currently we do a fully associative 2746 * search through the array. 2747 */ 2748 switch(parent->bref.type) { 2749 case HAMMER2_BREF_TYPE_INODE: 2750 /* 2751 * Special shortcut for embedded data returns the inode 2752 * itself. Callers must detect this condition and access 2753 * the embedded data (the strategy code does this for us). 2754 * 2755 * This is only applicable to regular files and softlinks. 2756 * 2757 * We need a second lock on parent. Since we already have 2758 * a lock we must pass LOCKAGAIN to prevent unexpected 2759 * blocking (we don't want to block on a second shared 2760 * ref if an exclusive lock is pending) 2761 */ 2762 if (parent->data->ipdata.meta.op_flags & 2763 HAMMER2_OPFLAG_DIRECTDATA) { 2764 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2765 chain = NULL; 2766 *key_nextp = key_end + 1; 2767 goto done; 2768 } 2769 hammer2_chain_ref(parent); 2770 hammer2_chain_lock(parent, how_always | 2771 HAMMER2_RESOLVE_LOCKAGAIN); 2772 *key_nextp = key_end + 1; 2773 return (parent); 2774 } 2775 base = &parent->data->ipdata.u.blockset.blockref[0]; 2776 count = HAMMER2_SET_COUNT; 2777 break; 2778 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2779 case HAMMER2_BREF_TYPE_INDIRECT: 2780 /* 2781 * Optimize indirect blocks in the INITIAL state to avoid 2782 * I/O. 2783 * 2784 * Debugging: Enter permanent wait state instead of 2785 * panicing on unexpectedly NULL data for the moment. 2786 */ 2787 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2788 base = NULL; 2789 } else { 2790 if (parent->data == NULL) { 2791 kprintf("hammer2: unexpected NULL data " 2792 "on %p\n", parent); 2793 while (1) 2794 tsleep(parent, 0, "xxx", 0); 2795 } 2796 base = &parent->data->npdata[0]; 2797 } 2798 count = parent->bytes / sizeof(hammer2_blockref_t); 2799 break; 2800 case HAMMER2_BREF_TYPE_VOLUME: 2801 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2802 count = HAMMER2_SET_COUNT; 2803 break; 2804 case HAMMER2_BREF_TYPE_FREEMAP: 2805 base = &parent->data->blkset.blockref[0]; 2806 count = HAMMER2_SET_COUNT; 2807 break; 2808 default: 2809 kprintf("hammer2_chain_lookup: unrecognized " 2810 "blockref(B) type: %d", 2811 parent->bref.type); 2812 while (1) 2813 tsleep(&base, 0, "dead", 0); 2814 panic("hammer2_chain_lookup: unrecognized " 2815 "blockref(B) type: %d", 2816 parent->bref.type); 2817 base = NULL; /* safety */ 2818 count = 0; /* safety */ 2819 } 2820 2821 /* 2822 * Merged scan to find next candidate. 2823 * 2824 * hammer2_base_*() functions require the parent->core.live_* fields 2825 * to be synchronized. 2826 * 2827 * We need to hold the spinlock to access the block array and RB tree 2828 * and to interlock chain creation. 2829 */ 2830 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2831 hammer2_chain_countbrefs(parent, base, count); 2832 2833 /* 2834 * Combined search 2835 */ 2836 hammer2_spin_ex(&parent->core.spin); 2837 chain = hammer2_combined_find(parent, base, count, 2838 key_nextp, 2839 key_beg, key_end, 2840 &bref); 2841 generation = parent->core.generation; 2842 2843 /* 2844 * Exhausted parent chain, iterate. 2845 */ 2846 if (bref == NULL) { 2847 KKASSERT(chain == NULL); 2848 hammer2_spin_unex(&parent->core.spin); 2849 if (key_beg == key_end) /* short cut single-key case */ 2850 return (NULL); 2851 2852 /* 2853 * Stop if we reached the end of the iteration. 2854 */ 2855 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2856 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2857 return (NULL); 2858 } 2859 2860 /* 2861 * Calculate next key, stop if we reached the end of the 2862 * iteration, otherwise go up one level and loop. 2863 */ 2864 key_beg = parent->bref.key + 2865 ((hammer2_key_t)1 << parent->bref.keybits); 2866 if (key_beg == 0 || key_beg > key_end) 2867 return (NULL); 2868 parent = hammer2_chain_repparent(parentp, how_maybe); 2869 goto again; 2870 } 2871 2872 /* 2873 * Selected from blockref or in-memory chain. 2874 */ 2875 bsave = *bref; 2876 if (chain == NULL) { 2877 hammer2_spin_unex(&parent->core.spin); 2878 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2879 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2880 chain = hammer2_chain_get(parent, generation, 2881 &bsave, how_maybe); 2882 } else { 2883 chain = hammer2_chain_get(parent, generation, 2884 &bsave, how); 2885 } 2886 if (chain == NULL) 2887 goto again; 2888 } else { 2889 hammer2_chain_ref(chain); 2890 hammer2_spin_unex(&parent->core.spin); 2891 2892 /* 2893 * chain is referenced but not locked. We must lock the 2894 * chain to obtain definitive state. 2895 */ 2896 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2897 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2898 hammer2_chain_lock(chain, how_maybe); 2899 } else { 2900 hammer2_chain_lock(chain, how); 2901 } 2902 KKASSERT(chain->parent == parent); 2903 } 2904 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 2905 chain->parent != parent) { 2906 hammer2_chain_unlock(chain); 2907 hammer2_chain_drop(chain); 2908 chain = NULL; /* SAFETY */ 2909 goto again; 2910 } 2911 2912 2913 /* 2914 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2915 * 2916 * NOTE: Chain's key range is not relevant as there might be 2917 * one-offs within the range that are not deleted. 2918 * 2919 * NOTE: Lookups can race delete-duplicate because 2920 * delete-duplicate does not lock the parent's core 2921 * (they just use the spinlock on the core). 2922 */ 2923 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2924 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2925 chain->bref.data_off, chain->bref.type, 2926 chain->bref.key); 2927 hammer2_chain_unlock(chain); 2928 hammer2_chain_drop(chain); 2929 chain = NULL; /* SAFETY */ 2930 key_beg = *key_nextp; 2931 if (key_beg == 0 || key_beg > key_end) 2932 return(NULL); 2933 goto again; 2934 } 2935 2936 /* 2937 * If the chain element is an indirect block it becomes the new 2938 * parent and we loop on it. We must maintain our top-down locks 2939 * to prevent the flusher from interfering (i.e. doing a 2940 * delete-duplicate and leaving us recursing down a deleted chain). 2941 * 2942 * The parent always has to be locked with at least RESOLVE_MAYBE 2943 * so we can access its data. It might need a fixup if the caller 2944 * passed incompatible flags. Be careful not to cause a deadlock 2945 * as a data-load requires an exclusive lock. 2946 * 2947 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2948 * range is within the requested key range we return the indirect 2949 * block and do NOT loop. This is usually only used to acquire 2950 * freemap nodes. 2951 */ 2952 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2953 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2954 save_mtx = parent->lock; 2955 hammer2_chain_unlock(parent); 2956 hammer2_chain_drop(parent); 2957 *parentp = parent = chain; 2958 chain = NULL; /* SAFETY */ 2959 goto again; 2960 } 2961 done: 2962 /* 2963 * All done, return the locked chain. 2964 * 2965 * If the caller does not want a locked chain, replace the lock with 2966 * a ref. Perhaps this can eventually be optimized to not obtain the 2967 * lock in the first place for situations where the data does not 2968 * need to be resolved. 2969 * 2970 * NOTE! A chain->error must be tested by the caller upon return. 2971 * *errorp is only set based on issues which occur while 2972 * trying to reach the chain. 2973 */ 2974 return (chain); 2975 } 2976 2977 /* 2978 * After having issued a lookup we can iterate all matching keys. 2979 * 2980 * If chain is non-NULL we continue the iteration from just after it's index. 2981 * 2982 * If chain is NULL we assume the parent was exhausted and continue the 2983 * iteration at the next parent. 2984 * 2985 * If a fatal error occurs (typically an I/O error), a dummy chain is 2986 * returned with chain->error and error-identifying information set. This 2987 * chain will assert if you try to do anything fancy with it. 2988 * 2989 * XXX Depending on where the error occurs we should allow continued iteration. 2990 * 2991 * parent must be locked on entry and remains locked throughout. chain's 2992 * lock status must match flags. Chain is always at least referenced. 2993 * 2994 * WARNING! The MATCHIND flag does not apply to this function. 2995 */ 2996 hammer2_chain_t * 2997 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2998 hammer2_key_t *key_nextp, 2999 hammer2_key_t key_beg, hammer2_key_t key_end, 3000 int *errorp, int flags) 3001 { 3002 hammer2_chain_t *parent; 3003 int how_maybe; 3004 3005 /* 3006 * Calculate locking flags for upward recursion. 3007 */ 3008 how_maybe = HAMMER2_RESOLVE_MAYBE; 3009 if (flags & HAMMER2_LOOKUP_SHARED) 3010 how_maybe |= HAMMER2_RESOLVE_SHARED; 3011 3012 parent = *parentp; 3013 *errorp = 0; 3014 3015 /* 3016 * Calculate the next index and recalculate the parent if necessary. 3017 */ 3018 if (chain) { 3019 key_beg = chain->bref.key + 3020 ((hammer2_key_t)1 << chain->bref.keybits); 3021 hammer2_chain_unlock(chain); 3022 hammer2_chain_drop(chain); 3023 3024 /* 3025 * chain invalid past this point, but we can still do a 3026 * pointer comparison w/parent. 3027 * 3028 * Any scan where the lookup returned degenerate data embedded 3029 * in the inode has an invalid index and must terminate. 3030 */ 3031 if (chain == parent) 3032 return(NULL); 3033 if (key_beg == 0 || key_beg > key_end) 3034 return(NULL); 3035 chain = NULL; 3036 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 3037 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 3038 /* 3039 * We reached the end of the iteration. 3040 */ 3041 return (NULL); 3042 } else { 3043 /* 3044 * Continue iteration with next parent unless the current 3045 * parent covers the range. 3046 * 3047 * (This also handles the case of a deleted, empty indirect 3048 * node). 3049 */ 3050 key_beg = parent->bref.key + 3051 ((hammer2_key_t)1 << parent->bref.keybits); 3052 if (key_beg == 0 || key_beg > key_end) 3053 return (NULL); 3054 parent = hammer2_chain_repparent(parentp, how_maybe); 3055 } 3056 3057 /* 3058 * And execute 3059 */ 3060 return (hammer2_chain_lookup(parentp, key_nextp, 3061 key_beg, key_end, 3062 errorp, flags)); 3063 } 3064 3065 /* 3066 * Caller wishes to iterate chains under parent, loading new chains into 3067 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 3068 * then call hammer2_chain_scan() repeatedly until a non-zero return. 3069 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 3070 * with the returned chain for the scan. The returned *chainp will be 3071 * locked and referenced. Any prior contents will be unlocked and dropped. 3072 * 3073 * Caller should check the return value. A normal scan EOF will return 3074 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 3075 * error trying to access parent data. Any error in the returned chain 3076 * must be tested separately by the caller. 3077 * 3078 * (*chainp) is dropped on each scan, but will only be set if the returned 3079 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 3080 * returned via *chainp. The caller will get their bref only. 3081 * 3082 * The raw scan function is similar to lookup/next but does not seek to a key. 3083 * Blockrefs are iterated via first_bref = (parent, NULL) and 3084 * next_chain = (parent, bref). 3085 * 3086 * The passed-in parent must be locked and its data resolved. The function 3087 * nominally returns a locked and referenced *chainp != NULL for chains 3088 * the caller might need to recurse on (and will dipose of any *chainp passed 3089 * in). The caller must check the chain->bref.type either way. 3090 */ 3091 int 3092 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 3093 hammer2_blockref_t *bref, int *firstp, 3094 int flags) 3095 { 3096 hammer2_dev_t *hmp; 3097 hammer2_blockref_t *base; 3098 hammer2_blockref_t *bref_ptr; 3099 hammer2_key_t key; 3100 hammer2_key_t next_key; 3101 hammer2_chain_t *chain = NULL; 3102 int count = 0; 3103 int how_always = HAMMER2_RESOLVE_ALWAYS; 3104 int how_maybe = HAMMER2_RESOLVE_MAYBE; 3105 int how; 3106 int generation; 3107 int maxloops = 300000; 3108 int error; 3109 3110 hmp = parent->hmp; 3111 error = 0; 3112 3113 /* 3114 * Scan flags borrowed from lookup. 3115 */ 3116 if (flags & HAMMER2_LOOKUP_ALWAYS) { 3117 how_maybe = how_always; 3118 how = HAMMER2_RESOLVE_ALWAYS; 3119 } else if (flags & HAMMER2_LOOKUP_NODATA) { 3120 how = HAMMER2_RESOLVE_NEVER; 3121 } else { 3122 how = HAMMER2_RESOLVE_MAYBE; 3123 } 3124 if (flags & HAMMER2_LOOKUP_SHARED) { 3125 how_maybe |= HAMMER2_RESOLVE_SHARED; 3126 how_always |= HAMMER2_RESOLVE_SHARED; 3127 how |= HAMMER2_RESOLVE_SHARED; 3128 } 3129 3130 /* 3131 * Calculate key to locate first/next element, unlocking the previous 3132 * element as we go. Be careful, the key calculation can overflow. 3133 * 3134 * (also reset bref to NULL) 3135 */ 3136 if (*firstp) { 3137 key = 0; 3138 *firstp = 0; 3139 } else { 3140 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 3141 if ((chain = *chainp) != NULL) { 3142 *chainp = NULL; 3143 hammer2_chain_unlock(chain); 3144 hammer2_chain_drop(chain); 3145 chain = NULL; 3146 } 3147 if (key == 0) { 3148 error |= HAMMER2_ERROR_EOF; 3149 goto done; 3150 } 3151 } 3152 3153 again: 3154 if (parent->error) { 3155 error = parent->error; 3156 goto done; 3157 } 3158 if (--maxloops == 0) 3159 panic("hammer2_chain_scan: maxloops"); 3160 3161 /* 3162 * Locate the blockref array. Currently we do a fully associative 3163 * search through the array. 3164 */ 3165 switch(parent->bref.type) { 3166 case HAMMER2_BREF_TYPE_INODE: 3167 /* 3168 * An inode with embedded data has no sub-chains. 3169 * 3170 * WARNING! Bulk scan code may pass a static chain marked 3171 * as BREF_TYPE_INODE with a copy of the volume 3172 * root blockset to snapshot the volume. 3173 */ 3174 if (parent->data->ipdata.meta.op_flags & 3175 HAMMER2_OPFLAG_DIRECTDATA) { 3176 error |= HAMMER2_ERROR_EOF; 3177 goto done; 3178 } 3179 base = &parent->data->ipdata.u.blockset.blockref[0]; 3180 count = HAMMER2_SET_COUNT; 3181 break; 3182 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3183 case HAMMER2_BREF_TYPE_INDIRECT: 3184 /* 3185 * Optimize indirect blocks in the INITIAL state to avoid 3186 * I/O. 3187 */ 3188 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3189 base = NULL; 3190 } else { 3191 if (parent->data == NULL) 3192 panic("parent->data is NULL"); 3193 base = &parent->data->npdata[0]; 3194 } 3195 count = parent->bytes / sizeof(hammer2_blockref_t); 3196 break; 3197 case HAMMER2_BREF_TYPE_VOLUME: 3198 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3199 count = HAMMER2_SET_COUNT; 3200 break; 3201 case HAMMER2_BREF_TYPE_FREEMAP: 3202 base = &parent->data->blkset.blockref[0]; 3203 count = HAMMER2_SET_COUNT; 3204 break; 3205 default: 3206 panic("hammer2_chain_scan: unrecognized blockref type: %d", 3207 parent->bref.type); 3208 base = NULL; /* safety */ 3209 count = 0; /* safety */ 3210 } 3211 3212 /* 3213 * Merged scan to find next candidate. 3214 * 3215 * hammer2_base_*() functions require the parent->core.live_* fields 3216 * to be synchronized. 3217 * 3218 * We need to hold the spinlock to access the block array and RB tree 3219 * and to interlock chain creation. 3220 */ 3221 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3222 hammer2_chain_countbrefs(parent, base, count); 3223 3224 next_key = 0; 3225 bref_ptr = NULL; 3226 hammer2_spin_ex(&parent->core.spin); 3227 chain = hammer2_combined_find(parent, base, count, 3228 &next_key, 3229 key, HAMMER2_KEY_MAX, 3230 &bref_ptr); 3231 generation = parent->core.generation; 3232 3233 /* 3234 * Exhausted parent chain, we're done. 3235 */ 3236 if (bref_ptr == NULL) { 3237 hammer2_spin_unex(&parent->core.spin); 3238 KKASSERT(chain == NULL); 3239 error |= HAMMER2_ERROR_EOF; 3240 goto done; 3241 } 3242 3243 /* 3244 * Copy into the supplied stack-based blockref. 3245 */ 3246 *bref = *bref_ptr; 3247 3248 /* 3249 * Selected from blockref or in-memory chain. 3250 */ 3251 if (chain == NULL) { 3252 switch(bref->type) { 3253 case HAMMER2_BREF_TYPE_INODE: 3254 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3255 case HAMMER2_BREF_TYPE_INDIRECT: 3256 case HAMMER2_BREF_TYPE_VOLUME: 3257 case HAMMER2_BREF_TYPE_FREEMAP: 3258 /* 3259 * Recursion, always get the chain 3260 */ 3261 hammer2_spin_unex(&parent->core.spin); 3262 chain = hammer2_chain_get(parent, generation, 3263 bref, how); 3264 if (chain == NULL) 3265 goto again; 3266 break; 3267 default: 3268 /* 3269 * No recursion, do not waste time instantiating 3270 * a chain, just iterate using the bref. 3271 */ 3272 hammer2_spin_unex(&parent->core.spin); 3273 break; 3274 } 3275 } else { 3276 /* 3277 * Recursion or not we need the chain in order to supply 3278 * the bref. 3279 */ 3280 hammer2_chain_ref(chain); 3281 hammer2_spin_unex(&parent->core.spin); 3282 hammer2_chain_lock(chain, how); 3283 } 3284 if (chain && 3285 (bcmp(bref, &chain->bref, sizeof(*bref)) || 3286 chain->parent != parent)) { 3287 hammer2_chain_unlock(chain); 3288 hammer2_chain_drop(chain); 3289 chain = NULL; 3290 goto again; 3291 } 3292 3293 /* 3294 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 3295 * 3296 * NOTE: chain's key range is not relevant as there might be 3297 * one-offs within the range that are not deleted. 3298 * 3299 * NOTE: XXX this could create problems with scans used in 3300 * situations other than mount-time recovery. 3301 * 3302 * NOTE: Lookups can race delete-duplicate because 3303 * delete-duplicate does not lock the parent's core 3304 * (they just use the spinlock on the core). 3305 */ 3306 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3307 hammer2_chain_unlock(chain); 3308 hammer2_chain_drop(chain); 3309 chain = NULL; 3310 3311 key = next_key; 3312 if (key == 0) { 3313 error |= HAMMER2_ERROR_EOF; 3314 goto done; 3315 } 3316 goto again; 3317 } 3318 3319 done: 3320 /* 3321 * All done, return the bref or NULL, supply chain if necessary. 3322 */ 3323 if (chain) 3324 *chainp = chain; 3325 return (error); 3326 } 3327 3328 /* 3329 * Create and return a new hammer2 system memory structure of the specified 3330 * key, type and size and insert it under (*parentp). This is a full 3331 * insertion, based on the supplied key/keybits, and may involve creating 3332 * indirect blocks and moving other chains around via delete/duplicate. 3333 * 3334 * This call can be made with parent == NULL as long as a non -1 methods 3335 * is supplied. hmp must also be supplied in this situation (otherwise 3336 * hmp is extracted from the supplied parent). The chain will be detached 3337 * from the topology. A later call with both parent and chain can be made 3338 * to attach it. 3339 * 3340 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3341 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3342 * FULL. This typically means that the caller is creating the chain after 3343 * doing a hammer2_chain_lookup(). 3344 * 3345 * (*parentp) must be exclusive locked and may be replaced on return 3346 * depending on how much work the function had to do. 3347 * 3348 * (*parentp) must not be errored or this function will assert. 3349 * 3350 * (*chainp) usually starts out NULL and returns the newly created chain, 3351 * but if the caller desires the caller may allocate a disconnected chain 3352 * and pass it in instead. 3353 * 3354 * This function should NOT be used to insert INDIRECT blocks. It is 3355 * typically used to create/insert inodes and data blocks. 3356 * 3357 * Caller must pass-in an exclusively locked parent the new chain is to 3358 * be inserted under, and optionally pass-in a disconnected, exclusively 3359 * locked chain to insert (else we create a new chain). The function will 3360 * adjust (*parentp) as necessary, create or connect the chain, and 3361 * return an exclusively locked chain in *chainp. 3362 * 3363 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3364 * and will be reassigned. 3365 * 3366 * NOTE: returns HAMMER_ERROR_* flags 3367 */ 3368 int 3369 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3370 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods, 3371 hammer2_key_t key, int keybits, int type, size_t bytes, 3372 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3373 { 3374 hammer2_chain_t *chain; 3375 hammer2_chain_t *parent; 3376 hammer2_blockref_t *base; 3377 hammer2_blockref_t dummy; 3378 int allocated = 0; 3379 int error = 0; 3380 int count; 3381 int maxloops = 300000; 3382 3383 /* 3384 * Topology may be crossing a PFS boundary. 3385 */ 3386 parent = *parentp; 3387 if (parent) { 3388 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3389 KKASSERT(parent->error == 0); 3390 hmp = parent->hmp; 3391 } 3392 chain = *chainp; 3393 3394 if (chain == NULL) { 3395 /* 3396 * First allocate media space and construct the dummy bref, 3397 * then allocate the in-memory chain structure. Set the 3398 * INITIAL flag for fresh chains which do not have embedded 3399 * data. 3400 * 3401 * XXX for now set the check mode of the child based on 3402 * the parent or, if the parent is an inode, the 3403 * specification in the inode. 3404 */ 3405 bzero(&dummy, sizeof(dummy)); 3406 dummy.type = type; 3407 dummy.key = key; 3408 dummy.keybits = keybits; 3409 dummy.data_off = hammer2_getradix(bytes); 3410 3411 /* 3412 * Inherit methods from parent by default. Primarily used 3413 * for BREF_TYPE_DATA. Non-data types *must* be set to 3414 * a non-NONE check algorithm. 3415 */ 3416 if (methods == -1) 3417 dummy.methods = parent->bref.methods; 3418 else 3419 dummy.methods = (uint8_t)methods; 3420 3421 if (type != HAMMER2_BREF_TYPE_DATA && 3422 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3423 dummy.methods |= 3424 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3425 } 3426 3427 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3428 3429 /* 3430 * Lock the chain manually, chain_lock will load the chain 3431 * which we do NOT want to do. (note: chain->refs is set 3432 * to 1 by chain_alloc() for us, but lockcnt is not). 3433 */ 3434 chain->lockcnt = 1; 3435 hammer2_mtx_ex(&chain->lock); 3436 allocated = 1; 3437 3438 /* 3439 * Set INITIAL to optimize I/O. The flag will generally be 3440 * processed when we call hammer2_chain_modify(). 3441 * 3442 * Recalculate bytes to reflect the actual media block 3443 * allocation. Handle special case radix 0 == 0 bytes. 3444 */ 3445 bytes = (size_t)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 3446 if (bytes) 3447 bytes = (hammer2_off_t)1 << bytes; 3448 chain->bytes = bytes; 3449 3450 switch(type) { 3451 case HAMMER2_BREF_TYPE_VOLUME: 3452 case HAMMER2_BREF_TYPE_FREEMAP: 3453 panic("hammer2_chain_create: called with volume type"); 3454 break; 3455 case HAMMER2_BREF_TYPE_INDIRECT: 3456 panic("hammer2_chain_create: cannot be used to" 3457 "create indirect block"); 3458 break; 3459 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3460 panic("hammer2_chain_create: cannot be used to" 3461 "create freemap root or node"); 3462 break; 3463 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3464 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3465 /* fall through */ 3466 case HAMMER2_BREF_TYPE_DIRENT: 3467 case HAMMER2_BREF_TYPE_INODE: 3468 case HAMMER2_BREF_TYPE_DATA: 3469 default: 3470 /* 3471 * leave chain->data NULL, set INITIAL 3472 */ 3473 KKASSERT(chain->data == NULL); 3474 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3475 break; 3476 } 3477 } else { 3478 /* 3479 * We are reattaching a previously deleted chain, possibly 3480 * under a new parent and possibly with a new key/keybits. 3481 * The chain does not have to be in a modified state. The 3482 * UPDATE flag will be set later on in this routine. 3483 * 3484 * Do NOT mess with the current state of the INITIAL flag. 3485 */ 3486 chain->bref.key = key; 3487 chain->bref.keybits = keybits; 3488 if (chain->flags & HAMMER2_CHAIN_DELETED) 3489 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3490 KKASSERT(chain->parent == NULL); 3491 } 3492 3493 /* 3494 * Set the appropriate bref flag if requested. 3495 * 3496 * NOTE! Callers can call this function to move chains without 3497 * knowing about special flags, so don't clear bref flags 3498 * here! 3499 */ 3500 if (flags & HAMMER2_INSERT_PFSROOT) 3501 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3502 3503 if (parent == NULL) 3504 goto skip; 3505 3506 /* 3507 * Calculate how many entries we have in the blockref array and 3508 * determine if an indirect block is required when inserting into 3509 * the parent. 3510 */ 3511 again: 3512 if (--maxloops == 0) 3513 panic("hammer2_chain_create: maxloops"); 3514 3515 switch(parent->bref.type) { 3516 case HAMMER2_BREF_TYPE_INODE: 3517 if ((parent->data->ipdata.meta.op_flags & 3518 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3519 kprintf("hammer2: parent set for direct-data! " 3520 "pkey=%016jx ckey=%016jx\n", 3521 parent->bref.key, 3522 chain->bref.key); 3523 } 3524 KKASSERT((parent->data->ipdata.meta.op_flags & 3525 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3526 KKASSERT(parent->data != NULL); 3527 base = &parent->data->ipdata.u.blockset.blockref[0]; 3528 count = HAMMER2_SET_COUNT; 3529 break; 3530 case HAMMER2_BREF_TYPE_INDIRECT: 3531 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3532 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3533 base = NULL; 3534 else 3535 base = &parent->data->npdata[0]; 3536 count = parent->bytes / sizeof(hammer2_blockref_t); 3537 break; 3538 case HAMMER2_BREF_TYPE_VOLUME: 3539 KKASSERT(parent->data != NULL); 3540 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3541 count = HAMMER2_SET_COUNT; 3542 break; 3543 case HAMMER2_BREF_TYPE_FREEMAP: 3544 KKASSERT(parent->data != NULL); 3545 base = &parent->data->blkset.blockref[0]; 3546 count = HAMMER2_SET_COUNT; 3547 break; 3548 default: 3549 panic("hammer2_chain_create: unrecognized blockref type: %d", 3550 parent->bref.type); 3551 base = NULL; 3552 count = 0; 3553 break; 3554 } 3555 3556 /* 3557 * Make sure we've counted the brefs 3558 */ 3559 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3560 hammer2_chain_countbrefs(parent, base, count); 3561 3562 KASSERT(parent->core.live_count >= 0 && 3563 parent->core.live_count <= count, 3564 ("bad live_count %d/%d (%02x, %d)", 3565 parent->core.live_count, count, 3566 parent->bref.type, parent->bytes)); 3567 3568 /* 3569 * If no free blockref could be found we must create an indirect 3570 * block and move a number of blockrefs into it. With the parent 3571 * locked we can safely lock each child in order to delete+duplicate 3572 * it without causing a deadlock. 3573 * 3574 * This may return the new indirect block or the old parent depending 3575 * on where the key falls. NULL is returned on error. 3576 */ 3577 if (parent->core.live_count == count) { 3578 hammer2_chain_t *nparent; 3579 3580 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3581 3582 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3583 mtid, type, &error); 3584 if (nparent == NULL) { 3585 if (allocated) 3586 hammer2_chain_drop(chain); 3587 chain = NULL; 3588 goto done; 3589 } 3590 if (parent != nparent) { 3591 hammer2_chain_unlock(parent); 3592 hammer2_chain_drop(parent); 3593 parent = *parentp = nparent; 3594 } 3595 goto again; 3596 } 3597 3598 /* 3599 * fall through if parent, or skip to here if no parent. 3600 */ 3601 skip: 3602 if (chain->flags & HAMMER2_CHAIN_DELETED) 3603 kprintf("Inserting deleted chain @%016jx\n", 3604 chain->bref.key); 3605 3606 /* 3607 * Link the chain into its parent. 3608 */ 3609 if (chain->parent != NULL) 3610 panic("hammer2: hammer2_chain_create: chain already connected"); 3611 KKASSERT(chain->parent == NULL); 3612 if (parent) { 3613 KKASSERT(parent->core.live_count < count); 3614 hammer2_chain_insert(parent, chain, 3615 HAMMER2_CHAIN_INSERT_SPIN | 3616 HAMMER2_CHAIN_INSERT_LIVE, 3617 0); 3618 } 3619 3620 if (allocated) { 3621 /* 3622 * Mark the newly created chain modified. This will cause 3623 * UPDATE to be set and process the INITIAL flag. 3624 * 3625 * Device buffers are not instantiated for DATA elements 3626 * as these are handled by logical buffers. 3627 * 3628 * Indirect and freemap node indirect blocks are handled 3629 * by hammer2_chain_create_indirect() and not by this 3630 * function. 3631 * 3632 * Data for all other bref types is expected to be 3633 * instantiated (INODE, LEAF). 3634 */ 3635 switch(chain->bref.type) { 3636 case HAMMER2_BREF_TYPE_DATA: 3637 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3638 case HAMMER2_BREF_TYPE_DIRENT: 3639 case HAMMER2_BREF_TYPE_INODE: 3640 error = hammer2_chain_modify(chain, mtid, dedup_off, 3641 HAMMER2_MODIFY_OPTDATA); 3642 break; 3643 default: 3644 /* 3645 * Remaining types are not supported by this function. 3646 * In particular, INDIRECT and LEAF_NODE types are 3647 * handled by create_indirect(). 3648 */ 3649 panic("hammer2_chain_create: bad type: %d", 3650 chain->bref.type); 3651 /* NOT REACHED */ 3652 break; 3653 } 3654 } else { 3655 /* 3656 * When reconnecting a chain we must set UPDATE and 3657 * setflush so the flush recognizes that it must update 3658 * the bref in the parent. 3659 */ 3660 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3661 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3662 } 3663 3664 /* 3665 * We must setflush(parent) to ensure that it recurses through to 3666 * chain. setflush(chain) might not work because ONFLUSH is possibly 3667 * already set in the chain (so it won't recurse up to set it in the 3668 * parent). 3669 */ 3670 if (parent) 3671 hammer2_chain_setflush(parent); 3672 3673 done: 3674 *chainp = chain; 3675 3676 return (error); 3677 } 3678 3679 /* 3680 * Move the chain from its old parent to a new parent. The chain must have 3681 * already been deleted or already disconnected (or never associated) with 3682 * a parent. The chain is reassociated with the new parent and the deleted 3683 * flag will be cleared (no longer deleted). The chain's modification state 3684 * is not altered. 3685 * 3686 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3687 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3688 * FULL. This typically means that the caller is creating the chain after 3689 * doing a hammer2_chain_lookup(). 3690 * 3691 * Neither (parent) or (chain) can be errored. 3692 * 3693 * If (parent) is non-NULL then the chain is inserted under the parent. 3694 * 3695 * If (parent) is NULL then the newly duplicated chain is not inserted 3696 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3697 * passing into hammer2_chain_create() after this function returns). 3698 * 3699 * WARNING! This function calls create which means it can insert indirect 3700 * blocks. This can cause other unrelated chains in the parent to 3701 * be moved to a newly inserted indirect block in addition to the 3702 * specific chain. 3703 */ 3704 void 3705 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3706 hammer2_tid_t mtid, int flags) 3707 { 3708 hammer2_blockref_t *bref; 3709 hammer2_dev_t *hmp; 3710 hammer2_chain_t *parent; 3711 size_t bytes; 3712 3713 /* 3714 * WARNING! We should never resolve DATA to device buffers 3715 * (XXX allow it if the caller did?), and since 3716 * we currently do not have the logical buffer cache 3717 * buffer in-hand to fix its cached physical offset 3718 * we also force the modify code to not COW it. XXX 3719 * 3720 * NOTE! We allow error'd chains to be renamed. The bref itself 3721 * is good and can be renamed. The content, however, may 3722 * be inaccessible. 3723 */ 3724 hmp = chain->hmp; 3725 KKASSERT(chain->parent == NULL); 3726 /*KKASSERT(chain->error == 0); allow */ 3727 3728 /* 3729 * Now create a duplicate of the chain structure, associating 3730 * it with the same core, making it the same size, pointing it 3731 * to the same bref (the same media block). 3732 * 3733 * NOTE: Handle special radix == 0 case (means 0 bytes). 3734 */ 3735 bref = &chain->bref; 3736 bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 3737 if (bytes) 3738 bytes = (hammer2_off_t)1 << bytes; 3739 3740 /* 3741 * If parent is not NULL the duplicated chain will be entered under 3742 * the parent and the UPDATE bit set to tell flush to update 3743 * the blockref. 3744 * 3745 * We must setflush(parent) to ensure that it recurses through to 3746 * chain. setflush(chain) might not work because ONFLUSH is possibly 3747 * already set in the chain (so it won't recurse up to set it in the 3748 * parent). 3749 * 3750 * Having both chains locked is extremely important for atomicy. 3751 */ 3752 if (parentp && (parent = *parentp) != NULL) { 3753 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3754 KKASSERT(parent->refs > 0); 3755 KKASSERT(parent->error == 0); 3756 3757 hammer2_chain_create(parentp, &chain, NULL, chain->pmp, 3758 HAMMER2_METH_DEFAULT, 3759 bref->key, bref->keybits, bref->type, 3760 chain->bytes, mtid, 0, flags); 3761 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3762 hammer2_chain_setflush(*parentp); 3763 } 3764 } 3765 3766 /* 3767 * This works in tandem with delete_obref() to install a blockref in 3768 * (typically) an indirect block that is associated with the chain being 3769 * moved to *parentp. 3770 * 3771 * The reason we need this function is that the caller needs to maintain 3772 * the blockref as it was, and not generate a new blockref for what might 3773 * be a modified chain. Otherwise stuff will leak into the flush that 3774 * the flush code's FLUSH_INODE_STOP flag is unable to catch. 3775 * 3776 * It is EXTREMELY important that we properly set CHAIN_BMAPUPD and 3777 * CHAIN_UPDATE. We must set BMAPUPD if the bref does not match, and 3778 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if 3779 * it does. Otherwise we can end up in a situation where H2 is unable to 3780 * clean up the in-memory chain topology. 3781 * 3782 * The reason for this is that flushes do not generally flush through 3783 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq 3784 * or sideq to properly flush and dispose of the related inode chain's flags. 3785 * Situations where the inode is not actually modified by the frontend, 3786 * but where we have to move the related chains around as we insert or cleanup 3787 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory 3788 * inode chain that does not have a hammer2_inode_t associated with it. 3789 */ 3790 void 3791 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3792 hammer2_tid_t mtid, int flags, 3793 hammer2_blockref_t *obref) 3794 { 3795 hammer2_chain_rename(parentp, chain, mtid, flags); 3796 3797 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) { 3798 hammer2_blockref_t *tbase; 3799 int tcount; 3800 3801 KKASSERT((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0); 3802 hammer2_chain_modify(*parentp, mtid, 0, 0); 3803 tbase = hammer2_chain_base_and_count(*parentp, &tcount); 3804 hammer2_base_insert(*parentp, tbase, tcount, chain, obref); 3805 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) { 3806 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD | 3807 HAMMER2_CHAIN_UPDATE); 3808 } else { 3809 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3810 } 3811 } 3812 } 3813 3814 /* 3815 * Helper function for deleting chains. 3816 * 3817 * The chain is removed from the live view (the RBTREE) as well as the parent's 3818 * blockmap. Both chain and its parent must be locked. 3819 * 3820 * parent may not be errored. chain can be errored. 3821 */ 3822 static int 3823 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3824 hammer2_tid_t mtid, int flags, 3825 hammer2_blockref_t *obref) 3826 { 3827 hammer2_dev_t *hmp; 3828 int error = 0; 3829 3830 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 3831 KKASSERT(chain->parent == parent); 3832 hmp = chain->hmp; 3833 3834 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 3835 /* 3836 * Chain is blockmapped, so there must be a parent. 3837 * Atomically remove the chain from the parent and remove 3838 * the blockmap entry. The parent must be set modified 3839 * to remove the blockmap entry. 3840 */ 3841 hammer2_blockref_t *base; 3842 int count; 3843 3844 KKASSERT(parent != NULL); 3845 KKASSERT(parent->error == 0); 3846 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3847 error = hammer2_chain_modify(parent, mtid, 0, 0); 3848 if (error) 3849 goto done; 3850 3851 /* 3852 * Calculate blockmap pointer 3853 */ 3854 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3855 hammer2_spin_ex(&chain->core.spin); 3856 hammer2_spin_ex(&parent->core.spin); 3857 3858 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3859 atomic_add_int(&parent->core.live_count, -1); 3860 ++parent->core.generation; 3861 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3862 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3863 --parent->core.chain_count; 3864 chain->parent = NULL; 3865 3866 switch(parent->bref.type) { 3867 case HAMMER2_BREF_TYPE_INODE: 3868 /* 3869 * Access the inode's block array. However, there 3870 * is no block array if the inode is flagged 3871 * DIRECTDATA. 3872 */ 3873 if (parent->data && 3874 (parent->data->ipdata.meta.op_flags & 3875 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3876 base = 3877 &parent->data->ipdata.u.blockset.blockref[0]; 3878 } else { 3879 base = NULL; 3880 } 3881 count = HAMMER2_SET_COUNT; 3882 break; 3883 case HAMMER2_BREF_TYPE_INDIRECT: 3884 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3885 if (parent->data) 3886 base = &parent->data->npdata[0]; 3887 else 3888 base = NULL; 3889 count = parent->bytes / sizeof(hammer2_blockref_t); 3890 break; 3891 case HAMMER2_BREF_TYPE_VOLUME: 3892 base = &parent->data->voldata. 3893 sroot_blockset.blockref[0]; 3894 count = HAMMER2_SET_COUNT; 3895 break; 3896 case HAMMER2_BREF_TYPE_FREEMAP: 3897 base = &parent->data->blkset.blockref[0]; 3898 count = HAMMER2_SET_COUNT; 3899 break; 3900 default: 3901 base = NULL; 3902 count = 0; 3903 panic("_hammer2_chain_delete_helper: " 3904 "unrecognized blockref type: %d", 3905 parent->bref.type); 3906 } 3907 3908 /* 3909 * delete blockmapped chain from its parent. 3910 * 3911 * The parent is not affected by any statistics in chain 3912 * which are pending synchronization. That is, there is 3913 * nothing to undo in the parent since they have not yet 3914 * been incorporated into the parent. 3915 * 3916 * The parent is affected by statistics stored in inodes. 3917 * Those have already been synchronized, so they must be 3918 * undone. XXX split update possible w/delete in middle? 3919 */ 3920 if (base) { 3921 hammer2_base_delete(parent, base, count, chain, obref); 3922 } 3923 hammer2_spin_unex(&parent->core.spin); 3924 hammer2_spin_unex(&chain->core.spin); 3925 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3926 /* 3927 * Chain is not blockmapped but a parent is present. 3928 * Atomically remove the chain from the parent. There is 3929 * no blockmap entry to remove. 3930 * 3931 * Because chain was associated with a parent but not 3932 * synchronized, the chain's *_count_up fields contain 3933 * inode adjustment statistics which must be undone. 3934 */ 3935 hammer2_spin_ex(&chain->core.spin); 3936 hammer2_spin_ex(&parent->core.spin); 3937 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3938 atomic_add_int(&parent->core.live_count, -1); 3939 ++parent->core.generation; 3940 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3941 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3942 --parent->core.chain_count; 3943 chain->parent = NULL; 3944 hammer2_spin_unex(&parent->core.spin); 3945 hammer2_spin_unex(&chain->core.spin); 3946 } else { 3947 /* 3948 * Chain is not blockmapped and has no parent. This 3949 * is a degenerate case. 3950 */ 3951 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3952 } 3953 done: 3954 return error; 3955 } 3956 3957 /* 3958 * Create an indirect block that covers one or more of the elements in the 3959 * current parent. Either returns the existing parent with no locking or 3960 * ref changes or returns the new indirect block locked and referenced 3961 * and leaving the original parent lock/ref intact as well. 3962 * 3963 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3964 * 3965 * The returned chain depends on where the specified key falls. 3966 * 3967 * The key/keybits for the indirect mode only needs to follow three rules: 3968 * 3969 * (1) That all elements underneath it fit within its key space and 3970 * 3971 * (2) That all elements outside it are outside its key space. 3972 * 3973 * (3) When creating the new indirect block any elements in the current 3974 * parent that fit within the new indirect block's keyspace must be 3975 * moved into the new indirect block. 3976 * 3977 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3978 * keyspace the the current parent, but lookup/iteration rules will 3979 * ensure (and must ensure) that rule (2) for all parents leading up 3980 * to the nearest inode or the root volume header is adhered to. This 3981 * is accomplished by always recursing through matching keyspaces in 3982 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3983 * 3984 * The current implementation calculates the current worst-case keyspace by 3985 * iterating the current parent and then divides it into two halves, choosing 3986 * whichever half has the most elements (not necessarily the half containing 3987 * the requested key). 3988 * 3989 * We can also opt to use the half with the least number of elements. This 3990 * causes lower-numbered keys (aka logical file offsets) to recurse through 3991 * fewer indirect blocks and higher-numbered keys to recurse through more. 3992 * This also has the risk of not moving enough elements to the new indirect 3993 * block and being forced to create several indirect blocks before the element 3994 * can be inserted. 3995 * 3996 * Must be called with an exclusively locked parent. 3997 * 3998 * NOTE: *errorp set to HAMMER_ERROR_* flags 3999 */ 4000 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 4001 hammer2_key_t *keyp, int keybits, 4002 hammer2_blockref_t *base, int count); 4003 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 4004 hammer2_key_t *keyp, int keybits, 4005 hammer2_blockref_t *base, int count, 4006 int ncount); 4007 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 4008 hammer2_key_t *keyp, int keybits, 4009 hammer2_blockref_t *base, int count, 4010 int ncount); 4011 static 4012 hammer2_chain_t * 4013 hammer2_chain_create_indirect(hammer2_chain_t *parent, 4014 hammer2_key_t create_key, int create_bits, 4015 hammer2_tid_t mtid, int for_type, int *errorp) 4016 { 4017 hammer2_dev_t *hmp; 4018 hammer2_blockref_t *base; 4019 hammer2_blockref_t *bref; 4020 hammer2_blockref_t bsave; 4021 hammer2_blockref_t dummy; 4022 hammer2_chain_t *chain; 4023 hammer2_chain_t *ichain; 4024 hammer2_key_t key = create_key; 4025 hammer2_key_t key_beg; 4026 hammer2_key_t key_end; 4027 hammer2_key_t key_next; 4028 int keybits = create_bits; 4029 int count; 4030 int ncount; 4031 int nbytes; 4032 int loops; 4033 int error; 4034 int reason; 4035 int generation; 4036 int maxloops = 300000; 4037 4038 /* 4039 * Calculate the base blockref pointer or NULL if the chain 4040 * is known to be empty. We need to calculate the array count 4041 * for RB lookups either way. 4042 */ 4043 hmp = parent->hmp; 4044 KKASSERT(hammer2_mtx_owned(&parent->lock)); 4045 4046 /* 4047 * Pre-modify the parent now to avoid having to deal with error 4048 * processing if we tried to later (in the middle of our loop). 4049 * 4050 * We are going to be moving bref's around, the indirect blocks 4051 * cannot be in an initial state. Do not pass MODIFY_OPTDATA. 4052 */ 4053 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 4054 if (*errorp) { 4055 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 4056 *errorp, hammer2_error_str(*errorp)); 4057 return NULL; 4058 } 4059 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 4060 4061 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 4062 base = hammer2_chain_base_and_count(parent, &count); 4063 4064 /* 4065 * How big should our new indirect block be? It has to be at least 4066 * as large as its parent for splits to work properly. 4067 * 4068 * The freemap uses a specific indirect block size. The number of 4069 * levels are built dynamically and ultimately depend on the size 4070 * volume. Because freemap blocks are taken from the reserved areas 4071 * of the volume our goal is efficiency (fewer levels) and not so 4072 * much to save disk space. 4073 * 4074 * The first indirect block level for a directory usually uses 4075 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 4076 * the hash mechanism, this typically gives us a nominal 4077 * 32 * 4 entries with one level of indirection. 4078 * 4079 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 4080 * indirect blocks. The initial 4 entries in the inode gives us 4081 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 4082 * of indirection gives us 137GB, and so forth. H2 can support 4083 * huge file sizes but they are not typical, so we try to stick 4084 * with compactness and do not use a larger indirect block size. 4085 * 4086 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 4087 * due to the way indirect blocks are created this usually winds 4088 * up being extremely inefficient for small files. Even though 4089 * 16KB requires more levels of indirection for very large files, 4090 * the 16KB records can be ganged together into 64KB DIOs. 4091 */ 4092 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4093 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4094 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 4095 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4096 if (parent->data->ipdata.meta.type == 4097 HAMMER2_OBJTYPE_DIRECTORY) 4098 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 4099 else 4100 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 4101 4102 } else { 4103 nbytes = HAMMER2_IND_BYTES_NOM; 4104 } 4105 if (nbytes < count * sizeof(hammer2_blockref_t)) { 4106 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 4107 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 4108 nbytes = count * sizeof(hammer2_blockref_t); 4109 } 4110 ncount = nbytes / sizeof(hammer2_blockref_t); 4111 4112 /* 4113 * When creating an indirect block for a freemap node or leaf 4114 * the key/keybits must be fitted to static radix levels because 4115 * particular radix levels use particular reserved blocks in the 4116 * related zone. 4117 * 4118 * This routine calculates the key/radix of the indirect block 4119 * we need to create, and whether it is on the high-side or the 4120 * low-side. 4121 */ 4122 switch(for_type) { 4123 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4124 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4125 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 4126 base, count); 4127 break; 4128 case HAMMER2_BREF_TYPE_DATA: 4129 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 4130 base, count, ncount); 4131 break; 4132 case HAMMER2_BREF_TYPE_DIRENT: 4133 case HAMMER2_BREF_TYPE_INODE: 4134 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 4135 base, count, ncount); 4136 break; 4137 default: 4138 panic("illegal indirect block for bref type %d", for_type); 4139 break; 4140 } 4141 4142 /* 4143 * Normalize the key for the radix being represented, keeping the 4144 * high bits and throwing away the low bits. 4145 */ 4146 key &= ~(((hammer2_key_t)1 << keybits) - 1); 4147 4148 /* 4149 * Ok, create our new indirect block 4150 */ 4151 bzero(&dummy, sizeof(dummy)); 4152 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4153 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4154 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 4155 } else { 4156 dummy.type = HAMMER2_BREF_TYPE_INDIRECT; 4157 } 4158 dummy.key = key; 4159 dummy.keybits = keybits; 4160 dummy.data_off = hammer2_getradix(nbytes); 4161 dummy.methods = 4162 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 4163 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 4164 4165 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy); 4166 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 4167 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 4168 /* ichain has one ref at this point */ 4169 4170 /* 4171 * We have to mark it modified to allocate its block, but use 4172 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 4173 * it won't be acted upon by the flush code. 4174 * 4175 * XXX remove OPTDATA, we need a fully initialized indirect block to 4176 * be able to move the original blockref. 4177 */ 4178 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0); 4179 if (*errorp) { 4180 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 4181 *errorp, hammer2_error_str(*errorp)); 4182 hammer2_chain_unlock(ichain); 4183 hammer2_chain_drop(ichain); 4184 return NULL; 4185 } 4186 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4187 4188 /* 4189 * Iterate the original parent and move the matching brefs into 4190 * the new indirect block. 4191 * 4192 * XXX handle flushes. 4193 */ 4194 key_beg = 0; 4195 key_end = HAMMER2_KEY_MAX; 4196 key_next = 0; /* avoid gcc warnings */ 4197 hammer2_spin_ex(&parent->core.spin); 4198 loops = 0; 4199 reason = 0; 4200 4201 for (;;) { 4202 /* 4203 * Parent may have been modified, relocating its block array. 4204 * Reload the base pointer. 4205 */ 4206 base = hammer2_chain_base_and_count(parent, &count); 4207 4208 if (++loops > 100000) { 4209 hammer2_spin_unex(&parent->core.spin); 4210 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 4211 reason, parent, base, count, key_next); 4212 } 4213 4214 /* 4215 * NOTE: spinlock stays intact, returned chain (if not NULL) 4216 * is not referenced or locked which means that we 4217 * cannot safely check its flagged / deletion status 4218 * until we lock it. 4219 */ 4220 chain = hammer2_combined_find(parent, base, count, 4221 &key_next, 4222 key_beg, key_end, 4223 &bref); 4224 generation = parent->core.generation; 4225 if (bref == NULL) 4226 break; 4227 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4228 4229 /* 4230 * Skip keys that are not within the key/radix of the new 4231 * indirect block. They stay in the parent. 4232 */ 4233 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) { 4234 goto next_key_spinlocked; 4235 } 4236 4237 /* 4238 * Load the new indirect block by acquiring the related 4239 * chains (potentially from media as it might not be 4240 * in-memory). Then move it to the new parent (ichain). 4241 * 4242 * chain is referenced but not locked. We must lock the 4243 * chain to obtain definitive state. 4244 */ 4245 bsave = *bref; 4246 if (chain) { 4247 /* 4248 * Use chain already present in the RBTREE 4249 */ 4250 hammer2_chain_ref(chain); 4251 hammer2_spin_unex(&parent->core.spin); 4252 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 4253 } else { 4254 /* 4255 * Get chain for blockref element. _get returns NULL 4256 * on insertion race. 4257 */ 4258 hammer2_spin_unex(&parent->core.spin); 4259 chain = hammer2_chain_get(parent, generation, &bsave, 4260 HAMMER2_RESOLVE_NEVER); 4261 if (chain == NULL) { 4262 reason = 1; 4263 hammer2_spin_ex(&parent->core.spin); 4264 continue; 4265 } 4266 } 4267 4268 /* 4269 * This is always live so if the chain has been deleted 4270 * we raced someone and we have to retry. 4271 * 4272 * NOTE: Lookups can race delete-duplicate because 4273 * delete-duplicate does not lock the parent's core 4274 * (they just use the spinlock on the core). 4275 * 4276 * (note reversed logic for this one) 4277 */ 4278 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 4279 chain->parent != parent || 4280 (chain->flags & HAMMER2_CHAIN_DELETED)) { 4281 hammer2_chain_unlock(chain); 4282 hammer2_chain_drop(chain); 4283 if (hammer2_debug & 0x0040) { 4284 kprintf("LOST PARENT RETRY " 4285 "RETRY (%p,%p)->%p %08x\n", 4286 parent, chain->parent, chain, chain->flags); 4287 } 4288 hammer2_spin_ex(&parent->core.spin); 4289 continue; 4290 } 4291 4292 /* 4293 * Shift the chain to the indirect block. 4294 * 4295 * WARNING! No reason for us to load chain data, pass NOSTATS 4296 * to prevent delete/insert from trying to access 4297 * inode stats (and thus asserting if there is no 4298 * chain->data loaded). 4299 * 4300 * WARNING! The (parent, chain) deletion may modify the parent 4301 * and invalidate the base pointer. 4302 * 4303 * WARNING! Parent must already be marked modified, so we 4304 * can assume that chain_delete always suceeds. 4305 * 4306 * WARNING! hammer2_chain_repchange() does not have to be 4307 * called (and doesn't work anyway because we are 4308 * only doing a partial shift). A recursion that is 4309 * in-progress can continue at the current parent 4310 * and will be able to properly find its next key. 4311 */ 4312 error = hammer2_chain_delete_obref(parent, chain, mtid, 0, 4313 &bsave); 4314 KKASSERT(error == 0); 4315 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave); 4316 hammer2_chain_unlock(chain); 4317 hammer2_chain_drop(chain); 4318 KKASSERT(parent->refs > 0); 4319 chain = NULL; 4320 base = NULL; /* safety */ 4321 hammer2_spin_ex(&parent->core.spin); 4322 next_key_spinlocked: 4323 if (--maxloops == 0) 4324 panic("hammer2_chain_create_indirect: maxloops"); 4325 reason = 4; 4326 if (key_next == 0 || key_next > key_end) 4327 break; 4328 key_beg = key_next; 4329 /* loop */ 4330 } 4331 hammer2_spin_unex(&parent->core.spin); 4332 4333 /* 4334 * Insert the new indirect block into the parent now that we've 4335 * cleared out some entries in the parent. We calculated a good 4336 * insertion index in the loop above (ichain->index). 4337 * 4338 * We don't have to set UPDATE here because we mark ichain 4339 * modified down below (so the normal modified -> flush -> set-moved 4340 * sequence applies). 4341 * 4342 * The insertion shouldn't race as this is a completely new block 4343 * and the parent is locked. 4344 */ 4345 base = NULL; /* safety, parent modify may change address */ 4346 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 4347 KKASSERT(parent->core.live_count < count); 4348 hammer2_chain_insert(parent, ichain, 4349 HAMMER2_CHAIN_INSERT_SPIN | 4350 HAMMER2_CHAIN_INSERT_LIVE, 4351 0); 4352 4353 /* 4354 * Make sure flushes propogate after our manual insertion. 4355 */ 4356 hammer2_chain_setflush(ichain); 4357 hammer2_chain_setflush(parent); 4358 4359 /* 4360 * Figure out what to return. 4361 */ 4362 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits)) { 4363 /* 4364 * Key being created is outside the key range, 4365 * return the original parent. 4366 */ 4367 hammer2_chain_unlock(ichain); 4368 hammer2_chain_drop(ichain); 4369 } else { 4370 /* 4371 * Otherwise its in the range, return the new parent. 4372 * (leave both the new and old parent locked). 4373 */ 4374 parent = ichain; 4375 } 4376 4377 return(parent); 4378 } 4379 4380 /* 4381 * Do maintenance on an indirect chain. Both parent and chain are locked. 4382 * 4383 * Returns non-zero if (chain) is deleted, either due to being empty or 4384 * because its children were safely moved into the parent. 4385 */ 4386 int 4387 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4388 hammer2_chain_t *chain) 4389 { 4390 hammer2_blockref_t *chain_base; 4391 hammer2_blockref_t *base; 4392 hammer2_blockref_t *bref; 4393 hammer2_blockref_t bsave; 4394 hammer2_key_t key_next; 4395 hammer2_key_t key_beg; 4396 hammer2_key_t key_end; 4397 hammer2_chain_t *sub; 4398 int chain_count; 4399 int count; 4400 int error; 4401 int generation; 4402 4403 /* 4404 * Make sure we have an accurate live_count 4405 */ 4406 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4407 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4408 base = &chain->data->npdata[0]; 4409 count = chain->bytes / sizeof(hammer2_blockref_t); 4410 hammer2_chain_countbrefs(chain, base, count); 4411 } 4412 4413 /* 4414 * If the indirect block is empty we can delete it. 4415 * (ignore deletion error) 4416 */ 4417 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4418 hammer2_chain_delete(parent, chain, 4419 chain->bref.modify_tid, 4420 HAMMER2_DELETE_PERMANENT); 4421 hammer2_chain_repchange(parent, chain); 4422 return 1; 4423 } 4424 4425 base = hammer2_chain_base_and_count(parent, &count); 4426 4427 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4428 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4429 hammer2_chain_countbrefs(parent, base, count); 4430 } 4431 4432 /* 4433 * Determine if we can collapse chain into parent, calculate 4434 * hysteresis for chain emptiness. 4435 */ 4436 if (parent->core.live_count + chain->core.live_count - 1 > count) 4437 return 0; 4438 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4439 if (chain->core.live_count > chain_count * 3 / 4) 4440 return 0; 4441 4442 /* 4443 * Ok, theoretically we can collapse chain's contents into 4444 * parent. chain is locked, but any in-memory children of chain 4445 * are not. For this to work, we must be able to dispose of any 4446 * in-memory children of chain. 4447 * 4448 * For now require that there are no in-memory children of chain. 4449 * 4450 * WARNING! Both chain and parent must remain locked across this 4451 * entire operation. 4452 */ 4453 4454 /* 4455 * Parent must be marked modified. Don't try to collapse it if we 4456 * can't mark it modified. Once modified, destroy chain to make room 4457 * and to get rid of what will be a conflicting key (this is included 4458 * in the calculation above). Finally, move the children of chain 4459 * into chain's parent. 4460 * 4461 * This order creates an accounting problem for bref.embed.stats 4462 * because we destroy chain before we remove its children. Any 4463 * elements whos blockref is already synchronized will be counted 4464 * twice. To deal with the problem we clean out chain's stats prior 4465 * to deleting it. 4466 */ 4467 error = hammer2_chain_modify(parent, 0, 0, 0); 4468 if (error) { 4469 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4470 hammer2_error_str(error)); 4471 return 0; 4472 } 4473 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4474 if (error) { 4475 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4476 hammer2_error_str(error)); 4477 return 0; 4478 } 4479 4480 chain->bref.embed.stats.inode_count = 0; 4481 chain->bref.embed.stats.data_count = 0; 4482 error = hammer2_chain_delete(parent, chain, 4483 chain->bref.modify_tid, 4484 HAMMER2_DELETE_PERMANENT); 4485 KKASSERT(error == 0); 4486 4487 /* 4488 * The combined_find call requires core.spin to be held. One would 4489 * think there wouldn't be any conflicts since we hold chain 4490 * exclusively locked, but the caching mechanism for 0-ref children 4491 * does not require a chain lock. 4492 */ 4493 hammer2_spin_ex(&chain->core.spin); 4494 4495 key_next = 0; 4496 key_beg = 0; 4497 key_end = HAMMER2_KEY_MAX; 4498 for (;;) { 4499 chain_base = &chain->data->npdata[0]; 4500 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4501 sub = hammer2_combined_find(chain, chain_base, chain_count, 4502 &key_next, 4503 key_beg, key_end, 4504 &bref); 4505 generation = chain->core.generation; 4506 if (bref == NULL) 4507 break; 4508 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4509 4510 bsave = *bref; 4511 if (sub) { 4512 hammer2_chain_ref(sub); 4513 hammer2_spin_unex(&chain->core.spin); 4514 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4515 } else { 4516 hammer2_spin_unex(&chain->core.spin); 4517 sub = hammer2_chain_get(chain, generation, &bsave, 4518 HAMMER2_RESOLVE_NEVER); 4519 if (sub == NULL) { 4520 hammer2_spin_ex(&chain->core.spin); 4521 continue; 4522 } 4523 } 4524 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) || 4525 sub->parent != chain || 4526 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4527 hammer2_chain_unlock(sub); 4528 hammer2_chain_drop(sub); 4529 hammer2_spin_ex(&chain->core.spin); 4530 sub = NULL; /* safety */ 4531 continue; 4532 } 4533 error = hammer2_chain_delete_obref(chain, sub, 4534 sub->bref.modify_tid, 0, 4535 &bsave); 4536 KKASSERT(error == 0); 4537 hammer2_chain_rename_obref(&parent, sub, 4538 sub->bref.modify_tid, 4539 HAMMER2_INSERT_SAMEPARENT, &bsave); 4540 hammer2_chain_unlock(sub); 4541 hammer2_chain_drop(sub); 4542 hammer2_spin_ex(&chain->core.spin); 4543 4544 if (key_next == 0) 4545 break; 4546 key_beg = key_next; 4547 } 4548 hammer2_spin_unex(&chain->core.spin); 4549 4550 hammer2_chain_repchange(parent, chain); 4551 4552 return 1; 4553 } 4554 4555 /* 4556 * Freemap indirect blocks 4557 * 4558 * Calculate the keybits and highside/lowside of the freemap node the 4559 * caller is creating. 4560 * 4561 * This routine will specify the next higher-level freemap key/radix 4562 * representing the lowest-ordered set. By doing so, eventually all 4563 * low-ordered sets will be moved one level down. 4564 * 4565 * We have to be careful here because the freemap reserves a limited 4566 * number of blocks for a limited number of levels. So we can't just 4567 * push indiscriminately. 4568 */ 4569 int 4570 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4571 int keybits, hammer2_blockref_t *base, int count) 4572 { 4573 hammer2_chain_t *chain; 4574 hammer2_blockref_t *bref; 4575 hammer2_key_t key; 4576 hammer2_key_t key_beg; 4577 hammer2_key_t key_end; 4578 hammer2_key_t key_next; 4579 int locount; 4580 int hicount; 4581 int maxloops = 300000; 4582 4583 key = *keyp; 4584 locount = 0; 4585 hicount = 0; 4586 keybits = 64; 4587 4588 /* 4589 * Calculate the range of keys in the array being careful to skip 4590 * slots which are overridden with a deletion. 4591 */ 4592 key_beg = 0; 4593 key_end = HAMMER2_KEY_MAX; 4594 hammer2_spin_ex(&parent->core.spin); 4595 4596 for (;;) { 4597 if (--maxloops == 0) { 4598 panic("indkey_freemap shit %p %p:%d\n", 4599 parent, base, count); 4600 } 4601 chain = hammer2_combined_find(parent, base, count, 4602 &key_next, 4603 key_beg, key_end, 4604 &bref); 4605 4606 /* 4607 * Exhausted search 4608 */ 4609 if (bref == NULL) 4610 break; 4611 4612 /* 4613 * Skip deleted chains. 4614 */ 4615 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4616 if (key_next == 0 || key_next > key_end) 4617 break; 4618 key_beg = key_next; 4619 continue; 4620 } 4621 4622 /* 4623 * Use the full live (not deleted) element for the scan 4624 * iteration. HAMMER2 does not allow partial replacements. 4625 * 4626 * XXX should be built into hammer2_combined_find(). 4627 */ 4628 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4629 4630 if (keybits > bref->keybits) { 4631 key = bref->key; 4632 keybits = bref->keybits; 4633 } else if (keybits == bref->keybits && bref->key < key) { 4634 key = bref->key; 4635 } 4636 if (key_next == 0) 4637 break; 4638 key_beg = key_next; 4639 } 4640 hammer2_spin_unex(&parent->core.spin); 4641 4642 /* 4643 * Return the keybits for a higher-level FREEMAP_NODE covering 4644 * this node. 4645 */ 4646 switch(keybits) { 4647 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4648 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4649 break; 4650 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4651 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4652 break; 4653 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4654 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4655 break; 4656 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4657 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4658 break; 4659 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4660 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4661 break; 4662 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4663 panic("hammer2_chain_indkey_freemap: level too high"); 4664 break; 4665 default: 4666 panic("hammer2_chain_indkey_freemap: bad radix"); 4667 break; 4668 } 4669 *keyp = key; 4670 4671 return (keybits); 4672 } 4673 4674 /* 4675 * File indirect blocks 4676 * 4677 * Calculate the key/keybits for the indirect block to create by scanning 4678 * existing keys. The key being created is also passed in *keyp and can be 4679 * inside or outside the indirect block. Regardless, the indirect block 4680 * must hold at least two keys in order to guarantee sufficient space. 4681 * 4682 * We use a modified version of the freemap's fixed radix tree, but taylored 4683 * for file data. Basically we configure an indirect block encompassing the 4684 * smallest key. 4685 */ 4686 static int 4687 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4688 int keybits, hammer2_blockref_t *base, int count, 4689 int ncount) 4690 { 4691 hammer2_chain_t *chain; 4692 hammer2_blockref_t *bref; 4693 hammer2_key_t key; 4694 hammer2_key_t key_beg; 4695 hammer2_key_t key_end; 4696 hammer2_key_t key_next; 4697 int nradix; 4698 int locount; 4699 int hicount; 4700 int maxloops = 300000; 4701 4702 key = *keyp; 4703 locount = 0; 4704 hicount = 0; 4705 keybits = 64; 4706 4707 /* 4708 * Calculate the range of keys in the array being careful to skip 4709 * slots which are overridden with a deletion. 4710 * 4711 * Locate the smallest key. 4712 */ 4713 key_beg = 0; 4714 key_end = HAMMER2_KEY_MAX; 4715 hammer2_spin_ex(&parent->core.spin); 4716 4717 for (;;) { 4718 if (--maxloops == 0) { 4719 panic("indkey_freemap shit %p %p:%d\n", 4720 parent, base, count); 4721 } 4722 chain = hammer2_combined_find(parent, base, count, 4723 &key_next, 4724 key_beg, key_end, 4725 &bref); 4726 4727 /* 4728 * Exhausted search 4729 */ 4730 if (bref == NULL) 4731 break; 4732 4733 /* 4734 * Skip deleted chains. 4735 */ 4736 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4737 if (key_next == 0 || key_next > key_end) 4738 break; 4739 key_beg = key_next; 4740 continue; 4741 } 4742 4743 /* 4744 * Use the full live (not deleted) element for the scan 4745 * iteration. HAMMER2 does not allow partial replacements. 4746 * 4747 * XXX should be built into hammer2_combined_find(). 4748 */ 4749 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4750 4751 if (keybits > bref->keybits) { 4752 key = bref->key; 4753 keybits = bref->keybits; 4754 } else if (keybits == bref->keybits && bref->key < key) { 4755 key = bref->key; 4756 } 4757 if (key_next == 0) 4758 break; 4759 key_beg = key_next; 4760 } 4761 hammer2_spin_unex(&parent->core.spin); 4762 4763 /* 4764 * Calculate the static keybits for a higher-level indirect block 4765 * that contains the key. 4766 */ 4767 *keyp = key; 4768 4769 switch(ncount) { 4770 case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t): 4771 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4772 break; 4773 case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t): 4774 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4775 break; 4776 case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t): 4777 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4778 break; 4779 default: 4780 panic("bad ncount %d\n", ncount); 4781 nradix = 0; 4782 break; 4783 } 4784 4785 /* 4786 * The largest radix that can be returned for an indirect block is 4787 * 63 bits. (The largest practical indirect block radix is actually 4788 * 62 bits because the top-level inode or volume root contains four 4789 * entries, but allow 63 to be returned). 4790 */ 4791 if (nradix >= 64) 4792 nradix = 63; 4793 4794 return keybits + nradix; 4795 } 4796 4797 #if 1 4798 4799 /* 4800 * Directory indirect blocks. 4801 * 4802 * Covers both the inode index (directory of inodes), and directory contents 4803 * (filenames hardlinked to inodes). 4804 * 4805 * Because directory keys are hashed we generally try to cut the space in 4806 * half. We accomodate the inode index (which tends to have linearly 4807 * increasing inode numbers) by ensuring that the keyspace is at least large 4808 * enough to fill up the indirect block being created. 4809 */ 4810 static int 4811 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4812 int keybits, hammer2_blockref_t *base, int count, 4813 int ncount) 4814 { 4815 hammer2_blockref_t *bref; 4816 hammer2_chain_t *chain; 4817 hammer2_key_t key_beg; 4818 hammer2_key_t key_end; 4819 hammer2_key_t key_next; 4820 hammer2_key_t key; 4821 int nkeybits; 4822 int locount; 4823 int hicount; 4824 int maxloops = 300000; 4825 4826 /* 4827 * NOTE: We can't take a shortcut here anymore for inodes because 4828 * the root directory can contain a mix of inodes and directory 4829 * entries (we used to just return 63 if parent->bref.type was 4830 * HAMMER2_BREF_TYPE_INODE. 4831 */ 4832 key = *keyp; 4833 locount = 0; 4834 hicount = 0; 4835 4836 /* 4837 * Calculate the range of keys in the array being careful to skip 4838 * slots which are overridden with a deletion. 4839 */ 4840 key_beg = 0; 4841 key_end = HAMMER2_KEY_MAX; 4842 hammer2_spin_ex(&parent->core.spin); 4843 4844 for (;;) { 4845 if (--maxloops == 0) { 4846 panic("indkey_freemap shit %p %p:%d\n", 4847 parent, base, count); 4848 } 4849 chain = hammer2_combined_find(parent, base, count, 4850 &key_next, 4851 key_beg, key_end, 4852 &bref); 4853 4854 /* 4855 * Exhausted search 4856 */ 4857 if (bref == NULL) 4858 break; 4859 4860 /* 4861 * Deleted object 4862 */ 4863 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4864 if (key_next == 0 || key_next > key_end) 4865 break; 4866 key_beg = key_next; 4867 continue; 4868 } 4869 4870 /* 4871 * Use the full live (not deleted) element for the scan 4872 * iteration. HAMMER2 does not allow partial replacements. 4873 * 4874 * XXX should be built into hammer2_combined_find(). 4875 */ 4876 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4877 4878 /* 4879 * Expand our calculated key range (key, keybits) to fit 4880 * the scanned key. nkeybits represents the full range 4881 * that we will later cut in half (two halves @ nkeybits - 1). 4882 */ 4883 nkeybits = keybits; 4884 if (nkeybits < bref->keybits) { 4885 if (bref->keybits > 64) { 4886 kprintf("bad bref chain %p bref %p\n", 4887 chain, bref); 4888 Debugger("fubar"); 4889 } 4890 nkeybits = bref->keybits; 4891 } 4892 while (nkeybits < 64 && 4893 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) { 4894 ++nkeybits; 4895 } 4896 4897 /* 4898 * If the new key range is larger we have to determine 4899 * which side of the new key range the existing keys fall 4900 * under by checking the high bit, then collapsing the 4901 * locount into the hicount or vise-versa. 4902 */ 4903 if (keybits != nkeybits) { 4904 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4905 hicount += locount; 4906 locount = 0; 4907 } else { 4908 locount += hicount; 4909 hicount = 0; 4910 } 4911 keybits = nkeybits; 4912 } 4913 4914 /* 4915 * The newly scanned key will be in the lower half or the 4916 * upper half of the (new) key range. 4917 */ 4918 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4919 ++hicount; 4920 else 4921 ++locount; 4922 4923 if (key_next == 0) 4924 break; 4925 key_beg = key_next; 4926 } 4927 hammer2_spin_unex(&parent->core.spin); 4928 bref = NULL; /* now invalid (safety) */ 4929 4930 /* 4931 * Adjust keybits to represent half of the full range calculated 4932 * above (radix 63 max) for our new indirect block. 4933 */ 4934 --keybits; 4935 4936 /* 4937 * Expand keybits to hold at least ncount elements. ncount will be 4938 * a power of 2. This is to try to completely fill leaf nodes (at 4939 * least for keys which are not hashes). 4940 * 4941 * We aren't counting 'in' or 'out', we are counting 'high side' 4942 * and 'low side' based on the bit at (1LL << keybits). We want 4943 * everything to be inside in these cases so shift it all to 4944 * the low or high side depending on the new high bit. 4945 */ 4946 while (((hammer2_key_t)1 << keybits) < ncount) { 4947 ++keybits; 4948 if (key & ((hammer2_key_t)1 << keybits)) { 4949 hicount += locount; 4950 locount = 0; 4951 } else { 4952 locount += hicount; 4953 hicount = 0; 4954 } 4955 } 4956 4957 if (hicount > locount) 4958 key |= (hammer2_key_t)1 << keybits; 4959 else 4960 key &= ~(hammer2_key_t)1 << keybits; 4961 4962 *keyp = key; 4963 4964 return (keybits); 4965 } 4966 4967 #else 4968 4969 /* 4970 * Directory indirect blocks. 4971 * 4972 * Covers both the inode index (directory of inodes), and directory contents 4973 * (filenames hardlinked to inodes). 4974 * 4975 * Because directory keys are hashed we generally try to cut the space in 4976 * half. We accomodate the inode index (which tends to have linearly 4977 * increasing inode numbers) by ensuring that the keyspace is at least large 4978 * enough to fill up the indirect block being created. 4979 */ 4980 static int 4981 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4982 int keybits, hammer2_blockref_t *base, int count, 4983 int ncount) 4984 { 4985 hammer2_blockref_t *bref; 4986 hammer2_chain_t *chain; 4987 hammer2_key_t key_beg; 4988 hammer2_key_t key_end; 4989 hammer2_key_t key_next; 4990 hammer2_key_t key; 4991 int nkeybits; 4992 int locount; 4993 int hicount; 4994 int maxloops = 300000; 4995 4996 /* 4997 * Shortcut if the parent is the inode. In this situation the 4998 * parent has 4+1 directory entries and we are creating an indirect 4999 * block capable of holding many more. 5000 */ 5001 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 5002 return 63; 5003 } 5004 5005 key = *keyp; 5006 locount = 0; 5007 hicount = 0; 5008 5009 /* 5010 * Calculate the range of keys in the array being careful to skip 5011 * slots which are overridden with a deletion. 5012 */ 5013 key_beg = 0; 5014 key_end = HAMMER2_KEY_MAX; 5015 hammer2_spin_ex(&parent->core.spin); 5016 5017 for (;;) { 5018 if (--maxloops == 0) { 5019 panic("indkey_freemap shit %p %p:%d\n", 5020 parent, base, count); 5021 } 5022 chain = hammer2_combined_find(parent, base, count, 5023 &key_next, 5024 key_beg, key_end, 5025 &bref); 5026 5027 /* 5028 * Exhausted search 5029 */ 5030 if (bref == NULL) 5031 break; 5032 5033 /* 5034 * Deleted object 5035 */ 5036 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 5037 if (key_next == 0 || key_next > key_end) 5038 break; 5039 key_beg = key_next; 5040 continue; 5041 } 5042 5043 /* 5044 * Use the full live (not deleted) element for the scan 5045 * iteration. HAMMER2 does not allow partial replacements. 5046 * 5047 * XXX should be built into hammer2_combined_find(). 5048 */ 5049 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 5050 5051 /* 5052 * Expand our calculated key range (key, keybits) to fit 5053 * the scanned key. nkeybits represents the full range 5054 * that we will later cut in half (two halves @ nkeybits - 1). 5055 */ 5056 nkeybits = keybits; 5057 if (nkeybits < bref->keybits) { 5058 if (bref->keybits > 64) { 5059 kprintf("bad bref chain %p bref %p\n", 5060 chain, bref); 5061 Debugger("fubar"); 5062 } 5063 nkeybits = bref->keybits; 5064 } 5065 while (nkeybits < 64 && 5066 (~(((hammer2_key_t)1 << nkeybits) - 1) & 5067 (key ^ bref->key)) != 0) { 5068 ++nkeybits; 5069 } 5070 5071 /* 5072 * If the new key range is larger we have to determine 5073 * which side of the new key range the existing keys fall 5074 * under by checking the high bit, then collapsing the 5075 * locount into the hicount or vise-versa. 5076 */ 5077 if (keybits != nkeybits) { 5078 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 5079 hicount += locount; 5080 locount = 0; 5081 } else { 5082 locount += hicount; 5083 hicount = 0; 5084 } 5085 keybits = nkeybits; 5086 } 5087 5088 /* 5089 * The newly scanned key will be in the lower half or the 5090 * upper half of the (new) key range. 5091 */ 5092 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 5093 ++hicount; 5094 else 5095 ++locount; 5096 5097 if (key_next == 0) 5098 break; 5099 key_beg = key_next; 5100 } 5101 hammer2_spin_unex(&parent->core.spin); 5102 bref = NULL; /* now invalid (safety) */ 5103 5104 /* 5105 * Adjust keybits to represent half of the full range calculated 5106 * above (radix 63 max) for our new indirect block. 5107 */ 5108 --keybits; 5109 5110 /* 5111 * Expand keybits to hold at least ncount elements. ncount will be 5112 * a power of 2. This is to try to completely fill leaf nodes (at 5113 * least for keys which are not hashes). 5114 * 5115 * We aren't counting 'in' or 'out', we are counting 'high side' 5116 * and 'low side' based on the bit at (1LL << keybits). We want 5117 * everything to be inside in these cases so shift it all to 5118 * the low or high side depending on the new high bit. 5119 */ 5120 while (((hammer2_key_t)1 << keybits) < ncount) { 5121 ++keybits; 5122 if (key & ((hammer2_key_t)1 << keybits)) { 5123 hicount += locount; 5124 locount = 0; 5125 } else { 5126 locount += hicount; 5127 hicount = 0; 5128 } 5129 } 5130 5131 if (hicount > locount) 5132 key |= (hammer2_key_t)1 << keybits; 5133 else 5134 key &= ~(hammer2_key_t)1 << keybits; 5135 5136 *keyp = key; 5137 5138 return (keybits); 5139 } 5140 5141 #endif 5142 5143 /* 5144 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 5145 * it exists. 5146 * 5147 * Both parent and chain must be locked exclusively. 5148 * 5149 * This function will modify the parent if the blockref requires removal 5150 * from the parent's block table. 5151 * 5152 * This function is NOT recursive. Any entity already pushed into the 5153 * chain (such as an inode) may still need visibility into its contents, 5154 * as well as the ability to read and modify the contents. For example, 5155 * for an unlinked file which is still open. 5156 * 5157 * Also note that the flusher is responsible for cleaning up empty 5158 * indirect blocks. 5159 */ 5160 int 5161 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 5162 hammer2_tid_t mtid, int flags) 5163 { 5164 int error = 0; 5165 5166 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5167 5168 /* 5169 * Nothing to do if already marked. 5170 * 5171 * We need the spinlock on the core whos RBTREE contains chain 5172 * to protect against races. 5173 */ 5174 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5175 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5176 chain->parent == parent); 5177 error = _hammer2_chain_delete_helper(parent, chain, 5178 mtid, flags, NULL); 5179 } 5180 5181 /* 5182 * Permanent deletions mark the chain as destroyed. 5183 * 5184 * NOTE: We do not setflush the chain unless the deletion is 5185 * permanent, since the deletion of a chain does not actually 5186 * require it to be flushed. 5187 */ 5188 if (error == 0) { 5189 if (flags & HAMMER2_DELETE_PERMANENT) { 5190 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5191 hammer2_chain_setflush(chain); 5192 } 5193 } 5194 5195 return error; 5196 } 5197 5198 static int 5199 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain, 5200 hammer2_tid_t mtid, int flags, 5201 hammer2_blockref_t *obref) 5202 { 5203 int error = 0; 5204 5205 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5206 5207 /* 5208 * Nothing to do if already marked. 5209 * 5210 * We need the spinlock on the core whos RBTREE contains chain 5211 * to protect against races. 5212 */ 5213 obref->type = HAMMER2_BREF_TYPE_EMPTY; 5214 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5215 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5216 chain->parent == parent); 5217 error = _hammer2_chain_delete_helper(parent, chain, 5218 mtid, flags, obref); 5219 } 5220 5221 /* 5222 * Permanent deletions mark the chain as destroyed. 5223 * 5224 * NOTE: We do not setflush the chain unless the deletion is 5225 * permanent, since the deletion of a chain does not actually 5226 * require it to be flushed. 5227 */ 5228 if (error == 0) { 5229 if (flags & HAMMER2_DELETE_PERMANENT) { 5230 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5231 hammer2_chain_setflush(chain); 5232 } 5233 } 5234 5235 return error; 5236 } 5237 5238 /* 5239 * Returns the index of the nearest element in the blockref array >= elm. 5240 * Returns (count) if no element could be found. 5241 * 5242 * Sets *key_nextp to the next key for loop purposes but does not modify 5243 * it if the next key would be higher than the current value of *key_nextp. 5244 * Note that *key_nexp can overflow to 0, which should be tested by the 5245 * caller. 5246 * 5247 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5248 * held through the operation. 5249 */ 5250 static int 5251 hammer2_base_find(hammer2_chain_t *parent, 5252 hammer2_blockref_t *base, int count, 5253 hammer2_key_t *key_nextp, 5254 hammer2_key_t key_beg, hammer2_key_t key_end) 5255 { 5256 hammer2_blockref_t *scan; 5257 hammer2_key_t scan_end; 5258 int i; 5259 int limit; 5260 5261 /* 5262 * Require the live chain's already have their core's counted 5263 * so we can optimize operations. 5264 */ 5265 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 5266 5267 /* 5268 * Degenerate case 5269 */ 5270 if (count == 0 || base == NULL) 5271 return(count); 5272 5273 /* 5274 * Sequential optimization using parent->cache_index. This is 5275 * the most likely scenario. 5276 * 5277 * We can avoid trailing empty entries on live chains, otherwise 5278 * we might have to check the whole block array. 5279 */ 5280 i = parent->cache_index; /* SMP RACE OK */ 5281 cpu_ccfence(); 5282 limit = parent->core.live_zero; 5283 if (i >= limit) 5284 i = limit - 1; 5285 if (i < 0) 5286 i = 0; 5287 KKASSERT(i < count); 5288 5289 /* 5290 * Search backwards 5291 */ 5292 scan = &base[i]; 5293 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY || 5294 scan->key > key_beg)) { 5295 --scan; 5296 --i; 5297 } 5298 parent->cache_index = i; 5299 5300 /* 5301 * Search forwards, stop when we find a scan element which 5302 * encloses the key or until we know that there are no further 5303 * elements. 5304 */ 5305 while (i < count) { 5306 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) { 5307 scan_end = scan->key + 5308 ((hammer2_key_t)1 << scan->keybits) - 1; 5309 if (scan->key > key_beg || scan_end >= key_beg) 5310 break; 5311 } 5312 if (i >= limit) 5313 return (count); 5314 ++scan; 5315 ++i; 5316 } 5317 if (i != count) { 5318 parent->cache_index = i; 5319 if (i >= limit) { 5320 i = count; 5321 } else { 5322 scan_end = scan->key + 5323 ((hammer2_key_t)1 << scan->keybits); 5324 if (scan_end && (*key_nextp > scan_end || 5325 *key_nextp == 0)) { 5326 *key_nextp = scan_end; 5327 } 5328 } 5329 } 5330 return (i); 5331 } 5332 5333 /* 5334 * Do a combined search and return the next match either from the blockref 5335 * array or from the in-memory chain. Sets *bresp to the returned bref in 5336 * both cases, or sets it to NULL if the search exhausted. Only returns 5337 * a non-NULL chain if the search matched from the in-memory chain. 5338 * 5339 * When no in-memory chain has been found and a non-NULL bref is returned 5340 * in *bresp. 5341 * 5342 * 5343 * The returned chain is not locked or referenced. Use the returned bref 5344 * to determine if the search exhausted or not. Iterate if the base find 5345 * is chosen but matches a deleted chain. 5346 * 5347 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5348 * held through the operation. 5349 */ 5350 hammer2_chain_t * 5351 hammer2_combined_find(hammer2_chain_t *parent, 5352 hammer2_blockref_t *base, int count, 5353 hammer2_key_t *key_nextp, 5354 hammer2_key_t key_beg, hammer2_key_t key_end, 5355 hammer2_blockref_t **bresp) 5356 { 5357 hammer2_blockref_t *bref; 5358 hammer2_chain_t *chain; 5359 int i; 5360 5361 /* 5362 * Lookup in block array and in rbtree. 5363 */ 5364 *key_nextp = key_end + 1; 5365 i = hammer2_base_find(parent, base, count, key_nextp, 5366 key_beg, key_end); 5367 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 5368 5369 /* 5370 * Neither matched 5371 */ 5372 if (i == count && chain == NULL) { 5373 *bresp = NULL; 5374 return(NULL); 5375 } 5376 5377 /* 5378 * Only chain matched. 5379 */ 5380 if (i == count) { 5381 bref = &chain->bref; 5382 goto found; 5383 } 5384 5385 /* 5386 * Only blockref matched. 5387 */ 5388 if (chain == NULL) { 5389 bref = &base[i]; 5390 goto found; 5391 } 5392 5393 /* 5394 * Both in-memory and blockref matched, select the nearer element. 5395 * 5396 * If both are flush with the left-hand side or both are the 5397 * same distance away, select the chain. In this situation the 5398 * chain must have been loaded from the matching blockmap. 5399 */ 5400 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 5401 chain->bref.key == base[i].key) { 5402 KKASSERT(chain->bref.key == base[i].key); 5403 bref = &chain->bref; 5404 goto found; 5405 } 5406 5407 /* 5408 * Select the nearer key 5409 */ 5410 if (chain->bref.key < base[i].key) { 5411 bref = &chain->bref; 5412 } else { 5413 bref = &base[i]; 5414 chain = NULL; 5415 } 5416 5417 /* 5418 * If the bref is out of bounds we've exhausted our search. 5419 */ 5420 found: 5421 if (bref->key > key_end) { 5422 *bresp = NULL; 5423 chain = NULL; 5424 } else { 5425 *bresp = bref; 5426 } 5427 return(chain); 5428 } 5429 5430 /* 5431 * Locate the specified block array element and delete it. The element 5432 * must exist. 5433 * 5434 * The spin lock on the related chain must be held. 5435 * 5436 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5437 * need to be adjusted when we commit the media change. 5438 */ 5439 void 5440 hammer2_base_delete(hammer2_chain_t *parent, 5441 hammer2_blockref_t *base, int count, 5442 hammer2_chain_t *chain, 5443 hammer2_blockref_t *obref) 5444 { 5445 hammer2_blockref_t *elm = &chain->bref; 5446 hammer2_blockref_t *scan; 5447 hammer2_key_t key_next; 5448 int i; 5449 5450 /* 5451 * Delete element. Expect the element to exist. 5452 * 5453 * XXX see caller, flush code not yet sophisticated enough to prevent 5454 * re-flushed in some cases. 5455 */ 5456 key_next = 0; /* max range */ 5457 i = hammer2_base_find(parent, base, count, &key_next, 5458 elm->key, elm->key); 5459 scan = &base[i]; 5460 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY || 5461 scan->key != elm->key || 5462 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 5463 scan->keybits != elm->keybits)) { 5464 hammer2_spin_unex(&parent->core.spin); 5465 panic("delete base %p element not found at %d/%d elm %p\n", 5466 base, i, count, elm); 5467 return; 5468 } 5469 5470 /* 5471 * Update stats and zero the entry. 5472 * 5473 * NOTE: Handle radix == 0 (0 bytes) case. 5474 */ 5475 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5476 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5477 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5478 } 5479 switch(scan->type) { 5480 case HAMMER2_BREF_TYPE_INODE: 5481 --parent->bref.embed.stats.inode_count; 5482 /* fall through */ 5483 case HAMMER2_BREF_TYPE_DATA: 5484 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5485 atomic_set_int(&chain->flags, 5486 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5487 } else { 5488 if (parent->bref.leaf_count) 5489 --parent->bref.leaf_count; 5490 } 5491 /* fall through */ 5492 case HAMMER2_BREF_TYPE_INDIRECT: 5493 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5494 parent->bref.embed.stats.data_count -= 5495 scan->embed.stats.data_count; 5496 parent->bref.embed.stats.inode_count -= 5497 scan->embed.stats.inode_count; 5498 } 5499 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5500 break; 5501 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5502 atomic_set_int(&chain->flags, 5503 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5504 } else { 5505 if (parent->bref.leaf_count <= scan->leaf_count) 5506 parent->bref.leaf_count = 0; 5507 else 5508 parent->bref.leaf_count -= scan->leaf_count; 5509 } 5510 break; 5511 case HAMMER2_BREF_TYPE_DIRENT: 5512 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5513 atomic_set_int(&chain->flags, 5514 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5515 } else { 5516 if (parent->bref.leaf_count) 5517 --parent->bref.leaf_count; 5518 } 5519 default: 5520 break; 5521 } 5522 5523 if (obref) 5524 *obref = *scan; 5525 bzero(scan, sizeof(*scan)); 5526 5527 /* 5528 * We can only optimize parent->core.live_zero for live chains. 5529 */ 5530 if (parent->core.live_zero == i + 1) { 5531 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY) 5532 ; 5533 parent->core.live_zero = i + 1; 5534 } 5535 5536 /* 5537 * Clear appropriate blockmap flags in chain. 5538 */ 5539 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 5540 HAMMER2_CHAIN_BMAPUPD); 5541 } 5542 5543 /* 5544 * Insert the specified element. The block array must not already have the 5545 * element and must have space available for the insertion. 5546 * 5547 * The spin lock on the related chain must be held. 5548 * 5549 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5550 * need to be adjusted when we commit the media change. 5551 */ 5552 void 5553 hammer2_base_insert(hammer2_chain_t *parent, 5554 hammer2_blockref_t *base, int count, 5555 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5556 { 5557 hammer2_key_t key_next; 5558 hammer2_key_t xkey; 5559 int i; 5560 int j; 5561 int k; 5562 int l; 5563 int u = 1; 5564 5565 /* 5566 * Insert new element. Expect the element to not already exist 5567 * unless we are replacing it. 5568 * 5569 * XXX see caller, flush code not yet sophisticated enough to prevent 5570 * re-flushed in some cases. 5571 */ 5572 key_next = 0; /* max range */ 5573 i = hammer2_base_find(parent, base, count, &key_next, 5574 elm->key, elm->key); 5575 5576 /* 5577 * Shortcut fill optimization, typical ordered insertion(s) may not 5578 * require a search. 5579 */ 5580 KKASSERT(i >= 0 && i <= count); 5581 5582 /* 5583 * Set appropriate blockmap flags in chain (if not NULL) 5584 */ 5585 if (chain) 5586 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 5587 5588 /* 5589 * Update stats and zero the entry 5590 */ 5591 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5592 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5593 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5594 } 5595 switch(elm->type) { 5596 case HAMMER2_BREF_TYPE_INODE: 5597 ++parent->bref.embed.stats.inode_count; 5598 /* fall through */ 5599 case HAMMER2_BREF_TYPE_DATA: 5600 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5601 ++parent->bref.leaf_count; 5602 /* fall through */ 5603 case HAMMER2_BREF_TYPE_INDIRECT: 5604 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5605 parent->bref.embed.stats.data_count += 5606 elm->embed.stats.data_count; 5607 parent->bref.embed.stats.inode_count += 5608 elm->embed.stats.inode_count; 5609 } 5610 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5611 break; 5612 if (parent->bref.leaf_count + elm->leaf_count < 5613 HAMMER2_BLOCKREF_LEAF_MAX) { 5614 parent->bref.leaf_count += elm->leaf_count; 5615 } else { 5616 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5617 } 5618 break; 5619 case HAMMER2_BREF_TYPE_DIRENT: 5620 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5621 ++parent->bref.leaf_count; 5622 break; 5623 default: 5624 break; 5625 } 5626 5627 5628 /* 5629 * We can only optimize parent->core.live_zero for live chains. 5630 */ 5631 if (i == count && parent->core.live_zero < count) { 5632 i = parent->core.live_zero++; 5633 base[i] = *elm; 5634 return; 5635 } 5636 5637 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5638 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5639 hammer2_spin_unex(&parent->core.spin); 5640 panic("insert base %p overlapping elements at %d elm %p\n", 5641 base, i, elm); 5642 } 5643 5644 /* 5645 * Try to find an empty slot before or after. 5646 */ 5647 j = i; 5648 k = i; 5649 while (j > 0 || k < count) { 5650 --j; 5651 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) { 5652 if (j == i - 1) { 5653 base[j] = *elm; 5654 } else { 5655 bcopy(&base[j+1], &base[j], 5656 (i - j - 1) * sizeof(*base)); 5657 base[i - 1] = *elm; 5658 } 5659 goto validate; 5660 } 5661 ++k; 5662 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) { 5663 bcopy(&base[i], &base[i+1], 5664 (k - i) * sizeof(hammer2_blockref_t)); 5665 base[i] = *elm; 5666 5667 /* 5668 * We can only update parent->core.live_zero for live 5669 * chains. 5670 */ 5671 if (parent->core.live_zero <= k) 5672 parent->core.live_zero = k + 1; 5673 u = 2; 5674 goto validate; 5675 } 5676 } 5677 panic("hammer2_base_insert: no room!"); 5678 5679 /* 5680 * Debugging 5681 */ 5682 validate: 5683 key_next = 0; 5684 for (l = 0; l < count; ++l) { 5685 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5686 key_next = base[l].key + 5687 ((hammer2_key_t)1 << base[l].keybits) - 1; 5688 break; 5689 } 5690 } 5691 while (++l < count) { 5692 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5693 if (base[l].key <= key_next) 5694 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5695 key_next = base[l].key + 5696 ((hammer2_key_t)1 << base[l].keybits) - 1; 5697 5698 } 5699 } 5700 5701 } 5702 5703 #if 0 5704 5705 /* 5706 * Sort the blockref array for the chain. Used by the flush code to 5707 * sort the blockref[] array. 5708 * 5709 * The chain must be exclusively locked AND spin-locked. 5710 */ 5711 typedef hammer2_blockref_t *hammer2_blockref_p; 5712 5713 static 5714 int 5715 hammer2_base_sort_callback(const void *v1, const void *v2) 5716 { 5717 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5718 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5719 5720 /* 5721 * Make sure empty elements are placed at the end of the array 5722 */ 5723 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) { 5724 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) 5725 return(0); 5726 return(1); 5727 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) { 5728 return(-1); 5729 } 5730 5731 /* 5732 * Sort by key 5733 */ 5734 if (bref1->key < bref2->key) 5735 return(-1); 5736 if (bref1->key > bref2->key) 5737 return(1); 5738 return(0); 5739 } 5740 5741 void 5742 hammer2_base_sort(hammer2_chain_t *chain) 5743 { 5744 hammer2_blockref_t *base; 5745 int count; 5746 5747 switch(chain->bref.type) { 5748 case HAMMER2_BREF_TYPE_INODE: 5749 /* 5750 * Special shortcut for embedded data returns the inode 5751 * itself. Callers must detect this condition and access 5752 * the embedded data (the strategy code does this for us). 5753 * 5754 * This is only applicable to regular files and softlinks. 5755 */ 5756 if (chain->data->ipdata.meta.op_flags & 5757 HAMMER2_OPFLAG_DIRECTDATA) { 5758 return; 5759 } 5760 base = &chain->data->ipdata.u.blockset.blockref[0]; 5761 count = HAMMER2_SET_COUNT; 5762 break; 5763 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5764 case HAMMER2_BREF_TYPE_INDIRECT: 5765 /* 5766 * Optimize indirect blocks in the INITIAL state to avoid 5767 * I/O. 5768 */ 5769 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5770 base = &chain->data->npdata[0]; 5771 count = chain->bytes / sizeof(hammer2_blockref_t); 5772 break; 5773 case HAMMER2_BREF_TYPE_VOLUME: 5774 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5775 count = HAMMER2_SET_COUNT; 5776 break; 5777 case HAMMER2_BREF_TYPE_FREEMAP: 5778 base = &chain->data->blkset.blockref[0]; 5779 count = HAMMER2_SET_COUNT; 5780 break; 5781 default: 5782 kprintf("hammer2_base_sort: unrecognized " 5783 "blockref(A) type: %d", 5784 chain->bref.type); 5785 while (1) 5786 tsleep(&base, 0, "dead", 0); 5787 panic("hammer2_base_sort: unrecognized " 5788 "blockref(A) type: %d", 5789 chain->bref.type); 5790 base = NULL; /* safety */ 5791 count = 0; /* safety */ 5792 } 5793 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5794 } 5795 5796 #endif 5797 5798 /* 5799 * Chain memory management 5800 */ 5801 void 5802 hammer2_chain_wait(hammer2_chain_t *chain) 5803 { 5804 tsleep(chain, 0, "chnflw", 1); 5805 } 5806 5807 const hammer2_media_data_t * 5808 hammer2_chain_rdata(hammer2_chain_t *chain) 5809 { 5810 KKASSERT(chain->data != NULL); 5811 return (chain->data); 5812 } 5813 5814 hammer2_media_data_t * 5815 hammer2_chain_wdata(hammer2_chain_t *chain) 5816 { 5817 KKASSERT(chain->data != NULL); 5818 return (chain->data); 5819 } 5820 5821 /* 5822 * Set the check data for a chain. This can be a heavy-weight operation 5823 * and typically only runs on-flush. For file data check data is calculated 5824 * when the logical buffers are flushed. 5825 */ 5826 void 5827 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5828 { 5829 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 5830 5831 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5832 case HAMMER2_CHECK_NONE: 5833 break; 5834 case HAMMER2_CHECK_DISABLED: 5835 break; 5836 case HAMMER2_CHECK_ISCSI32: 5837 chain->bref.check.iscsi32.value = 5838 hammer2_icrc32(bdata, chain->bytes); 5839 break; 5840 case HAMMER2_CHECK_XXHASH64: 5841 chain->bref.check.xxhash64.value = 5842 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5843 break; 5844 case HAMMER2_CHECK_SHA192: 5845 { 5846 SHA256_CTX hash_ctx; 5847 union { 5848 uint8_t digest[SHA256_DIGEST_LENGTH]; 5849 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5850 } u; 5851 5852 SHA256_Init(&hash_ctx); 5853 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5854 SHA256_Final(u.digest, &hash_ctx); 5855 u.digest64[2] ^= u.digest64[3]; 5856 bcopy(u.digest, 5857 chain->bref.check.sha192.data, 5858 sizeof(chain->bref.check.sha192.data)); 5859 } 5860 break; 5861 case HAMMER2_CHECK_FREEMAP: 5862 chain->bref.check.freemap.icrc32 = 5863 hammer2_icrc32(bdata, chain->bytes); 5864 break; 5865 default: 5866 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5867 chain->bref.methods); 5868 break; 5869 } 5870 } 5871 5872 /* 5873 * Characterize a failed check code and try to trace back to the inode. 5874 */ 5875 static void 5876 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check, 5877 int bits) 5878 { 5879 hammer2_chain_t *lchain; 5880 hammer2_chain_t *ochain; 5881 int did; 5882 5883 did = krateprintf(&krate_h2chk, 5884 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL " 5885 "(flags=%08x, bref/data ", 5886 chain->bref.data_off, 5887 chain->bref.type, 5888 hammer2_bref_type_str(chain->bref.type), 5889 chain->bref.methods, 5890 chain->flags); 5891 if (did == 0) 5892 return; 5893 5894 if (bits == 32) { 5895 kprintf("%08x/%08x)\n", 5896 chain->bref.check.iscsi32.value, 5897 (uint32_t)check); 5898 } else { 5899 kprintf("%016jx/%016jx)\n", 5900 chain->bref.check.xxhash64.value, 5901 check); 5902 } 5903 5904 /* 5905 * Run up the chains to try to find the governing inode so we 5906 * can report it. 5907 * 5908 * XXX This error reporting is not really MPSAFE 5909 */ 5910 ochain = chain; 5911 lchain = chain; 5912 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) { 5913 lchain = chain; 5914 chain = chain->parent; 5915 } 5916 5917 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE && 5918 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 || 5919 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) { 5920 kprintf(" Resides at/in inode %ld\n", 5921 chain->bref.key); 5922 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5923 kprintf(" Resides in inode index - CRITICAL!!!\n"); 5924 } else { 5925 kprintf(" Resides in root index - CRITICAL!!!\n"); 5926 } 5927 if (ochain->hmp) { 5928 const char *pfsname = "UNKNOWN"; 5929 int i; 5930 5931 if (ochain->pmp) { 5932 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 5933 if (ochain->pmp->pfs_hmps[i] == ochain->hmp && 5934 ochain->pmp->pfs_names[i]) { 5935 pfsname = ochain->pmp->pfs_names[i]; 5936 break; 5937 } 5938 } 5939 } 5940 kprintf(" In pfs %s on device %s\n", 5941 pfsname, ochain->hmp->devrepname); 5942 } 5943 } 5944 5945 /* 5946 * Returns non-zero on success, 0 on failure. 5947 */ 5948 int 5949 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5950 { 5951 uint32_t check32; 5952 uint64_t check64; 5953 int r; 5954 5955 if (chain->flags & HAMMER2_CHAIN_NOTTESTED) 5956 return 1; 5957 5958 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5959 case HAMMER2_CHECK_NONE: 5960 r = 1; 5961 break; 5962 case HAMMER2_CHECK_DISABLED: 5963 r = 1; 5964 break; 5965 case HAMMER2_CHECK_ISCSI32: 5966 check32 = hammer2_icrc32(bdata, chain->bytes); 5967 r = (chain->bref.check.iscsi32.value == check32); 5968 if (r == 0) { 5969 hammer2_characterize_failed_chain(chain, check32, 32); 5970 } 5971 hammer2_process_icrc32 += chain->bytes; 5972 break; 5973 case HAMMER2_CHECK_XXHASH64: 5974 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5975 r = (chain->bref.check.xxhash64.value == check64); 5976 if (r == 0) { 5977 hammer2_characterize_failed_chain(chain, check64, 64); 5978 } 5979 hammer2_process_xxhash64 += chain->bytes; 5980 break; 5981 case HAMMER2_CHECK_SHA192: 5982 { 5983 SHA256_CTX hash_ctx; 5984 union { 5985 uint8_t digest[SHA256_DIGEST_LENGTH]; 5986 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5987 } u; 5988 5989 SHA256_Init(&hash_ctx); 5990 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5991 SHA256_Final(u.digest, &hash_ctx); 5992 u.digest64[2] ^= u.digest64[3]; 5993 if (bcmp(u.digest, 5994 chain->bref.check.sha192.data, 5995 sizeof(chain->bref.check.sha192.data)) == 0) { 5996 r = 1; 5997 } else { 5998 r = 0; 5999 krateprintf(&krate_h2chk, 6000 "chain %016jx.%02x meth=%02x " 6001 "CHECK FAIL\n", 6002 chain->bref.data_off, 6003 chain->bref.type, 6004 chain->bref.methods); 6005 } 6006 } 6007 break; 6008 case HAMMER2_CHECK_FREEMAP: 6009 r = (chain->bref.check.freemap.icrc32 == 6010 hammer2_icrc32(bdata, chain->bytes)); 6011 if (r == 0) { 6012 int did; 6013 6014 did = krateprintf(&krate_h2chk, 6015 "chain %016jx.%02x meth=%02x " 6016 "CHECK FAIL\n", 6017 chain->bref.data_off, 6018 chain->bref.type, 6019 chain->bref.methods); 6020 if (did) { 6021 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 6022 chain->bref.check.freemap.icrc32, 6023 hammer2_icrc32(bdata, chain->bytes), 6024 chain->bytes); 6025 if (chain->dio) { 6026 kprintf("dio %p buf %016jx,%d " 6027 "bdata %p/%p\n", 6028 chain->dio, 6029 chain->dio->bp->b_loffset, 6030 chain->dio->bp->b_bufsize, 6031 bdata, 6032 chain->dio->bp->b_data); 6033 } 6034 } 6035 } 6036 break; 6037 default: 6038 kprintf("hammer2_chain_testcheck: unknown check type %02x\n", 6039 chain->bref.methods); 6040 r = 1; 6041 break; 6042 } 6043 return r; 6044 } 6045 6046 /* 6047 * Acquire the chain and parent representing the specified inode for the 6048 * device at the specified cluster index. 6049 * 6050 * The flags passed in are LOOKUP flags, not RESOLVE flags. 6051 * 6052 * If we are unable to locate the inode, HAMMER2_ERROR_EIO is returned and 6053 * *chainp will be NULL. *parentp may still be set error or not, or NULL 6054 * if the parent itself could not be resolved. 6055 * 6056 * The caller may pass-in a locked *parentp and/or *chainp, or neither. 6057 * They will be unlocked and released by this function. The *parentp and 6058 * *chainp representing the located inode are returned locked. 6059 */ 6060 int 6061 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 6062 int clindex, int flags, 6063 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 6064 { 6065 hammer2_chain_t *parent; 6066 hammer2_chain_t *rchain; 6067 hammer2_key_t key_dummy; 6068 hammer2_inode_t *ip; 6069 int resolve_flags; 6070 int error; 6071 6072 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 6073 HAMMER2_RESOLVE_SHARED : 0; 6074 6075 /* 6076 * Caller expects us to replace these. 6077 */ 6078 if (*chainp) { 6079 hammer2_chain_unlock(*chainp); 6080 hammer2_chain_drop(*chainp); 6081 *chainp = NULL; 6082 } 6083 if (*parentp) { 6084 hammer2_chain_unlock(*parentp); 6085 hammer2_chain_drop(*parentp); 6086 *parentp = NULL; 6087 } 6088 6089 /* 6090 * Be very careful, this is a backend function and we CANNOT 6091 * lock any frontend inode structure we find. But we have to 6092 * look the inode up this way first in case it exists but is 6093 * detached from the radix tree. 6094 */ 6095 ip = hammer2_inode_lookup(pmp, inum); 6096 if (ip) { 6097 *chainp = hammer2_inode_chain_and_parent(ip, clindex, 6098 parentp, 6099 resolve_flags); 6100 hammer2_inode_drop(ip); 6101 if (*chainp) 6102 return 0; 6103 hammer2_chain_unlock(*chainp); 6104 hammer2_chain_drop(*chainp); 6105 *chainp = NULL; 6106 if (*parentp) { 6107 hammer2_chain_unlock(*parentp); 6108 hammer2_chain_drop(*parentp); 6109 *parentp = NULL; 6110 } 6111 } 6112 6113 /* 6114 * Inodes hang off of the iroot (bit 63 is clear, differentiating 6115 * inodes from root directory entries in the key lookup). 6116 */ 6117 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 6118 rchain = NULL; 6119 if (parent) { 6120 rchain = hammer2_chain_lookup(&parent, &key_dummy, 6121 inum, inum, 6122 &error, flags); 6123 } else { 6124 error = HAMMER2_ERROR_EIO; 6125 } 6126 *parentp = parent; 6127 *chainp = rchain; 6128 6129 return error; 6130 } 6131 6132 /* 6133 * Used by the bulkscan code to snapshot the synchronized storage for 6134 * a volume, allowing it to be scanned concurrently against normal 6135 * operation. 6136 */ 6137 hammer2_chain_t * 6138 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 6139 { 6140 hammer2_chain_t *copy; 6141 6142 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 6143 copy->data = kmalloc(sizeof(copy->data->voldata), 6144 hmp->mchain, 6145 M_WAITOK | M_ZERO); 6146 hammer2_voldata_lock(hmp); 6147 copy->data->voldata = hmp->volsync; 6148 hammer2_voldata_unlock(hmp); 6149 6150 return copy; 6151 } 6152 6153 void 6154 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 6155 { 6156 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 6157 KKASSERT(copy->data); 6158 kfree(copy->data, copy->hmp->mchain); 6159 copy->data = NULL; 6160 atomic_add_long(&hammer2_chain_allocs, -1); 6161 hammer2_chain_drop(copy); 6162 } 6163 6164 /* 6165 * Returns non-zero if the chain (INODE or DIRENT) matches the 6166 * filename. 6167 */ 6168 int 6169 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 6170 size_t name_len) 6171 { 6172 const hammer2_inode_data_t *ripdata; 6173 const hammer2_dirent_head_t *den; 6174 6175 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 6176 ripdata = &chain->data->ipdata; 6177 if (ripdata->meta.name_len == name_len && 6178 bcmp(ripdata->filename, name, name_len) == 0) { 6179 return 1; 6180 } 6181 } 6182 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 6183 chain->bref.embed.dirent.namlen == name_len) { 6184 den = &chain->bref.embed.dirent; 6185 if (name_len > sizeof(chain->bref.check.buf) && 6186 bcmp(chain->data->buf, name, name_len) == 0) { 6187 return 1; 6188 } 6189 if (name_len <= sizeof(chain->bref.check.buf) && 6190 bcmp(chain->bref.check.buf, name, name_len) == 0) { 6191 return 1; 6192 } 6193 } 6194 return 0; 6195 } 6196