1 /* 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org> 5 * Copyright (c) 2011-2022 The DragonFly Project. All rights reserved. 6 * 7 * This code is derived from software contributed to The DragonFly Project 8 * by Matthew Dillon <dillon@dragonflybsd.org> 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in 18 * the documentation and/or other materials provided with the 19 * distribution. 20 * 3. Neither the name of The DragonFly Project nor the names of its 21 * contributors may be used to endorse or promote products derived 22 * from this software without specific, prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 27 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 28 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 29 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 33 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 34 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 /* 38 * This subsystem implements most of the core support functions for 39 * the hammer2_chain structure. 40 * 41 * Chains are the in-memory version on media objects (volume header, inodes, 42 * indirect blocks, data blocks, etc). Chains represent a portion of the 43 * HAMMER2 topology. 44 * 45 * Chains are no-longer delete-duplicated. Instead, the original in-memory 46 * chain will be moved along with its block reference (e.g. for things like 47 * renames, hardlink operations, modifications, etc), and will be indexed 48 * on a secondary list for flush handling instead of propagating a flag 49 * upward to the root. 50 * 51 * Concurrent front-end operations can still run against backend flushes 52 * as long as they do not cross the current flush boundary. An operation 53 * running above the current flush (in areas not yet flushed) can become 54 * part of the current flush while ano peration running below the current 55 * flush can become part of the next flush. 56 */ 57 /* 58 #include <sys/cdefs.h> 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/types.h> 62 #include <sys/lock.h> 63 #include <sys/buf.h> 64 65 #include <crypto/sha2/sha2.h> 66 */ 67 68 #include "hammer2.h" 69 70 static hammer2_chain_t *hammer2_chain_create_indirect( 71 hammer2_chain_t *parent, 72 hammer2_key_t key, int keybits, 73 hammer2_tid_t mtid, int for_type, int *errorp); 74 static int hammer2_chain_delete_obref(hammer2_chain_t *parent, 75 hammer2_chain_t *chain, 76 hammer2_tid_t mtid, int flags, 77 hammer2_blockref_t *obref); 78 static hammer2_chain_t *hammer2_combined_find( 79 hammer2_chain_t *parent, 80 hammer2_blockref_t *base, int count, 81 hammer2_key_t *key_nextp, 82 hammer2_key_t key_beg, hammer2_key_t key_end, 83 hammer2_blockref_t **brefp); 84 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 85 int depth); 86 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp); 87 88 /* 89 * There are many degenerate situations where an extreme rate of console 90 * output can occur from warnings and errors. Make sure this output does 91 * not impede operations. 92 */ 93 /* 94 static struct krate krate_h2chk = { .freq = 5 }; 95 static struct krate krate_h2me = { .freq = 1 }; 96 static struct krate krate_h2em = { .freq = 1 }; 97 */ 98 99 /* 100 * Basic RBTree for chains (core.rbtree). 101 */ 102 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 103 104 int 105 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 106 { 107 hammer2_key_t c1_beg; 108 hammer2_key_t c1_end; 109 hammer2_key_t c2_beg; 110 hammer2_key_t c2_end; 111 112 /* 113 * Compare chains. Overlaps are not supposed to happen and catch 114 * any software issues early we count overlaps as a match. 115 */ 116 c1_beg = chain1->bref.key; 117 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 118 c2_beg = chain2->bref.key; 119 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 120 121 if (c1_end < c2_beg) /* fully to the left */ 122 return(-1); 123 if (c1_beg > c2_end) /* fully to the right */ 124 return(1); 125 return(0); /* overlap (must not cross edge boundary) */ 126 } 127 128 /* 129 * Assert that a chain has no media data associated with it. 130 */ 131 static __inline void 132 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 133 { 134 KKASSERT(chain->dio == NULL); 135 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 136 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 137 chain->data) { 138 panic("hammer2_chain_assert_no_data: chain %p still has data", 139 chain); 140 } 141 } 142 143 /* 144 * Make a chain visible to the flusher. The flusher operates using a top-down 145 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains, 146 * flushes them, and updates blocks back to the volume root. 147 * 148 * This routine sets the ONFLUSH flag upward from the triggering chain until 149 * it hits an inode root or the volume root. Inode chains serve as inflection 150 * points, requiring the flusher to bridge across trees. Inodes include 151 * regular inodes, PFS roots (pmp->iroot), and the media super root 152 * (spmp->iroot). 153 */ 154 void 155 hammer2_chain_setflush(hammer2_chain_t *chain) 156 { 157 hammer2_chain_t *parent; 158 159 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 160 hammer2_spin_sh(&chain->core.spin); 161 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 162 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 163 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 164 break; 165 if ((parent = chain->parent) == NULL) 166 break; 167 hammer2_spin_sh(&parent->core.spin); 168 hammer2_spin_unsh(&chain->core.spin); 169 chain = parent; 170 } 171 hammer2_spin_unsh(&chain->core.spin); 172 } 173 } 174 175 /* 176 * Allocate a new disconnected chain element representing the specified 177 * bref. chain->refs is set to 1 and the passed bref is copied to 178 * chain->bref. chain->bytes is derived from the bref. 179 * 180 * chain->pmp inherits pmp unless the chain is an inode (other than the 181 * super-root inode). 182 * 183 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 184 */ 185 hammer2_chain_t * 186 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 187 hammer2_blockref_t *bref) 188 { 189 hammer2_chain_t *chain; 190 u_int bytes; 191 192 /* 193 * Special case - radix of 0 indicates a chain that does not 194 * need a data reference (context is completely embedded in the 195 * bref). 196 */ 197 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 198 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 199 else 200 bytes = 0; 201 202 switch(bref->type) { 203 case HAMMER2_BREF_TYPE_INODE: 204 case HAMMER2_BREF_TYPE_INDIRECT: 205 case HAMMER2_BREF_TYPE_DATA: 206 case HAMMER2_BREF_TYPE_DIRENT: 207 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 208 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 209 case HAMMER2_BREF_TYPE_FREEMAP: 210 case HAMMER2_BREF_TYPE_VOLUME: 211 chain = kmalloc_obj(sizeof(*chain), hmp->mchain, 212 M_WAITOK | M_ZERO); 213 atomic_add_long(&hammer2_chain_allocs, 1); 214 break; 215 case HAMMER2_BREF_TYPE_EMPTY: 216 default: 217 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 218 bref->type); 219 break; 220 } 221 222 /* 223 * Initialize the new chain structure. pmp must be set to NULL for 224 * chains belonging to the super-root topology of a device mount. 225 */ 226 if (pmp == hmp->spmp) 227 chain->pmp = NULL; 228 else 229 chain->pmp = pmp; 230 231 chain->hmp = hmp; 232 chain->bref = *bref; 233 chain->bytes = bytes; 234 chain->refs = 1; 235 chain->flags = HAMMER2_CHAIN_ALLOCATED; 236 lockinit(&chain->diolk, "chdio", 0, 0); 237 238 /* 239 * Set the PFS boundary flag if this chain represents a PFS root. 240 */ 241 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 242 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 243 hammer2_chain_init(chain); 244 245 return (chain); 246 } 247 248 /* 249 * A common function to initialize chains including fchain and vchain. 250 */ 251 void 252 hammer2_chain_init(hammer2_chain_t *chain) 253 { 254 RB_INIT(&chain->core.rbtree); /* live chains */ 255 hammer2_mtx_init(&chain->lock, "h2chain"); 256 hammer2_spin_init(&chain->core.spin, "h2chain"); 257 } 258 259 /* 260 * Add a reference to a chain element, preventing its destruction. 261 * 262 * (can be called with spinlock held) 263 */ 264 void 265 hammer2_chain_ref(hammer2_chain_t *chain) 266 { 267 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 268 /* 269 * Just flag that the chain was used and should be recycled 270 * on the LRU if it encounters it later. 271 */ 272 if (chain->flags & HAMMER2_CHAIN_ONLRU) 273 atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 274 275 #if 0 276 /* 277 * REMOVED - reduces contention, lru_list is more heuristical 278 * now. 279 * 280 * 0->non-zero transition must ensure that chain is removed 281 * from the LRU list. 282 * 283 * NOTE: Already holding lru_spin here so we cannot call 284 * hammer2_chain_ref() to get it off lru_list, do 285 * it manually. 286 */ 287 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 288 hammer2_pfs_t *pmp = chain->pmp; 289 hammer2_spin_ex(&pmp->lru_spin); 290 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 291 atomic_add_int(&pmp->lru_count, -1); 292 atomic_clear_int(&chain->flags, 293 HAMMER2_CHAIN_ONLRU); 294 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 295 } 296 hammer2_spin_unex(&pmp->lru_spin); 297 } 298 #endif 299 } 300 } 301 302 /* 303 * Ref a locked chain and force the data to be held across an unlock. 304 * Chain must be currently locked. The user of the chain who desires 305 * to release the hold must call hammer2_chain_lock_unhold() to relock 306 * and unhold the chain, then unlock normally, or may simply call 307 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 308 */ 309 void 310 hammer2_chain_ref_hold(hammer2_chain_t *chain) 311 { 312 atomic_add_int(&chain->lockcnt, 1); 313 hammer2_chain_ref(chain); 314 } 315 316 /* 317 * Insert the chain in the core rbtree. 318 * 319 * Normal insertions are placed in the live rbtree. Insertion of a deleted 320 * chain is a special case used by the flush code that is placed on the 321 * unstaged deleted list to avoid confusing the live view. 322 */ 323 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 324 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 325 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 326 327 static 328 int 329 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 330 int flags, int generation) 331 { 332 hammer2_chain_t *xchain __debugvar; 333 int error = 0; 334 335 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 336 hammer2_spin_ex(&parent->core.spin); 337 338 /* 339 * Interlocked by spinlock, check for race 340 */ 341 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 342 parent->core.generation != generation) { 343 error = HAMMER2_ERROR_EAGAIN; 344 goto failed; 345 } 346 347 /* 348 * Insert chain 349 */ 350 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 351 KASSERT(xchain == NULL, 352 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 353 chain, xchain, chain->bref.key)); 354 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 355 chain->parent = parent; 356 ++parent->core.chain_count; 357 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 358 359 /* 360 * We have to keep track of the effective live-view blockref count 361 * so the create code knows when to push an indirect block. 362 */ 363 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 364 atomic_add_int(&parent->core.live_count, 1); 365 failed: 366 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 367 hammer2_spin_unex(&parent->core.spin); 368 return error; 369 } 370 371 /* 372 * Drop the caller's reference to the chain. When the ref count drops to 373 * zero this function will try to disassociate the chain from its parent and 374 * deallocate it, then recursely drop the parent using the implied ref 375 * from the chain's chain->parent. 376 * 377 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 378 * races an acquisition by another cpu. Therefore we can loop if we are 379 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 380 * race against another drop. 381 */ 382 void 383 hammer2_chain_drop(hammer2_chain_t *chain) 384 { 385 u_int refs; 386 387 KKASSERT(chain->refs > 0); 388 389 while (chain) { 390 refs = chain->refs; 391 cpu_ccfence(); 392 KKASSERT(refs > 0); 393 394 if (refs == 1) { 395 if (hammer2_mtx_ex_try(&chain->lock) == 0) 396 chain = hammer2_chain_lastdrop(chain, 0); 397 /* retry the same chain, or chain from lastdrop */ 398 } else { 399 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 400 break; 401 /* retry the same chain */ 402 } 403 cpu_pause(); 404 } 405 } 406 407 /* 408 * Unhold a held and probably not-locked chain, ensure that the data is 409 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 410 * lock and then simply unlocking the chain. 411 */ 412 void 413 hammer2_chain_unhold(hammer2_chain_t *chain) 414 { 415 u_int lockcnt; 416 int iter = 0; 417 418 for (;;) { 419 lockcnt = chain->lockcnt; 420 cpu_ccfence(); 421 if (lockcnt > 1) { 422 if (atomic_cmpset_int(&chain->lockcnt, 423 lockcnt, lockcnt - 1)) { 424 break; 425 } 426 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) { 427 hammer2_chain_unlock(chain); 428 break; 429 } else { 430 /* 431 * This situation can easily occur on SMP due to 432 * the gap inbetween the 1->0 transition and the 433 * final unlock. We cannot safely block on the 434 * mutex because lockcnt might go above 1. 435 * 436 * XXX Sleep for one tick if it takes too long. 437 */ 438 if (++iter > 1000) { 439 if (iter > 1000 + hz) { 440 kprintf("hammer2: h2race1 %p\n", chain); 441 iter = 1000; 442 } 443 tsleep(&iter, 0, "h2race1", 1); 444 } 445 cpu_pause(); 446 } 447 } 448 } 449 450 void 451 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 452 { 453 hammer2_chain_unhold(chain); 454 hammer2_chain_drop(chain); 455 } 456 457 void 458 hammer2_chain_rehold(hammer2_chain_t *chain) 459 { 460 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED); 461 atomic_add_int(&chain->lockcnt, 1); 462 hammer2_chain_unlock(chain); 463 } 464 465 /* 466 * Handles the (potential) last drop of chain->refs from 1->0. Called with 467 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 468 * possible against refs and lockcnt. We must dispose of the mutex on chain. 469 * 470 * This function returns an unlocked chain for recursive drop or NULL. It 471 * can return the same chain if it determines it has raced another ref. 472 * 473 * -- 474 * 475 * When two chains need to be recursively dropped we use the chain we 476 * would otherwise free to placehold the additional chain. It's a bit 477 * convoluted but we can't just recurse without potentially blowing out 478 * the kernel stack. 479 * 480 * The chain cannot be freed if it has any children. 481 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 482 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 483 * Any dedup registration can remain intact. 484 * 485 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 486 */ 487 static 488 hammer2_chain_t * 489 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth) 490 { 491 hammer2_pfs_t *pmp; 492 hammer2_dev_t *hmp; 493 hammer2_chain_t *parent; 494 hammer2_chain_t *rdrop; 495 496 /* 497 * We need chain's spinlock to interlock the sub-tree test. 498 * We already have chain's mutex, protecting chain->parent. 499 * 500 * Remember that chain->refs can be in flux. 501 */ 502 hammer2_spin_ex(&chain->core.spin); 503 504 if (chain->parent != NULL) { 505 /* 506 * If the chain has a parent the UPDATE bit prevents scrapping 507 * as the chain is needed to properly flush the parent. Try 508 * to complete the 1->0 transition and return NULL. Retry 509 * (return chain) if we are unable to complete the 1->0 510 * transition, else return NULL (nothing more to do). 511 * 512 * If the chain has a parent the MODIFIED bit prevents 513 * scrapping. 514 * 515 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 516 */ 517 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 518 HAMMER2_CHAIN_MODIFIED)) { 519 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 520 hammer2_spin_unex(&chain->core.spin); 521 hammer2_chain_assert_no_data(chain); 522 hammer2_mtx_unlock(&chain->lock); 523 chain = NULL; 524 } else { 525 hammer2_spin_unex(&chain->core.spin); 526 hammer2_mtx_unlock(&chain->lock); 527 } 528 return (chain); 529 } 530 /* spinlock still held */ 531 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME || 532 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) { 533 /* 534 * Retain the static vchain and fchain. Clear bits that 535 * are not relevant. Do not clear the MODIFIED bit, 536 * and certainly do not put it on the delayed-flush queue. 537 */ 538 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 539 } else { 540 /* 541 * The chain has no parent and can be flagged for destruction. 542 * Since it has no parent, UPDATE can also be cleared. 543 */ 544 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 545 if (chain->flags & HAMMER2_CHAIN_UPDATE) 546 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 547 548 /* 549 * If the chain has children we must propagate the DESTROY 550 * flag downward and rip the disconnected topology apart. 551 * This is accomplished by calling hammer2_flush() on the 552 * chain. 553 * 554 * Any dedup is already handled by the underlying DIO, so 555 * we do not have to specifically flush it here. 556 */ 557 if (chain->core.chain_count) { 558 hammer2_spin_unex(&chain->core.spin); 559 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 560 HAMMER2_FLUSH_ALL); 561 hammer2_mtx_unlock(&chain->lock); 562 563 return(chain); /* retry drop */ 564 } 565 566 /* 567 * Otherwise we can scrap the MODIFIED bit if it is set, 568 * and continue along the freeing path. 569 * 570 * Be sure to clean-out any dedup bits. Without a parent 571 * this chain will no longer be visible to the flush code. 572 * Easy check data_off to avoid the volume root. 573 */ 574 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 575 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 576 atomic_add_long(&hammer2_count_modified_chains, -1); 577 if (chain->pmp) 578 hammer2_pfs_memory_wakeup(chain->pmp, -1); 579 } 580 /* spinlock still held */ 581 } 582 583 /* spinlock still held */ 584 585 /* 586 * If any children exist we must leave the chain intact with refs == 0. 587 * They exist because chains are retained below us which have refs or 588 * may require flushing. 589 * 590 * Retry (return chain) if we fail to transition the refs to 0, else 591 * return NULL indication nothing more to do. 592 * 593 * Chains with children are NOT put on the LRU list. 594 */ 595 if (chain->core.chain_count) { 596 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 597 hammer2_spin_unex(&chain->core.spin); 598 hammer2_chain_assert_no_data(chain); 599 hammer2_mtx_unlock(&chain->lock); 600 chain = NULL; 601 } else { 602 hammer2_spin_unex(&chain->core.spin); 603 hammer2_mtx_unlock(&chain->lock); 604 } 605 return (chain); 606 } 607 /* spinlock still held */ 608 /* no chains left under us */ 609 610 /* 611 * chain->core has no children left so no accessors can get to our 612 * chain from there. Now we have to lock the parent core to interlock 613 * remaining possible accessors that might bump chain's refs before 614 * we can safely drop chain's refs with intent to free the chain. 615 */ 616 hmp = chain->hmp; 617 pmp = chain->pmp; /* can be NULL */ 618 rdrop = NULL; 619 620 parent = chain->parent; 621 622 /* 623 * WARNING! chain's spin lock is still held here, and other spinlocks 624 * will be acquired and released in the code below. We 625 * cannot be making fancy procedure calls! 626 */ 627 628 /* 629 * We can cache the chain if it is associated with a pmp 630 * and not flagged as being destroyed or requesting a full 631 * release. In this situation the chain is not removed 632 * from its parent, i.e. it can still be looked up. 633 * 634 * We intentionally do not cache DATA chains because these 635 * were likely used to load data into the logical buffer cache 636 * and will not be accessed again for some time. 637 */ 638 if ((chain->flags & 639 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 640 chain->pmp && 641 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 642 if (parent) 643 hammer2_spin_ex(&parent->core.spin); 644 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 645 /* 646 * 1->0 transition failed, retry. Do not drop 647 * the chain's data yet! 648 */ 649 if (parent) 650 hammer2_spin_unex(&parent->core.spin); 651 hammer2_spin_unex(&chain->core.spin); 652 hammer2_mtx_unlock(&chain->lock); 653 654 return(chain); 655 } 656 657 /* 658 * Success 659 */ 660 hammer2_chain_assert_no_data(chain); 661 662 /* 663 * Make sure we are on the LRU list, clean up excessive 664 * LRU entries. We can only really drop one but there might 665 * be other entries that we can remove from the lru_list 666 * without dropping. 667 * 668 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when 669 * chain->core.spin AND pmp->lru_spin are held, but 670 * can be safely cleared only holding pmp->lru_spin. 671 */ 672 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 673 hammer2_spin_ex(&pmp->lru_spin); 674 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 675 atomic_set_int(&chain->flags, 676 HAMMER2_CHAIN_ONLRU); 677 TAILQ_INSERT_TAIL(&pmp->lru_list, 678 chain, lru_node); 679 atomic_add_int(&pmp->lru_count, 1); 680 } 681 if (pmp->lru_count < HAMMER2_LRU_LIMIT) 682 depth = 1; /* disable lru_list flush */ 683 hammer2_spin_unex(&pmp->lru_spin); 684 } else { 685 /* disable lru flush */ 686 depth = 1; 687 } 688 689 if (parent) { 690 hammer2_spin_unex(&parent->core.spin); 691 parent = NULL; /* safety */ 692 } 693 hammer2_spin_unex(&chain->core.spin); 694 hammer2_mtx_unlock(&chain->lock); 695 696 /* 697 * lru_list hysteresis (see above for depth overrides). 698 * Note that depth also prevents excessive lastdrop recursion. 699 */ 700 if (depth == 0) 701 hammer2_chain_lru_flush(pmp); 702 703 return NULL; 704 /* NOT REACHED */ 705 } 706 707 /* 708 * Make sure we are not on the LRU list. 709 */ 710 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 711 hammer2_spin_ex(&pmp->lru_spin); 712 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 713 atomic_add_int(&pmp->lru_count, -1); 714 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 715 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 716 } 717 hammer2_spin_unex(&pmp->lru_spin); 718 } 719 720 /* 721 * Spinlock the parent and try to drop the last ref on chain. 722 * On success determine if we should dispose of the chain 723 * (remove the chain from its parent, etc). 724 * 725 * (normal core locks are top-down recursive but we define 726 * core spinlocks as bottom-up recursive, so this is safe). 727 */ 728 if (parent) { 729 hammer2_spin_ex(&parent->core.spin); 730 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 731 /* 732 * 1->0 transition failed, retry. 733 */ 734 hammer2_spin_unex(&parent->core.spin); 735 hammer2_spin_unex(&chain->core.spin); 736 hammer2_mtx_unlock(&chain->lock); 737 738 return(chain); 739 } 740 741 /* 742 * 1->0 transition successful, parent spin held to prevent 743 * new lookups, chain spinlock held to protect parent field. 744 * Remove chain from the parent. 745 * 746 * If the chain is being removed from the parent's rbtree but 747 * is not blkmapped, we have to adjust live_count downward. If 748 * it is blkmapped then the blockref is retained in the parent 749 * as is its associated live_count. This case can occur when 750 * a chain added to the topology is unable to flush and is 751 * then later deleted. 752 */ 753 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 754 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 755 (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) { 756 atomic_add_int(&parent->core.live_count, -1); 757 } 758 RB_REMOVE(hammer2_chain_tree, 759 &parent->core.rbtree, chain); 760 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 761 --parent->core.chain_count; 762 chain->parent = NULL; 763 } 764 765 /* 766 * If our chain was the last chain in the parent's core the 767 * core is now empty and its parent might have to be 768 * re-dropped if it has 0 refs. 769 */ 770 if (parent->core.chain_count == 0) { 771 rdrop = parent; 772 atomic_add_int(&rdrop->refs, 1); 773 /* 774 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 775 rdrop = NULL; 776 */ 777 } 778 hammer2_spin_unex(&parent->core.spin); 779 parent = NULL; /* safety */ 780 /* FALL THROUGH */ 781 } else { 782 /* 783 * No-parent case. 784 */ 785 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 786 /* 787 * 1->0 transition failed, retry. 788 */ 789 hammer2_spin_unex(&parent->core.spin); 790 hammer2_spin_unex(&chain->core.spin); 791 hammer2_mtx_unlock(&chain->lock); 792 793 return(chain); 794 } 795 } 796 797 /* 798 * Successful 1->0 transition, no parent, no children... no way for 799 * anyone to ref this chain any more. We can clean-up and free it. 800 * 801 * We still have the core spinlock, and core's chain_count is 0. 802 * Any parent spinlock is gone. 803 */ 804 hammer2_spin_unex(&chain->core.spin); 805 hammer2_chain_assert_no_data(chain); 806 hammer2_mtx_unlock(&chain->lock); 807 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 808 chain->core.chain_count == 0); 809 810 /* 811 * All locks are gone, no pointers remain to the chain, finish 812 * freeing it. 813 */ 814 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 815 HAMMER2_CHAIN_MODIFIED)) == 0); 816 817 /* 818 * Once chain resources are gone we can use the now dead chain 819 * structure to placehold what might otherwise require a recursive 820 * drop, because we have potentially two things to drop and can only 821 * return one directly. 822 */ 823 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 824 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 825 chain->hmp = NULL; 826 kfree_obj(chain, hmp->mchain); 827 atomic_add_long(&hammer2_chain_allocs, -1); 828 } 829 830 /* 831 * Possible chaining loop when parent re-drop needed. 832 */ 833 return(rdrop); 834 } 835 836 /* 837 * Heuristical flush of the LRU, try to reduce the number of entries 838 * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3). This procedure is called 839 * only when lru_count exceeds HAMMER2_LRU_LIMIT. 840 */ 841 static 842 void 843 hammer2_chain_lru_flush(hammer2_pfs_t *pmp) 844 { 845 hammer2_chain_t *chain; 846 847 again: 848 chain = NULL; 849 hammer2_spin_ex(&pmp->lru_spin); 850 while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) { 851 /* 852 * Pick a chain off the lru_list, just recycle it quickly 853 * if LRUHINT is set (the chain was ref'd but left on 854 * the lru_list, so cycle to the end). 855 */ 856 chain = TAILQ_FIRST(&pmp->lru_list); 857 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 858 859 if (chain->flags & HAMMER2_CHAIN_LRUHINT) { 860 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 861 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 862 chain = NULL; 863 continue; 864 } 865 866 /* 867 * Ok, we are off the LRU. We must adjust refs before we 868 * can safely clear the ONLRU flag. 869 */ 870 atomic_add_int(&pmp->lru_count, -1); 871 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 872 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 873 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 874 break; 875 } 876 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 877 chain = NULL; 878 } 879 hammer2_spin_unex(&pmp->lru_spin); 880 if (chain == NULL) 881 return; 882 883 /* 884 * If we picked a chain off the lru list we may be able to lastdrop 885 * it. Use a depth of 1 to prevent excessive lastdrop recursion. 886 */ 887 while (chain) { 888 u_int refs; 889 890 refs = chain->refs; 891 cpu_ccfence(); 892 KKASSERT(refs > 0); 893 894 if (refs == 1) { 895 if (hammer2_mtx_ex_try(&chain->lock) == 0) 896 chain = hammer2_chain_lastdrop(chain, 1); 897 /* retry the same chain, or chain from lastdrop */ 898 } else { 899 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 900 break; 901 /* retry the same chain */ 902 } 903 cpu_pause(); 904 } 905 goto again; 906 } 907 908 /* 909 * On last lock release. 910 */ 911 static hammer2_io_t * 912 hammer2_chain_drop_data(hammer2_chain_t *chain) 913 { 914 hammer2_io_t *dio; 915 916 if ((dio = chain->dio) != NULL) { 917 chain->dio = NULL; 918 chain->data = NULL; 919 } else { 920 switch(chain->bref.type) { 921 case HAMMER2_BREF_TYPE_VOLUME: 922 case HAMMER2_BREF_TYPE_FREEMAP: 923 break; 924 default: 925 if (chain->data != NULL) { 926 hammer2_spin_unex(&chain->core.spin); 927 panic("chain data not null: " 928 "chain %p bref %016jx.%02x " 929 "refs %d parent %p dio %p data %p", 930 chain, chain->bref.data_off, 931 chain->bref.type, chain->refs, 932 chain->parent, 933 chain->dio, chain->data); 934 } 935 KKASSERT(chain->data == NULL); 936 break; 937 } 938 } 939 return dio; 940 } 941 942 /* 943 * Lock a referenced chain element, acquiring its data with I/O if necessary, 944 * and specify how you would like the data to be resolved. 945 * 946 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 947 * 948 * The lock is allowed to recurse, multiple locking ops will aggregate 949 * the requested resolve types. Once data is assigned it will not be 950 * removed until the last unlock. 951 * 952 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 953 * (typically used to avoid device/logical buffer 954 * aliasing for data) 955 * 956 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 957 * the INITIAL-create state (indirect blocks only). 958 * 959 * Do not resolve data elements for DATA chains. 960 * (typically used to avoid device/logical buffer 961 * aliasing for data) 962 * 963 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 964 * 965 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 966 * it will be locked exclusive. 967 * 968 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If 969 * the lock fails, EAGAIN is returned. 970 * 971 * NOTE: Embedded elements (volume header, inodes) are always resolved 972 * regardless. 973 * 974 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 975 * element will instantiate and zero its buffer, and flush it on 976 * release. 977 * 978 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 979 * so as not to instantiate a device buffer, which could alias against 980 * a logical file buffer. However, if ALWAYS is specified the 981 * device buffer will be instantiated anyway. 982 * 983 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which 984 * case it can be either 0 or EAGAIN. 985 * 986 * WARNING! This function blocks on I/O if data needs to be fetched. This 987 * blocking can run concurrent with other compatible lock holders 988 * who do not need data returning. The lock is not upgraded to 989 * exclusive during a data fetch, a separate bit is used to 990 * interlock I/O. However, an exclusive lock holder can still count 991 * on being interlocked against an I/O fetch managed by a shared 992 * lock holder. 993 */ 994 int 995 hammer2_chain_lock(hammer2_chain_t *chain, int how) 996 { 997 KKASSERT(chain->refs > 0); 998 999 if (how & HAMMER2_RESOLVE_NONBLOCK) { 1000 /* 1001 * We still have to bump lockcnt before acquiring the lock, 1002 * even for non-blocking operation, because the unlock code 1003 * live-loops on lockcnt == 1 when dropping the last lock. 1004 * 1005 * If the non-blocking operation fails we have to use an 1006 * unhold sequence to undo the mess. 1007 * 1008 * NOTE: LOCKAGAIN must always succeed without blocking, 1009 * even if NONBLOCK is specified. 1010 */ 1011 atomic_add_int(&chain->lockcnt, 1); 1012 if (how & HAMMER2_RESOLVE_SHARED) { 1013 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1014 hammer2_mtx_sh_again(&chain->lock); 1015 } else { 1016 if (hammer2_mtx_sh_try(&chain->lock) != 0) { 1017 hammer2_chain_unhold(chain); 1018 return EAGAIN; 1019 } 1020 } 1021 } else { 1022 if (hammer2_mtx_ex_try(&chain->lock) != 0) { 1023 hammer2_chain_unhold(chain); 1024 return EAGAIN; 1025 } 1026 } 1027 } else { 1028 /* 1029 * Get the appropriate lock. If LOCKAGAIN is flagged with 1030 * SHARED the caller expects a shared lock to already be 1031 * present and we are giving it another ref. This case must 1032 * importantly not block if there is a pending exclusive lock 1033 * request. 1034 */ 1035 atomic_add_int(&chain->lockcnt, 1); 1036 if (how & HAMMER2_RESOLVE_SHARED) { 1037 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1038 hammer2_mtx_sh_again(&chain->lock); 1039 } else { 1040 hammer2_mtx_sh(&chain->lock); 1041 } 1042 } else { 1043 hammer2_mtx_ex(&chain->lock); 1044 } 1045 } 1046 1047 /* 1048 * If we already have a valid data pointer make sure the data is 1049 * synchronized to the current cpu, and then no further action is 1050 * necessary. 1051 */ 1052 if (chain->data) { 1053 if (chain->dio) 1054 hammer2_io_bkvasync(chain->dio); 1055 return 0; 1056 } 1057 1058 /* 1059 * Do we have to resolve the data? This is generally only 1060 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 1061 * Other BREF types expects the data to be there. 1062 */ 1063 switch(how & HAMMER2_RESOLVE_MASK) { 1064 case HAMMER2_RESOLVE_NEVER: 1065 return 0; 1066 case HAMMER2_RESOLVE_MAYBE: 1067 if (chain->flags & HAMMER2_CHAIN_INITIAL) 1068 return 0; 1069 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 1070 return 0; 1071 #if 0 1072 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 1073 return 0; 1074 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 1075 return 0; 1076 #endif 1077 /* fall through */ 1078 case HAMMER2_RESOLVE_ALWAYS: 1079 default: 1080 break; 1081 } 1082 1083 /* 1084 * Caller requires data 1085 */ 1086 hammer2_chain_load_data(chain); 1087 1088 return 0; 1089 } 1090 1091 #if 0 1092 /* 1093 * Lock the chain, retain the hold, and drop the data persistence count. 1094 * The data should remain valid because we never transitioned lockcnt 1095 * through 0. 1096 */ 1097 void 1098 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 1099 { 1100 hammer2_chain_lock(chain, how); 1101 atomic_add_int(&chain->lockcnt, -1); 1102 } 1103 1104 /* 1105 * Downgrade an exclusive chain lock to a shared chain lock. 1106 * 1107 * NOTE: There is no upgrade equivalent due to the ease of 1108 * deadlocks in that direction. 1109 */ 1110 void 1111 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1112 { 1113 hammer2_mtx_downgrade(&chain->lock); 1114 } 1115 #endif 1116 1117 /* 1118 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1119 * may be of any type. 1120 * 1121 * Once chain->data is set it cannot be disposed of until all locks are 1122 * released. 1123 * 1124 * Make sure the data is synchronized to the current cpu. 1125 */ 1126 void 1127 hammer2_chain_load_data(hammer2_chain_t *chain) 1128 { 1129 hammer2_blockref_t *bref; 1130 hammer2_dev_t *hmp; 1131 hammer2_io_t *dio; 1132 char *bdata; 1133 int error; 1134 1135 /* 1136 * Degenerate case, data already present, or chain has no media 1137 * reference to load. 1138 */ 1139 KKASSERT(chain->lock.mtx_lock & MTX_MASK); 1140 if (chain->data) { 1141 if (chain->dio) 1142 hammer2_io_bkvasync(chain->dio); 1143 return; 1144 } 1145 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1146 return; 1147 1148 hmp = chain->hmp; 1149 KKASSERT(hmp != NULL); 1150 1151 /* 1152 * Gain the IOINPROG bit, interlocked block. 1153 */ 1154 for (;;) { 1155 u_int oflags; 1156 u_int nflags; 1157 1158 oflags = chain->flags; 1159 cpu_ccfence(); 1160 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1161 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1162 tsleep_interlock(&chain->flags, 0); 1163 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1164 tsleep(&chain->flags, PINTERLOCKED, 1165 "h2iocw", 0); 1166 } 1167 /* retry */ 1168 } else { 1169 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1170 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1171 break; 1172 } 1173 /* retry */ 1174 } 1175 } 1176 1177 /* 1178 * We own CHAIN_IOINPROG 1179 * 1180 * Degenerate case if we raced another load. 1181 */ 1182 if (chain->data) { 1183 if (chain->dio) 1184 hammer2_io_bkvasync(chain->dio); 1185 goto done; 1186 } 1187 1188 /* 1189 * We must resolve to a device buffer, either by issuing I/O or 1190 * by creating a zero-fill element. We do not mark the buffer 1191 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1192 * API must still be used to do that). 1193 * 1194 * The device buffer is variable-sized in powers of 2 down 1195 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1196 * chunk always contains buffers of the same size. (XXX) 1197 * 1198 * The minimum physical IO size may be larger than the variable 1199 * block size. 1200 */ 1201 bref = &chain->bref; 1202 1203 /* 1204 * The getblk() optimization can only be used on newly created 1205 * elements if the physical block size matches the request. 1206 */ 1207 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1208 error = hammer2_io_new(hmp, bref->type, 1209 bref->data_off, chain->bytes, 1210 &chain->dio); 1211 } else { 1212 error = hammer2_io_bread(hmp, bref->type, 1213 bref->data_off, chain->bytes, 1214 &chain->dio); 1215 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 1216 } 1217 if (error) { 1218 chain->error = HAMMER2_ERROR_EIO; 1219 kprintf("hammer2_chain_load_data: I/O error %016jx: %d\n", 1220 (intmax_t)bref->data_off, error); 1221 hammer2_io_bqrelse(&chain->dio); 1222 goto done; 1223 } 1224 chain->error = 0; 1225 1226 /* 1227 * This isn't perfect and can be ignored on OSs which do not have 1228 * an indication as to whether a buffer is coming from cache or 1229 * if I/O was actually issued for the read. TESTEDGOOD will work 1230 * pretty well without the B_IOISSUED logic because chains are 1231 * cached, but in that situation (without B_IOISSUED) it will not 1232 * detect whether a re-read via I/O is corrupted verses the original 1233 * read. 1234 * 1235 * We can't re-run the CRC on every fresh lock. That would be 1236 * insanely expensive. 1237 * 1238 * If the underlying kernel buffer covers the entire chain we can 1239 * use the B_IOISSUED indication to determine if we have to re-run 1240 * the CRC on chain data for chains that managed to stay cached 1241 * across the kernel disposal of the original buffer. 1242 */ 1243 if ((dio = chain->dio) != NULL && dio->bp) { 1244 //struct m_buf *bp = dio->bp; 1245 1246 if (dio->psize == chain->bytes //&& 1247 /*(bp->b_flags & B_IOISSUED)*/) { 1248 atomic_clear_int(&chain->flags, 1249 HAMMER2_CHAIN_TESTEDGOOD); 1250 //bp->b_flags &= ~B_IOISSUED; 1251 } 1252 } 1253 1254 /* 1255 * NOTE: A locked chain's data cannot be modified without first 1256 * calling hammer2_chain_modify(). 1257 */ 1258 1259 /* 1260 * NOTE: hammer2_io_data() call issues bkvasync() 1261 */ 1262 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1263 1264 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1265 /* 1266 * Clear INITIAL. In this case we used io_new() and the 1267 * buffer has been zero'd and marked dirty. 1268 * 1269 * CHAIN_MODIFIED has not been set yet, and we leave it 1270 * that way for now. Set a temporary CHAIN_NOTTESTED flag 1271 * to prevent hammer2_chain_testcheck() from trying to match 1272 * a check code that has not yet been generated. This bit 1273 * should NOT end up on the actual media. 1274 */ 1275 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1276 atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 1277 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1278 /* 1279 * check data not currently synchronized due to 1280 * modification. XXX assumes data stays in the buffer 1281 * cache, which might not be true (need biodep on flush 1282 * to calculate crc? or simple crc?). 1283 */ 1284 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1285 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1286 chain->error = HAMMER2_ERROR_CHECK; 1287 } else { 1288 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1289 } 1290 } 1291 1292 /* 1293 * Setup the data pointer by pointing it into the buffer. 1294 * WARNING! Other threads can start using the data the instant we 1295 * set chain->data non-NULL. 1296 */ 1297 switch (bref->type) { 1298 case HAMMER2_BREF_TYPE_VOLUME: 1299 case HAMMER2_BREF_TYPE_FREEMAP: 1300 panic("hammer2_chain_load_data: unresolved volume header"); 1301 break; 1302 case HAMMER2_BREF_TYPE_DIRENT: 1303 KKASSERT(chain->bytes != 0); 1304 /* fall through */ 1305 case HAMMER2_BREF_TYPE_INODE: 1306 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1307 case HAMMER2_BREF_TYPE_INDIRECT: 1308 case HAMMER2_BREF_TYPE_DATA: 1309 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1310 default: 1311 /* 1312 * Point data at the device buffer and leave dio intact. 1313 */ 1314 chain->data = (void *)bdata; 1315 break; 1316 } 1317 1318 /* 1319 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1320 */ 1321 done: 1322 for (;;) { 1323 u_int oflags; 1324 u_int nflags; 1325 1326 oflags = chain->flags; 1327 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1328 HAMMER2_CHAIN_IOSIGNAL); 1329 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1330 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1331 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1332 wakeup(&chain->flags); 1333 break; 1334 } 1335 } 1336 } 1337 1338 /* 1339 * Unlock and deref a chain element. 1340 * 1341 * Remember that the presence of children under chain prevent the chain's 1342 * destruction but do not add additional references, so the dio will still 1343 * be dropped. 1344 */ 1345 void 1346 hammer2_chain_unlock(hammer2_chain_t *chain) 1347 { 1348 hammer2_io_t *dio; 1349 u_int lockcnt; 1350 int iter = 0; 1351 1352 /* 1353 * If multiple locks are present (or being attempted) on this 1354 * particular chain we can just unlock, drop refs, and return. 1355 * 1356 * Otherwise fall-through on the 1->0 transition. 1357 */ 1358 for (;;) { 1359 lockcnt = chain->lockcnt; 1360 KKASSERT(lockcnt > 0); 1361 cpu_ccfence(); 1362 if (lockcnt > 1) { 1363 if (atomic_cmpset_int(&chain->lockcnt, 1364 lockcnt, lockcnt - 1)) { 1365 hammer2_mtx_unlock(&chain->lock); 1366 return; 1367 } 1368 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1369 /* while holding the mutex exclusively */ 1370 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1371 break; 1372 } else { 1373 /* 1374 * This situation can easily occur on SMP due to 1375 * the gap inbetween the 1->0 transition and the 1376 * final unlock. We cannot safely block on the 1377 * mutex because lockcnt might go above 1. 1378 * 1379 * XXX Sleep for one tick if it takes too long. 1380 */ 1381 if (++iter > 1000) { 1382 if (iter > 1000 + hz) { 1383 kprintf("hammer2: h2race2 %p\n", chain); 1384 iter = 1000; 1385 } 1386 tsleep(&iter, 0, "h2race2", 1); 1387 } 1388 cpu_pause(); 1389 } 1390 /* retry */ 1391 } 1392 1393 /* 1394 * Last unlock / mutex upgraded to exclusive. Drop the data 1395 * reference. 1396 */ 1397 dio = hammer2_chain_drop_data(chain); 1398 if (dio) 1399 hammer2_io_bqrelse(&dio); 1400 hammer2_mtx_unlock(&chain->lock); 1401 } 1402 1403 #if 0 1404 /* 1405 * Unlock and hold chain data intact 1406 */ 1407 void 1408 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1409 { 1410 atomic_add_int(&chain->lockcnt, 1); 1411 hammer2_chain_unlock(chain); 1412 } 1413 #endif 1414 1415 /* 1416 * Helper to obtain the blockref[] array base and count for a chain. 1417 * 1418 * XXX Not widely used yet, various use cases need to be validated and 1419 * converted to use this function. 1420 */ 1421 static 1422 hammer2_blockref_t * 1423 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1424 { 1425 hammer2_blockref_t *base; 1426 int count; 1427 1428 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1429 base = NULL; 1430 1431 switch(parent->bref.type) { 1432 case HAMMER2_BREF_TYPE_INODE: 1433 count = HAMMER2_SET_COUNT; 1434 break; 1435 case HAMMER2_BREF_TYPE_INDIRECT: 1436 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1437 count = parent->bytes / sizeof(hammer2_blockref_t); 1438 break; 1439 case HAMMER2_BREF_TYPE_VOLUME: 1440 count = HAMMER2_SET_COUNT; 1441 break; 1442 case HAMMER2_BREF_TYPE_FREEMAP: 1443 count = HAMMER2_SET_COUNT; 1444 break; 1445 default: 1446 panic("hammer2_chain_base_and_count: " 1447 "unrecognized blockref type: %d", 1448 parent->bref.type); 1449 count = 0; 1450 break; 1451 } 1452 } else { 1453 switch(parent->bref.type) { 1454 case HAMMER2_BREF_TYPE_INODE: 1455 base = &parent->data->ipdata.u.blockset.blockref[0]; 1456 count = HAMMER2_SET_COUNT; 1457 break; 1458 case HAMMER2_BREF_TYPE_INDIRECT: 1459 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1460 base = &parent->data->npdata[0]; 1461 count = parent->bytes / sizeof(hammer2_blockref_t); 1462 break; 1463 case HAMMER2_BREF_TYPE_VOLUME: 1464 base = &parent->data->voldata. 1465 sroot_blockset.blockref[0]; 1466 count = HAMMER2_SET_COUNT; 1467 break; 1468 case HAMMER2_BREF_TYPE_FREEMAP: 1469 base = &parent->data->blkset.blockref[0]; 1470 count = HAMMER2_SET_COUNT; 1471 break; 1472 default: 1473 panic("hammer2_chain_base_and_count: " 1474 "unrecognized blockref type: %d", 1475 parent->bref.type); 1476 base = NULL; 1477 count = 0; 1478 break; 1479 } 1480 } 1481 *countp = count; 1482 1483 return base; 1484 } 1485 1486 /* 1487 * This counts the number of live blockrefs in a block array and 1488 * also calculates the point at which all remaining blockrefs are empty. 1489 * This routine can only be called on a live chain. 1490 * 1491 * Caller holds the chain locked, but possibly with a shared lock. We 1492 * must use an exclusive spinlock to prevent corruption. 1493 * 1494 * NOTE: Flag is not set until after the count is complete, allowing 1495 * callers to test the flag without holding the spinlock. 1496 * 1497 * NOTE: If base is NULL the related chain is still in the INITIAL 1498 * state and there are no blockrefs to count. 1499 * 1500 * NOTE: live_count may already have some counts accumulated due to 1501 * creation and deletion and could even be initially negative. 1502 */ 1503 void 1504 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1505 hammer2_blockref_t *base, int count) 1506 { 1507 hammer2_spin_ex(&chain->core.spin); 1508 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1509 if (base) { 1510 while (--count >= 0) { 1511 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1512 break; 1513 } 1514 chain->core.live_zero = count + 1; 1515 while (count >= 0) { 1516 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1517 atomic_add_int(&chain->core.live_count, 1518 1); 1519 --count; 1520 } 1521 } else { 1522 chain->core.live_zero = 0; 1523 } 1524 /* else do not modify live_count */ 1525 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1526 } 1527 hammer2_spin_unex(&chain->core.spin); 1528 } 1529 1530 /* 1531 * Resize the chain's physical storage allocation in-place. This function does 1532 * not usually adjust the data pointer and must be followed by (typically) a 1533 * hammer2_chain_modify() call to copy any old data over and adjust the 1534 * data pointer. 1535 * 1536 * Chains can be resized smaller without reallocating the storage. Resizing 1537 * larger will reallocate the storage. Excess or prior storage is reclaimed 1538 * asynchronously at a later time. 1539 * 1540 * An nradix value of 0 is special-cased to mean that the storage should 1541 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1542 * byte). 1543 * 1544 * Must be passed an exclusively locked parent and chain. 1545 * 1546 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1547 * to avoid instantiating a device buffer that conflicts with the vnode data 1548 * buffer. However, because H2 can compress or encrypt data, the chain may 1549 * have a dio assigned to it in those situations, and they do not conflict. 1550 * 1551 * XXX return error if cannot resize. 1552 */ 1553 int 1554 hammer2_chain_resize(hammer2_chain_t *chain, 1555 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1556 int nradix, int flags) 1557 { 1558 hammer2_dev_t *hmp; 1559 size_t obytes; 1560 size_t nbytes; 1561 int error; 1562 1563 hmp = chain->hmp; 1564 1565 /* 1566 * Only data and indirect blocks can be resized for now. 1567 * (The volu root, inodes, and freemap elements use a fixed size). 1568 */ 1569 KKASSERT(chain != &hmp->vchain); 1570 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1571 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1572 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1573 1574 /* 1575 * Nothing to do if the element is already the proper size 1576 */ 1577 obytes = chain->bytes; 1578 nbytes = (nradix) ? (1U << nradix) : 0; 1579 if (obytes == nbytes) 1580 return (chain->error); 1581 1582 /* 1583 * Make sure the old data is instantiated so we can copy it. If this 1584 * is a data block, the device data may be superfluous since the data 1585 * might be in a logical block, but compressed or encrypted data is 1586 * another matter. 1587 * 1588 * NOTE: The modify will set BLKMAPUPD for us if BLKMAPPED is set. 1589 */ 1590 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1591 if (error) 1592 return error; 1593 1594 /* 1595 * Reallocate the block, even if making it smaller (because different 1596 * block sizes may be in different regions). 1597 * 1598 * NOTE: Operation does not copy the data and may only be used 1599 * to resize data blocks in-place, or directory entry blocks 1600 * which are about to be modified in some manner. 1601 */ 1602 error = hammer2_freemap_alloc(chain, nbytes); 1603 if (error) 1604 return error; 1605 1606 chain->bytes = nbytes; 1607 1608 /* 1609 * We don't want the followup chain_modify() to try to copy data 1610 * from the old (wrong-sized) buffer. It won't know how much to 1611 * copy. This case should only occur during writes when the 1612 * originator already has the data to write in-hand. 1613 */ 1614 if (chain->dio) { 1615 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1616 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1617 hammer2_io_brelse(&chain->dio); 1618 chain->data = NULL; 1619 } 1620 return (chain->error); 1621 } 1622 1623 /* 1624 * Set the chain modified so its data can be changed by the caller, or 1625 * install deduplicated data. The caller must call this routine for each 1626 * set of modifications it makes, even if the chain is already flagged 1627 * MODIFIED. 1628 * 1629 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1630 * is a CLC (cluster level change) field and is not updated by parent 1631 * propagation during a flush. 1632 * 1633 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1634 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1635 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1636 * remains unmodified with its old data ref intact and chain->error 1637 * unchanged. 1638 * 1639 * Dedup Handling 1640 * 1641 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1642 * even if the chain is still flagged MODIFIED. In this case the chain's 1643 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1644 * 1645 * If the caller passes a non-zero dedup_off we will use it to assign the 1646 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1647 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1648 * must not modify the data content upon return. 1649 */ 1650 int 1651 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1652 hammer2_off_t dedup_off, int flags) 1653 { 1654 hammer2_dev_t *hmp; 1655 hammer2_io_t *dio; 1656 int error; 1657 int wasinitial; 1658 int setmodified; 1659 int setupdate; 1660 int newmod; 1661 char *bdata; 1662 1663 hmp = chain->hmp; 1664 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE); 1665 1666 /* 1667 * Data is not optional for freemap chains (we must always be sure 1668 * to copy the data on COW storage allocations). 1669 */ 1670 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1671 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1672 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1673 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1674 } 1675 1676 /* 1677 * Data must be resolved if already assigned, unless explicitly 1678 * flagged otherwise. If we cannot safety load the data the 1679 * modification fails and we return early. 1680 */ 1681 if (chain->data == NULL && chain->bytes != 0 && 1682 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1683 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1684 hammer2_chain_load_data(chain); 1685 if (chain->error) 1686 return (chain->error); 1687 } 1688 error = 0; 1689 1690 /* 1691 * Set MODIFIED to indicate that the chain has been modified. A new 1692 * allocation is required when modifying a chain. 1693 * 1694 * Set UPDATE to ensure that the blockref is updated in the parent. 1695 * 1696 * If MODIFIED is already set determine if we can reuse the assigned 1697 * data block or if we need a new data block. 1698 */ 1699 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1700 /* 1701 * Must set modified bit. 1702 */ 1703 atomic_add_long(&hammer2_count_modified_chains, 1); 1704 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1705 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1706 setmodified = 1; 1707 1708 /* 1709 * We may be able to avoid a copy-on-write if the chain's 1710 * check mode is set to NONE and the chain's current 1711 * modify_tid is beyond the last explicit snapshot tid. 1712 * 1713 * This implements HAMMER2's overwrite-in-place feature. 1714 * 1715 * NOTE! This data-block cannot be used as a de-duplication 1716 * source when the check mode is set to NONE. 1717 */ 1718 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1719 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1720 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1721 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1722 HAMMER2_DEC_CHECK(chain->bref.methods) == 1723 HAMMER2_CHECK_NONE && 1724 chain->pmp && 1725 chain->bref.modify_tid > 1726 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1727 /* 1728 * Sector overwrite allowed. 1729 */ 1730 newmod = 0; 1731 } else if ((hmp->hflags & HMNT2_EMERG) && 1732 chain->pmp && 1733 chain->bref.modify_tid > 1734 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1735 /* 1736 * If in emergency delete mode then do a modify-in- 1737 * place on any chain type belonging to the PFS as 1738 * long as it doesn't mess up a snapshot. We might 1739 * be forced to do this anyway a little further down 1740 * in the code if the allocation fails. 1741 * 1742 * Also note that in emergency mode, these modify-in- 1743 * place operations are NOT SAFE. A storage failure, 1744 * power failure, or panic can corrupt the filesystem. 1745 */ 1746 newmod = 0; 1747 } else { 1748 /* 1749 * Sector overwrite not allowed, must copy-on-write. 1750 */ 1751 newmod = 1; 1752 } 1753 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1754 /* 1755 * If the modified chain was registered for dedup we need 1756 * a new allocation. This only happens for delayed-flush 1757 * chains (i.e. which run through the front-end buffer 1758 * cache). 1759 */ 1760 newmod = 1; 1761 setmodified = 0; 1762 } else { 1763 /* 1764 * Already flagged modified, no new allocation is needed. 1765 */ 1766 newmod = 0; 1767 setmodified = 0; 1768 } 1769 1770 /* 1771 * Flag parent update required. 1772 */ 1773 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1774 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1775 setupdate = 1; 1776 } else { 1777 setupdate = 0; 1778 } 1779 1780 /* 1781 * The XOP code returns held but unlocked focus chains. This 1782 * prevents the chain from being destroyed but does not prevent 1783 * it from being modified. diolk is used to interlock modifications 1784 * against XOP frontend accesses to the focus. 1785 * 1786 * This allows us to theoretically avoid deadlocking the frontend 1787 * if one of the backends lock up by not formally locking the 1788 * focused chain in the frontend. In addition, the synchronization 1789 * code relies on this mechanism to avoid deadlocking concurrent 1790 * synchronization threads. 1791 */ 1792 lockmgr(&chain->diolk, LK_EXCLUSIVE); 1793 1794 /* 1795 * The modification or re-modification requires an allocation and 1796 * possible COW. If an error occurs, the previous content and data 1797 * reference is retained and the modification fails. 1798 * 1799 * If dedup_off is non-zero, the caller is requesting a deduplication 1800 * rather than a modification. The MODIFIED bit is not set and the 1801 * data offset is set to the deduplication offset. The data cannot 1802 * be modified. 1803 * 1804 * NOTE: The dedup offset is allowed to be in a partially free state 1805 * and we must be sure to reset it to a fully allocated state 1806 * to force two bulkfree passes to free it again. 1807 * 1808 * NOTE: Only applicable when chain->bytes != 0. 1809 * 1810 * XXX can a chain already be marked MODIFIED without a data 1811 * assignment? If not, assert here instead of testing the case. 1812 */ 1813 if (chain != &hmp->vchain && chain != &hmp->fchain && 1814 chain->bytes) { 1815 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1816 newmod 1817 ) { 1818 /* 1819 * NOTE: We do not have to remove the dedup 1820 * registration because the area is still 1821 * allocated and the underlying DIO will 1822 * still be flushed. 1823 */ 1824 if (dedup_off) { 1825 chain->bref.data_off = dedup_off; 1826 if ((int)(dedup_off & HAMMER2_OFF_MASK_RADIX)) 1827 chain->bytes = 1 << 1828 (int)(dedup_off & 1829 HAMMER2_OFF_MASK_RADIX); 1830 else 1831 chain->bytes = 0; 1832 chain->error = 0; 1833 atomic_clear_int(&chain->flags, 1834 HAMMER2_CHAIN_MODIFIED); 1835 atomic_add_long(&hammer2_count_modified_chains, 1836 -1); 1837 if (chain->pmp) { 1838 hammer2_pfs_memory_wakeup( 1839 chain->pmp, -1); 1840 } 1841 hammer2_freemap_adjust(hmp, &chain->bref, 1842 HAMMER2_FREEMAP_DORECOVER); 1843 atomic_set_int(&chain->flags, 1844 HAMMER2_CHAIN_DEDUPABLE); 1845 } else { 1846 error = hammer2_freemap_alloc(chain, 1847 chain->bytes); 1848 atomic_clear_int(&chain->flags, 1849 HAMMER2_CHAIN_DEDUPABLE); 1850 1851 /* 1852 * If we are unable to allocate a new block 1853 * but we are in emergency mode, issue a 1854 * warning to the console and reuse the same 1855 * block. 1856 * 1857 * We behave as if the allocation were 1858 * successful. 1859 * 1860 * THIS IS IMPORTANT: These modifications 1861 * are virtually guaranteed to corrupt any 1862 * snapshots related to this filesystem. 1863 */ 1864 if (error && (hmp->hflags & HMNT2_EMERG)) { 1865 error = 0; 1866 chain->bref.flags |= 1867 HAMMER2_BREF_FLAG_EMERG_MIP; 1868 1869 krateprintf(&krate_h2em, 1870 "hammer2: Emergency Mode WARNING: " 1871 "Operation will likely corrupt " 1872 "related snapshot: " 1873 "%016jx.%02x key=%016jx\n", 1874 chain->bref.data_off, 1875 chain->bref.type, 1876 chain->bref.key); 1877 } else if (error == 0) { 1878 chain->bref.flags &= 1879 ~HAMMER2_BREF_FLAG_EMERG_MIP; 1880 } 1881 } 1882 } 1883 } 1884 1885 /* 1886 * Stop here if error. We have to undo any flag bits we might 1887 * have set above. 1888 */ 1889 if (error) { 1890 if (setmodified) { 1891 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1892 atomic_add_long(&hammer2_count_modified_chains, -1); 1893 if (chain->pmp) 1894 hammer2_pfs_memory_wakeup(chain->pmp, -1); 1895 } 1896 if (setupdate) { 1897 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1898 } 1899 lockmgr(&chain->diolk, LK_RELEASE); 1900 1901 return error; 1902 } 1903 1904 /* 1905 * Update mirror_tid and modify_tid. modify_tid is only updated 1906 * if not passed as zero (during flushes, parent propagation passes 1907 * the value 0). 1908 * 1909 * NOTE: chain->pmp could be the device spmp. 1910 */ 1911 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1912 if (mtid) 1913 chain->bref.modify_tid = mtid; 1914 1915 /* 1916 * Set BLKMAPUPD to tell the flush code that an existing blockmap entry 1917 * requires updating as well as to tell the delete code that the 1918 * chain's blockref might not exactly match (in terms of physical size 1919 * or block offset) the one in the parent's blocktable. The base key 1920 * of course will still match. 1921 */ 1922 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) 1923 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD); 1924 1925 /* 1926 * Short-cut data block handling when the caller does not need an 1927 * actual data reference to (aka OPTDATA), as long as the chain does 1928 * not already have a data pointer to the data and no de-duplication 1929 * occurred. 1930 * 1931 * This generally means that the modifications are being done via the 1932 * logical buffer cache. 1933 * 1934 * NOTE: If deduplication occurred we have to run through the data 1935 * stuff to clear INITIAL, and the caller will likely want to 1936 * assign the check code anyway. Leaving INITIAL set on a 1937 * dedup can be deadly (it can cause the block to be zero'd!). 1938 * 1939 * This code also handles bytes == 0 (most dirents). 1940 */ 1941 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1942 (flags & HAMMER2_MODIFY_OPTDATA) && 1943 chain->data == NULL) { 1944 if (dedup_off == 0) { 1945 KKASSERT(chain->dio == NULL); 1946 goto skip2; 1947 } 1948 } 1949 1950 /* 1951 * Clearing the INITIAL flag (for indirect blocks) indicates that 1952 * we've processed the uninitialized storage allocation. 1953 * 1954 * If this flag is already clear we are likely in a copy-on-write 1955 * situation but we have to be sure NOT to bzero the storage if 1956 * no data is present. 1957 * 1958 * Clearing of NOTTESTED is allowed if the MODIFIED bit is set, 1959 */ 1960 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1961 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1962 wasinitial = 1; 1963 } else { 1964 wasinitial = 0; 1965 } 1966 1967 /* 1968 * Instantiate data buffer and possibly execute COW operation 1969 */ 1970 switch(chain->bref.type) { 1971 case HAMMER2_BREF_TYPE_VOLUME: 1972 case HAMMER2_BREF_TYPE_FREEMAP: 1973 /* 1974 * The data is embedded, no copy-on-write operation is 1975 * needed. 1976 */ 1977 KKASSERT(chain->dio == NULL); 1978 break; 1979 case HAMMER2_BREF_TYPE_DIRENT: 1980 /* 1981 * The data might be fully embedded. 1982 */ 1983 if (chain->bytes == 0) { 1984 KKASSERT(chain->dio == NULL); 1985 break; 1986 } 1987 /* fall through */ 1988 case HAMMER2_BREF_TYPE_INODE: 1989 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1990 case HAMMER2_BREF_TYPE_DATA: 1991 case HAMMER2_BREF_TYPE_INDIRECT: 1992 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1993 /* 1994 * Perform the copy-on-write operation 1995 * 1996 * zero-fill or copy-on-write depending on whether 1997 * chain->data exists or not and set the dirty state for 1998 * the new buffer. hammer2_io_new() will handle the 1999 * zero-fill. 2000 * 2001 * If a dedup_off was supplied this is an existing block 2002 * and no COW, copy, or further modification is required. 2003 */ 2004 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 2005 2006 if (wasinitial && dedup_off == 0) { 2007 error = hammer2_io_new(hmp, chain->bref.type, 2008 chain->bref.data_off, 2009 chain->bytes, &dio); 2010 } else { 2011 error = hammer2_io_bread(hmp, chain->bref.type, 2012 chain->bref.data_off, 2013 chain->bytes, &dio); 2014 } 2015 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 2016 2017 /* 2018 * If an I/O error occurs make sure callers cannot accidently 2019 * modify the old buffer's contents and corrupt the filesystem. 2020 * 2021 * NOTE: hammer2_io_data() call issues bkvasync() 2022 */ 2023 if (error) { 2024 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 2025 hmp); 2026 chain->error = HAMMER2_ERROR_EIO; 2027 hammer2_io_brelse(&dio); 2028 hammer2_io_brelse(&chain->dio); 2029 chain->data = NULL; 2030 break; 2031 } 2032 chain->error = 0; 2033 bdata = hammer2_io_data(dio, chain->bref.data_off); 2034 2035 if (chain->data) { 2036 /* 2037 * COW (unless a dedup). 2038 */ 2039 KKASSERT(chain->dio != NULL); 2040 if (chain->data != (void *)bdata && dedup_off == 0) { 2041 bcopy(chain->data, bdata, chain->bytes); 2042 } 2043 } else if (wasinitial == 0 && dedup_off == 0) { 2044 /* 2045 * We have a problem. We were asked to COW but 2046 * we don't have any data to COW with! 2047 */ 2048 panic("hammer2_chain_modify: having a COW %p\n", 2049 chain); 2050 } 2051 2052 /* 2053 * Retire the old buffer, replace with the new. Dirty or 2054 * redirty the new buffer. 2055 * 2056 * WARNING! The system buffer cache may have already flushed 2057 * the buffer, so we must be sure to [re]dirty it 2058 * for further modification. 2059 * 2060 * If dedup_off was supplied, the caller is not 2061 * expected to make any further modification to the 2062 * buffer. 2063 * 2064 * WARNING! hammer2_get_gdata() assumes dio never transitions 2065 * through NULL in order to optimize away unnecessary 2066 * diolk operations. 2067 */ 2068 { 2069 hammer2_io_t *tio; 2070 2071 if ((tio = chain->dio) != NULL) 2072 hammer2_io_bqrelse(&tio); 2073 chain->data = (void *)bdata; 2074 chain->dio = dio; 2075 if (dedup_off == 0) 2076 hammer2_io_setdirty(dio); 2077 } 2078 break; 2079 default: 2080 panic("hammer2_chain_modify: illegal non-embedded type %d", 2081 chain->bref.type); 2082 break; 2083 2084 } 2085 skip2: 2086 /* 2087 * setflush on parent indicating that the parent must recurse down 2088 * to us. Do not call on chain itself which might already have it 2089 * set. 2090 */ 2091 if (chain->parent) 2092 hammer2_chain_setflush(chain->parent); 2093 lockmgr(&chain->diolk, LK_RELEASE); 2094 2095 return (chain->error); 2096 } 2097 2098 /* 2099 * Modify the chain associated with an inode. 2100 */ 2101 int 2102 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 2103 hammer2_tid_t mtid, int flags) 2104 { 2105 int error; 2106 2107 hammer2_inode_modify(ip); 2108 error = hammer2_chain_modify(chain, mtid, 0, flags); 2109 2110 return error; 2111 } 2112 2113 /* 2114 * This function returns the chain at the nearest key within the specified 2115 * range. The returned chain will be referenced but not locked. 2116 * 2117 * This function will recurse through chain->rbtree as necessary and will 2118 * return a *key_nextp suitable for iteration. *key_nextp is only set if 2119 * the iteration value is less than the current value of *key_nextp. 2120 * 2121 * The caller should use (*key_nextp) to calculate the actual range of 2122 * the returned element, which will be (key_beg to *key_nextp - 1), because 2123 * there might be another element which is superior to the returned element 2124 * and overlaps it. 2125 * 2126 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 2127 * chains continue to be returned. On EOF (*key_nextp) may overflow since 2128 * it will wind up being (key_end + 1). 2129 * 2130 * WARNING! Must be called with child's spinlock held. Spinlock remains 2131 * held through the operation. 2132 */ 2133 struct hammer2_chain_find_info { 2134 hammer2_chain_t *best; 2135 hammer2_key_t key_beg; 2136 hammer2_key_t key_end; 2137 hammer2_key_t key_next; 2138 }; 2139 2140 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 2141 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 2142 2143 static 2144 hammer2_chain_t * 2145 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 2146 hammer2_key_t key_beg, hammer2_key_t key_end) 2147 { 2148 struct hammer2_chain_find_info info; 2149 2150 info.best = NULL; 2151 info.key_beg = key_beg; 2152 info.key_end = key_end; 2153 info.key_next = *key_nextp; 2154 2155 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2156 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2157 &info); 2158 *key_nextp = info.key_next; 2159 #if 0 2160 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2161 parent, key_beg, key_end, *key_nextp); 2162 #endif 2163 2164 return (info.best); 2165 } 2166 2167 static 2168 int 2169 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2170 { 2171 struct hammer2_chain_find_info *info = data; 2172 hammer2_key_t child_beg; 2173 hammer2_key_t child_end; 2174 2175 child_beg = child->bref.key; 2176 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2177 2178 if (child_end < info->key_beg) 2179 return(-1); 2180 if (child_beg > info->key_end) 2181 return(1); 2182 return(0); 2183 } 2184 2185 static 2186 int 2187 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2188 { 2189 struct hammer2_chain_find_info *info = data; 2190 hammer2_chain_t *best; 2191 hammer2_key_t child_end; 2192 2193 if ((best = info->best) == NULL) { 2194 /* 2195 * No previous best. Assign best 2196 */ 2197 info->best = child; 2198 } else if (best->bref.key <= info->key_beg && 2199 child->bref.key <= info->key_beg) { 2200 /* 2201 * Illegal overlap. 2202 */ 2203 KKASSERT(0); 2204 /*info->best = child;*/ 2205 } else if (child->bref.key < best->bref.key) { 2206 /* 2207 * Child has a nearer key and best is not flush with key_beg. 2208 * Set best to child. Truncate key_next to the old best key. 2209 */ 2210 info->best = child; 2211 if (info->key_next > best->bref.key || info->key_next == 0) 2212 info->key_next = best->bref.key; 2213 } else if (child->bref.key == best->bref.key) { 2214 /* 2215 * If our current best is flush with the child then this 2216 * is an illegal overlap. 2217 * 2218 * key_next will automatically be limited to the smaller of 2219 * the two end-points. 2220 */ 2221 KKASSERT(0); 2222 info->best = child; 2223 } else { 2224 /* 2225 * Keep the current best but truncate key_next to the child's 2226 * base. 2227 * 2228 * key_next will also automatically be limited to the smaller 2229 * of the two end-points (probably not necessary for this case 2230 * but we do it anyway). 2231 */ 2232 if (info->key_next > child->bref.key || info->key_next == 0) 2233 info->key_next = child->bref.key; 2234 } 2235 2236 /* 2237 * Always truncate key_next based on child's end-of-range. 2238 */ 2239 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2240 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2241 info->key_next = child_end; 2242 2243 return(0); 2244 } 2245 2246 /* 2247 * Retrieve the specified chain from a media blockref, creating the 2248 * in-memory chain structure which reflects it. The returned chain is 2249 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must 2250 * handle crc-checks and so forth, and should check chain->error before 2251 * assuming that the data is good. 2252 * 2253 * To handle insertion races pass the INSERT_RACE flag along with the 2254 * generation number of the core. NULL will be returned if the generation 2255 * number changes before we have a chance to insert the chain. Insert 2256 * races can occur because the parent might be held shared. 2257 * 2258 * Caller must hold the parent locked shared or exclusive since we may 2259 * need the parent's bref array to find our block. 2260 * 2261 * WARNING! chain->pmp is always set to NULL for any chain representing 2262 * part of the super-root topology. 2263 */ 2264 hammer2_chain_t * 2265 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2266 hammer2_blockref_t *bref, int how) 2267 { 2268 hammer2_dev_t *hmp = parent->hmp; 2269 hammer2_chain_t *chain; 2270 int error; 2271 2272 /* 2273 * Allocate a chain structure representing the existing media 2274 * entry. Resulting chain has one ref and is not locked. 2275 */ 2276 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2277 chain = hammer2_chain_alloc(hmp, NULL, bref); 2278 else 2279 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2280 /* ref'd chain returned */ 2281 2282 /* 2283 * Flag that the chain is in the parent's blockmap so delete/flush 2284 * knows what to do with it. 2285 */ 2286 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 2287 2288 /* 2289 * chain must be locked to avoid unexpected ripouts 2290 */ 2291 hammer2_chain_lock(chain, how); 2292 2293 /* 2294 * Link the chain into its parent. A spinlock is required to safely 2295 * access the RBTREE, and it is possible to collide with another 2296 * hammer2_chain_get() operation because the caller might only hold 2297 * a shared lock on the parent. 2298 * 2299 * NOTE: Get races can occur quite often when we distribute 2300 * asynchronous read-aheads across multiple threads. 2301 */ 2302 KKASSERT(parent->refs > 0); 2303 error = hammer2_chain_insert(parent, chain, 2304 HAMMER2_CHAIN_INSERT_SPIN | 2305 HAMMER2_CHAIN_INSERT_RACE, 2306 generation); 2307 if (error) { 2308 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2309 /*kprintf("chain %p get race\n", chain);*/ 2310 hammer2_chain_unlock(chain); 2311 hammer2_chain_drop(chain); 2312 chain = NULL; 2313 } else { 2314 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2315 } 2316 2317 /* 2318 * Return our new chain referenced but not locked, or NULL if 2319 * a race occurred. 2320 */ 2321 return (chain); 2322 } 2323 2324 /* 2325 * Lookup initialization/completion API 2326 */ 2327 hammer2_chain_t * 2328 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2329 { 2330 hammer2_chain_ref(parent); 2331 if (flags & HAMMER2_LOOKUP_SHARED) { 2332 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2333 HAMMER2_RESOLVE_SHARED); 2334 } else { 2335 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2336 } 2337 return (parent); 2338 } 2339 2340 void 2341 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2342 { 2343 if (parent) { 2344 hammer2_chain_unlock(parent); 2345 hammer2_chain_drop(parent); 2346 } 2347 } 2348 2349 /* 2350 * Take the locked chain and return a locked parent. The chain remains 2351 * locked on return, but may have to be temporarily unlocked to acquire 2352 * the parent. Because of this, (chain) must be stable and cannot be 2353 * deleted while it was temporarily unlocked (typically means that (chain) 2354 * is an inode). 2355 * 2356 * Pass HAMMER2_RESOLVE_* flags in flags. 2357 * 2358 * This will work even if the chain is errored, and the caller can check 2359 * parent->error on return if desired since the parent will be locked. 2360 * 2361 * This function handles the lock order reversal. 2362 */ 2363 hammer2_chain_t * 2364 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2365 { 2366 hammer2_chain_t *parent; 2367 2368 /* 2369 * Be careful of order, chain must be unlocked before parent 2370 * is locked below to avoid a deadlock. Try it trivially first. 2371 */ 2372 parent = chain->parent; 2373 if (parent == NULL) 2374 panic("hammer2_chain_getparent: no parent"); 2375 hammer2_chain_ref(parent); 2376 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) 2377 return parent; 2378 2379 for (;;) { 2380 hammer2_chain_unlock(chain); 2381 hammer2_chain_lock(parent, flags); 2382 hammer2_chain_lock(chain, flags); 2383 2384 /* 2385 * Parent relinking races are quite common. We have to get 2386 * it right or we will blow up the block table. 2387 */ 2388 if (chain->parent == parent) 2389 break; 2390 hammer2_chain_unlock(parent); 2391 hammer2_chain_drop(parent); 2392 cpu_ccfence(); 2393 parent = chain->parent; 2394 if (parent == NULL) 2395 panic("hammer2_chain_getparent: no parent"); 2396 hammer2_chain_ref(parent); 2397 } 2398 return parent; 2399 } 2400 2401 /* 2402 * Take the locked chain and return a locked parent. The chain is unlocked 2403 * and dropped. *chainp is set to the returned parent as a convenience. 2404 * Pass HAMMER2_RESOLVE_* flags in flags. 2405 * 2406 * This will work even if the chain is errored, and the caller can check 2407 * parent->error on return if desired since the parent will be locked. 2408 * 2409 * The chain does NOT need to be stable. We use a tracking structure 2410 * to track the expected parent if the chain is deleted out from under us. 2411 * 2412 * This function handles the lock order reversal. 2413 */ 2414 hammer2_chain_t * 2415 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2416 { 2417 hammer2_chain_t *chain; 2418 hammer2_chain_t *parent; 2419 struct hammer2_reptrack reptrack; 2420 struct hammer2_reptrack **repp; 2421 2422 /* 2423 * Be careful of order, chain must be unlocked before parent 2424 * is locked below to avoid a deadlock. Try it trivially first. 2425 */ 2426 chain = *chainp; 2427 parent = chain->parent; 2428 if (parent == NULL) { 2429 hammer2_spin_unex(&chain->core.spin); 2430 panic("hammer2_chain_repparent: no parent"); 2431 } 2432 hammer2_chain_ref(parent); 2433 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) { 2434 hammer2_chain_unlock(chain); 2435 hammer2_chain_drop(chain); 2436 *chainp = parent; 2437 2438 return parent; 2439 } 2440 2441 /* 2442 * Ok, now it gets a bit nasty. There are multiple situations where 2443 * the parent might be in the middle of a deletion, or where the child 2444 * (chain) might be deleted the instant we let go of its lock. 2445 * We can potentially end up in a no-win situation! 2446 * 2447 * In particular, the indirect_maintenance() case can cause these 2448 * situations. 2449 * 2450 * To deal with this we install a reptrack structure in the parent 2451 * This reptrack structure 'owns' the parent ref and will automatically 2452 * migrate to the parent's parent if the parent is deleted permanently. 2453 */ 2454 hammer2_spin_init(&reptrack.spin, "h2reptrk"); 2455 reptrack.chain = parent; 2456 hammer2_chain_ref(parent); /* for the reptrack */ 2457 2458 hammer2_spin_ex(&parent->core.spin); 2459 reptrack.next = parent->core.reptrack; 2460 parent->core.reptrack = &reptrack; 2461 hammer2_spin_unex(&parent->core.spin); 2462 2463 hammer2_chain_unlock(chain); 2464 hammer2_chain_drop(chain); 2465 chain = NULL; /* gone */ 2466 2467 /* 2468 * At the top of this loop, chain is gone and parent is refd both 2469 * by us explicitly AND via our reptrack. We are attempting to 2470 * lock parent. 2471 */ 2472 for (;;) { 2473 hammer2_chain_lock(parent, flags); 2474 2475 if (reptrack.chain == parent) 2476 break; 2477 hammer2_chain_unlock(parent); 2478 hammer2_chain_drop(parent); 2479 2480 kprintf("hammer2: debug REPTRACK %p->%p\n", 2481 parent, reptrack.chain); 2482 hammer2_spin_ex(&reptrack.spin); 2483 parent = reptrack.chain; 2484 hammer2_chain_ref(parent); 2485 hammer2_spin_unex(&reptrack.spin); 2486 } 2487 2488 /* 2489 * Once parent is locked and matches our reptrack, our reptrack 2490 * will be stable and we have our parent. We can unlink our 2491 * reptrack. 2492 * 2493 * WARNING! Remember that the chain lock might be shared. Chains 2494 * locked shared have stable parent linkages. 2495 */ 2496 hammer2_spin_ex(&parent->core.spin); 2497 repp = &parent->core.reptrack; 2498 while (*repp != &reptrack) 2499 repp = &(*repp)->next; 2500 *repp = reptrack.next; 2501 hammer2_spin_unex(&parent->core.spin); 2502 2503 hammer2_chain_drop(parent); /* reptrack ref */ 2504 *chainp = parent; /* return parent lock+ref */ 2505 2506 return parent; 2507 } 2508 2509 /* 2510 * Dispose of any linked reptrack structures in (chain) by shifting them to 2511 * (parent). Both (chain) and (parent) must be exclusively locked. 2512 * 2513 * This is interlocked against any children of (chain) on the other side. 2514 * No children so remain as-of when this is called so we can test 2515 * core.reptrack without holding the spin-lock. 2516 * 2517 * Used whenever the caller intends to permanently delete chains related 2518 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE), 2519 * where the chains underneath the node being deleted are given a new parent 2520 * above the node being deleted. 2521 */ 2522 static 2523 void 2524 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain) 2525 { 2526 struct hammer2_reptrack *reptrack; 2527 2528 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)); 2529 while (chain->core.reptrack) { 2530 hammer2_spin_ex(&parent->core.spin); 2531 hammer2_spin_ex(&chain->core.spin); 2532 reptrack = chain->core.reptrack; 2533 if (reptrack == NULL) { 2534 hammer2_spin_unex(&chain->core.spin); 2535 hammer2_spin_unex(&parent->core.spin); 2536 break; 2537 } 2538 hammer2_spin_ex(&reptrack->spin); 2539 chain->core.reptrack = reptrack->next; 2540 reptrack->chain = parent; 2541 reptrack->next = parent->core.reptrack; 2542 parent->core.reptrack = reptrack; 2543 hammer2_chain_ref(parent); /* reptrack */ 2544 2545 hammer2_spin_unex(&chain->core.spin); 2546 hammer2_spin_unex(&parent->core.spin); 2547 kprintf("hammer2: debug repchange %p %p->%p\n", 2548 reptrack, chain, parent); 2549 hammer2_chain_drop(chain); /* reptrack */ 2550 } 2551 } 2552 2553 /* 2554 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2555 * (*parentp) typically points to an inode but can also point to a related 2556 * indirect block and this function will recurse upwards and find the inode 2557 * or the nearest undeleted indirect block covering the key range. 2558 * 2559 * This function unconditionally sets *errorp, replacing any previous value. 2560 * 2561 * (*parentp) must be exclusive or shared locked (depending on flags) and 2562 * referenced and can be an inode or an existing indirect block within the 2563 * inode. 2564 * 2565 * If (*parent) is errored out, this function will not attempt to recurse 2566 * the radix tree and will return NULL along with an appropriate *errorp. 2567 * If NULL is returned and *errorp is 0, the requested lookup could not be 2568 * located. 2569 * 2570 * On return (*parentp) will be modified to point at the deepest parent chain 2571 * element encountered during the search, as a helper for an insertion or 2572 * deletion. 2573 * 2574 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2575 * and referenced, and the old will be unlocked and dereferenced (no change 2576 * if they are both the same). This is particularly important if the caller 2577 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2578 * is returned, as long as no error occurred. 2579 * 2580 * The matching chain will be returned locked according to flags. 2581 * 2582 * -- 2583 * 2584 * NULL is returned if no match was found, but (*parentp) will still 2585 * potentially be adjusted. 2586 * 2587 * On return (*key_nextp) will point to an iterative value for key_beg. 2588 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2589 * 2590 * This function will also recurse up the chain if the key is not within the 2591 * current parent's range. (*parentp) can never be set to NULL. An iteration 2592 * can simply allow (*parentp) to float inside the loop. 2593 * 2594 * NOTE! chain->data is not always resolved. By default it will not be 2595 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2596 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2597 * BREF_TYPE_DATA as the device buffer can alias the logical file 2598 * buffer). 2599 */ 2600 hammer2_chain_t * 2601 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2602 hammer2_key_t key_beg, hammer2_key_t key_end, 2603 int *errorp, int flags) 2604 { 2605 hammer2_chain_t *parent; 2606 hammer2_chain_t *chain; 2607 hammer2_blockref_t *base; 2608 hammer2_blockref_t *bref; 2609 hammer2_blockref_t bsave; 2610 hammer2_key_t scan_beg; 2611 hammer2_key_t scan_end; 2612 int count = 0; 2613 int how_always = HAMMER2_RESOLVE_ALWAYS; 2614 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2615 int how; 2616 int generation; 2617 int maxloops = 300000; 2618 2619 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2620 how_maybe = how_always; 2621 how = HAMMER2_RESOLVE_ALWAYS; 2622 } else if (flags & HAMMER2_LOOKUP_NODATA) { 2623 how = HAMMER2_RESOLVE_NEVER; 2624 } else { 2625 how = HAMMER2_RESOLVE_MAYBE; 2626 } 2627 if (flags & HAMMER2_LOOKUP_SHARED) { 2628 how_maybe |= HAMMER2_RESOLVE_SHARED; 2629 how_always |= HAMMER2_RESOLVE_SHARED; 2630 how |= HAMMER2_RESOLVE_SHARED; 2631 } 2632 2633 /* 2634 * Recurse (*parentp) upward if necessary until the parent completely 2635 * encloses the key range or we hit the inode. 2636 * 2637 * Handle races against the flusher deleting indirect nodes on its 2638 * way back up by continuing to recurse upward past the deletion. 2639 */ 2640 parent = *parentp; 2641 *errorp = 0; 2642 2643 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2644 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2645 scan_beg = parent->bref.key; 2646 scan_end = scan_beg + 2647 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2648 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2649 if (key_beg >= scan_beg && key_end <= scan_end) 2650 break; 2651 } 2652 parent = hammer2_chain_repparent(parentp, how_maybe); 2653 } 2654 again: 2655 if (--maxloops == 0) 2656 panic("hammer2_chain_lookup: maxloops"); 2657 2658 /* 2659 * MATCHIND case that does not require parent->data (do prior to 2660 * parent->error check). 2661 */ 2662 switch(parent->bref.type) { 2663 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2664 case HAMMER2_BREF_TYPE_INDIRECT: 2665 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2666 scan_beg = parent->bref.key; 2667 scan_end = scan_beg + 2668 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2669 if (key_beg == scan_beg && key_end == scan_end) { 2670 chain = parent; 2671 hammer2_chain_ref(chain); 2672 hammer2_chain_lock(chain, how_maybe); 2673 *key_nextp = scan_end + 1; 2674 goto done; 2675 } 2676 } 2677 break; 2678 default: 2679 break; 2680 } 2681 2682 /* 2683 * No lookup is possible if the parent is errored. We delayed 2684 * this check as long as we could to ensure that the parent backup, 2685 * embedded data, and MATCHIND code could still execute. 2686 */ 2687 if (parent->error) { 2688 *errorp = parent->error; 2689 return NULL; 2690 } 2691 2692 /* 2693 * Locate the blockref array. Currently we do a fully associative 2694 * search through the array. 2695 */ 2696 switch(parent->bref.type) { 2697 case HAMMER2_BREF_TYPE_INODE: 2698 /* 2699 * Special shortcut for embedded data returns the inode 2700 * itself. Callers must detect this condition and access 2701 * the embedded data (the strategy code does this for us). 2702 * 2703 * This is only applicable to regular files and softlinks. 2704 * 2705 * We need a second lock on parent. Since we already have 2706 * a lock we must pass LOCKAGAIN to prevent unexpected 2707 * blocking (we don't want to block on a second shared 2708 * ref if an exclusive lock is pending) 2709 */ 2710 if (parent->data->ipdata.meta.op_flags & 2711 HAMMER2_OPFLAG_DIRECTDATA) { 2712 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2713 chain = NULL; 2714 *key_nextp = key_end + 1; 2715 goto done; 2716 } 2717 hammer2_chain_ref(parent); 2718 hammer2_chain_lock(parent, how_always | 2719 HAMMER2_RESOLVE_LOCKAGAIN); 2720 *key_nextp = key_end + 1; 2721 return (parent); 2722 } 2723 base = &parent->data->ipdata.u.blockset.blockref[0]; 2724 count = HAMMER2_SET_COUNT; 2725 break; 2726 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2727 case HAMMER2_BREF_TYPE_INDIRECT: 2728 /* 2729 * Optimize indirect blocks in the INITIAL state to avoid 2730 * I/O. 2731 * 2732 * Debugging: Enter permanent wait state instead of 2733 * panicing on unexpectedly NULL data for the moment. 2734 */ 2735 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2736 base = NULL; 2737 } else { 2738 if (parent->data == NULL) { 2739 kprintf("hammer2: unexpected NULL data " 2740 "on %p\n", parent); 2741 while (1) 2742 tsleep(parent, 0, "xxx", 0); 2743 } 2744 base = &parent->data->npdata[0]; 2745 } 2746 count = parent->bytes / sizeof(hammer2_blockref_t); 2747 break; 2748 case HAMMER2_BREF_TYPE_VOLUME: 2749 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2750 count = HAMMER2_SET_COUNT; 2751 break; 2752 case HAMMER2_BREF_TYPE_FREEMAP: 2753 base = &parent->data->blkset.blockref[0]; 2754 count = HAMMER2_SET_COUNT; 2755 break; 2756 default: 2757 panic("hammer2_chain_lookup: unrecognized " 2758 "blockref(B) type: %d", 2759 parent->bref.type); 2760 base = NULL; /* safety */ 2761 count = 0; /* safety */ 2762 break; 2763 } 2764 2765 /* 2766 * Merged scan to find next candidate. 2767 * 2768 * hammer2_base_*() functions require the parent->core.live_* fields 2769 * to be synchronized. 2770 * 2771 * We need to hold the spinlock to access the block array and RB tree 2772 * and to interlock chain creation. 2773 */ 2774 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2775 hammer2_chain_countbrefs(parent, base, count); 2776 2777 /* 2778 * Combined search 2779 */ 2780 hammer2_spin_ex(&parent->core.spin); 2781 chain = hammer2_combined_find(parent, base, count, 2782 key_nextp, 2783 key_beg, key_end, 2784 &bref); 2785 generation = parent->core.generation; 2786 2787 /* 2788 * Exhausted parent chain, iterate. 2789 */ 2790 if (bref == NULL) { 2791 KKASSERT(chain == NULL); 2792 hammer2_spin_unex(&parent->core.spin); 2793 if (key_beg == key_end) /* short cut single-key case */ 2794 return (NULL); 2795 2796 /* 2797 * Stop if we reached the end of the iteration. 2798 */ 2799 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2800 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2801 return (NULL); 2802 } 2803 2804 /* 2805 * Calculate next key, stop if we reached the end of the 2806 * iteration, otherwise go up one level and loop. 2807 */ 2808 key_beg = parent->bref.key + 2809 ((hammer2_key_t)1 << parent->bref.keybits); 2810 if (key_beg == 0 || key_beg > key_end) 2811 return (NULL); 2812 parent = hammer2_chain_repparent(parentp, how_maybe); 2813 goto again; 2814 } 2815 2816 /* 2817 * Selected from blockref or in-memory chain. 2818 */ 2819 bsave = *bref; 2820 if (chain == NULL) { 2821 hammer2_spin_unex(&parent->core.spin); 2822 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2823 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2824 chain = hammer2_chain_get(parent, generation, 2825 &bsave, how_maybe); 2826 } else { 2827 chain = hammer2_chain_get(parent, generation, 2828 &bsave, how); 2829 } 2830 if (chain == NULL) 2831 goto again; 2832 } else { 2833 hammer2_chain_ref(chain); 2834 hammer2_spin_unex(&parent->core.spin); 2835 2836 /* 2837 * chain is referenced but not locked. We must lock the 2838 * chain to obtain definitive state. 2839 */ 2840 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2841 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2842 hammer2_chain_lock(chain, how_maybe); 2843 } else { 2844 hammer2_chain_lock(chain, how); 2845 } 2846 KKASSERT(chain->parent == parent); 2847 } 2848 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 2849 chain->parent != parent) { 2850 hammer2_chain_unlock(chain); 2851 hammer2_chain_drop(chain); 2852 chain = NULL; /* SAFETY */ 2853 goto again; 2854 } 2855 2856 2857 /* 2858 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2859 * 2860 * NOTE: Chain's key range is not relevant as there might be 2861 * one-offs within the range that are not deleted. 2862 * 2863 * NOTE: Lookups can race delete-duplicate because 2864 * delete-duplicate does not lock the parent's core 2865 * (they just use the spinlock on the core). 2866 */ 2867 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2868 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2869 chain->bref.data_off, chain->bref.type, 2870 chain->bref.key); 2871 hammer2_chain_unlock(chain); 2872 hammer2_chain_drop(chain); 2873 chain = NULL; /* SAFETY */ 2874 key_beg = *key_nextp; 2875 if (key_beg == 0 || key_beg > key_end) 2876 return(NULL); 2877 goto again; 2878 } 2879 2880 /* 2881 * If the chain element is an indirect block it becomes the new 2882 * parent and we loop on it. We must maintain our top-down locks 2883 * to prevent the flusher from interfering (i.e. doing a 2884 * delete-duplicate and leaving us recursing down a deleted chain). 2885 * 2886 * The parent always has to be locked with at least RESOLVE_MAYBE 2887 * so we can access its data. It might need a fixup if the caller 2888 * passed incompatible flags. Be careful not to cause a deadlock 2889 * as a data-load requires an exclusive lock. 2890 * 2891 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2892 * range is within the requested key range we return the indirect 2893 * block and do NOT loop. This is usually only used to acquire 2894 * freemap nodes. 2895 */ 2896 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2897 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2898 hammer2_chain_unlock(parent); 2899 hammer2_chain_drop(parent); 2900 *parentp = parent = chain; 2901 chain = NULL; /* SAFETY */ 2902 goto again; 2903 } 2904 done: 2905 /* 2906 * All done, return the locked chain. 2907 * 2908 * If the caller does not want a locked chain, replace the lock with 2909 * a ref. Perhaps this can eventually be optimized to not obtain the 2910 * lock in the first place for situations where the data does not 2911 * need to be resolved. 2912 * 2913 * NOTE! A chain->error must be tested by the caller upon return. 2914 * *errorp is only set based on issues which occur while 2915 * trying to reach the chain. 2916 */ 2917 return (chain); 2918 } 2919 2920 /* 2921 * After having issued a lookup we can iterate all matching keys. 2922 * 2923 * If chain is non-NULL we continue the iteration from just after it's index. 2924 * 2925 * If chain is NULL we assume the parent was exhausted and continue the 2926 * iteration at the next parent. 2927 * 2928 * If a fatal error occurs (typically an I/O error), a dummy chain is 2929 * returned with chain->error and error-identifying information set. This 2930 * chain will assert if you try to do anything fancy with it. 2931 * 2932 * XXX Depending on where the error occurs we should allow continued iteration. 2933 * 2934 * parent must be locked on entry and remains locked throughout. chain's 2935 * lock status must match flags. Chain is always at least referenced. 2936 * 2937 * WARNING! The MATCHIND flag does not apply to this function. 2938 */ 2939 hammer2_chain_t * 2940 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2941 hammer2_key_t *key_nextp, 2942 hammer2_key_t key_beg, hammer2_key_t key_end, 2943 int *errorp, int flags) 2944 { 2945 hammer2_chain_t *parent; 2946 int how_maybe; 2947 2948 /* 2949 * Calculate locking flags for upward recursion. 2950 */ 2951 how_maybe = HAMMER2_RESOLVE_MAYBE; 2952 if (flags & HAMMER2_LOOKUP_SHARED) 2953 how_maybe |= HAMMER2_RESOLVE_SHARED; 2954 2955 parent = *parentp; 2956 *errorp = 0; 2957 2958 /* 2959 * Calculate the next index and recalculate the parent if necessary. 2960 */ 2961 if (chain) { 2962 key_beg = chain->bref.key + 2963 ((hammer2_key_t)1 << chain->bref.keybits); 2964 hammer2_chain_unlock(chain); 2965 hammer2_chain_drop(chain); 2966 2967 /* 2968 * chain invalid past this point, but we can still do a 2969 * pointer comparison w/parent. 2970 * 2971 * Any scan where the lookup returned degenerate data embedded 2972 * in the inode has an invalid index and must terminate. 2973 */ 2974 if (chain == parent) 2975 return(NULL); 2976 if (key_beg == 0 || key_beg > key_end) 2977 return(NULL); 2978 chain = NULL; 2979 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2980 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2981 /* 2982 * We reached the end of the iteration. 2983 */ 2984 return (NULL); 2985 } else { 2986 /* 2987 * Continue iteration with next parent unless the current 2988 * parent covers the range. 2989 * 2990 * (This also handles the case of a deleted, empty indirect 2991 * node). 2992 */ 2993 key_beg = parent->bref.key + 2994 ((hammer2_key_t)1 << parent->bref.keybits); 2995 if (key_beg == 0 || key_beg > key_end) 2996 return (NULL); 2997 parent = hammer2_chain_repparent(parentp, how_maybe); 2998 } 2999 3000 /* 3001 * And execute 3002 */ 3003 return (hammer2_chain_lookup(parentp, key_nextp, 3004 key_beg, key_end, 3005 errorp, flags)); 3006 } 3007 3008 /* 3009 * Caller wishes to iterate chains under parent, loading new chains into 3010 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 3011 * then call hammer2_chain_scan() repeatedly until a non-zero return. 3012 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 3013 * with the returned chain for the scan. The returned *chainp will be 3014 * locked and referenced. Any prior contents will be unlocked and dropped. 3015 * 3016 * Caller should check the return value. A normal scan EOF will return 3017 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 3018 * error trying to access parent data. Any error in the returned chain 3019 * must be tested separately by the caller. 3020 * 3021 * (*chainp) is dropped on each scan, but will only be set if the returned 3022 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 3023 * returned via *chainp. The caller will get their bref only. 3024 * 3025 * The raw scan function is similar to lookup/next but does not seek to a key. 3026 * Blockrefs are iterated via first_bref = (parent, NULL) and 3027 * next_chain = (parent, bref). 3028 * 3029 * The passed-in parent must be locked and its data resolved. The function 3030 * nominally returns a locked and referenced *chainp != NULL for chains 3031 * the caller might need to recurse on (and will dipose of any *chainp passed 3032 * in). The caller must check the chain->bref.type either way. 3033 */ 3034 int 3035 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 3036 hammer2_blockref_t *bref, int *firstp, 3037 int flags) 3038 { 3039 hammer2_blockref_t *base; 3040 hammer2_blockref_t *bref_ptr; 3041 hammer2_key_t key; 3042 hammer2_key_t next_key; 3043 hammer2_chain_t *chain = NULL; 3044 int count = 0; 3045 int how; 3046 int generation; 3047 int maxloops = 300000; 3048 int error; 3049 3050 error = 0; 3051 3052 /* 3053 * Scan flags borrowed from lookup. 3054 */ 3055 if (flags & HAMMER2_LOOKUP_ALWAYS) { 3056 how = HAMMER2_RESOLVE_ALWAYS; 3057 } else if (flags & HAMMER2_LOOKUP_NODATA) { 3058 how = HAMMER2_RESOLVE_NEVER; 3059 } else { 3060 how = HAMMER2_RESOLVE_MAYBE; 3061 } 3062 if (flags & HAMMER2_LOOKUP_SHARED) { 3063 how |= HAMMER2_RESOLVE_SHARED; 3064 } 3065 3066 /* 3067 * Calculate key to locate first/next element, unlocking the previous 3068 * element as we go. Be careful, the key calculation can overflow. 3069 * 3070 * (also reset bref to NULL) 3071 */ 3072 if (*firstp) { 3073 key = 0; 3074 *firstp = 0; 3075 } else { 3076 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 3077 if ((chain = *chainp) != NULL) { 3078 *chainp = NULL; 3079 hammer2_chain_unlock(chain); 3080 hammer2_chain_drop(chain); 3081 chain = NULL; 3082 } 3083 if (key == 0) { 3084 error |= HAMMER2_ERROR_EOF; 3085 goto done; 3086 } 3087 } 3088 3089 again: 3090 if (parent->error) { 3091 error = parent->error; 3092 goto done; 3093 } 3094 if (--maxloops == 0) 3095 panic("hammer2_chain_scan: maxloops"); 3096 3097 /* 3098 * Locate the blockref array. Currently we do a fully associative 3099 * search through the array. 3100 */ 3101 switch(parent->bref.type) { 3102 case HAMMER2_BREF_TYPE_INODE: 3103 /* 3104 * An inode with embedded data has no sub-chains. 3105 * 3106 * WARNING! Bulk scan code may pass a static chain marked 3107 * as BREF_TYPE_INODE with a copy of the volume 3108 * root blockset to snapshot the volume. 3109 */ 3110 if (parent->data->ipdata.meta.op_flags & 3111 HAMMER2_OPFLAG_DIRECTDATA) { 3112 error |= HAMMER2_ERROR_EOF; 3113 goto done; 3114 } 3115 base = &parent->data->ipdata.u.blockset.blockref[0]; 3116 count = HAMMER2_SET_COUNT; 3117 break; 3118 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3119 case HAMMER2_BREF_TYPE_INDIRECT: 3120 /* 3121 * Optimize indirect blocks in the INITIAL state to avoid 3122 * I/O. 3123 */ 3124 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3125 base = NULL; 3126 } else { 3127 if (parent->data == NULL) 3128 panic("parent->data is NULL"); 3129 base = &parent->data->npdata[0]; 3130 } 3131 count = parent->bytes / sizeof(hammer2_blockref_t); 3132 break; 3133 case HAMMER2_BREF_TYPE_VOLUME: 3134 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3135 count = HAMMER2_SET_COUNT; 3136 break; 3137 case HAMMER2_BREF_TYPE_FREEMAP: 3138 base = &parent->data->blkset.blockref[0]; 3139 count = HAMMER2_SET_COUNT; 3140 break; 3141 default: 3142 panic("hammer2_chain_scan: unrecognized blockref type: %d", 3143 parent->bref.type); 3144 base = NULL; /* safety */ 3145 count = 0; /* safety */ 3146 break; 3147 } 3148 3149 /* 3150 * Merged scan to find next candidate. 3151 * 3152 * hammer2_base_*() functions require the parent->core.live_* fields 3153 * to be synchronized. 3154 * 3155 * We need to hold the spinlock to access the block array and RB tree 3156 * and to interlock chain creation. 3157 */ 3158 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3159 hammer2_chain_countbrefs(parent, base, count); 3160 3161 next_key = 0; 3162 bref_ptr = NULL; 3163 hammer2_spin_ex(&parent->core.spin); 3164 chain = hammer2_combined_find(parent, base, count, 3165 &next_key, 3166 key, HAMMER2_KEY_MAX, 3167 &bref_ptr); 3168 generation = parent->core.generation; 3169 3170 /* 3171 * Exhausted parent chain, we're done. 3172 */ 3173 if (bref_ptr == NULL) { 3174 hammer2_spin_unex(&parent->core.spin); 3175 KKASSERT(chain == NULL); 3176 error |= HAMMER2_ERROR_EOF; 3177 goto done; 3178 } 3179 3180 /* 3181 * Copy into the supplied stack-based blockref. 3182 */ 3183 *bref = *bref_ptr; 3184 3185 /* 3186 * Selected from blockref or in-memory chain. 3187 */ 3188 if (chain == NULL) { 3189 switch(bref->type) { 3190 case HAMMER2_BREF_TYPE_INODE: 3191 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3192 case HAMMER2_BREF_TYPE_INDIRECT: 3193 case HAMMER2_BREF_TYPE_VOLUME: 3194 case HAMMER2_BREF_TYPE_FREEMAP: 3195 /* 3196 * Recursion, always get the chain 3197 */ 3198 hammer2_spin_unex(&parent->core.spin); 3199 chain = hammer2_chain_get(parent, generation, 3200 bref, how); 3201 if (chain == NULL) 3202 goto again; 3203 break; 3204 default: 3205 /* 3206 * No recursion, do not waste time instantiating 3207 * a chain, just iterate using the bref. 3208 */ 3209 hammer2_spin_unex(&parent->core.spin); 3210 break; 3211 } 3212 } else { 3213 /* 3214 * Recursion or not we need the chain in order to supply 3215 * the bref. 3216 */ 3217 hammer2_chain_ref(chain); 3218 hammer2_spin_unex(&parent->core.spin); 3219 hammer2_chain_lock(chain, how); 3220 } 3221 if (chain && 3222 (bcmp(bref, &chain->bref, sizeof(*bref)) || 3223 chain->parent != parent)) { 3224 hammer2_chain_unlock(chain); 3225 hammer2_chain_drop(chain); 3226 chain = NULL; 3227 goto again; 3228 } 3229 3230 /* 3231 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 3232 * 3233 * NOTE: chain's key range is not relevant as there might be 3234 * one-offs within the range that are not deleted. 3235 * 3236 * NOTE: XXX this could create problems with scans used in 3237 * situations other than mount-time recovery. 3238 * 3239 * NOTE: Lookups can race delete-duplicate because 3240 * delete-duplicate does not lock the parent's core 3241 * (they just use the spinlock on the core). 3242 */ 3243 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3244 hammer2_chain_unlock(chain); 3245 hammer2_chain_drop(chain); 3246 chain = NULL; 3247 3248 key = next_key; 3249 if (key == 0) { 3250 error |= HAMMER2_ERROR_EOF; 3251 goto done; 3252 } 3253 goto again; 3254 } 3255 3256 done: 3257 /* 3258 * All done, return the bref or NULL, supply chain if necessary. 3259 */ 3260 if (chain) 3261 *chainp = chain; 3262 return (error); 3263 } 3264 3265 /* 3266 * Create and return a new hammer2 system memory structure of the specified 3267 * key, type and size and insert it under (*parentp). This is a full 3268 * insertion, based on the supplied key/keybits, and may involve creating 3269 * indirect blocks and moving other chains around via delete/duplicate. 3270 * 3271 * This call can be made with parent == NULL as long as a non -1 methods 3272 * is supplied. hmp must also be supplied in this situation (otherwise 3273 * hmp is extracted from the supplied parent). The chain will be detached 3274 * from the topology. A later call with both parent and chain can be made 3275 * to attach it. 3276 * 3277 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3278 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3279 * FULL. This typically means that the caller is creating the chain after 3280 * doing a hammer2_chain_lookup(). 3281 * 3282 * (*parentp) must be exclusive locked and may be replaced on return 3283 * depending on how much work the function had to do. 3284 * 3285 * (*parentp) must not be errored or this function will assert. 3286 * 3287 * (*chainp) usually starts out NULL and returns the newly created chain, 3288 * but if the caller desires the caller may allocate a disconnected chain 3289 * and pass it in instead. 3290 * 3291 * This function should NOT be used to insert INDIRECT blocks. It is 3292 * typically used to create/insert inodes and data blocks. 3293 * 3294 * Caller must pass-in an exclusively locked parent the new chain is to 3295 * be inserted under, and optionally pass-in a disconnected, exclusively 3296 * locked chain to insert (else we create a new chain). The function will 3297 * adjust (*parentp) as necessary, create or connect the chain, and 3298 * return an exclusively locked chain in *chainp. 3299 * 3300 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3301 * and will be reassigned. 3302 * 3303 * NOTE: returns HAMMER_ERROR_* flags 3304 */ 3305 int 3306 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3307 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods, 3308 hammer2_key_t key, int keybits, int type, size_t bytes, 3309 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3310 { 3311 hammer2_chain_t *chain; 3312 hammer2_chain_t *parent; 3313 hammer2_blockref_t *base; 3314 hammer2_blockref_t dummy; 3315 int allocated = 0; 3316 int error = 0; 3317 int count; 3318 int maxloops = 300000; 3319 3320 /* 3321 * Topology may be crossing a PFS boundary. 3322 */ 3323 parent = *parentp; 3324 if (parent) { 3325 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3326 KKASSERT(parent->error == 0); 3327 hmp = parent->hmp; 3328 } 3329 chain = *chainp; 3330 3331 if (chain == NULL) { 3332 /* 3333 * First allocate media space and construct the dummy bref, 3334 * then allocate the in-memory chain structure. Set the 3335 * INITIAL flag for fresh chains which do not have embedded 3336 * data. 3337 */ 3338 bzero(&dummy, sizeof(dummy)); 3339 dummy.type = type; 3340 dummy.key = key; 3341 dummy.keybits = keybits; 3342 dummy.data_off = hammer2_getradix(bytes); 3343 3344 /* 3345 * Inherit methods from parent by default. Primarily used 3346 * for BREF_TYPE_DATA. Non-data types *must* be set to 3347 * a non-NONE check algorithm. 3348 */ 3349 if (methods == HAMMER2_METH_DEFAULT) 3350 dummy.methods = parent->bref.methods; 3351 else 3352 dummy.methods = (uint8_t)methods; 3353 3354 if (type != HAMMER2_BREF_TYPE_DATA && 3355 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3356 dummy.methods |= 3357 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3358 } 3359 3360 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3361 3362 /* 3363 * Lock the chain manually, chain_lock will load the chain 3364 * which we do NOT want to do. (note: chain->refs is set 3365 * to 1 by chain_alloc() for us, but lockcnt is not). 3366 */ 3367 chain->lockcnt = 1; 3368 hammer2_mtx_ex(&chain->lock); 3369 allocated = 1; 3370 3371 /* 3372 * Set INITIAL to optimize I/O. The flag will generally be 3373 * processed when we call hammer2_chain_modify(). 3374 */ 3375 switch(type) { 3376 case HAMMER2_BREF_TYPE_VOLUME: 3377 case HAMMER2_BREF_TYPE_FREEMAP: 3378 panic("hammer2_chain_create: called with volume type"); 3379 break; 3380 case HAMMER2_BREF_TYPE_INDIRECT: 3381 panic("hammer2_chain_create: cannot be used to" 3382 "create indirect block"); 3383 break; 3384 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3385 panic("hammer2_chain_create: cannot be used to" 3386 "create freemap root or node"); 3387 break; 3388 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3389 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3390 /* fall through */ 3391 case HAMMER2_BREF_TYPE_DIRENT: 3392 case HAMMER2_BREF_TYPE_INODE: 3393 case HAMMER2_BREF_TYPE_DATA: 3394 default: 3395 /* 3396 * leave chain->data NULL, set INITIAL 3397 */ 3398 KKASSERT(chain->data == NULL); 3399 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3400 break; 3401 } 3402 } else { 3403 /* 3404 * We are reattaching a previously deleted chain, possibly 3405 * under a new parent and possibly with a new key/keybits. 3406 * The chain does not have to be in a modified state. The 3407 * UPDATE flag will be set later on in this routine. 3408 * 3409 * Do NOT mess with the current state of the INITIAL flag. 3410 */ 3411 chain->bref.key = key; 3412 chain->bref.keybits = keybits; 3413 if (chain->flags & HAMMER2_CHAIN_DELETED) 3414 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3415 KKASSERT(chain->parent == NULL); 3416 } 3417 3418 /* 3419 * Set the appropriate bref flag if requested. 3420 * 3421 * NOTE! Callers can call this function to move chains without 3422 * knowing about special flags, so don't clear bref flags 3423 * here! 3424 */ 3425 if (flags & HAMMER2_INSERT_PFSROOT) 3426 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3427 3428 if (parent == NULL) 3429 goto skip; 3430 3431 /* 3432 * Calculate how many entries we have in the blockref array and 3433 * determine if an indirect block is required when inserting into 3434 * the parent. 3435 */ 3436 again: 3437 if (--maxloops == 0) 3438 panic("hammer2_chain_create: maxloops"); 3439 3440 switch(parent->bref.type) { 3441 case HAMMER2_BREF_TYPE_INODE: 3442 if ((parent->data->ipdata.meta.op_flags & 3443 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3444 kprintf("hammer2: parent set for direct-data! " 3445 "pkey=%016jx ckey=%016jx\n", 3446 parent->bref.key, 3447 chain->bref.key); 3448 } 3449 KKASSERT((parent->data->ipdata.meta.op_flags & 3450 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3451 KKASSERT(parent->data != NULL); 3452 base = &parent->data->ipdata.u.blockset.blockref[0]; 3453 count = HAMMER2_SET_COUNT; 3454 break; 3455 case HAMMER2_BREF_TYPE_INDIRECT: 3456 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3457 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3458 base = NULL; 3459 else 3460 base = &parent->data->npdata[0]; 3461 count = parent->bytes / sizeof(hammer2_blockref_t); 3462 break; 3463 case HAMMER2_BREF_TYPE_VOLUME: 3464 KKASSERT(parent->data != NULL); 3465 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3466 count = HAMMER2_SET_COUNT; 3467 break; 3468 case HAMMER2_BREF_TYPE_FREEMAP: 3469 KKASSERT(parent->data != NULL); 3470 base = &parent->data->blkset.blockref[0]; 3471 count = HAMMER2_SET_COUNT; 3472 break; 3473 default: 3474 panic("hammer2_chain_create: unrecognized blockref type: %d", 3475 parent->bref.type); 3476 base = NULL; 3477 count = 0; 3478 break; 3479 } 3480 3481 /* 3482 * Make sure we've counted the brefs 3483 */ 3484 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3485 hammer2_chain_countbrefs(parent, base, count); 3486 3487 KASSERT(parent->core.live_count >= 0 && 3488 parent->core.live_count <= count, 3489 ("bad live_count %d/%d (%02x, %d)", 3490 parent->core.live_count, count, 3491 parent->bref.type, parent->bytes)); 3492 3493 /* 3494 * If no free blockref could be found we must create an indirect 3495 * block and move a number of blockrefs into it. With the parent 3496 * locked we can safely lock each child in order to delete+duplicate 3497 * it without causing a deadlock. 3498 * 3499 * This may return the new indirect block or the old parent depending 3500 * on where the key falls. NULL is returned on error. 3501 */ 3502 if (parent->core.live_count == count) { 3503 hammer2_chain_t *nparent; 3504 3505 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3506 3507 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3508 mtid, type, &error); 3509 if (nparent == NULL) { 3510 if (allocated) 3511 hammer2_chain_drop(chain); 3512 chain = NULL; 3513 goto done; 3514 } 3515 if (parent != nparent) { 3516 hammer2_chain_unlock(parent); 3517 hammer2_chain_drop(parent); 3518 parent = *parentp = nparent; 3519 } 3520 goto again; 3521 } 3522 3523 /* 3524 * fall through if parent, or skip to here if no parent. 3525 */ 3526 skip: 3527 if (chain->flags & HAMMER2_CHAIN_DELETED) 3528 kprintf("Inserting deleted chain @%016jx\n", 3529 chain->bref.key); 3530 3531 /* 3532 * Link the chain into its parent. 3533 */ 3534 if (chain->parent != NULL) 3535 panic("hammer2: hammer2_chain_create: chain already connected"); 3536 KKASSERT(chain->parent == NULL); 3537 if (parent) { 3538 KKASSERT(parent->core.live_count < count); 3539 hammer2_chain_insert(parent, chain, 3540 HAMMER2_CHAIN_INSERT_SPIN | 3541 HAMMER2_CHAIN_INSERT_LIVE, 3542 0); 3543 } 3544 3545 if (allocated) { 3546 /* 3547 * Mark the newly created chain modified. This will cause 3548 * UPDATE to be set and process the INITIAL flag. 3549 * 3550 * Device buffers are not instantiated for DATA elements 3551 * as these are handled by logical buffers. 3552 * 3553 * Indirect and freemap node indirect blocks are handled 3554 * by hammer2_chain_create_indirect() and not by this 3555 * function. 3556 * 3557 * Data for all other bref types is expected to be 3558 * instantiated (INODE, LEAF). 3559 */ 3560 switch(chain->bref.type) { 3561 case HAMMER2_BREF_TYPE_DATA: 3562 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3563 case HAMMER2_BREF_TYPE_DIRENT: 3564 case HAMMER2_BREF_TYPE_INODE: 3565 error = hammer2_chain_modify(chain, mtid, dedup_off, 3566 HAMMER2_MODIFY_OPTDATA); 3567 break; 3568 default: 3569 /* 3570 * Remaining types are not supported by this function. 3571 * In particular, INDIRECT and LEAF_NODE types are 3572 * handled by create_indirect(). 3573 */ 3574 panic("hammer2_chain_create: bad type: %d", 3575 chain->bref.type); 3576 /* NOT REACHED */ 3577 break; 3578 } 3579 } else { 3580 /* 3581 * When reconnecting a chain we must set UPDATE and 3582 * setflush so the flush recognizes that it must update 3583 * the bref in the parent. 3584 */ 3585 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3586 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3587 } 3588 3589 /* 3590 * We must setflush(parent) to ensure that it recurses through to 3591 * chain. setflush(chain) might not work because ONFLUSH is possibly 3592 * already set in the chain (so it won't recurse up to set it in the 3593 * parent). 3594 */ 3595 if (parent) 3596 hammer2_chain_setflush(parent); 3597 3598 done: 3599 *chainp = chain; 3600 3601 return (error); 3602 } 3603 3604 /* 3605 * Move the chain from its old parent to a new parent. The chain must have 3606 * already been deleted or already disconnected (or never associated) with 3607 * a parent. The chain is reassociated with the new parent and the deleted 3608 * flag will be cleared (no longer deleted). The chain's modification state 3609 * is not altered. 3610 * 3611 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3612 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3613 * FULL. This typically means that the caller is creating the chain after 3614 * doing a hammer2_chain_lookup(). 3615 * 3616 * Neither (parent) or (chain) can be errored. 3617 * 3618 * If (parent) is non-NULL then the chain is inserted under the parent. 3619 * 3620 * If (parent) is NULL then the newly duplicated chain is not inserted 3621 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3622 * passing into hammer2_chain_create() after this function returns). 3623 * 3624 * WARNING! This function calls create which means it can insert indirect 3625 * blocks. This can cause other unrelated chains in the parent to 3626 * be moved to a newly inserted indirect block in addition to the 3627 * specific chain. 3628 */ 3629 void 3630 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3631 hammer2_tid_t mtid, int flags) 3632 { 3633 hammer2_blockref_t *bref; 3634 hammer2_chain_t *parent; 3635 3636 /* 3637 * WARNING! We should never resolve DATA to device buffers 3638 * (XXX allow it if the caller did?), and since 3639 * we currently do not have the logical buffer cache 3640 * buffer in-hand to fix its cached physical offset 3641 * we also force the modify code to not COW it. XXX 3642 * 3643 * NOTE! We allow error'd chains to be renamed. The bref itself 3644 * is good and can be renamed. The content, however, may 3645 * be inaccessible. 3646 */ 3647 KKASSERT(chain->parent == NULL); 3648 /*KKASSERT(chain->error == 0); allow */ 3649 bref = &chain->bref; 3650 3651 /* 3652 * If parent is not NULL the duplicated chain will be entered under 3653 * the parent and the UPDATE bit set to tell flush to update 3654 * the blockref. 3655 * 3656 * We must setflush(parent) to ensure that it recurses through to 3657 * chain. setflush(chain) might not work because ONFLUSH is possibly 3658 * already set in the chain (so it won't recurse up to set it in the 3659 * parent). 3660 * 3661 * Having both chains locked is extremely important for atomicy. 3662 */ 3663 if (parentp && (parent = *parentp) != NULL) { 3664 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3665 KKASSERT(parent->refs > 0); 3666 KKASSERT(parent->error == 0); 3667 3668 hammer2_chain_create(parentp, &chain, NULL, chain->pmp, 3669 HAMMER2_METH_DEFAULT, 3670 bref->key, bref->keybits, bref->type, 3671 chain->bytes, mtid, 0, flags); 3672 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3673 hammer2_chain_setflush(*parentp); 3674 } 3675 } 3676 3677 /* 3678 * This works in tandem with delete_obref() to install a blockref in 3679 * (typically) an indirect block that is associated with the chain being 3680 * moved to *parentp. 3681 * 3682 * The reason we need this function is that the caller needs to maintain 3683 * the blockref as it was, and not generate a new blockref for what might 3684 * be a modified chain. Otherwise stuff will leak into the flush that 3685 * the flush code's FLUSH_INODE_STOP flag is unable to catch. 3686 * 3687 * It is EXTREMELY important that we properly set CHAIN_BLKMAPUPD and 3688 * CHAIN_UPDATE. We must set BLKMAPUPD if the bref does not match, and 3689 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if 3690 * it does. Otherwise we can end up in a situation where H2 is unable to 3691 * clean up the in-memory chain topology. 3692 * 3693 * The reason for this is that flushes do not generally flush through 3694 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq 3695 * or sideq to properly flush and dispose of the related inode chain's flags. 3696 * Situations where the inode is not actually modified by the frontend, 3697 * but where we have to move the related chains around as we insert or cleanup 3698 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory 3699 * inode chain that does not have a hammer2_inode_t associated with it. 3700 */ 3701 static void 3702 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3703 hammer2_tid_t mtid, int flags, 3704 hammer2_blockref_t *obref) 3705 { 3706 hammer2_chain_rename(parentp, chain, mtid, flags); 3707 3708 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) { 3709 hammer2_blockref_t *tbase; 3710 int tcount; 3711 3712 KKASSERT((chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0); 3713 hammer2_chain_modify(*parentp, mtid, 0, 0); 3714 tbase = hammer2_chain_base_and_count(*parentp, &tcount); 3715 hammer2_base_insert(*parentp, tbase, tcount, chain, obref); 3716 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) { 3717 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD | 3718 HAMMER2_CHAIN_UPDATE); 3719 } else { 3720 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3721 } 3722 } 3723 } 3724 3725 /* 3726 * Helper function for deleting chains. 3727 * 3728 * The chain is removed from the live view (the RBTREE) as well as the parent's 3729 * blockmap. Both chain and its parent must be locked. 3730 * 3731 * parent may not be errored. chain can be errored. 3732 */ 3733 static int 3734 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3735 hammer2_tid_t mtid, int flags, 3736 hammer2_blockref_t *obref) 3737 { 3738 int error = 0; 3739 3740 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 3741 KKASSERT(chain->parent == parent); 3742 3743 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) { 3744 /* 3745 * Chain is blockmapped, so there must be a parent. 3746 * Atomically remove the chain from the parent and remove 3747 * the blockmap entry. The parent must be set modified 3748 * to remove the blockmap entry. 3749 */ 3750 hammer2_blockref_t *base; 3751 int count; 3752 3753 KKASSERT(parent != NULL); 3754 KKASSERT(parent->error == 0); 3755 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3756 error = hammer2_chain_modify(parent, mtid, 0, 0); 3757 if (error) 3758 goto done; 3759 3760 /* 3761 * Calculate blockmap pointer 3762 */ 3763 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3764 hammer2_spin_ex(&chain->core.spin); 3765 hammer2_spin_ex(&parent->core.spin); 3766 3767 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3768 atomic_add_int(&parent->core.live_count, -1); 3769 ++parent->core.generation; 3770 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3771 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3772 --parent->core.chain_count; 3773 chain->parent = NULL; 3774 3775 switch(parent->bref.type) { 3776 case HAMMER2_BREF_TYPE_INODE: 3777 /* 3778 * Access the inode's block array. However, there 3779 * is no block array if the inode is flagged 3780 * DIRECTDATA. 3781 */ 3782 if (parent->data && 3783 (parent->data->ipdata.meta.op_flags & 3784 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3785 base = 3786 &parent->data->ipdata.u.blockset.blockref[0]; 3787 } else { 3788 base = NULL; 3789 } 3790 count = HAMMER2_SET_COUNT; 3791 break; 3792 case HAMMER2_BREF_TYPE_INDIRECT: 3793 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3794 if (parent->data) 3795 base = &parent->data->npdata[0]; 3796 else 3797 base = NULL; 3798 count = parent->bytes / sizeof(hammer2_blockref_t); 3799 break; 3800 case HAMMER2_BREF_TYPE_VOLUME: 3801 base = &parent->data->voldata. 3802 sroot_blockset.blockref[0]; 3803 count = HAMMER2_SET_COUNT; 3804 break; 3805 case HAMMER2_BREF_TYPE_FREEMAP: 3806 base = &parent->data->blkset.blockref[0]; 3807 count = HAMMER2_SET_COUNT; 3808 break; 3809 default: 3810 base = NULL; 3811 count = 0; 3812 panic("_hammer2_chain_delete_helper: " 3813 "unrecognized blockref type: %d", 3814 parent->bref.type); 3815 break; 3816 } 3817 3818 /* 3819 * delete blockmapped chain from its parent. 3820 * 3821 * The parent is not affected by any statistics in chain 3822 * which are pending synchronization. That is, there is 3823 * nothing to undo in the parent since they have not yet 3824 * been incorporated into the parent. 3825 * 3826 * The parent is affected by statistics stored in inodes. 3827 * Those have already been synchronized, so they must be 3828 * undone. XXX split update possible w/delete in middle? 3829 */ 3830 if (base) { 3831 hammer2_base_delete(parent, base, count, chain, obref); 3832 } 3833 hammer2_spin_unex(&parent->core.spin); 3834 hammer2_spin_unex(&chain->core.spin); 3835 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3836 /* 3837 * Chain is not blockmapped but a parent is present. 3838 * Atomically remove the chain from the parent. There is 3839 * no blockmap entry to remove. 3840 * 3841 * Because chain was associated with a parent but not 3842 * synchronized, the chain's *_count_up fields contain 3843 * inode adjustment statistics which must be undone. 3844 */ 3845 hammer2_spin_ex(&chain->core.spin); 3846 hammer2_spin_ex(&parent->core.spin); 3847 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3848 atomic_add_int(&parent->core.live_count, -1); 3849 ++parent->core.generation; 3850 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3851 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3852 --parent->core.chain_count; 3853 chain->parent = NULL; 3854 hammer2_spin_unex(&parent->core.spin); 3855 hammer2_spin_unex(&chain->core.spin); 3856 } else { 3857 /* 3858 * Chain is not blockmapped and has no parent. This 3859 * is a degenerate case. 3860 */ 3861 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3862 } 3863 done: 3864 return error; 3865 } 3866 3867 /* 3868 * Create an indirect block that covers one or more of the elements in the 3869 * current parent. Either returns the existing parent with no locking or 3870 * ref changes or returns the new indirect block locked and referenced 3871 * and leaving the original parent lock/ref intact as well. 3872 * 3873 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3874 * 3875 * The returned chain depends on where the specified key falls. 3876 * 3877 * The key/keybits for the indirect mode only needs to follow three rules: 3878 * 3879 * (1) That all elements underneath it fit within its key space and 3880 * 3881 * (2) That all elements outside it are outside its key space. 3882 * 3883 * (3) When creating the new indirect block any elements in the current 3884 * parent that fit within the new indirect block's keyspace must be 3885 * moved into the new indirect block. 3886 * 3887 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3888 * keyspace the the current parent, but lookup/iteration rules will 3889 * ensure (and must ensure) that rule (2) for all parents leading up 3890 * to the nearest inode or the root volume header is adhered to. This 3891 * is accomplished by always recursing through matching keyspaces in 3892 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3893 * 3894 * The current implementation calculates the current worst-case keyspace by 3895 * iterating the current parent and then divides it into two halves, choosing 3896 * whichever half has the most elements (not necessarily the half containing 3897 * the requested key). 3898 * 3899 * We can also opt to use the half with the least number of elements. This 3900 * causes lower-numbered keys (aka logical file offsets) to recurse through 3901 * fewer indirect blocks and higher-numbered keys to recurse through more. 3902 * This also has the risk of not moving enough elements to the new indirect 3903 * block and being forced to create several indirect blocks before the element 3904 * can be inserted. 3905 * 3906 * Must be called with an exclusively locked parent. 3907 * 3908 * NOTE: *errorp set to HAMMER_ERROR_* flags 3909 */ 3910 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3911 hammer2_key_t *keyp, int keybits, 3912 hammer2_blockref_t *base, int count); 3913 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3914 hammer2_key_t *keyp, int keybits, 3915 hammer2_blockref_t *base, int count, 3916 int ncount); 3917 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3918 hammer2_key_t *keyp, int keybits, 3919 hammer2_blockref_t *base, int count, 3920 int ncount); 3921 static 3922 hammer2_chain_t * 3923 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3924 hammer2_key_t create_key, int create_bits, 3925 hammer2_tid_t mtid, int for_type, int *errorp) 3926 { 3927 hammer2_dev_t *hmp; 3928 hammer2_blockref_t *base; 3929 hammer2_blockref_t *bref; 3930 hammer2_blockref_t bsave; 3931 hammer2_blockref_t dummy; 3932 hammer2_chain_t *chain; 3933 hammer2_chain_t *ichain; 3934 hammer2_key_t key = create_key; 3935 hammer2_key_t key_beg; 3936 hammer2_key_t key_end; 3937 hammer2_key_t key_next; 3938 int keybits = create_bits; 3939 int count; 3940 int ncount; 3941 int nbytes; 3942 int loops; 3943 int error; 3944 int reason; 3945 int generation; 3946 int maxloops = 300000; 3947 3948 /* 3949 * Calculate the base blockref pointer or NULL if the chain 3950 * is known to be empty. We need to calculate the array count 3951 * for RB lookups either way. 3952 */ 3953 hmp = parent->hmp; 3954 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3955 3956 /* 3957 * Pre-modify the parent now to avoid having to deal with error 3958 * processing if we tried to later (in the middle of our loop). 3959 * 3960 * We are going to be moving bref's around, the indirect blocks 3961 * cannot be in an initial state. Do not pass MODIFY_OPTDATA. 3962 */ 3963 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 3964 if (*errorp) { 3965 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 3966 *errorp, hammer2_error_str(*errorp)); 3967 return NULL; 3968 } 3969 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3970 3971 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 3972 base = hammer2_chain_base_and_count(parent, &count); 3973 3974 /* 3975 * How big should our new indirect block be? It has to be at least 3976 * as large as its parent for splits to work properly. 3977 * 3978 * The freemap uses a specific indirect block size. The number of 3979 * levels are built dynamically and ultimately depend on the size 3980 * volume. Because freemap blocks are taken from the reserved areas 3981 * of the volume our goal is efficiency (fewer levels) and not so 3982 * much to save disk space. 3983 * 3984 * The first indirect block level for a directory usually uses 3985 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 3986 * the hash mechanism, this typically gives us a nominal 3987 * 32 * 4 entries with one level of indirection. 3988 * 3989 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 3990 * indirect blocks. The initial 4 entries in the inode gives us 3991 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 3992 * of indirection gives us 137GB, and so forth. H2 can support 3993 * huge file sizes but they are not typical, so we try to stick 3994 * with compactness and do not use a larger indirect block size. 3995 * 3996 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 3997 * due to the way indirect blocks are created this usually winds 3998 * up being extremely inefficient for small files. Even though 3999 * 16KB requires more levels of indirection for very large files, 4000 * the 16KB records can be ganged together into 64KB DIOs. 4001 */ 4002 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4003 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4004 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 4005 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4006 if (parent->data->ipdata.meta.type == 4007 HAMMER2_OBJTYPE_DIRECTORY) 4008 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 4009 else 4010 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 4011 4012 } else { 4013 nbytes = HAMMER2_IND_BYTES_NOM; 4014 } 4015 if (nbytes < count * sizeof(hammer2_blockref_t)) { 4016 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 4017 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 4018 nbytes = count * sizeof(hammer2_blockref_t); 4019 } 4020 ncount = nbytes / sizeof(hammer2_blockref_t); 4021 4022 /* 4023 * When creating an indirect block for a freemap node or leaf 4024 * the key/keybits must be fitted to static radix levels because 4025 * particular radix levels use particular reserved blocks in the 4026 * related zone. 4027 * 4028 * This routine calculates the key/radix of the indirect block 4029 * we need to create, and whether it is on the high-side or the 4030 * low-side. 4031 */ 4032 switch(for_type) { 4033 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4034 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4035 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 4036 base, count); 4037 break; 4038 case HAMMER2_BREF_TYPE_DATA: 4039 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 4040 base, count, ncount); 4041 break; 4042 case HAMMER2_BREF_TYPE_DIRENT: 4043 case HAMMER2_BREF_TYPE_INODE: 4044 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 4045 base, count, ncount); 4046 break; 4047 default: 4048 panic("illegal indirect block for bref type %d", for_type); 4049 break; 4050 } 4051 4052 /* 4053 * Normalize the key for the radix being represented, keeping the 4054 * high bits and throwing away the low bits. 4055 */ 4056 key &= ~(((hammer2_key_t)1 << keybits) - 1); 4057 4058 /* 4059 * Ok, create our new indirect block 4060 */ 4061 bzero(&dummy, sizeof(dummy)); 4062 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4063 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4064 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 4065 } else { 4066 dummy.type = HAMMER2_BREF_TYPE_INDIRECT; 4067 } 4068 dummy.key = key; 4069 dummy.keybits = keybits; 4070 dummy.data_off = hammer2_getradix(nbytes); 4071 dummy.methods = 4072 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 4073 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 4074 4075 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy); 4076 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 4077 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 4078 /* ichain has one ref at this point */ 4079 4080 /* 4081 * We have to mark it modified to allocate its block, but use 4082 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 4083 * it won't be acted upon by the flush code. 4084 * 4085 * XXX remove OPTDATA, we need a fully initialized indirect block to 4086 * be able to move the original blockref. 4087 */ 4088 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0); 4089 if (*errorp) { 4090 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 4091 *errorp, hammer2_error_str(*errorp)); 4092 hammer2_chain_unlock(ichain); 4093 hammer2_chain_drop(ichain); 4094 return NULL; 4095 } 4096 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4097 4098 /* 4099 * Iterate the original parent and move the matching brefs into 4100 * the new indirect block. 4101 * 4102 * XXX handle flushes. 4103 */ 4104 key_beg = 0; 4105 key_end = HAMMER2_KEY_MAX; 4106 key_next = 0; /* avoid gcc warnings */ 4107 hammer2_spin_ex(&parent->core.spin); 4108 loops = 0; 4109 reason = 0; 4110 4111 for (;;) { 4112 /* 4113 * Parent may have been modified, relocating its block array. 4114 * Reload the base pointer. 4115 */ 4116 base = hammer2_chain_base_and_count(parent, &count); 4117 4118 if (++loops > 100000) { 4119 hammer2_spin_unex(&parent->core.spin); 4120 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 4121 reason, parent, base, count, key_next); 4122 } 4123 4124 /* 4125 * NOTE: spinlock stays intact, returned chain (if not NULL) 4126 * is not referenced or locked which means that we 4127 * cannot safely check its flagged / deletion status 4128 * until we lock it. 4129 */ 4130 chain = hammer2_combined_find(parent, base, count, 4131 &key_next, 4132 key_beg, key_end, 4133 &bref); 4134 generation = parent->core.generation; 4135 if (bref == NULL) 4136 break; 4137 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4138 4139 /* 4140 * Skip keys that are not within the key/radix of the new 4141 * indirect block. They stay in the parent. 4142 */ 4143 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) { 4144 goto next_key_spinlocked; 4145 } 4146 4147 /* 4148 * Load the new indirect block by acquiring the related 4149 * chains (potentially from media as it might not be 4150 * in-memory). Then move it to the new parent (ichain). 4151 * 4152 * chain is referenced but not locked. We must lock the 4153 * chain to obtain definitive state. 4154 */ 4155 bsave = *bref; 4156 if (chain) { 4157 /* 4158 * Use chain already present in the RBTREE 4159 */ 4160 hammer2_chain_ref(chain); 4161 hammer2_spin_unex(&parent->core.spin); 4162 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 4163 } else { 4164 /* 4165 * Get chain for blockref element. _get returns NULL 4166 * on insertion race. 4167 */ 4168 hammer2_spin_unex(&parent->core.spin); 4169 chain = hammer2_chain_get(parent, generation, &bsave, 4170 HAMMER2_RESOLVE_NEVER); 4171 if (chain == NULL) { 4172 reason = 1; 4173 hammer2_spin_ex(&parent->core.spin); 4174 continue; 4175 } 4176 } 4177 4178 /* 4179 * This is always live so if the chain has been deleted 4180 * we raced someone and we have to retry. 4181 * 4182 * NOTE: Lookups can race delete-duplicate because 4183 * delete-duplicate does not lock the parent's core 4184 * (they just use the spinlock on the core). 4185 * 4186 * (note reversed logic for this one) 4187 */ 4188 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 4189 chain->parent != parent || 4190 (chain->flags & HAMMER2_CHAIN_DELETED)) { 4191 hammer2_chain_unlock(chain); 4192 hammer2_chain_drop(chain); 4193 if (hammer2_debug & 0x0040) { 4194 kprintf("LOST PARENT RETRY " 4195 "RETRY (%p,%p)->%p %08x\n", 4196 parent, chain->parent, chain, chain->flags); 4197 } 4198 hammer2_spin_ex(&parent->core.spin); 4199 continue; 4200 } 4201 4202 /* 4203 * Shift the chain to the indirect block. 4204 * 4205 * WARNING! No reason for us to load chain data, pass NOSTATS 4206 * to prevent delete/insert from trying to access 4207 * inode stats (and thus asserting if there is no 4208 * chain->data loaded). 4209 * 4210 * WARNING! The (parent, chain) deletion may modify the parent 4211 * and invalidate the base pointer. 4212 * 4213 * WARNING! Parent must already be marked modified, so we 4214 * can assume that chain_delete always suceeds. 4215 * 4216 * WARNING! hammer2_chain_repchange() does not have to be 4217 * called (and doesn't work anyway because we are 4218 * only doing a partial shift). A recursion that is 4219 * in-progress can continue at the current parent 4220 * and will be able to properly find its next key. 4221 */ 4222 error = hammer2_chain_delete_obref(parent, chain, mtid, 0, 4223 &bsave); 4224 KKASSERT(error == 0); 4225 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave); 4226 hammer2_chain_unlock(chain); 4227 hammer2_chain_drop(chain); 4228 KKASSERT(parent->refs > 0); 4229 chain = NULL; 4230 base = NULL; /* safety */ 4231 hammer2_spin_ex(&parent->core.spin); 4232 next_key_spinlocked: 4233 if (--maxloops == 0) 4234 panic("hammer2_chain_create_indirect: maxloops"); 4235 reason = 4; 4236 if (key_next == 0 || key_next > key_end) 4237 break; 4238 key_beg = key_next; 4239 /* loop */ 4240 } 4241 hammer2_spin_unex(&parent->core.spin); 4242 4243 /* 4244 * Insert the new indirect block into the parent now that we've 4245 * cleared out some entries in the parent. We calculated a good 4246 * insertion index in the loop above (ichain->index). 4247 * 4248 * We don't have to set UPDATE here because we mark ichain 4249 * modified down below (so the normal modified -> flush -> set-moved 4250 * sequence applies). 4251 * 4252 * The insertion shouldn't race as this is a completely new block 4253 * and the parent is locked. 4254 */ 4255 base = NULL; /* safety, parent modify may change address */ 4256 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 4257 KKASSERT(parent->core.live_count < count); 4258 hammer2_chain_insert(parent, ichain, 4259 HAMMER2_CHAIN_INSERT_SPIN | 4260 HAMMER2_CHAIN_INSERT_LIVE, 4261 0); 4262 4263 /* 4264 * Make sure flushes propogate after our manual insertion. 4265 */ 4266 hammer2_chain_setflush(ichain); 4267 hammer2_chain_setflush(parent); 4268 4269 /* 4270 * Figure out what to return. 4271 */ 4272 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits) != 0) { 4273 /* 4274 * Key being created is outside the key range, 4275 * return the original parent. 4276 */ 4277 hammer2_chain_unlock(ichain); 4278 hammer2_chain_drop(ichain); 4279 } else { 4280 /* 4281 * Otherwise its in the range, return the new parent. 4282 * (leave both the new and old parent locked). 4283 */ 4284 parent = ichain; 4285 } 4286 4287 return(parent); 4288 } 4289 4290 /* 4291 * Do maintenance on an indirect chain. Both parent and chain are locked. 4292 * 4293 * Returns non-zero if (chain) is deleted, either due to being empty or 4294 * because its children were safely moved into the parent. 4295 */ 4296 int 4297 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4298 hammer2_chain_t *chain) 4299 { 4300 hammer2_blockref_t *chain_base; 4301 hammer2_blockref_t *base; 4302 hammer2_blockref_t *bref; 4303 hammer2_blockref_t bsave; 4304 hammer2_key_t key_next; 4305 hammer2_key_t key_beg; 4306 hammer2_key_t key_end; 4307 hammer2_chain_t *sub; 4308 int chain_count; 4309 int count; 4310 int error; 4311 int generation; 4312 4313 /* 4314 * Make sure we have an accurate live_count 4315 */ 4316 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4317 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4318 base = &chain->data->npdata[0]; 4319 count = chain->bytes / sizeof(hammer2_blockref_t); 4320 hammer2_chain_countbrefs(chain, base, count); 4321 } 4322 4323 /* 4324 * If the indirect block is empty we can delete it. 4325 * (ignore deletion error) 4326 */ 4327 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4328 hammer2_chain_delete(parent, chain, 4329 chain->bref.modify_tid, 4330 HAMMER2_DELETE_PERMANENT); 4331 hammer2_chain_repchange(parent, chain); 4332 return 1; 4333 } 4334 4335 base = hammer2_chain_base_and_count(parent, &count); 4336 4337 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4338 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4339 hammer2_chain_countbrefs(parent, base, count); 4340 } 4341 4342 /* 4343 * Determine if we can collapse chain into parent, calculate 4344 * hysteresis for chain emptiness. 4345 */ 4346 if (parent->core.live_count + chain->core.live_count - 1 > count) 4347 return 0; 4348 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4349 if (chain->core.live_count > chain_count * 3 / 4) 4350 return 0; 4351 4352 /* 4353 * Ok, theoretically we can collapse chain's contents into 4354 * parent. chain is locked, but any in-memory children of chain 4355 * are not. For this to work, we must be able to dispose of any 4356 * in-memory children of chain. 4357 * 4358 * For now require that there are no in-memory children of chain. 4359 * 4360 * WARNING! Both chain and parent must remain locked across this 4361 * entire operation. 4362 */ 4363 4364 /* 4365 * Parent must be marked modified. Don't try to collapse it if we 4366 * can't mark it modified. Once modified, destroy chain to make room 4367 * and to get rid of what will be a conflicting key (this is included 4368 * in the calculation above). Finally, move the children of chain 4369 * into chain's parent. 4370 * 4371 * This order creates an accounting problem for bref.embed.stats 4372 * because we destroy chain before we remove its children. Any 4373 * elements whos blockref is already synchronized will be counted 4374 * twice. To deal with the problem we clean out chain's stats prior 4375 * to deleting it. 4376 */ 4377 error = hammer2_chain_modify(parent, 0, 0, 0); 4378 if (error) { 4379 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4380 hammer2_error_str(error)); 4381 return 0; 4382 } 4383 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4384 if (error) { 4385 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4386 hammer2_error_str(error)); 4387 return 0; 4388 } 4389 4390 chain->bref.embed.stats.inode_count = 0; 4391 chain->bref.embed.stats.data_count = 0; 4392 error = hammer2_chain_delete(parent, chain, 4393 chain->bref.modify_tid, 4394 HAMMER2_DELETE_PERMANENT); 4395 KKASSERT(error == 0); 4396 4397 /* 4398 * The combined_find call requires core.spin to be held. One would 4399 * think there wouldn't be any conflicts since we hold chain 4400 * exclusively locked, but the caching mechanism for 0-ref children 4401 * does not require a chain lock. 4402 */ 4403 hammer2_spin_ex(&chain->core.spin); 4404 4405 key_next = 0; 4406 key_beg = 0; 4407 key_end = HAMMER2_KEY_MAX; 4408 for (;;) { 4409 chain_base = &chain->data->npdata[0]; 4410 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4411 sub = hammer2_combined_find(chain, chain_base, chain_count, 4412 &key_next, 4413 key_beg, key_end, 4414 &bref); 4415 generation = chain->core.generation; 4416 if (bref == NULL) 4417 break; 4418 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4419 4420 bsave = *bref; 4421 if (sub) { 4422 hammer2_chain_ref(sub); 4423 hammer2_spin_unex(&chain->core.spin); 4424 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4425 } else { 4426 hammer2_spin_unex(&chain->core.spin); 4427 sub = hammer2_chain_get(chain, generation, &bsave, 4428 HAMMER2_RESOLVE_NEVER); 4429 if (sub == NULL) { 4430 hammer2_spin_ex(&chain->core.spin); 4431 continue; 4432 } 4433 } 4434 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) || 4435 sub->parent != chain || 4436 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4437 hammer2_chain_unlock(sub); 4438 hammer2_chain_drop(sub); 4439 hammer2_spin_ex(&chain->core.spin); 4440 sub = NULL; /* safety */ 4441 continue; 4442 } 4443 error = hammer2_chain_delete_obref(chain, sub, 4444 sub->bref.modify_tid, 0, 4445 &bsave); 4446 KKASSERT(error == 0); 4447 hammer2_chain_rename_obref(&parent, sub, 4448 sub->bref.modify_tid, 4449 HAMMER2_INSERT_SAMEPARENT, &bsave); 4450 hammer2_chain_unlock(sub); 4451 hammer2_chain_drop(sub); 4452 hammer2_spin_ex(&chain->core.spin); 4453 4454 if (key_next == 0) 4455 break; 4456 key_beg = key_next; 4457 } 4458 hammer2_spin_unex(&chain->core.spin); 4459 4460 hammer2_chain_repchange(parent, chain); 4461 4462 return 1; 4463 } 4464 4465 /* 4466 * Freemap indirect blocks 4467 * 4468 * Calculate the keybits and highside/lowside of the freemap node the 4469 * caller is creating. 4470 * 4471 * This routine will specify the next higher-level freemap key/radix 4472 * representing the lowest-ordered set. By doing so, eventually all 4473 * low-ordered sets will be moved one level down. 4474 * 4475 * We have to be careful here because the freemap reserves a limited 4476 * number of blocks for a limited number of levels. So we can't just 4477 * push indiscriminately. 4478 */ 4479 int 4480 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4481 int keybits, hammer2_blockref_t *base, int count) 4482 { 4483 hammer2_chain_t *chain; 4484 hammer2_blockref_t *bref; 4485 hammer2_key_t key; 4486 hammer2_key_t key_beg; 4487 hammer2_key_t key_end; 4488 hammer2_key_t key_next; 4489 int maxloops = 300000; 4490 4491 key = *keyp; 4492 keybits = 64; 4493 4494 /* 4495 * Calculate the range of keys in the array being careful to skip 4496 * slots which are overridden with a deletion. 4497 */ 4498 key_beg = 0; 4499 key_end = HAMMER2_KEY_MAX; 4500 hammer2_spin_ex(&parent->core.spin); 4501 4502 for (;;) { 4503 if (--maxloops == 0) { 4504 panic("indkey_freemap shit %p %p:%d\n", 4505 parent, base, count); 4506 } 4507 chain = hammer2_combined_find(parent, base, count, 4508 &key_next, 4509 key_beg, key_end, 4510 &bref); 4511 4512 /* 4513 * Exhausted search 4514 */ 4515 if (bref == NULL) 4516 break; 4517 4518 /* 4519 * Skip deleted chains. 4520 */ 4521 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4522 if (key_next == 0 || key_next > key_end) 4523 break; 4524 key_beg = key_next; 4525 continue; 4526 } 4527 4528 /* 4529 * Use the full live (not deleted) element for the scan 4530 * iteration. HAMMER2 does not allow partial replacements. 4531 * 4532 * XXX should be built into hammer2_combined_find(). 4533 */ 4534 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4535 4536 if (keybits > bref->keybits) { 4537 key = bref->key; 4538 keybits = bref->keybits; 4539 } else if (keybits == bref->keybits && bref->key < key) { 4540 key = bref->key; 4541 } 4542 if (key_next == 0) 4543 break; 4544 key_beg = key_next; 4545 } 4546 hammer2_spin_unex(&parent->core.spin); 4547 4548 /* 4549 * Return the keybits for a higher-level FREEMAP_NODE covering 4550 * this node. 4551 */ 4552 switch(keybits) { 4553 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4554 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4555 break; 4556 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4557 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4558 break; 4559 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4560 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4561 break; 4562 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4563 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4564 break; 4565 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4566 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4567 break; 4568 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4569 panic("hammer2_chain_indkey_freemap: level too high"); 4570 break; 4571 default: 4572 panic("hammer2_chain_indkey_freemap: bad radix"); 4573 break; 4574 } 4575 *keyp = key; 4576 4577 return (keybits); 4578 } 4579 4580 /* 4581 * File indirect blocks 4582 * 4583 * Calculate the key/keybits for the indirect block to create by scanning 4584 * existing keys. The key being created is also passed in *keyp and can be 4585 * inside or outside the indirect block. Regardless, the indirect block 4586 * must hold at least two keys in order to guarantee sufficient space. 4587 * 4588 * We use a modified version of the freemap's fixed radix tree, but taylored 4589 * for file data. Basically we configure an indirect block encompassing the 4590 * smallest key. 4591 */ 4592 static int 4593 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4594 int keybits, hammer2_blockref_t *base, int count, 4595 int ncount) 4596 { 4597 hammer2_chain_t *chain; 4598 hammer2_blockref_t *bref; 4599 hammer2_key_t key; 4600 hammer2_key_t key_beg; 4601 hammer2_key_t key_end; 4602 hammer2_key_t key_next; 4603 int nradix; 4604 int maxloops = 300000; 4605 4606 key = *keyp; 4607 keybits = 64; 4608 4609 /* 4610 * Calculate the range of keys in the array being careful to skip 4611 * slots which are overridden with a deletion. 4612 * 4613 * Locate the smallest key. 4614 */ 4615 key_beg = 0; 4616 key_end = HAMMER2_KEY_MAX; 4617 hammer2_spin_ex(&parent->core.spin); 4618 4619 for (;;) { 4620 if (--maxloops == 0) { 4621 panic("indkey_freemap shit %p %p:%d\n", 4622 parent, base, count); 4623 } 4624 chain = hammer2_combined_find(parent, base, count, 4625 &key_next, 4626 key_beg, key_end, 4627 &bref); 4628 4629 /* 4630 * Exhausted search 4631 */ 4632 if (bref == NULL) 4633 break; 4634 4635 /* 4636 * Skip deleted chains. 4637 */ 4638 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4639 if (key_next == 0 || key_next > key_end) 4640 break; 4641 key_beg = key_next; 4642 continue; 4643 } 4644 4645 /* 4646 * Use the full live (not deleted) element for the scan 4647 * iteration. HAMMER2 does not allow partial replacements. 4648 * 4649 * XXX should be built into hammer2_combined_find(). 4650 */ 4651 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4652 4653 if (keybits > bref->keybits) { 4654 key = bref->key; 4655 keybits = bref->keybits; 4656 } else if (keybits == bref->keybits && bref->key < key) { 4657 key = bref->key; 4658 } 4659 if (key_next == 0) 4660 break; 4661 key_beg = key_next; 4662 } 4663 hammer2_spin_unex(&parent->core.spin); 4664 4665 /* 4666 * Calculate the static keybits for a higher-level indirect block 4667 * that contains the key. 4668 */ 4669 *keyp = key; 4670 4671 switch(ncount) { 4672 case HAMMER2_IND_COUNT_MIN: 4673 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4674 break; 4675 case HAMMER2_IND_COUNT_NOM: 4676 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4677 break; 4678 case HAMMER2_IND_COUNT_MAX: 4679 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4680 break; 4681 default: 4682 panic("bad ncount %d\n", ncount); 4683 nradix = 0; 4684 break; 4685 } 4686 4687 /* 4688 * The largest radix that can be returned for an indirect block is 4689 * 63 bits. (The largest practical indirect block radix is actually 4690 * 62 bits because the top-level inode or volume root contains four 4691 * entries, but allow 63 to be returned). 4692 */ 4693 if (nradix >= 64) 4694 nradix = 63; 4695 4696 return keybits + nradix; 4697 } 4698 4699 #if 1 4700 4701 /* 4702 * Directory indirect blocks. 4703 * 4704 * Covers both the inode index (directory of inodes), and directory contents 4705 * (filenames hardlinked to inodes). 4706 * 4707 * Because directory keys are hashed we generally try to cut the space in 4708 * half. We accomodate the inode index (which tends to have linearly 4709 * increasing inode numbers) by ensuring that the keyspace is at least large 4710 * enough to fill up the indirect block being created. 4711 */ 4712 static int 4713 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4714 int keybits, hammer2_blockref_t *base, int count, 4715 int ncount) 4716 { 4717 hammer2_blockref_t *bref; 4718 hammer2_chain_t *chain; 4719 hammer2_key_t key_beg; 4720 hammer2_key_t key_end; 4721 hammer2_key_t key_next; 4722 hammer2_key_t key; 4723 int nkeybits; 4724 int locount; 4725 int hicount; 4726 int maxloops = 300000; 4727 4728 /* 4729 * NOTE: We can't take a shortcut here anymore for inodes because 4730 * the root directory can contain a mix of inodes and directory 4731 * entries (we used to just return 63 if parent->bref.type was 4732 * HAMMER2_BREF_TYPE_INODE. 4733 */ 4734 key = *keyp; 4735 locount = 0; 4736 hicount = 0; 4737 4738 /* 4739 * Calculate the range of keys in the array being careful to skip 4740 * slots which are overridden with a deletion. 4741 */ 4742 key_beg = 0; 4743 key_end = HAMMER2_KEY_MAX; 4744 hammer2_spin_ex(&parent->core.spin); 4745 4746 for (;;) { 4747 if (--maxloops == 0) { 4748 panic("indkey_freemap shit %p %p:%d\n", 4749 parent, base, count); 4750 } 4751 chain = hammer2_combined_find(parent, base, count, 4752 &key_next, 4753 key_beg, key_end, 4754 &bref); 4755 4756 /* 4757 * Exhausted search 4758 */ 4759 if (bref == NULL) 4760 break; 4761 4762 /* 4763 * Deleted object 4764 */ 4765 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4766 if (key_next == 0 || key_next > key_end) 4767 break; 4768 key_beg = key_next; 4769 continue; 4770 } 4771 4772 /* 4773 * Use the full live (not deleted) element for the scan 4774 * iteration. HAMMER2 does not allow partial replacements. 4775 * 4776 * XXX should be built into hammer2_combined_find(). 4777 */ 4778 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4779 4780 /* 4781 * Expand our calculated key range (key, keybits) to fit 4782 * the scanned key. nkeybits represents the full range 4783 * that we will later cut in half (two halves @ nkeybits - 1). 4784 */ 4785 nkeybits = keybits; 4786 if (nkeybits < bref->keybits) { 4787 if (bref->keybits > 64) { 4788 kprintf("bad bref chain %p bref %p\n", 4789 chain, bref); 4790 Debugger("fubar"); 4791 } 4792 nkeybits = bref->keybits; 4793 } 4794 while (nkeybits < 64 && 4795 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) { 4796 ++nkeybits; 4797 } 4798 4799 /* 4800 * If the new key range is larger we have to determine 4801 * which side of the new key range the existing keys fall 4802 * under by checking the high bit, then collapsing the 4803 * locount into the hicount or vise-versa. 4804 */ 4805 if (keybits != nkeybits) { 4806 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4807 hicount += locount; 4808 locount = 0; 4809 } else { 4810 locount += hicount; 4811 hicount = 0; 4812 } 4813 keybits = nkeybits; 4814 } 4815 4816 /* 4817 * The newly scanned key will be in the lower half or the 4818 * upper half of the (new) key range. 4819 */ 4820 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4821 ++hicount; 4822 else 4823 ++locount; 4824 4825 if (key_next == 0) 4826 break; 4827 key_beg = key_next; 4828 } 4829 hammer2_spin_unex(&parent->core.spin); 4830 bref = NULL; /* now invalid (safety) */ 4831 4832 /* 4833 * Adjust keybits to represent half of the full range calculated 4834 * above (radix 63 max) for our new indirect block. 4835 */ 4836 --keybits; 4837 4838 /* 4839 * Expand keybits to hold at least ncount elements. ncount will be 4840 * a power of 2. This is to try to completely fill leaf nodes (at 4841 * least for keys which are not hashes). 4842 * 4843 * We aren't counting 'in' or 'out', we are counting 'high side' 4844 * and 'low side' based on the bit at (1LL << keybits). We want 4845 * everything to be inside in these cases so shift it all to 4846 * the low or high side depending on the new high bit. 4847 */ 4848 while (((hammer2_key_t)1 << keybits) < ncount) { 4849 ++keybits; 4850 if (key & ((hammer2_key_t)1 << keybits)) { 4851 hicount += locount; 4852 locount = 0; 4853 } else { 4854 locount += hicount; 4855 hicount = 0; 4856 } 4857 } 4858 4859 if (hicount > locount) 4860 key |= (hammer2_key_t)1 << keybits; 4861 else 4862 key &= ~(hammer2_key_t)1 << keybits; 4863 4864 *keyp = key; 4865 4866 return (keybits); 4867 } 4868 4869 #else 4870 4871 /* 4872 * Directory indirect blocks. 4873 * 4874 * Covers both the inode index (directory of inodes), and directory contents 4875 * (filenames hardlinked to inodes). 4876 * 4877 * Because directory keys are hashed we generally try to cut the space in 4878 * half. We accomodate the inode index (which tends to have linearly 4879 * increasing inode numbers) by ensuring that the keyspace is at least large 4880 * enough to fill up the indirect block being created. 4881 */ 4882 static int 4883 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4884 int keybits, hammer2_blockref_t *base, int count, 4885 int ncount) 4886 { 4887 hammer2_blockref_t *bref; 4888 hammer2_chain_t *chain; 4889 hammer2_key_t key_beg; 4890 hammer2_key_t key_end; 4891 hammer2_key_t key_next; 4892 hammer2_key_t key; 4893 int nkeybits; 4894 int locount; 4895 int hicount; 4896 int maxloops = 300000; 4897 4898 /* 4899 * Shortcut if the parent is the inode. In this situation the 4900 * parent has 4+1 directory entries and we are creating an indirect 4901 * block capable of holding many more. 4902 */ 4903 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4904 return 63; 4905 } 4906 4907 key = *keyp; 4908 locount = 0; 4909 hicount = 0; 4910 4911 /* 4912 * Calculate the range of keys in the array being careful to skip 4913 * slots which are overridden with a deletion. 4914 */ 4915 key_beg = 0; 4916 key_end = HAMMER2_KEY_MAX; 4917 hammer2_spin_ex(&parent->core.spin); 4918 4919 for (;;) { 4920 if (--maxloops == 0) { 4921 panic("indkey_freemap shit %p %p:%d\n", 4922 parent, base, count); 4923 } 4924 chain = hammer2_combined_find(parent, base, count, 4925 &key_next, 4926 key_beg, key_end, 4927 &bref); 4928 4929 /* 4930 * Exhausted search 4931 */ 4932 if (bref == NULL) 4933 break; 4934 4935 /* 4936 * Deleted object 4937 */ 4938 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4939 if (key_next == 0 || key_next > key_end) 4940 break; 4941 key_beg = key_next; 4942 continue; 4943 } 4944 4945 /* 4946 * Use the full live (not deleted) element for the scan 4947 * iteration. HAMMER2 does not allow partial replacements. 4948 * 4949 * XXX should be built into hammer2_combined_find(). 4950 */ 4951 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4952 4953 /* 4954 * Expand our calculated key range (key, keybits) to fit 4955 * the scanned key. nkeybits represents the full range 4956 * that we will later cut in half (two halves @ nkeybits - 1). 4957 */ 4958 nkeybits = keybits; 4959 if (nkeybits < bref->keybits) { 4960 if (bref->keybits > 64) { 4961 kprintf("bad bref chain %p bref %p\n", 4962 chain, bref); 4963 Debugger("fubar"); 4964 } 4965 nkeybits = bref->keybits; 4966 } 4967 while (nkeybits < 64 && 4968 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4969 (key ^ bref->key)) != 0) { 4970 ++nkeybits; 4971 } 4972 4973 /* 4974 * If the new key range is larger we have to determine 4975 * which side of the new key range the existing keys fall 4976 * under by checking the high bit, then collapsing the 4977 * locount into the hicount or vise-versa. 4978 */ 4979 if (keybits != nkeybits) { 4980 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4981 hicount += locount; 4982 locount = 0; 4983 } else { 4984 locount += hicount; 4985 hicount = 0; 4986 } 4987 keybits = nkeybits; 4988 } 4989 4990 /* 4991 * The newly scanned key will be in the lower half or the 4992 * upper half of the (new) key range. 4993 */ 4994 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4995 ++hicount; 4996 else 4997 ++locount; 4998 4999 if (key_next == 0) 5000 break; 5001 key_beg = key_next; 5002 } 5003 hammer2_spin_unex(&parent->core.spin); 5004 bref = NULL; /* now invalid (safety) */ 5005 5006 /* 5007 * Adjust keybits to represent half of the full range calculated 5008 * above (radix 63 max) for our new indirect block. 5009 */ 5010 --keybits; 5011 5012 /* 5013 * Expand keybits to hold at least ncount elements. ncount will be 5014 * a power of 2. This is to try to completely fill leaf nodes (at 5015 * least for keys which are not hashes). 5016 * 5017 * We aren't counting 'in' or 'out', we are counting 'high side' 5018 * and 'low side' based on the bit at (1LL << keybits). We want 5019 * everything to be inside in these cases so shift it all to 5020 * the low or high side depending on the new high bit. 5021 */ 5022 while (((hammer2_key_t)1 << keybits) < ncount) { 5023 ++keybits; 5024 if (key & ((hammer2_key_t)1 << keybits)) { 5025 hicount += locount; 5026 locount = 0; 5027 } else { 5028 locount += hicount; 5029 hicount = 0; 5030 } 5031 } 5032 5033 if (hicount > locount) 5034 key |= (hammer2_key_t)1 << keybits; 5035 else 5036 key &= ~(hammer2_key_t)1 << keybits; 5037 5038 *keyp = key; 5039 5040 return (keybits); 5041 } 5042 5043 #endif 5044 5045 /* 5046 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 5047 * it exists. 5048 * 5049 * Both parent and chain must be locked exclusively. 5050 * 5051 * This function will modify the parent if the blockref requires removal 5052 * from the parent's block table. 5053 * 5054 * This function is NOT recursive. Any entity already pushed into the 5055 * chain (such as an inode) may still need visibility into its contents, 5056 * as well as the ability to read and modify the contents. For example, 5057 * for an unlinked file which is still open. 5058 * 5059 * Also note that the flusher is responsible for cleaning up empty 5060 * indirect blocks. 5061 */ 5062 int 5063 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 5064 hammer2_tid_t mtid, int flags) 5065 { 5066 int error = 0; 5067 5068 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5069 5070 /* 5071 * Nothing to do if already marked. 5072 * 5073 * We need the spinlock on the core whos RBTREE contains chain 5074 * to protect against races. 5075 */ 5076 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5077 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5078 chain->parent == parent); 5079 error = _hammer2_chain_delete_helper(parent, chain, 5080 mtid, flags, NULL); 5081 } 5082 5083 /* 5084 * Permanent deletions mark the chain as destroyed. 5085 * 5086 * NOTE: We do not setflush the chain unless the deletion is 5087 * permanent, since the deletion of a chain does not actually 5088 * require it to be flushed. 5089 */ 5090 if (error == 0) { 5091 if (flags & HAMMER2_DELETE_PERMANENT) { 5092 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5093 hammer2_chain_setflush(chain); 5094 } 5095 } 5096 5097 return error; 5098 } 5099 5100 static int 5101 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain, 5102 hammer2_tid_t mtid, int flags, 5103 hammer2_blockref_t *obref) 5104 { 5105 int error = 0; 5106 5107 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5108 5109 /* 5110 * Nothing to do if already marked. 5111 * 5112 * We need the spinlock on the core whos RBTREE contains chain 5113 * to protect against races. 5114 */ 5115 obref->type = HAMMER2_BREF_TYPE_EMPTY; 5116 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5117 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5118 chain->parent == parent); 5119 error = _hammer2_chain_delete_helper(parent, chain, 5120 mtid, flags, obref); 5121 } 5122 5123 /* 5124 * Permanent deletions mark the chain as destroyed. 5125 * 5126 * NOTE: We do not setflush the chain unless the deletion is 5127 * permanent, since the deletion of a chain does not actually 5128 * require it to be flushed. 5129 */ 5130 if (error == 0) { 5131 if (flags & HAMMER2_DELETE_PERMANENT) { 5132 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5133 hammer2_chain_setflush(chain); 5134 } 5135 } 5136 5137 return error; 5138 } 5139 5140 /* 5141 * Returns the index of the nearest element in the blockref array >= elm. 5142 * Returns (count) if no element could be found. 5143 * 5144 * Sets *key_nextp to the next key for loop purposes but does not modify 5145 * it if the next key would be higher than the current value of *key_nextp. 5146 * Note that *key_nexp can overflow to 0, which should be tested by the 5147 * caller. 5148 * 5149 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5150 * held through the operation. 5151 */ 5152 static int 5153 hammer2_base_find(hammer2_chain_t *parent, 5154 hammer2_blockref_t *base, int count, 5155 hammer2_key_t *key_nextp, 5156 hammer2_key_t key_beg, hammer2_key_t key_end) 5157 { 5158 hammer2_blockref_t *scan; 5159 hammer2_key_t scan_end; 5160 int i; 5161 int limit; 5162 5163 /* 5164 * Require the live chain's already have their core's counted 5165 * so we can optimize operations. 5166 */ 5167 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 5168 5169 /* 5170 * Degenerate case 5171 */ 5172 if (count == 0 || base == NULL) 5173 return(count); 5174 5175 /* 5176 * Sequential optimization using parent->cache_index. This is 5177 * the most likely scenario. 5178 * 5179 * We can avoid trailing empty entries on live chains, otherwise 5180 * we might have to check the whole block array. 5181 */ 5182 i = parent->cache_index; /* SMP RACE OK */ 5183 cpu_ccfence(); 5184 limit = parent->core.live_zero; 5185 if (i >= limit) 5186 i = limit - 1; 5187 if (i < 0) 5188 i = 0; 5189 KKASSERT(i < count); 5190 5191 /* 5192 * Search backwards 5193 */ 5194 scan = &base[i]; 5195 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY || 5196 scan->key > key_beg)) { 5197 --scan; 5198 --i; 5199 } 5200 parent->cache_index = i; 5201 5202 /* 5203 * Search forwards, stop when we find a scan element which 5204 * encloses the key or until we know that there are no further 5205 * elements. 5206 */ 5207 while (i < count) { 5208 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) { 5209 scan_end = scan->key + 5210 ((hammer2_key_t)1 << scan->keybits) - 1; 5211 if (scan->key > key_beg || scan_end >= key_beg) 5212 break; 5213 } 5214 if (i >= limit) 5215 return (count); 5216 ++scan; 5217 ++i; 5218 } 5219 if (i != count) { 5220 parent->cache_index = i; 5221 if (i >= limit) { 5222 i = count; 5223 } else { 5224 scan_end = scan->key + 5225 ((hammer2_key_t)1 << scan->keybits); 5226 if (scan_end && (*key_nextp > scan_end || 5227 *key_nextp == 0)) { 5228 *key_nextp = scan_end; 5229 } 5230 } 5231 } 5232 return (i); 5233 } 5234 5235 /* 5236 * Do a combined search and return the next match either from the blockref 5237 * array or from the in-memory chain. Sets *brefp to the returned bref in 5238 * both cases, or sets it to NULL if the search exhausted. Only returns 5239 * a non-NULL chain if the search matched from the in-memory chain. 5240 * 5241 * When no in-memory chain has been found and a non-NULL bref is returned 5242 * in *brefp. 5243 * 5244 * 5245 * The returned chain is not locked or referenced. Use the returned bref 5246 * to determine if the search exhausted or not. Iterate if the base find 5247 * is chosen but matches a deleted chain. 5248 * 5249 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5250 * held through the operation. 5251 */ 5252 static hammer2_chain_t * 5253 hammer2_combined_find(hammer2_chain_t *parent, 5254 hammer2_blockref_t *base, int count, 5255 hammer2_key_t *key_nextp, 5256 hammer2_key_t key_beg, hammer2_key_t key_end, 5257 hammer2_blockref_t **brefp) 5258 { 5259 hammer2_blockref_t *bref; 5260 hammer2_chain_t *chain; 5261 int i; 5262 5263 /* 5264 * Lookup in block array and in rbtree. 5265 */ 5266 *key_nextp = key_end + 1; 5267 i = hammer2_base_find(parent, base, count, key_nextp, 5268 key_beg, key_end); 5269 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 5270 5271 /* 5272 * Neither matched 5273 */ 5274 if (i == count && chain == NULL) { 5275 *brefp = NULL; 5276 return(NULL); 5277 } 5278 5279 /* 5280 * Only chain matched. 5281 */ 5282 if (i == count) { 5283 bref = &chain->bref; 5284 goto found; 5285 } 5286 5287 /* 5288 * Only blockref matched. 5289 */ 5290 if (chain == NULL) { 5291 bref = &base[i]; 5292 goto found; 5293 } 5294 5295 /* 5296 * Both in-memory and blockref matched, select the nearer element. 5297 * 5298 * If both are flush with the left-hand side or both are the 5299 * same distance away, select the chain. In this situation the 5300 * chain must have been loaded from the matching blockmap. 5301 */ 5302 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 5303 chain->bref.key == base[i].key) { 5304 KKASSERT(chain->bref.key == base[i].key); 5305 bref = &chain->bref; 5306 goto found; 5307 } 5308 5309 /* 5310 * Select the nearer key 5311 */ 5312 if (chain->bref.key < base[i].key) { 5313 bref = &chain->bref; 5314 } else { 5315 bref = &base[i]; 5316 chain = NULL; 5317 } 5318 5319 /* 5320 * If the bref is out of bounds we've exhausted our search. 5321 */ 5322 found: 5323 if (bref->key > key_end) { 5324 *brefp = NULL; 5325 chain = NULL; 5326 } else { 5327 *brefp = bref; 5328 } 5329 return(chain); 5330 } 5331 5332 /* 5333 * Locate the specified block array element and delete it. The element 5334 * must exist. 5335 * 5336 * The spin lock on the related chain must be held. 5337 * 5338 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5339 * need to be adjusted when we commit the media change. 5340 */ 5341 void 5342 hammer2_base_delete(hammer2_chain_t *parent, 5343 hammer2_blockref_t *base, int count, 5344 hammer2_chain_t *chain, 5345 hammer2_blockref_t *obref) 5346 { 5347 hammer2_blockref_t *elm = &chain->bref; 5348 hammer2_blockref_t *scan; 5349 hammer2_key_t key_next; 5350 int i; 5351 5352 /* 5353 * Delete element. Expect the element to exist. 5354 * 5355 * XXX see caller, flush code not yet sophisticated enough to prevent 5356 * re-flushed in some cases. 5357 */ 5358 key_next = 0; /* max range */ 5359 i = hammer2_base_find(parent, base, count, &key_next, 5360 elm->key, elm->key); 5361 scan = &base[i]; 5362 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY || 5363 scan->key != elm->key || 5364 ((chain->flags & HAMMER2_CHAIN_BLKMAPUPD) == 0 && 5365 scan->keybits != elm->keybits)) { 5366 hammer2_spin_unex(&parent->core.spin); 5367 panic("delete base %p element not found at %d/%d elm %p\n", 5368 base, i, count, elm); 5369 return; 5370 } 5371 5372 /* 5373 * Update stats and zero the entry. 5374 * 5375 * NOTE: Handle radix == 0 (0 bytes) case. 5376 */ 5377 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5378 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5379 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5380 } 5381 switch(scan->type) { 5382 case HAMMER2_BREF_TYPE_INODE: 5383 --parent->bref.embed.stats.inode_count; 5384 /* fall through */ 5385 case HAMMER2_BREF_TYPE_DATA: 5386 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5387 atomic_set_int(&chain->flags, 5388 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5389 } else { 5390 if (parent->bref.leaf_count) 5391 --parent->bref.leaf_count; 5392 } 5393 /* fall through */ 5394 case HAMMER2_BREF_TYPE_INDIRECT: 5395 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5396 parent->bref.embed.stats.data_count -= 5397 scan->embed.stats.data_count; 5398 parent->bref.embed.stats.inode_count -= 5399 scan->embed.stats.inode_count; 5400 } 5401 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5402 break; 5403 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5404 atomic_set_int(&chain->flags, 5405 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5406 } else { 5407 if (parent->bref.leaf_count <= scan->leaf_count) 5408 parent->bref.leaf_count = 0; 5409 else 5410 parent->bref.leaf_count -= scan->leaf_count; 5411 } 5412 break; 5413 case HAMMER2_BREF_TYPE_DIRENT: 5414 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5415 atomic_set_int(&chain->flags, 5416 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5417 } else { 5418 if (parent->bref.leaf_count) 5419 --parent->bref.leaf_count; 5420 } 5421 default: 5422 break; 5423 } 5424 5425 if (obref) 5426 *obref = *scan; 5427 bzero(scan, sizeof(*scan)); 5428 5429 /* 5430 * We can only optimize parent->core.live_zero for live chains. 5431 */ 5432 if (parent->core.live_zero == i + 1) { 5433 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY) 5434 ; 5435 parent->core.live_zero = i + 1; 5436 } 5437 5438 /* 5439 * Clear appropriate blockmap flags in chain. 5440 */ 5441 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED | 5442 HAMMER2_CHAIN_BLKMAPUPD); 5443 } 5444 5445 /* 5446 * Insert the specified element. The block array must not already have the 5447 * element and must have space available for the insertion. 5448 * 5449 * The spin lock on the related chain must be held. 5450 * 5451 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5452 * need to be adjusted when we commit the media change. 5453 */ 5454 void 5455 hammer2_base_insert(hammer2_chain_t *parent, 5456 hammer2_blockref_t *base, int count, 5457 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5458 { 5459 hammer2_key_t key_next; 5460 hammer2_key_t xkey; 5461 int i; 5462 int j; 5463 int k; 5464 int l; 5465 int u = 1; 5466 5467 /* 5468 * Insert new element. Expect the element to not already exist 5469 * unless we are replacing it. 5470 * 5471 * XXX see caller, flush code not yet sophisticated enough to prevent 5472 * re-flushed in some cases. 5473 */ 5474 key_next = 0; /* max range */ 5475 i = hammer2_base_find(parent, base, count, &key_next, 5476 elm->key, elm->key); 5477 5478 /* 5479 * Shortcut fill optimization, typical ordered insertion(s) may not 5480 * require a search. 5481 */ 5482 KKASSERT(i >= 0 && i <= count); 5483 5484 /* 5485 * Set appropriate blockmap flags in chain (if not NULL) 5486 */ 5487 if (chain) 5488 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 5489 5490 /* 5491 * Update stats and zero the entry 5492 */ 5493 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5494 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5495 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5496 } 5497 switch(elm->type) { 5498 case HAMMER2_BREF_TYPE_INODE: 5499 ++parent->bref.embed.stats.inode_count; 5500 /* fall through */ 5501 case HAMMER2_BREF_TYPE_DATA: 5502 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5503 ++parent->bref.leaf_count; 5504 /* fall through */ 5505 case HAMMER2_BREF_TYPE_INDIRECT: 5506 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5507 parent->bref.embed.stats.data_count += 5508 elm->embed.stats.data_count; 5509 parent->bref.embed.stats.inode_count += 5510 elm->embed.stats.inode_count; 5511 } 5512 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5513 break; 5514 if (parent->bref.leaf_count + elm->leaf_count < 5515 HAMMER2_BLOCKREF_LEAF_MAX) { 5516 parent->bref.leaf_count += elm->leaf_count; 5517 } else { 5518 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5519 } 5520 break; 5521 case HAMMER2_BREF_TYPE_DIRENT: 5522 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5523 ++parent->bref.leaf_count; 5524 break; 5525 default: 5526 break; 5527 } 5528 5529 5530 /* 5531 * We can only optimize parent->core.live_zero for live chains. 5532 */ 5533 if (i == count && parent->core.live_zero < count) { 5534 i = parent->core.live_zero++; 5535 base[i] = *elm; 5536 return; 5537 } 5538 5539 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5540 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5541 hammer2_spin_unex(&parent->core.spin); 5542 panic("insert base %p overlapping elements at %d elm %p\n", 5543 base, i, elm); 5544 } 5545 5546 /* 5547 * Try to find an empty slot before or after. 5548 */ 5549 j = i; 5550 k = i; 5551 while (j > 0 || k < count) { 5552 --j; 5553 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) { 5554 if (j == i - 1) { 5555 base[j] = *elm; 5556 } else { 5557 bcopy(&base[j+1], &base[j], 5558 (i - j - 1) * sizeof(*base)); 5559 base[i - 1] = *elm; 5560 } 5561 goto validate; 5562 } 5563 ++k; 5564 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) { 5565 bcopy(&base[i], &base[i+1], 5566 (k - i) * sizeof(hammer2_blockref_t)); 5567 base[i] = *elm; 5568 5569 /* 5570 * We can only update parent->core.live_zero for live 5571 * chains. 5572 */ 5573 if (parent->core.live_zero <= k) 5574 parent->core.live_zero = k + 1; 5575 u = 2; 5576 goto validate; 5577 } 5578 } 5579 panic("hammer2_base_insert: no room!"); 5580 5581 /* 5582 * Debugging 5583 */ 5584 validate: 5585 key_next = 0; 5586 for (l = 0; l < count; ++l) { 5587 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5588 key_next = base[l].key + 5589 ((hammer2_key_t)1 << base[l].keybits) - 1; 5590 break; 5591 } 5592 } 5593 while (++l < count) { 5594 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5595 if (base[l].key <= key_next) 5596 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5597 key_next = base[l].key + 5598 ((hammer2_key_t)1 << base[l].keybits) - 1; 5599 5600 } 5601 } 5602 5603 } 5604 5605 #if 0 5606 5607 /* 5608 * Sort the blockref array for the chain. Used by the flush code to 5609 * sort the blockref[] array. 5610 * 5611 * The chain must be exclusively locked AND spin-locked. 5612 */ 5613 typedef hammer2_blockref_t *hammer2_blockref_p; 5614 5615 static 5616 int 5617 hammer2_base_sort_callback(const void *v1, const void *v2) 5618 { 5619 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5620 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5621 5622 /* 5623 * Make sure empty elements are placed at the end of the array 5624 */ 5625 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) { 5626 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) 5627 return(0); 5628 return(1); 5629 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) { 5630 return(-1); 5631 } 5632 5633 /* 5634 * Sort by key 5635 */ 5636 if (bref1->key < bref2->key) 5637 return(-1); 5638 if (bref1->key > bref2->key) 5639 return(1); 5640 return(0); 5641 } 5642 5643 void 5644 hammer2_base_sort(hammer2_chain_t *chain) 5645 { 5646 hammer2_blockref_t *base; 5647 int count; 5648 5649 switch(chain->bref.type) { 5650 case HAMMER2_BREF_TYPE_INODE: 5651 /* 5652 * Special shortcut for embedded data returns the inode 5653 * itself. Callers must detect this condition and access 5654 * the embedded data (the strategy code does this for us). 5655 * 5656 * This is only applicable to regular files and softlinks. 5657 */ 5658 if (chain->data->ipdata.meta.op_flags & 5659 HAMMER2_OPFLAG_DIRECTDATA) { 5660 return; 5661 } 5662 base = &chain->data->ipdata.u.blockset.blockref[0]; 5663 count = HAMMER2_SET_COUNT; 5664 break; 5665 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5666 case HAMMER2_BREF_TYPE_INDIRECT: 5667 /* 5668 * Optimize indirect blocks in the INITIAL state to avoid 5669 * I/O. 5670 */ 5671 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5672 base = &chain->data->npdata[0]; 5673 count = chain->bytes / sizeof(hammer2_blockref_t); 5674 break; 5675 case HAMMER2_BREF_TYPE_VOLUME: 5676 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5677 count = HAMMER2_SET_COUNT; 5678 break; 5679 case HAMMER2_BREF_TYPE_FREEMAP: 5680 base = &chain->data->blkset.blockref[0]; 5681 count = HAMMER2_SET_COUNT; 5682 break; 5683 default: 5684 panic("hammer2_base_sort: unrecognized " 5685 "blockref(A) type: %d", 5686 chain->bref.type); 5687 base = NULL; /* safety */ 5688 count = 0; /* safety */ 5689 break; 5690 } 5691 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5692 } 5693 5694 #endif 5695 5696 /* 5697 * Set the check data for a chain. This can be a heavy-weight operation 5698 * and typically only runs on-flush. For file data check data is calculated 5699 * when the logical buffers are flushed. 5700 */ 5701 void 5702 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5703 { 5704 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 5705 5706 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5707 case HAMMER2_CHECK_NONE: 5708 break; 5709 case HAMMER2_CHECK_DISABLED: 5710 break; 5711 case HAMMER2_CHECK_ISCSI32: 5712 chain->bref.check.iscsi32.value = 5713 hammer2_icrc32(bdata, chain->bytes); 5714 break; 5715 case HAMMER2_CHECK_XXHASH64: 5716 chain->bref.check.xxhash64.value = 5717 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5718 break; 5719 case HAMMER2_CHECK_SHA192: 5720 assert(0); /* XXX unsupported */ 5721 /* 5722 { 5723 SHA256_CTX hash_ctx; 5724 union { 5725 uint8_t digest[SHA256_DIGEST_LENGTH]; 5726 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5727 } u; 5728 5729 SHA256_Init(&hash_ctx); 5730 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5731 SHA256_Final(u.digest, &hash_ctx); 5732 u.digest64[2] ^= u.digest64[3]; 5733 bcopy(u.digest, 5734 chain->bref.check.sha192.data, 5735 sizeof(chain->bref.check.sha192.data)); 5736 } 5737 */ 5738 break; 5739 case HAMMER2_CHECK_FREEMAP: 5740 chain->bref.check.freemap.icrc32 = 5741 hammer2_icrc32(bdata, chain->bytes); 5742 break; 5743 default: 5744 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5745 chain->bref.methods); 5746 break; 5747 } 5748 } 5749 5750 /* 5751 * Characterize a failed check code and try to trace back to the inode. 5752 */ 5753 static void 5754 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check, 5755 int bits) 5756 { 5757 hammer2_chain_t *lchain; 5758 hammer2_chain_t *ochain; 5759 int did; 5760 5761 did = krateprintf(&krate_h2chk, 5762 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL " 5763 "(flags=%08x, bref/data ", 5764 chain->bref.data_off, 5765 chain->bref.type, 5766 hammer2_bref_type_str(chain->bref.type), 5767 chain->bref.methods, 5768 chain->flags); 5769 if (did == 0) 5770 return; 5771 5772 if (bits == 32) { 5773 kprintf("%08x/%08x)\n", 5774 chain->bref.check.iscsi32.value, 5775 (uint32_t)check); 5776 } else { 5777 kprintf("%016jx/%016jx)\n", 5778 chain->bref.check.xxhash64.value, 5779 check); 5780 } 5781 5782 /* 5783 * Run up the chains to try to find the governing inode so we 5784 * can report it. 5785 * 5786 * XXX This error reporting is not really MPSAFE 5787 */ 5788 ochain = chain; 5789 lchain = chain; 5790 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) { 5791 lchain = chain; 5792 chain = chain->parent; 5793 } 5794 5795 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE && 5796 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 || 5797 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) { 5798 kprintf(" Resides at/in inode %ld\n", 5799 chain->bref.key); 5800 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5801 kprintf(" Resides in inode index - CRITICAL!!!\n"); 5802 } else { 5803 kprintf(" Resides in root index - CRITICAL!!!\n"); 5804 } 5805 if (ochain->hmp) { 5806 const char *pfsname = "UNKNOWN"; 5807 int i; 5808 5809 if (ochain->pmp) { 5810 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 5811 if (ochain->pmp->pfs_hmps[i] == ochain->hmp && 5812 ochain->pmp->pfs_names[i]) { 5813 pfsname = ochain->pmp->pfs_names[i]; 5814 break; 5815 } 5816 } 5817 } 5818 kprintf(" In pfs %s on device %s\n", 5819 pfsname, ochain->hmp->devrepname); 5820 } 5821 } 5822 5823 /* 5824 * Returns non-zero on success, 0 on failure. 5825 */ 5826 int 5827 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5828 { 5829 uint32_t check32; 5830 uint64_t check64; 5831 int r; 5832 5833 if (chain->flags & HAMMER2_CHAIN_NOTTESTED) 5834 return 1; 5835 5836 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5837 case HAMMER2_CHECK_NONE: 5838 r = 1; 5839 break; 5840 case HAMMER2_CHECK_DISABLED: 5841 r = 1; 5842 break; 5843 case HAMMER2_CHECK_ISCSI32: 5844 check32 = hammer2_icrc32(bdata, chain->bytes); 5845 r = (chain->bref.check.iscsi32.value == check32); 5846 if (r == 0) { 5847 hammer2_characterize_failed_chain(chain, check32, 32); 5848 } 5849 hammer2_process_icrc32 += chain->bytes; 5850 break; 5851 case HAMMER2_CHECK_XXHASH64: 5852 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5853 r = (chain->bref.check.xxhash64.value == check64); 5854 if (r == 0) { 5855 hammer2_characterize_failed_chain(chain, check64, 64); 5856 } 5857 hammer2_process_xxhash64 += chain->bytes; 5858 break; 5859 case HAMMER2_CHECK_SHA192: 5860 assert(0); /* XXX unsupported */ 5861 /* 5862 { 5863 SHA256_CTX hash_ctx; 5864 union { 5865 uint8_t digest[SHA256_DIGEST_LENGTH]; 5866 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5867 } u; 5868 5869 SHA256_Init(&hash_ctx); 5870 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5871 SHA256_Final(u.digest, &hash_ctx); 5872 u.digest64[2] ^= u.digest64[3]; 5873 if (bcmp(u.digest, 5874 chain->bref.check.sha192.data, 5875 sizeof(chain->bref.check.sha192.data)) == 0) { 5876 r = 1; 5877 } else { 5878 r = 0; 5879 krateprintf(&krate_h2chk, 5880 "chain %016jx.%02x meth=%02x " 5881 "CHECK FAIL\n", 5882 chain->bref.data_off, 5883 chain->bref.type, 5884 chain->bref.methods); 5885 } 5886 } 5887 */ 5888 break; 5889 case HAMMER2_CHECK_FREEMAP: 5890 r = (chain->bref.check.freemap.icrc32 == 5891 hammer2_icrc32(bdata, chain->bytes)); 5892 if (r == 0) { 5893 int did; 5894 5895 did = krateprintf(&krate_h2chk, 5896 "chain %016jx.%02x meth=%02x " 5897 "CHECK FAIL\n", 5898 chain->bref.data_off, 5899 chain->bref.type, 5900 chain->bref.methods); 5901 if (did) { 5902 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 5903 chain->bref.check.freemap.icrc32, 5904 hammer2_icrc32(bdata, chain->bytes), 5905 chain->bytes); 5906 if (chain->dio) { 5907 kprintf("dio %p buf %016jx,%ld " 5908 "bdata %p/%p\n", 5909 chain->dio, 5910 (intmax_t)chain->dio->bp->b_loffset, 5911 chain->dio->bp->b_bufsize, 5912 bdata, 5913 chain->dio->bp->b_data); 5914 } 5915 } 5916 } 5917 break; 5918 default: 5919 kprintf("hammer2_chain_testcheck: unknown check type %02x\n", 5920 chain->bref.methods); 5921 r = 1; 5922 break; 5923 } 5924 return r; 5925 } 5926 5927 /* 5928 * Acquire the chain and parent representing the specified inode for the 5929 * device at the specified cluster index. 5930 * 5931 * The flags passed in are LOOKUP flags, not RESOLVE flags. 5932 * 5933 * If we are unable to locate the inode, HAMMER2_ERROR_EIO or HAMMER2_ERROR_CHECK 5934 * is returned. In case of error, *chainp and/or *parentp may still be returned 5935 * non-NULL. 5936 * 5937 * The caller may pass-in a locked *parentp and/or *chainp, or neither. 5938 * They will be unlocked and released by this function. The *parentp and 5939 * *chainp representing the located inode are returned locked. 5940 * 5941 * The returned error includes any error on the returned chain in addition to 5942 * errors incurred while trying to lookup the inode. However, a chain->error 5943 * might not be recognized if HAMMER2_LOOKUP_NODATA is passed. This flag may 5944 * not be passed to this function. 5945 */ 5946 int 5947 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 5948 int clindex, int flags, 5949 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 5950 { 5951 hammer2_chain_t *parent; 5952 hammer2_chain_t *rchain; 5953 hammer2_key_t key_dummy; 5954 hammer2_inode_t *ip; 5955 int resolve_flags; 5956 int error; 5957 5958 KKASSERT((flags & HAMMER2_LOOKUP_NODATA) == 0); 5959 5960 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 5961 HAMMER2_RESOLVE_SHARED : 0; 5962 5963 /* 5964 * Caller expects us to replace these. 5965 */ 5966 if (*chainp) { 5967 hammer2_chain_unlock(*chainp); 5968 hammer2_chain_drop(*chainp); 5969 *chainp = NULL; 5970 } 5971 if (*parentp) { 5972 hammer2_chain_unlock(*parentp); 5973 hammer2_chain_drop(*parentp); 5974 *parentp = NULL; 5975 } 5976 5977 /* 5978 * Be very careful, this is a backend function and we CANNOT 5979 * lock any frontend inode structure we find. But we have to 5980 * look the inode up this way first in case it exists but is 5981 * detached from the radix tree. 5982 */ 5983 ip = hammer2_inode_lookup(pmp, inum); 5984 if (ip) { 5985 *chainp = hammer2_inode_chain_and_parent(ip, clindex, 5986 parentp, 5987 resolve_flags); 5988 hammer2_inode_drop(ip); 5989 if (*chainp) 5990 return (*chainp)->error; 5991 hammer2_chain_unlock(*chainp); 5992 hammer2_chain_drop(*chainp); 5993 *chainp = NULL; 5994 if (*parentp) { 5995 hammer2_chain_unlock(*parentp); 5996 hammer2_chain_drop(*parentp); 5997 *parentp = NULL; 5998 } 5999 } 6000 6001 /* 6002 * Inodes hang off of the iroot (bit 63 is clear, differentiating 6003 * inodes from root directory entries in the key lookup). 6004 */ 6005 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 6006 rchain = NULL; 6007 if (parent) { 6008 /* 6009 * NOTE: rchain can be returned as NULL even if error == 0 6010 * (i.e. not found) 6011 */ 6012 rchain = hammer2_chain_lookup(&parent, &key_dummy, 6013 inum, inum, 6014 &error, flags); 6015 /* 6016 * Propagate a chain-specific error to caller. 6017 * 6018 * If the chain is not errored, we must still validate that the inode 6019 * number is correct, because all hell will break loose if it isn't 6020 * correct. It should always be correct so print to the console and 6021 * simulate a CHECK error if it is not. 6022 */ 6023 if (error == 0 && rchain) { 6024 error = rchain->error; 6025 if (error == 0 && rchain->data) { 6026 if (inum != rchain->data->ipdata.meta.inum) { 6027 kprintf("hammer2_chain_inode_find: lookup inum %ld, " 6028 "got valid inode but with inum %ld\n", 6029 inum, rchain->data->ipdata.meta.inum); 6030 error = HAMMER2_ERROR_CHECK; 6031 rchain->error = error; 6032 } 6033 } 6034 } 6035 } else { 6036 error = HAMMER2_ERROR_EIO; 6037 } 6038 *parentp = parent; 6039 *chainp = rchain; 6040 6041 return error; 6042 } 6043 6044 /* 6045 * Used by the bulkscan code to snapshot the synchronized storage for 6046 * a volume, allowing it to be scanned concurrently against normal 6047 * operation. 6048 */ 6049 hammer2_chain_t * 6050 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 6051 { 6052 hammer2_chain_t *copy; 6053 6054 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 6055 copy->data = kmalloc(sizeof(copy->data->voldata), 6056 hmp->mmsg, M_WAITOK | M_ZERO); 6057 hammer2_voldata_lock(hmp); 6058 copy->data->voldata = hmp->volsync; 6059 hammer2_voldata_unlock(hmp); 6060 6061 return copy; 6062 } 6063 6064 void 6065 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 6066 { 6067 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 6068 KKASSERT(copy->data); 6069 kfree(copy->data, copy->hmp->mmsg); 6070 copy->data = NULL; 6071 hammer2_chain_drop(copy); 6072 } 6073 6074 /* 6075 * Returns non-zero if the chain (INODE or DIRENT) matches the 6076 * filename. 6077 */ 6078 int 6079 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 6080 size_t name_len) 6081 { 6082 const hammer2_inode_data_t *ripdata; 6083 6084 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 6085 ripdata = &chain->data->ipdata; 6086 if (ripdata->meta.name_len == name_len && 6087 bcmp(ripdata->filename, name, name_len) == 0) { 6088 return 1; 6089 } 6090 } 6091 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 6092 chain->bref.embed.dirent.namlen == name_len) { 6093 if (name_len > sizeof(chain->bref.check.buf) && 6094 bcmp(chain->data->buf, name, name_len) == 0) { 6095 return 1; 6096 } 6097 if (name_len <= sizeof(chain->bref.check.buf) && 6098 bcmp(chain->bref.check.buf, name, name_len) == 0) { 6099 return 1; 6100 } 6101 } 6102 return 0; 6103 } 6104 6105 /* 6106 * Debugging 6107 */ 6108 void 6109 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int bi, int *countp, 6110 char pfx, u_int flags) 6111 { 6112 hammer2_chain_t *scan; 6113 hammer2_chain_t *parent; 6114 6115 --*countp; 6116 if (*countp == 0) { 6117 kprintf("%*.*s...\n", tab, tab, ""); 6118 return; 6119 } 6120 if (*countp < 0) 6121 return; 6122 kprintf("%*.*s%c-chain %p %s.%-3d %016jx %016jx/%-2d mir=%016jx\n", 6123 tab, tab, "", pfx, chain, 6124 hammer2_bref_type_str(chain->bref.type), bi, 6125 chain->bref.data_off, chain->bref.key, chain->bref.keybits, 6126 chain->bref.mirror_tid); 6127 6128 kprintf("%*.*s [%08x] (%s) refs=%d", 6129 tab, tab, "", 6130 chain->flags, 6131 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE && 6132 chain->data) ? (char *)chain->data->ipdata.filename : "?"), 6133 chain->refs); 6134 6135 parent = chain->parent; 6136 if (parent) 6137 kprintf("\n%*.*s p=%p [pflags %08x prefs %d]", 6138 tab, tab, "", 6139 parent, parent->flags, parent->refs); 6140 if (RB_EMPTY(&chain->core.rbtree)) { 6141 kprintf("\n"); 6142 } else { 6143 int bi = 0; 6144 kprintf(" {\n"); 6145 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) { 6146 if ((scan->flags & flags) || flags == (u_int)-1) { 6147 hammer2_dump_chain(scan, tab + 4, bi, countp, 6148 'a', flags); 6149 } 6150 bi++; 6151 } 6152 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) 6153 kprintf("%*.*s}(%s)\n", tab, tab, "", 6154 chain->data->ipdata.filename); 6155 else 6156 kprintf("%*.*s}\n", tab, tab, ""); 6157 } 6158 } 6159 6160 void 6161 hammer2_dump_chains(hammer2_dev_t *hmp, char vpfx, char fpfx) 6162 { 6163 int dumpcnt; 6164 6165 dumpcnt = 50; 6166 hammer2_dump_chain(&hmp->vchain, 0, 0, &dumpcnt, vpfx, (u_int)-1); 6167 6168 dumpcnt = 50; 6169 hammer2_dump_chain(&hmp->fchain, 0, 0, &dumpcnt, fpfx, (u_int)-1); 6170 } 6171