1 /* 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org> 5 * Copyright (c) 2011-2022 The DragonFly Project. All rights reserved. 6 * 7 * This code is derived from software contributed to The DragonFly Project 8 * by Matthew Dillon <dillon@dragonflybsd.org> 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in 18 * the documentation and/or other materials provided with the 19 * distribution. 20 * 3. Neither the name of The DragonFly Project nor the names of its 21 * contributors may be used to endorse or promote products derived 22 * from this software without specific, prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 27 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 28 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 29 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 33 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 34 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 /* 38 * This subsystem implements most of the core support functions for 39 * the hammer2_chain structure. 40 * 41 * Chains are the in-memory version on media objects (volume header, inodes, 42 * indirect blocks, data blocks, etc). Chains represent a portion of the 43 * HAMMER2 topology. 44 * 45 * Chains are no-longer delete-duplicated. Instead, the original in-memory 46 * chain will be moved along with its block reference (e.g. for things like 47 * renames, hardlink operations, modifications, etc), and will be indexed 48 * on a secondary list for flush handling instead of propagating a flag 49 * upward to the root. 50 * 51 * Concurrent front-end operations can still run against backend flushes 52 * as long as they do not cross the current flush boundary. An operation 53 * running above the current flush (in areas not yet flushed) can become 54 * part of the current flush while ano peration running below the current 55 * flush can become part of the next flush. 56 */ 57 /* 58 #include <sys/cdefs.h> 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/types.h> 62 #include <sys/lock.h> 63 #include <sys/buf.h> 64 65 #include <crypto/sha2/sha2.h> 66 */ 67 68 #include "hammer2.h" 69 70 static hammer2_chain_t *hammer2_chain_create_indirect( 71 hammer2_chain_t *parent, 72 hammer2_key_t key, int keybits, 73 hammer2_tid_t mtid, int for_type, int *errorp); 74 static int hammer2_chain_delete_obref(hammer2_chain_t *parent, 75 hammer2_chain_t *chain, 76 hammer2_tid_t mtid, int flags, 77 hammer2_blockref_t *obref); 78 static hammer2_chain_t *hammer2_combined_find( 79 hammer2_chain_t *parent, 80 hammer2_blockref_t *base, int count, 81 hammer2_key_t *key_nextp, 82 hammer2_key_t key_beg, hammer2_key_t key_end, 83 hammer2_blockref_t **brefp); 84 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 85 int depth); 86 static void hammer2_chain_lru_flush(hammer2_pfs_t *pmp); 87 88 /* 89 * There are many degenerate situations where an extreme rate of console 90 * output can occur from warnings and errors. Make sure this output does 91 * not impede operations. 92 */ 93 /* 94 static struct krate krate_h2chk = { .freq = 5 }; 95 static struct krate krate_h2me = { .freq = 1 }; 96 static struct krate krate_h2em = { .freq = 1 }; 97 */ 98 99 /* 100 * Basic RBTree for chains (core.rbtree). 101 */ 102 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 103 104 int 105 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 106 { 107 hammer2_key_t c1_beg; 108 hammer2_key_t c1_end; 109 hammer2_key_t c2_beg; 110 hammer2_key_t c2_end; 111 112 /* 113 * Compare chains. Overlaps are not supposed to happen and catch 114 * any software issues early we count overlaps as a match. 115 */ 116 c1_beg = chain1->bref.key; 117 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 118 c2_beg = chain2->bref.key; 119 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 120 121 if (c1_end < c2_beg) /* fully to the left */ 122 return(-1); 123 if (c1_beg > c2_end) /* fully to the right */ 124 return(1); 125 return(0); /* overlap (must not cross edge boundary) */ 126 } 127 128 /* 129 * Assert that a chain has no media data associated with it. 130 */ 131 static __inline void 132 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 133 { 134 KKASSERT(chain->dio == NULL); 135 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 136 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 137 chain->data) { 138 panic("hammer2_chain_assert_no_data: chain %p still has data", 139 chain); 140 } 141 } 142 143 /* 144 * Make a chain visible to the flusher. The flusher operates using a top-down 145 * recursion based on the ONFLUSH flag. It locates MODIFIED and UPDATE chains, 146 * flushes them, and updates blocks back to the volume root. 147 * 148 * This routine sets the ONFLUSH flag upward from the triggering chain until 149 * it hits an inode root or the volume root. Inode chains serve as inflection 150 * points, requiring the flusher to bridge across trees. Inodes include 151 * regular inodes, PFS roots (pmp->iroot), and the media super root 152 * (spmp->iroot). 153 */ 154 void 155 hammer2_chain_setflush(hammer2_chain_t *chain) 156 { 157 hammer2_chain_t *parent; 158 159 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 160 hammer2_spin_sh(&chain->core.spin); 161 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 162 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 163 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 164 break; 165 if ((parent = chain->parent) == NULL) 166 break; 167 hammer2_spin_sh(&parent->core.spin); 168 hammer2_spin_unsh(&chain->core.spin); 169 chain = parent; 170 } 171 hammer2_spin_unsh(&chain->core.spin); 172 } 173 } 174 175 /* 176 * Allocate a new disconnected chain element representing the specified 177 * bref. chain->refs is set to 1 and the passed bref is copied to 178 * chain->bref. chain->bytes is derived from the bref. 179 * 180 * chain->pmp inherits pmp unless the chain is an inode (other than the 181 * super-root inode). 182 * 183 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 184 */ 185 hammer2_chain_t * 186 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 187 hammer2_blockref_t *bref) 188 { 189 hammer2_chain_t *chain; 190 u_int bytes; 191 192 /* 193 * Special case - radix of 0 indicates a chain that does not 194 * need a data reference (context is completely embedded in the 195 * bref). 196 */ 197 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 198 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 199 else 200 bytes = 0; 201 202 switch(bref->type) { 203 case HAMMER2_BREF_TYPE_INODE: 204 case HAMMER2_BREF_TYPE_INDIRECT: 205 case HAMMER2_BREF_TYPE_DATA: 206 case HAMMER2_BREF_TYPE_DIRENT: 207 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 208 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 209 case HAMMER2_BREF_TYPE_FREEMAP: 210 case HAMMER2_BREF_TYPE_VOLUME: 211 chain = kmalloc_obj(sizeof(*chain), hmp->mchain, 212 M_WAITOK | M_ZERO); 213 atomic_add_long(&hammer2_chain_allocs, 1); 214 break; 215 case HAMMER2_BREF_TYPE_EMPTY: 216 default: 217 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 218 bref->type); 219 break; 220 } 221 222 /* 223 * Initialize the new chain structure. pmp must be set to NULL for 224 * chains belonging to the super-root topology of a device mount. 225 */ 226 if (pmp == hmp->spmp) 227 chain->pmp = NULL; 228 else 229 chain->pmp = pmp; 230 231 chain->hmp = hmp; 232 chain->bref = *bref; 233 chain->bytes = bytes; 234 chain->refs = 1; 235 chain->flags = HAMMER2_CHAIN_ALLOCATED; 236 lockinit(&chain->diolk, "chdio", 0, 0); 237 238 /* 239 * Set the PFS boundary flag if this chain represents a PFS root. 240 */ 241 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 242 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 243 hammer2_chain_core_init(chain); 244 245 return (chain); 246 } 247 248 /* 249 * Initialize a chain's core structure. This structure used to be allocated 250 * but is now embedded. 251 * 252 * The core is not locked. No additional refs on the chain are made. 253 */ 254 void 255 hammer2_chain_core_init(hammer2_chain_t *chain) 256 { 257 RB_INIT(&chain->core.rbtree); /* live chains */ 258 hammer2_mtx_init(&chain->lock, "h2chain"); 259 hammer2_spin_init(&chain->core.spin, "h2chain"); 260 } 261 262 /* 263 * Add a reference to a chain element, preventing its destruction. 264 * 265 * (can be called with spinlock held) 266 */ 267 void 268 hammer2_chain_ref(hammer2_chain_t *chain) 269 { 270 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 271 /* 272 * Just flag that the chain was used and should be recycled 273 * on the LRU if it encounters it later. 274 */ 275 if (chain->flags & HAMMER2_CHAIN_ONLRU) 276 atomic_set_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 277 278 #if 0 279 /* 280 * REMOVED - reduces contention, lru_list is more heuristical 281 * now. 282 * 283 * 0->non-zero transition must ensure that chain is removed 284 * from the LRU list. 285 * 286 * NOTE: Already holding lru_spin here so we cannot call 287 * hammer2_chain_ref() to get it off lru_list, do 288 * it manually. 289 */ 290 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 291 hammer2_pfs_t *pmp = chain->pmp; 292 hammer2_spin_ex(&pmp->lru_spin); 293 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 294 atomic_add_int(&pmp->lru_count, -1); 295 atomic_clear_int(&chain->flags, 296 HAMMER2_CHAIN_ONLRU); 297 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 298 } 299 hammer2_spin_unex(&pmp->lru_spin); 300 } 301 #endif 302 } 303 } 304 305 /* 306 * Ref a locked chain and force the data to be held across an unlock. 307 * Chain must be currently locked. The user of the chain who desires 308 * to release the hold must call hammer2_chain_lock_unhold() to relock 309 * and unhold the chain, then unlock normally, or may simply call 310 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 311 */ 312 void 313 hammer2_chain_ref_hold(hammer2_chain_t *chain) 314 { 315 atomic_add_int(&chain->lockcnt, 1); 316 hammer2_chain_ref(chain); 317 } 318 319 /* 320 * Insert the chain in the core rbtree. 321 * 322 * Normal insertions are placed in the live rbtree. Insertion of a deleted 323 * chain is a special case used by the flush code that is placed on the 324 * unstaged deleted list to avoid confusing the live view. 325 */ 326 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 327 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 328 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 329 330 static 331 int 332 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 333 int flags, int generation) 334 { 335 hammer2_chain_t *xchain; 336 int error = 0; 337 338 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 339 hammer2_spin_ex(&parent->core.spin); 340 341 /* 342 * Interlocked by spinlock, check for race 343 */ 344 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 345 parent->core.generation != generation) { 346 error = HAMMER2_ERROR_EAGAIN; 347 goto failed; 348 } 349 350 /* 351 * Insert chain 352 */ 353 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 354 KASSERT(xchain == NULL, 355 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 356 chain, xchain, chain->bref.key)); 357 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 358 chain->parent = parent; 359 ++parent->core.chain_count; 360 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 361 362 /* 363 * We have to keep track of the effective live-view blockref count 364 * so the create code knows when to push an indirect block. 365 */ 366 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 367 atomic_add_int(&parent->core.live_count, 1); 368 failed: 369 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 370 hammer2_spin_unex(&parent->core.spin); 371 return error; 372 } 373 374 /* 375 * Drop the caller's reference to the chain. When the ref count drops to 376 * zero this function will try to disassociate the chain from its parent and 377 * deallocate it, then recursely drop the parent using the implied ref 378 * from the chain's chain->parent. 379 * 380 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 381 * races an acquisition by another cpu. Therefore we can loop if we are 382 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 383 * race against another drop. 384 */ 385 void 386 hammer2_chain_drop(hammer2_chain_t *chain) 387 { 388 u_int refs; 389 390 KKASSERT(chain->refs > 0); 391 392 while (chain) { 393 refs = chain->refs; 394 cpu_ccfence(); 395 KKASSERT(refs > 0); 396 397 if (refs == 1) { 398 if (hammer2_mtx_ex_try(&chain->lock) == 0) 399 chain = hammer2_chain_lastdrop(chain, 0); 400 /* retry the same chain, or chain from lastdrop */ 401 } else { 402 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 403 break; 404 /* retry the same chain */ 405 } 406 cpu_pause(); 407 } 408 } 409 410 /* 411 * Unhold a held and probably not-locked chain, ensure that the data is 412 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 413 * lock and then simply unlocking the chain. 414 */ 415 void 416 hammer2_chain_unhold(hammer2_chain_t *chain) 417 { 418 u_int lockcnt; 419 int iter = 0; 420 421 for (;;) { 422 lockcnt = chain->lockcnt; 423 cpu_ccfence(); 424 if (lockcnt > 1) { 425 if (atomic_cmpset_int(&chain->lockcnt, 426 lockcnt, lockcnt - 1)) { 427 break; 428 } 429 } else if (hammer2_mtx_ex_try(&chain->lock) == 0) { 430 hammer2_chain_unlock(chain); 431 break; 432 } else { 433 /* 434 * This situation can easily occur on SMP due to 435 * the gap inbetween the 1->0 transition and the 436 * final unlock. We cannot safely block on the 437 * mutex because lockcnt might go above 1. 438 * 439 * XXX Sleep for one tick if it takes too long. 440 */ 441 if (++iter > 1000) { 442 if (iter > 1000 + hz) { 443 kprintf("hammer2: h2race1 %p\n", chain); 444 iter = 1000; 445 } 446 tsleep(&iter, 0, "h2race1", 1); 447 } 448 cpu_pause(); 449 } 450 } 451 } 452 453 void 454 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 455 { 456 hammer2_chain_unhold(chain); 457 hammer2_chain_drop(chain); 458 } 459 460 void 461 hammer2_chain_rehold(hammer2_chain_t *chain) 462 { 463 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED); 464 atomic_add_int(&chain->lockcnt, 1); 465 hammer2_chain_unlock(chain); 466 } 467 468 /* 469 * Handles the (potential) last drop of chain->refs from 1->0. Called with 470 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 471 * possible against refs and lockcnt. We must dispose of the mutex on chain. 472 * 473 * This function returns an unlocked chain for recursive drop or NULL. It 474 * can return the same chain if it determines it has raced another ref. 475 * 476 * -- 477 * 478 * When two chains need to be recursively dropped we use the chain we 479 * would otherwise free to placehold the additional chain. It's a bit 480 * convoluted but we can't just recurse without potentially blowing out 481 * the kernel stack. 482 * 483 * The chain cannot be freed if it has any children. 484 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 485 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 486 * Any dedup registration can remain intact. 487 * 488 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 489 */ 490 static 491 hammer2_chain_t * 492 hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth) 493 { 494 hammer2_pfs_t *pmp; 495 hammer2_dev_t *hmp; 496 hammer2_chain_t *parent; 497 hammer2_chain_t *rdrop; 498 499 /* 500 * We need chain's spinlock to interlock the sub-tree test. 501 * We already have chain's mutex, protecting chain->parent. 502 * 503 * Remember that chain->refs can be in flux. 504 */ 505 hammer2_spin_ex(&chain->core.spin); 506 507 if (chain->parent != NULL) { 508 /* 509 * If the chain has a parent the UPDATE bit prevents scrapping 510 * as the chain is needed to properly flush the parent. Try 511 * to complete the 1->0 transition and return NULL. Retry 512 * (return chain) if we are unable to complete the 1->0 513 * transition, else return NULL (nothing more to do). 514 * 515 * If the chain has a parent the MODIFIED bit prevents 516 * scrapping. 517 * 518 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 519 */ 520 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 521 HAMMER2_CHAIN_MODIFIED)) { 522 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 523 hammer2_spin_unex(&chain->core.spin); 524 hammer2_chain_assert_no_data(chain); 525 hammer2_mtx_unlock(&chain->lock); 526 chain = NULL; 527 } else { 528 hammer2_spin_unex(&chain->core.spin); 529 hammer2_mtx_unlock(&chain->lock); 530 } 531 return (chain); 532 } 533 /* spinlock still held */ 534 } else if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME || 535 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP) { 536 /* 537 * Retain the static vchain and fchain. Clear bits that 538 * are not relevant. Do not clear the MODIFIED bit, 539 * and certainly do not put it on the delayed-flush queue. 540 */ 541 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 542 } else { 543 /* 544 * The chain has no parent and can be flagged for destruction. 545 * Since it has no parent, UPDATE can also be cleared. 546 */ 547 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 548 if (chain->flags & HAMMER2_CHAIN_UPDATE) 549 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 550 551 /* 552 * If the chain has children we must propagate the DESTROY 553 * flag downward and rip the disconnected topology apart. 554 * This is accomplished by calling hammer2_flush() on the 555 * chain. 556 * 557 * Any dedup is already handled by the underlying DIO, so 558 * we do not have to specifically flush it here. 559 */ 560 if (chain->core.chain_count) { 561 hammer2_spin_unex(&chain->core.spin); 562 hammer2_flush(chain, HAMMER2_FLUSH_TOP | 563 HAMMER2_FLUSH_ALL); 564 hammer2_mtx_unlock(&chain->lock); 565 566 return(chain); /* retry drop */ 567 } 568 569 /* 570 * Otherwise we can scrap the MODIFIED bit if it is set, 571 * and continue along the freeing path. 572 * 573 * Be sure to clean-out any dedup bits. Without a parent 574 * this chain will no longer be visible to the flush code. 575 * Easy check data_off to avoid the volume root. 576 */ 577 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 578 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 579 atomic_add_long(&hammer2_count_modified_chains, -1); 580 if (chain->pmp) 581 hammer2_pfs_memory_wakeup(chain->pmp, -1); 582 } 583 /* spinlock still held */ 584 } 585 586 /* spinlock still held */ 587 588 /* 589 * If any children exist we must leave the chain intact with refs == 0. 590 * They exist because chains are retained below us which have refs or 591 * may require flushing. 592 * 593 * Retry (return chain) if we fail to transition the refs to 0, else 594 * return NULL indication nothing more to do. 595 * 596 * Chains with children are NOT put on the LRU list. 597 */ 598 if (chain->core.chain_count) { 599 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 600 hammer2_spin_unex(&chain->core.spin); 601 hammer2_chain_assert_no_data(chain); 602 hammer2_mtx_unlock(&chain->lock); 603 chain = NULL; 604 } else { 605 hammer2_spin_unex(&chain->core.spin); 606 hammer2_mtx_unlock(&chain->lock); 607 } 608 return (chain); 609 } 610 /* spinlock still held */ 611 /* no chains left under us */ 612 613 /* 614 * chain->core has no children left so no accessors can get to our 615 * chain from there. Now we have to lock the parent core to interlock 616 * remaining possible accessors that might bump chain's refs before 617 * we can safely drop chain's refs with intent to free the chain. 618 */ 619 hmp = chain->hmp; 620 pmp = chain->pmp; /* can be NULL */ 621 rdrop = NULL; 622 623 parent = chain->parent; 624 625 /* 626 * WARNING! chain's spin lock is still held here, and other spinlocks 627 * will be acquired and released in the code below. We 628 * cannot be making fancy procedure calls! 629 */ 630 631 /* 632 * We can cache the chain if it is associated with a pmp 633 * and not flagged as being destroyed or requesting a full 634 * release. In this situation the chain is not removed 635 * from its parent, i.e. it can still be looked up. 636 * 637 * We intentionally do not cache DATA chains because these 638 * were likely used to load data into the logical buffer cache 639 * and will not be accessed again for some time. 640 */ 641 if ((chain->flags & 642 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 643 chain->pmp && 644 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 645 if (parent) 646 hammer2_spin_ex(&parent->core.spin); 647 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 648 /* 649 * 1->0 transition failed, retry. Do not drop 650 * the chain's data yet! 651 */ 652 if (parent) 653 hammer2_spin_unex(&parent->core.spin); 654 hammer2_spin_unex(&chain->core.spin); 655 hammer2_mtx_unlock(&chain->lock); 656 657 return(chain); 658 } 659 660 /* 661 * Success 662 */ 663 hammer2_chain_assert_no_data(chain); 664 665 /* 666 * Make sure we are on the LRU list, clean up excessive 667 * LRU entries. We can only really drop one but there might 668 * be other entries that we can remove from the lru_list 669 * without dropping. 670 * 671 * NOTE: HAMMER2_CHAIN_ONLRU may only be safely set when 672 * chain->core.spin AND pmp->lru_spin are held, but 673 * can be safely cleared only holding pmp->lru_spin. 674 */ 675 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 676 hammer2_spin_ex(&pmp->lru_spin); 677 if ((chain->flags & HAMMER2_CHAIN_ONLRU) == 0) { 678 atomic_set_int(&chain->flags, 679 HAMMER2_CHAIN_ONLRU); 680 TAILQ_INSERT_TAIL(&pmp->lru_list, 681 chain, lru_node); 682 atomic_add_int(&pmp->lru_count, 1); 683 } 684 if (pmp->lru_count < HAMMER2_LRU_LIMIT) 685 depth = 1; /* disable lru_list flush */ 686 hammer2_spin_unex(&pmp->lru_spin); 687 } else { 688 /* disable lru flush */ 689 depth = 1; 690 } 691 692 if (parent) { 693 hammer2_spin_unex(&parent->core.spin); 694 parent = NULL; /* safety */ 695 } 696 hammer2_spin_unex(&chain->core.spin); 697 hammer2_mtx_unlock(&chain->lock); 698 699 /* 700 * lru_list hysteresis (see above for depth overrides). 701 * Note that depth also prevents excessive lastdrop recursion. 702 */ 703 if (depth == 0) 704 hammer2_chain_lru_flush(pmp); 705 706 return NULL; 707 /* NOT REACHED */ 708 } 709 710 /* 711 * Make sure we are not on the LRU list. 712 */ 713 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 714 hammer2_spin_ex(&pmp->lru_spin); 715 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 716 atomic_add_int(&pmp->lru_count, -1); 717 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 718 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 719 } 720 hammer2_spin_unex(&pmp->lru_spin); 721 } 722 723 /* 724 * Spinlock the parent and try to drop the last ref on chain. 725 * On success determine if we should dispose of the chain 726 * (remove the chain from its parent, etc). 727 * 728 * (normal core locks are top-down recursive but we define 729 * core spinlocks as bottom-up recursive, so this is safe). 730 */ 731 if (parent) { 732 hammer2_spin_ex(&parent->core.spin); 733 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 734 /* 735 * 1->0 transition failed, retry. 736 */ 737 hammer2_spin_unex(&parent->core.spin); 738 hammer2_spin_unex(&chain->core.spin); 739 hammer2_mtx_unlock(&chain->lock); 740 741 return(chain); 742 } 743 744 /* 745 * 1->0 transition successful, parent spin held to prevent 746 * new lookups, chain spinlock held to protect parent field. 747 * Remove chain from the parent. 748 * 749 * If the chain is being removed from the parent's btree but 750 * is not blkmapped, we have to adjust live_count downward. If 751 * it is blkmapped then the blockref is retained in the parent 752 * as is its associated live_count. This case can occur when 753 * a chain added to the topology is unable to flush and is 754 * then later deleted. 755 */ 756 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 757 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 758 (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) { 759 atomic_add_int(&parent->core.live_count, -1); 760 } 761 RB_REMOVE(hammer2_chain_tree, 762 &parent->core.rbtree, chain); 763 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 764 --parent->core.chain_count; 765 chain->parent = NULL; 766 } 767 768 /* 769 * If our chain was the last chain in the parent's core the 770 * core is now empty and its parent might have to be 771 * re-dropped if it has 0 refs. 772 */ 773 if (parent->core.chain_count == 0) { 774 rdrop = parent; 775 atomic_add_int(&rdrop->refs, 1); 776 /* 777 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 778 rdrop = NULL; 779 */ 780 } 781 hammer2_spin_unex(&parent->core.spin); 782 parent = NULL; /* safety */ 783 /* FALL THROUGH */ 784 } else { 785 /* 786 * No-parent case. 787 */ 788 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 789 /* 790 * 1->0 transition failed, retry. 791 */ 792 hammer2_spin_unex(&parent->core.spin); 793 hammer2_spin_unex(&chain->core.spin); 794 hammer2_mtx_unlock(&chain->lock); 795 796 return(chain); 797 } 798 } 799 800 /* 801 * Successful 1->0 transition, no parent, no children... no way for 802 * anyone to ref this chain any more. We can clean-up and free it. 803 * 804 * We still have the core spinlock, and core's chain_count is 0. 805 * Any parent spinlock is gone. 806 */ 807 hammer2_spin_unex(&chain->core.spin); 808 hammer2_chain_assert_no_data(chain); 809 hammer2_mtx_unlock(&chain->lock); 810 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 811 chain->core.chain_count == 0); 812 813 /* 814 * All locks are gone, no pointers remain to the chain, finish 815 * freeing it. 816 */ 817 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 818 HAMMER2_CHAIN_MODIFIED)) == 0); 819 820 /* 821 * Once chain resources are gone we can use the now dead chain 822 * structure to placehold what might otherwise require a recursive 823 * drop, because we have potentially two things to drop and can only 824 * return one directly. 825 */ 826 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 827 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 828 chain->hmp = NULL; 829 kfree_obj(chain, hmp->mchain); 830 atomic_add_long(&hammer2_chain_allocs, -1); 831 } 832 833 /* 834 * Possible chaining loop when parent re-drop needed. 835 */ 836 return(rdrop); 837 } 838 839 /* 840 * Heuristical flush of the LRU, try to reduce the number of entries 841 * on the LRU to (HAMMER2_LRU_LIMIT * 2 / 3). This procedure is called 842 * only when lru_count exceeds HAMMER2_LRU_LIMIT. 843 */ 844 static 845 void 846 hammer2_chain_lru_flush(hammer2_pfs_t *pmp) 847 { 848 hammer2_chain_t *chain; 849 850 again: 851 chain = NULL; 852 hammer2_spin_ex(&pmp->lru_spin); 853 while (pmp->lru_count > HAMMER2_LRU_LIMIT * 2 / 3) { 854 /* 855 * Pick a chain off the lru_list, just recycle it quickly 856 * if LRUHINT is set (the chain was ref'd but left on 857 * the lru_list, so cycle to the end). 858 */ 859 chain = TAILQ_FIRST(&pmp->lru_list); 860 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 861 862 if (chain->flags & HAMMER2_CHAIN_LRUHINT) { 863 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_LRUHINT); 864 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 865 chain = NULL; 866 continue; 867 } 868 869 /* 870 * Ok, we are off the LRU. We must adjust refs before we 871 * can safely clear the ONLRU flag. 872 */ 873 atomic_add_int(&pmp->lru_count, -1); 874 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 875 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 876 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); 877 break; 878 } 879 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 880 chain = NULL; 881 } 882 hammer2_spin_unex(&pmp->lru_spin); 883 if (chain == NULL) 884 return; 885 886 /* 887 * If we picked a chain off the lru list we may be able to lastdrop 888 * it. Use a depth of 1 to prevent excessive lastdrop recursion. 889 */ 890 while (chain) { 891 u_int refs; 892 893 refs = chain->refs; 894 cpu_ccfence(); 895 KKASSERT(refs > 0); 896 897 if (refs == 1) { 898 if (hammer2_mtx_ex_try(&chain->lock) == 0) 899 chain = hammer2_chain_lastdrop(chain, 1); 900 /* retry the same chain, or chain from lastdrop */ 901 } else { 902 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 903 break; 904 /* retry the same chain */ 905 } 906 cpu_pause(); 907 } 908 goto again; 909 } 910 911 /* 912 * On last lock release. 913 */ 914 static hammer2_io_t * 915 hammer2_chain_drop_data(hammer2_chain_t *chain) 916 { 917 hammer2_io_t *dio; 918 919 if ((dio = chain->dio) != NULL) { 920 chain->dio = NULL; 921 chain->data = NULL; 922 } else { 923 switch(chain->bref.type) { 924 case HAMMER2_BREF_TYPE_VOLUME: 925 case HAMMER2_BREF_TYPE_FREEMAP: 926 break; 927 default: 928 if (chain->data != NULL) { 929 hammer2_spin_unex(&chain->core.spin); 930 panic("chain data not null: " 931 "chain %p bref %016jx.%02x " 932 "refs %d parent %p dio %p data %p", 933 chain, chain->bref.data_off, 934 chain->bref.type, chain->refs, 935 chain->parent, 936 chain->dio, chain->data); 937 } 938 KKASSERT(chain->data == NULL); 939 break; 940 } 941 } 942 return dio; 943 } 944 945 /* 946 * Lock a referenced chain element, acquiring its data with I/O if necessary, 947 * and specify how you would like the data to be resolved. 948 * 949 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 950 * 951 * The lock is allowed to recurse, multiple locking ops will aggregate 952 * the requested resolve types. Once data is assigned it will not be 953 * removed until the last unlock. 954 * 955 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 956 * (typically used to avoid device/logical buffer 957 * aliasing for data) 958 * 959 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 960 * the INITIAL-create state (indirect blocks only). 961 * 962 * Do not resolve data elements for DATA chains. 963 * (typically used to avoid device/logical buffer 964 * aliasing for data) 965 * 966 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 967 * 968 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 969 * it will be locked exclusive. 970 * 971 * HAMMER2_RESOLVE_NONBLOCK- (flag) The chain is locked non-blocking. If 972 * the lock fails, EAGAIN is returned. 973 * 974 * NOTE: Embedded elements (volume header, inodes) are always resolved 975 * regardless. 976 * 977 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 978 * element will instantiate and zero its buffer, and flush it on 979 * release. 980 * 981 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 982 * so as not to instantiate a device buffer, which could alias against 983 * a logical file buffer. However, if ALWAYS is specified the 984 * device buffer will be instantiated anyway. 985 * 986 * NOTE: The return value is always 0 unless NONBLOCK is specified, in which 987 * case it can be either 0 or EAGAIN. 988 * 989 * WARNING! This function blocks on I/O if data needs to be fetched. This 990 * blocking can run concurrent with other compatible lock holders 991 * who do not need data returning. The lock is not upgraded to 992 * exclusive during a data fetch, a separate bit is used to 993 * interlock I/O. However, an exclusive lock holder can still count 994 * on being interlocked against an I/O fetch managed by a shared 995 * lock holder. 996 */ 997 int 998 hammer2_chain_lock(hammer2_chain_t *chain, int how) 999 { 1000 KKASSERT(chain->refs > 0); 1001 1002 if (how & HAMMER2_RESOLVE_NONBLOCK) { 1003 /* 1004 * We still have to bump lockcnt before acquiring the lock, 1005 * even for non-blocking operation, because the unlock code 1006 * live-loops on lockcnt == 1 when dropping the last lock. 1007 * 1008 * If the non-blocking operation fails we have to use an 1009 * unhold sequence to undo the mess. 1010 * 1011 * NOTE: LOCKAGAIN must always succeed without blocking, 1012 * even if NONBLOCK is specified. 1013 */ 1014 atomic_add_int(&chain->lockcnt, 1); 1015 if (how & HAMMER2_RESOLVE_SHARED) { 1016 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1017 hammer2_mtx_sh_again(&chain->lock); 1018 } else { 1019 if (hammer2_mtx_sh_try(&chain->lock) != 0) { 1020 hammer2_chain_unhold(chain); 1021 return EAGAIN; 1022 } 1023 } 1024 } else { 1025 if (hammer2_mtx_ex_try(&chain->lock) != 0) { 1026 hammer2_chain_unhold(chain); 1027 return EAGAIN; 1028 } 1029 } 1030 } else { 1031 /* 1032 * Get the appropriate lock. If LOCKAGAIN is flagged with 1033 * SHARED the caller expects a shared lock to already be 1034 * present and we are giving it another ref. This case must 1035 * importantly not block if there is a pending exclusive lock 1036 * request. 1037 */ 1038 atomic_add_int(&chain->lockcnt, 1); 1039 if (how & HAMMER2_RESOLVE_SHARED) { 1040 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 1041 hammer2_mtx_sh_again(&chain->lock); 1042 } else { 1043 hammer2_mtx_sh(&chain->lock); 1044 } 1045 } else { 1046 hammer2_mtx_ex(&chain->lock); 1047 } 1048 } 1049 1050 /* 1051 * If we already have a valid data pointer make sure the data is 1052 * synchronized to the current cpu, and then no further action is 1053 * necessary. 1054 */ 1055 if (chain->data) { 1056 if (chain->dio) 1057 hammer2_io_bkvasync(chain->dio); 1058 return 0; 1059 } 1060 1061 /* 1062 * Do we have to resolve the data? This is generally only 1063 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 1064 * Other BREF types expects the data to be there. 1065 */ 1066 switch(how & HAMMER2_RESOLVE_MASK) { 1067 case HAMMER2_RESOLVE_NEVER: 1068 return 0; 1069 case HAMMER2_RESOLVE_MAYBE: 1070 if (chain->flags & HAMMER2_CHAIN_INITIAL) 1071 return 0; 1072 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 1073 return 0; 1074 #if 0 1075 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 1076 return 0; 1077 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 1078 return 0; 1079 #endif 1080 /* fall through */ 1081 case HAMMER2_RESOLVE_ALWAYS: 1082 default: 1083 break; 1084 } 1085 1086 /* 1087 * Caller requires data 1088 */ 1089 hammer2_chain_load_data(chain); 1090 1091 return 0; 1092 } 1093 1094 /* 1095 * Lock the chain, retain the hold, and drop the data persistence count. 1096 * The data should remain valid because we never transitioned lockcnt 1097 * through 0. 1098 */ 1099 void 1100 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 1101 { 1102 hammer2_chain_lock(chain, how); 1103 atomic_add_int(&chain->lockcnt, -1); 1104 } 1105 1106 #if 0 1107 /* 1108 * Downgrade an exclusive chain lock to a shared chain lock. 1109 * 1110 * NOTE: There is no upgrade equivalent due to the ease of 1111 * deadlocks in that direction. 1112 */ 1113 void 1114 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1115 { 1116 hammer2_mtx_downgrade(&chain->lock); 1117 } 1118 #endif 1119 1120 /* 1121 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1122 * may be of any type. 1123 * 1124 * Once chain->data is set it cannot be disposed of until all locks are 1125 * released. 1126 * 1127 * Make sure the data is synchronized to the current cpu. 1128 */ 1129 void 1130 hammer2_chain_load_data(hammer2_chain_t *chain) 1131 { 1132 hammer2_blockref_t *bref; 1133 hammer2_dev_t *hmp; 1134 hammer2_io_t *dio; 1135 char *bdata; 1136 int error; 1137 1138 /* 1139 * Degenerate case, data already present, or chain has no media 1140 * reference to load. 1141 */ 1142 KKASSERT(chain->lock.mtx_lock & MTX_MASK); 1143 if (chain->data) { 1144 if (chain->dio) 1145 hammer2_io_bkvasync(chain->dio); 1146 return; 1147 } 1148 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1149 return; 1150 1151 hmp = chain->hmp; 1152 KKASSERT(hmp != NULL); 1153 1154 /* 1155 * Gain the IOINPROG bit, interlocked block. 1156 */ 1157 for (;;) { 1158 u_int oflags; 1159 u_int nflags; 1160 1161 oflags = chain->flags; 1162 cpu_ccfence(); 1163 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1164 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1165 tsleep_interlock(&chain->flags, 0); 1166 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1167 tsleep(&chain->flags, PINTERLOCKED, 1168 "h2iocw", 0); 1169 } 1170 /* retry */ 1171 } else { 1172 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1173 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1174 break; 1175 } 1176 /* retry */ 1177 } 1178 } 1179 1180 /* 1181 * We own CHAIN_IOINPROG 1182 * 1183 * Degenerate case if we raced another load. 1184 */ 1185 if (chain->data) { 1186 if (chain->dio) 1187 hammer2_io_bkvasync(chain->dio); 1188 goto done; 1189 } 1190 1191 /* 1192 * We must resolve to a device buffer, either by issuing I/O or 1193 * by creating a zero-fill element. We do not mark the buffer 1194 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1195 * API must still be used to do that). 1196 * 1197 * The device buffer is variable-sized in powers of 2 down 1198 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1199 * chunk always contains buffers of the same size. (XXX) 1200 * 1201 * The minimum physical IO size may be larger than the variable 1202 * block size. 1203 */ 1204 bref = &chain->bref; 1205 1206 /* 1207 * The getblk() optimization can only be used on newly created 1208 * elements if the physical block size matches the request. 1209 */ 1210 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1211 error = hammer2_io_new(hmp, bref->type, 1212 bref->data_off, chain->bytes, 1213 &chain->dio); 1214 } else { 1215 error = hammer2_io_bread(hmp, bref->type, 1216 bref->data_off, chain->bytes, 1217 &chain->dio); 1218 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 1219 } 1220 if (error) { 1221 chain->error = HAMMER2_ERROR_EIO; 1222 kprintf("hammer2_chain_load_data: I/O error %016jx: %d\n", 1223 (intmax_t)bref->data_off, error); 1224 hammer2_io_bqrelse(&chain->dio); 1225 goto done; 1226 } 1227 chain->error = 0; 1228 1229 /* 1230 * This isn't perfect and can be ignored on OSs which do not have 1231 * an indication as to whether a buffer is coming from cache or 1232 * if I/O was actually issued for the read. TESTEDGOOD will work 1233 * pretty well without the B_IOISSUED logic because chains are 1234 * cached, but in that situation (without B_IOISSUED) it will not 1235 * detect whether a re-read via I/O is corrupted verses the original 1236 * read. 1237 * 1238 * We can't re-run the CRC on every fresh lock. That would be 1239 * insanely expensive. 1240 * 1241 * If the underlying kernel buffer covers the entire chain we can 1242 * use the B_IOISSUED indication to determine if we have to re-run 1243 * the CRC on chain data for chains that managed to stay cached 1244 * across the kernel disposal of the original buffer. 1245 */ 1246 if ((dio = chain->dio) != NULL && dio->bp) { 1247 //struct buf *bp = dio->bp; 1248 1249 if (dio->psize == chain->bytes //&& 1250 /*(bp->b_flags & B_IOISSUED)*/) { 1251 atomic_clear_int(&chain->flags, 1252 HAMMER2_CHAIN_TESTEDGOOD); 1253 //bp->b_flags &= ~B_IOISSUED; 1254 } 1255 } 1256 1257 /* 1258 * NOTE: A locked chain's data cannot be modified without first 1259 * calling hammer2_chain_modify(). 1260 */ 1261 1262 /* 1263 * NOTE: hammer2_io_data() call issues bkvasync() 1264 */ 1265 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1266 1267 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1268 /* 1269 * Clear INITIAL. In this case we used io_new() and the 1270 * buffer has been zero'd and marked dirty. 1271 * 1272 * CHAIN_MODIFIED has not been set yet, and we leave it 1273 * that way for now. Set a temporary CHAIN_NOTTESTED flag 1274 * to prevent hammer2_chain_testcheck() from trying to match 1275 * a check code that has not yet been generated. This bit 1276 * should NOT end up on the actual media. 1277 */ 1278 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1279 atomic_set_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 1280 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1281 /* 1282 * check data not currently synchronized due to 1283 * modification. XXX assumes data stays in the buffer 1284 * cache, which might not be true (need biodep on flush 1285 * to calculate crc? or simple crc?). 1286 */ 1287 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1288 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1289 chain->error = HAMMER2_ERROR_CHECK; 1290 } else { 1291 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1292 } 1293 } 1294 1295 /* 1296 * Setup the data pointer, either pointing it to an embedded data 1297 * structure and copying the data from the buffer, or pointing it 1298 * into the buffer. 1299 * 1300 * The buffer is not retained when copying to an embedded data 1301 * structure in order to avoid potential deadlocks or recursions 1302 * on the same physical buffer. 1303 * 1304 * WARNING! Other threads can start using the data the instant we 1305 * set chain->data non-NULL. 1306 */ 1307 switch (bref->type) { 1308 case HAMMER2_BREF_TYPE_VOLUME: 1309 case HAMMER2_BREF_TYPE_FREEMAP: 1310 /* 1311 * Copy data from bp to embedded buffer 1312 */ 1313 panic("hammer2_chain_load_data: unresolved volume header"); 1314 break; 1315 case HAMMER2_BREF_TYPE_DIRENT: 1316 KKASSERT(chain->bytes != 0); 1317 /* fall through */ 1318 case HAMMER2_BREF_TYPE_INODE: 1319 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1320 case HAMMER2_BREF_TYPE_INDIRECT: 1321 case HAMMER2_BREF_TYPE_DATA: 1322 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1323 default: 1324 /* 1325 * Point data at the device buffer and leave dio intact. 1326 */ 1327 chain->data = (void *)bdata; 1328 break; 1329 } 1330 1331 /* 1332 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1333 */ 1334 done: 1335 for (;;) { 1336 u_int oflags; 1337 u_int nflags; 1338 1339 oflags = chain->flags; 1340 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1341 HAMMER2_CHAIN_IOSIGNAL); 1342 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1343 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1344 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1345 wakeup(&chain->flags); 1346 break; 1347 } 1348 } 1349 } 1350 1351 /* 1352 * Unlock and deref a chain element. 1353 * 1354 * Remember that the presence of children under chain prevent the chain's 1355 * destruction but do not add additional references, so the dio will still 1356 * be dropped. 1357 */ 1358 void 1359 hammer2_chain_unlock(hammer2_chain_t *chain) 1360 { 1361 hammer2_io_t *dio; 1362 u_int lockcnt; 1363 int iter = 0; 1364 1365 /* 1366 * If multiple locks are present (or being attempted) on this 1367 * particular chain we can just unlock, drop refs, and return. 1368 * 1369 * Otherwise fall-through on the 1->0 transition. 1370 */ 1371 for (;;) { 1372 lockcnt = chain->lockcnt; 1373 KKASSERT(lockcnt > 0); 1374 cpu_ccfence(); 1375 if (lockcnt > 1) { 1376 if (atomic_cmpset_int(&chain->lockcnt, 1377 lockcnt, lockcnt - 1)) { 1378 hammer2_mtx_unlock(&chain->lock); 1379 return; 1380 } 1381 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1382 /* while holding the mutex exclusively */ 1383 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1384 break; 1385 } else { 1386 /* 1387 * This situation can easily occur on SMP due to 1388 * the gap inbetween the 1->0 transition and the 1389 * final unlock. We cannot safely block on the 1390 * mutex because lockcnt might go above 1. 1391 * 1392 * XXX Sleep for one tick if it takes too long. 1393 */ 1394 if (++iter > 1000) { 1395 if (iter > 1000 + hz) { 1396 kprintf("hammer2: h2race2 %p\n", chain); 1397 iter = 1000; 1398 } 1399 tsleep(&iter, 0, "h2race2", 1); 1400 } 1401 cpu_pause(); 1402 } 1403 /* retry */ 1404 } 1405 1406 /* 1407 * Last unlock / mutex upgraded to exclusive. Drop the data 1408 * reference. 1409 */ 1410 dio = hammer2_chain_drop_data(chain); 1411 if (dio) 1412 hammer2_io_bqrelse(&dio); 1413 hammer2_mtx_unlock(&chain->lock); 1414 } 1415 1416 /* 1417 * Unlock and hold chain data intact 1418 */ 1419 void 1420 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1421 { 1422 atomic_add_int(&chain->lockcnt, 1); 1423 hammer2_chain_unlock(chain); 1424 } 1425 1426 /* 1427 * Helper to obtain the blockref[] array base and count for a chain. 1428 * 1429 * XXX Not widely used yet, various use cases need to be validated and 1430 * converted to use this function. 1431 */ 1432 static 1433 hammer2_blockref_t * 1434 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1435 { 1436 hammer2_blockref_t *base; 1437 int count; 1438 1439 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1440 base = NULL; 1441 1442 switch(parent->bref.type) { 1443 case HAMMER2_BREF_TYPE_INODE: 1444 count = HAMMER2_SET_COUNT; 1445 break; 1446 case HAMMER2_BREF_TYPE_INDIRECT: 1447 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1448 count = parent->bytes / sizeof(hammer2_blockref_t); 1449 break; 1450 case HAMMER2_BREF_TYPE_VOLUME: 1451 count = HAMMER2_SET_COUNT; 1452 break; 1453 case HAMMER2_BREF_TYPE_FREEMAP: 1454 count = HAMMER2_SET_COUNT; 1455 break; 1456 default: 1457 panic("hammer2_chain_base_and_count: " 1458 "unrecognized blockref type: %d", 1459 parent->bref.type); 1460 count = 0; 1461 break; 1462 } 1463 } else { 1464 switch(parent->bref.type) { 1465 case HAMMER2_BREF_TYPE_INODE: 1466 base = &parent->data->ipdata.u.blockset.blockref[0]; 1467 count = HAMMER2_SET_COUNT; 1468 break; 1469 case HAMMER2_BREF_TYPE_INDIRECT: 1470 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1471 base = &parent->data->npdata[0]; 1472 count = parent->bytes / sizeof(hammer2_blockref_t); 1473 break; 1474 case HAMMER2_BREF_TYPE_VOLUME: 1475 base = &parent->data->voldata. 1476 sroot_blockset.blockref[0]; 1477 count = HAMMER2_SET_COUNT; 1478 break; 1479 case HAMMER2_BREF_TYPE_FREEMAP: 1480 base = &parent->data->blkset.blockref[0]; 1481 count = HAMMER2_SET_COUNT; 1482 break; 1483 default: 1484 panic("hammer2_chain_base_and_count: " 1485 "unrecognized blockref type: %d", 1486 parent->bref.type); 1487 base = NULL; 1488 count = 0; 1489 break; 1490 } 1491 } 1492 *countp = count; 1493 1494 return base; 1495 } 1496 1497 /* 1498 * This counts the number of live blockrefs in a block array and 1499 * also calculates the point at which all remaining blockrefs are empty. 1500 * This routine can only be called on a live chain. 1501 * 1502 * Caller holds the chain locked, but possibly with a shared lock. We 1503 * must use an exclusive spinlock to prevent corruption. 1504 * 1505 * NOTE: Flag is not set until after the count is complete, allowing 1506 * callers to test the flag without holding the spinlock. 1507 * 1508 * NOTE: If base is NULL the related chain is still in the INITIAL 1509 * state and there are no blockrefs to count. 1510 * 1511 * NOTE: live_count may already have some counts accumulated due to 1512 * creation and deletion and could even be initially negative. 1513 */ 1514 void 1515 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1516 hammer2_blockref_t *base, int count) 1517 { 1518 hammer2_spin_ex(&chain->core.spin); 1519 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1520 if (base) { 1521 while (--count >= 0) { 1522 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1523 break; 1524 } 1525 chain->core.live_zero = count + 1; 1526 while (count >= 0) { 1527 if (base[count].type != HAMMER2_BREF_TYPE_EMPTY) 1528 atomic_add_int(&chain->core.live_count, 1529 1); 1530 --count; 1531 } 1532 } else { 1533 chain->core.live_zero = 0; 1534 } 1535 /* else do not modify live_count */ 1536 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1537 } 1538 hammer2_spin_unex(&chain->core.spin); 1539 } 1540 1541 /* 1542 * Resize the chain's physical storage allocation in-place. This function does 1543 * not usually adjust the data pointer and must be followed by (typically) a 1544 * hammer2_chain_modify() call to copy any old data over and adjust the 1545 * data pointer. 1546 * 1547 * Chains can be resized smaller without reallocating the storage. Resizing 1548 * larger will reallocate the storage. Excess or prior storage is reclaimed 1549 * asynchronously at a later time. 1550 * 1551 * An nradix value of 0 is special-cased to mean that the storage should 1552 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1553 * byte). 1554 * 1555 * Must be passed an exclusively locked parent and chain. 1556 * 1557 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1558 * to avoid instantiating a device buffer that conflicts with the vnode data 1559 * buffer. However, because H2 can compress or encrypt data, the chain may 1560 * have a dio assigned to it in those situations, and they do not conflict. 1561 * 1562 * XXX return error if cannot resize. 1563 */ 1564 int 1565 hammer2_chain_resize(hammer2_chain_t *chain, 1566 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1567 int nradix, int flags) 1568 { 1569 hammer2_dev_t *hmp; 1570 size_t obytes; 1571 size_t nbytes; 1572 int error; 1573 1574 hmp = chain->hmp; 1575 1576 /* 1577 * Only data and indirect blocks can be resized for now. 1578 * (The volu root, inodes, and freemap elements use a fixed size). 1579 */ 1580 KKASSERT(chain != &hmp->vchain); 1581 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1582 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1583 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1584 1585 /* 1586 * Nothing to do if the element is already the proper size 1587 */ 1588 obytes = chain->bytes; 1589 nbytes = (nradix) ? (1U << nradix) : 0; 1590 if (obytes == nbytes) 1591 return (chain->error); 1592 1593 /* 1594 * Make sure the old data is instantiated so we can copy it. If this 1595 * is a data block, the device data may be superfluous since the data 1596 * might be in a logical block, but compressed or encrypted data is 1597 * another matter. 1598 * 1599 * NOTE: The modify will set BLKMAPUPD for us if BLKMAPPED is set. 1600 */ 1601 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1602 if (error) 1603 return error; 1604 1605 /* 1606 * Reallocate the block, even if making it smaller (because different 1607 * block sizes may be in different regions). 1608 * 1609 * NOTE: Operation does not copy the data and may only be used 1610 * to resize data blocks in-place, or directory entry blocks 1611 * which are about to be modified in some manner. 1612 */ 1613 error = hammer2_freemap_alloc(chain, nbytes); 1614 if (error) 1615 return error; 1616 1617 chain->bytes = nbytes; 1618 1619 /* 1620 * We don't want the followup chain_modify() to try to copy data 1621 * from the old (wrong-sized) buffer. It won't know how much to 1622 * copy. This case should only occur during writes when the 1623 * originator already has the data to write in-hand. 1624 */ 1625 if (chain->dio) { 1626 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1627 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1628 hammer2_io_brelse(&chain->dio); 1629 chain->data = NULL; 1630 } 1631 return (chain->error); 1632 } 1633 1634 /* 1635 * Set the chain modified so its data can be changed by the caller, or 1636 * install deduplicated data. The caller must call this routine for each 1637 * set of modifications it makes, even if the chain is already flagged 1638 * MODIFIED. 1639 * 1640 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1641 * is a CLC (cluster level change) field and is not updated by parent 1642 * propagation during a flush. 1643 * 1644 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1645 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1646 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1647 * remains unmodified with its old data ref intact and chain->error 1648 * unchanged. 1649 * 1650 * Dedup Handling 1651 * 1652 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1653 * even if the chain is still flagged MODIFIED. In this case the chain's 1654 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1655 * 1656 * If the caller passes a non-zero dedup_off we will use it to assign the 1657 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1658 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1659 * must not modify the data content upon return. 1660 */ 1661 int 1662 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1663 hammer2_off_t dedup_off, int flags) 1664 { 1665 hammer2_blockref_t obref; 1666 hammer2_dev_t *hmp; 1667 hammer2_io_t *dio; 1668 int error; 1669 int wasinitial; 1670 int setmodified; 1671 int setupdate; 1672 int newmod; 1673 char *bdata; 1674 1675 hmp = chain->hmp; 1676 obref = chain->bref; 1677 KKASSERT(chain->lock.mtx_lock & MTX_EXCLUSIVE); 1678 1679 /* 1680 * Data is not optional for freemap chains (we must always be sure 1681 * to copy the data on COW storage allocations). 1682 */ 1683 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1684 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1685 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1686 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1687 } 1688 1689 /* 1690 * Data must be resolved if already assigned, unless explicitly 1691 * flagged otherwise. If we cannot safety load the data the 1692 * modification fails and we return early. 1693 */ 1694 if (chain->data == NULL && chain->bytes != 0 && 1695 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1696 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1697 hammer2_chain_load_data(chain); 1698 if (chain->error) 1699 return (chain->error); 1700 } 1701 error = 0; 1702 1703 /* 1704 * Set MODIFIED to indicate that the chain has been modified. A new 1705 * allocation is required when modifying a chain. 1706 * 1707 * Set UPDATE to ensure that the blockref is updated in the parent. 1708 * 1709 * If MODIFIED is already set determine if we can reuse the assigned 1710 * data block or if we need a new data block. 1711 */ 1712 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1713 /* 1714 * Must set modified bit. 1715 */ 1716 atomic_add_long(&hammer2_count_modified_chains, 1); 1717 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1718 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1719 setmodified = 1; 1720 1721 /* 1722 * We may be able to avoid a copy-on-write if the chain's 1723 * check mode is set to NONE and the chain's current 1724 * modify_tid is beyond the last explicit snapshot tid. 1725 * 1726 * This implements HAMMER2's overwrite-in-place feature. 1727 * 1728 * NOTE! This data-block cannot be used as a de-duplication 1729 * source when the check mode is set to NONE. 1730 */ 1731 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1732 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1733 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1734 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1735 HAMMER2_DEC_CHECK(chain->bref.methods) == 1736 HAMMER2_CHECK_NONE && 1737 chain->pmp && 1738 chain->bref.modify_tid > 1739 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1740 /* 1741 * Sector overwrite allowed. 1742 */ 1743 newmod = 0; 1744 } else if ((hmp->hflags & HMNT2_EMERG) && 1745 chain->pmp && 1746 chain->bref.modify_tid > 1747 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1748 /* 1749 * If in emergency delete mode then do a modify-in- 1750 * place on any chain type belonging to the PFS as 1751 * long as it doesn't mess up a snapshot. We might 1752 * be forced to do this anyway a little further down 1753 * in the code if the allocation fails. 1754 * 1755 * Also note that in emergency mode, these modify-in- 1756 * place operations are NOT SAFE. A storage failure, 1757 * power failure, or panic can corrupt the filesystem. 1758 */ 1759 newmod = 0; 1760 } else { 1761 /* 1762 * Sector overwrite not allowed, must copy-on-write. 1763 */ 1764 newmod = 1; 1765 } 1766 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1767 /* 1768 * If the modified chain was registered for dedup we need 1769 * a new allocation. This only happens for delayed-flush 1770 * chains (i.e. which run through the front-end buffer 1771 * cache). 1772 */ 1773 newmod = 1; 1774 setmodified = 0; 1775 } else { 1776 /* 1777 * Already flagged modified, no new allocation is needed. 1778 */ 1779 newmod = 0; 1780 setmodified = 0; 1781 } 1782 1783 /* 1784 * Flag parent update required. 1785 */ 1786 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1787 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1788 setupdate = 1; 1789 } else { 1790 setupdate = 0; 1791 } 1792 1793 /* 1794 * The XOP code returns held but unlocked focus chains. This 1795 * prevents the chain from being destroyed but does not prevent 1796 * it from being modified. diolk is used to interlock modifications 1797 * against XOP frontend accesses to the focus. 1798 * 1799 * This allows us to theoretically avoid deadlocking the frontend 1800 * if one of the backends lock up by not formally locking the 1801 * focused chain in the frontend. In addition, the synchronization 1802 * code relies on this mechanism to avoid deadlocking concurrent 1803 * synchronization threads. 1804 */ 1805 lockmgr(&chain->diolk, LK_EXCLUSIVE); 1806 1807 /* 1808 * The modification or re-modification requires an allocation and 1809 * possible COW. If an error occurs, the previous content and data 1810 * reference is retained and the modification fails. 1811 * 1812 * If dedup_off is non-zero, the caller is requesting a deduplication 1813 * rather than a modification. The MODIFIED bit is not set and the 1814 * data offset is set to the deduplication offset. The data cannot 1815 * be modified. 1816 * 1817 * NOTE: The dedup offset is allowed to be in a partially free state 1818 * and we must be sure to reset it to a fully allocated state 1819 * to force two bulkfree passes to free it again. 1820 * 1821 * NOTE: Only applicable when chain->bytes != 0. 1822 * 1823 * XXX can a chain already be marked MODIFIED without a data 1824 * assignment? If not, assert here instead of testing the case. 1825 */ 1826 if (chain != &hmp->vchain && chain != &hmp->fchain && 1827 chain->bytes) { 1828 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1829 newmod 1830 ) { 1831 /* 1832 * NOTE: We do not have to remove the dedup 1833 * registration because the area is still 1834 * allocated and the underlying DIO will 1835 * still be flushed. 1836 */ 1837 if (dedup_off) { 1838 chain->bref.data_off = dedup_off; 1839 if ((int)(dedup_off & HAMMER2_OFF_MASK_RADIX)) 1840 chain->bytes = 1 << 1841 (int)(dedup_off & 1842 HAMMER2_OFF_MASK_RADIX); 1843 else 1844 chain->bytes = 0; 1845 chain->error = 0; 1846 atomic_clear_int(&chain->flags, 1847 HAMMER2_CHAIN_MODIFIED); 1848 atomic_add_long(&hammer2_count_modified_chains, 1849 -1); 1850 if (chain->pmp) { 1851 hammer2_pfs_memory_wakeup( 1852 chain->pmp, -1); 1853 } 1854 hammer2_freemap_adjust(hmp, &chain->bref, 1855 HAMMER2_FREEMAP_DORECOVER); 1856 atomic_set_int(&chain->flags, 1857 HAMMER2_CHAIN_DEDUPABLE); 1858 } else { 1859 error = hammer2_freemap_alloc(chain, 1860 chain->bytes); 1861 atomic_clear_int(&chain->flags, 1862 HAMMER2_CHAIN_DEDUPABLE); 1863 1864 /* 1865 * If we are unable to allocate a new block 1866 * but we are in emergency mode, issue a 1867 * warning to the console and reuse the same 1868 * block. 1869 * 1870 * We behave as if the allocation were 1871 * successful. 1872 * 1873 * THIS IS IMPORTANT: These modifications 1874 * are virtually guaranteed to corrupt any 1875 * snapshots related to this filesystem. 1876 */ 1877 if (error && (hmp->hflags & HMNT2_EMERG)) { 1878 error = 0; 1879 chain->bref.flags |= 1880 HAMMER2_BREF_FLAG_EMERG_MIP; 1881 1882 krateprintf(&krate_h2em, 1883 "hammer2: Emergency Mode WARNING: " 1884 "Operation will likely corrupt " 1885 "related snapshot: " 1886 "%016jx.%02x key=%016jx\n", 1887 chain->bref.data_off, 1888 chain->bref.type, 1889 chain->bref.key); 1890 } else if (error == 0) { 1891 chain->bref.flags &= 1892 ~HAMMER2_BREF_FLAG_EMERG_MIP; 1893 } 1894 } 1895 } 1896 } 1897 1898 /* 1899 * Stop here if error. We have to undo any flag bits we might 1900 * have set above. 1901 */ 1902 if (error) { 1903 if (setmodified) { 1904 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1905 atomic_add_long(&hammer2_count_modified_chains, -1); 1906 if (chain->pmp) 1907 hammer2_pfs_memory_wakeup(chain->pmp, -1); 1908 } 1909 if (setupdate) { 1910 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1911 } 1912 lockmgr(&chain->diolk, LK_RELEASE); 1913 1914 return error; 1915 } 1916 1917 /* 1918 * Update mirror_tid and modify_tid. modify_tid is only updated 1919 * if not passed as zero (during flushes, parent propagation passes 1920 * the value 0). 1921 * 1922 * NOTE: chain->pmp could be the device spmp. 1923 */ 1924 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1925 if (mtid) 1926 chain->bref.modify_tid = mtid; 1927 1928 /* 1929 * Set BLKMAPUPD to tell the flush code that an existing blockmap entry 1930 * requires updating as well as to tell the delete code that the 1931 * chain's blockref might not exactly match (in terms of physical size 1932 * or block offset) the one in the parent's blocktable. The base key 1933 * of course will still match. 1934 */ 1935 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) 1936 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD); 1937 1938 /* 1939 * Short-cut data block handling when the caller does not need an 1940 * actual data reference to (aka OPTDATA), as long as the chain does 1941 * not already have a data pointer to the data and no de-duplication 1942 * occurred. 1943 * 1944 * This generally means that the modifications are being done via the 1945 * logical buffer cache. 1946 * 1947 * NOTE: If deduplication occurred we have to run through the data 1948 * stuff to clear INITIAL, and the caller will likely want to 1949 * assign the check code anyway. Leaving INITIAL set on a 1950 * dedup can be deadly (it can cause the block to be zero'd!). 1951 * 1952 * This code also handles bytes == 0 (most dirents). 1953 */ 1954 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1955 (flags & HAMMER2_MODIFY_OPTDATA) && 1956 chain->data == NULL) { 1957 if (dedup_off == 0) { 1958 KKASSERT(chain->dio == NULL); 1959 goto skip2; 1960 } 1961 } 1962 1963 /* 1964 * Clearing the INITIAL flag (for indirect blocks) indicates that 1965 * we've processed the uninitialized storage allocation. 1966 * 1967 * If this flag is already clear we are likely in a copy-on-write 1968 * situation but we have to be sure NOT to bzero the storage if 1969 * no data is present. 1970 * 1971 * Clearing of NOTTESTED is allowed if the MODIFIED bit is set, 1972 */ 1973 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1974 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1975 wasinitial = 1; 1976 } else { 1977 wasinitial = 0; 1978 } 1979 1980 /* 1981 * Instantiate data buffer and possibly execute COW operation 1982 */ 1983 switch(chain->bref.type) { 1984 case HAMMER2_BREF_TYPE_VOLUME: 1985 case HAMMER2_BREF_TYPE_FREEMAP: 1986 /* 1987 * The data is embedded, no copy-on-write operation is 1988 * needed. 1989 */ 1990 KKASSERT(chain->dio == NULL); 1991 break; 1992 case HAMMER2_BREF_TYPE_DIRENT: 1993 /* 1994 * The data might be fully embedded. 1995 */ 1996 if (chain->bytes == 0) { 1997 KKASSERT(chain->dio == NULL); 1998 break; 1999 } 2000 /* fall through */ 2001 case HAMMER2_BREF_TYPE_INODE: 2002 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2003 case HAMMER2_BREF_TYPE_DATA: 2004 case HAMMER2_BREF_TYPE_INDIRECT: 2005 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2006 /* 2007 * Perform the copy-on-write operation 2008 * 2009 * zero-fill or copy-on-write depending on whether 2010 * chain->data exists or not and set the dirty state for 2011 * the new buffer. hammer2_io_new() will handle the 2012 * zero-fill. 2013 * 2014 * If a dedup_off was supplied this is an existing block 2015 * and no COW, copy, or further modification is required. 2016 */ 2017 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 2018 2019 if (wasinitial && dedup_off == 0) { 2020 error = hammer2_io_new(hmp, chain->bref.type, 2021 chain->bref.data_off, 2022 chain->bytes, &dio); 2023 } else { 2024 error = hammer2_io_bread(hmp, chain->bref.type, 2025 chain->bref.data_off, 2026 chain->bytes, &dio); 2027 } 2028 hammer2_adjreadcounter(chain->bref.type, chain->bytes); 2029 2030 /* 2031 * If an I/O error occurs make sure callers cannot accidently 2032 * modify the old buffer's contents and corrupt the filesystem. 2033 * 2034 * NOTE: hammer2_io_data() call issues bkvasync() 2035 */ 2036 if (error) { 2037 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 2038 hmp); 2039 chain->error = HAMMER2_ERROR_EIO; 2040 hammer2_io_brelse(&dio); 2041 hammer2_io_brelse(&chain->dio); 2042 chain->data = NULL; 2043 break; 2044 } 2045 chain->error = 0; 2046 bdata = hammer2_io_data(dio, chain->bref.data_off); 2047 2048 if (chain->data) { 2049 /* 2050 * COW (unless a dedup). 2051 */ 2052 KKASSERT(chain->dio != NULL); 2053 if (chain->data != (void *)bdata && dedup_off == 0) { 2054 bcopy(chain->data, bdata, chain->bytes); 2055 } 2056 } else if (wasinitial == 0 && dedup_off == 0) { 2057 /* 2058 * We have a problem. We were asked to COW but 2059 * we don't have any data to COW with! 2060 */ 2061 panic("hammer2_chain_modify: having a COW %p\n", 2062 chain); 2063 } 2064 2065 /* 2066 * Retire the old buffer, replace with the new. Dirty or 2067 * redirty the new buffer. 2068 * 2069 * WARNING! The system buffer cache may have already flushed 2070 * the buffer, so we must be sure to [re]dirty it 2071 * for further modification. 2072 * 2073 * If dedup_off was supplied, the caller is not 2074 * expected to make any further modification to the 2075 * buffer. 2076 * 2077 * WARNING! hammer2_get_gdata() assumes dio never transitions 2078 * through NULL in order to optimize away unnecessary 2079 * diolk operations. 2080 */ 2081 { 2082 hammer2_io_t *tio; 2083 2084 if ((tio = chain->dio) != NULL) 2085 hammer2_io_bqrelse(&tio); 2086 chain->data = (void *)bdata; 2087 chain->dio = dio; 2088 if (dedup_off == 0) 2089 hammer2_io_setdirty(dio); 2090 } 2091 break; 2092 default: 2093 panic("hammer2_chain_modify: illegal non-embedded type %d", 2094 chain->bref.type); 2095 break; 2096 2097 } 2098 skip2: 2099 /* 2100 * setflush on parent indicating that the parent must recurse down 2101 * to us. Do not call on chain itself which might already have it 2102 * set. 2103 */ 2104 if (chain->parent) 2105 hammer2_chain_setflush(chain->parent); 2106 lockmgr(&chain->diolk, LK_RELEASE); 2107 2108 return (chain->error); 2109 } 2110 2111 /* 2112 * Modify the chain associated with an inode. 2113 */ 2114 int 2115 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 2116 hammer2_tid_t mtid, int flags) 2117 { 2118 int error; 2119 2120 hammer2_inode_modify(ip); 2121 error = hammer2_chain_modify(chain, mtid, 0, flags); 2122 2123 return error; 2124 } 2125 2126 /* 2127 * This function returns the chain at the nearest key within the specified 2128 * range. The returned chain will be referenced but not locked. 2129 * 2130 * This function will recurse through chain->rbtree as necessary and will 2131 * return a *key_nextp suitable for iteration. *key_nextp is only set if 2132 * the iteration value is less than the current value of *key_nextp. 2133 * 2134 * The caller should use (*key_nextp) to calculate the actual range of 2135 * the returned element, which will be (key_beg to *key_nextp - 1), because 2136 * there might be another element which is superior to the returned element 2137 * and overlaps it. 2138 * 2139 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 2140 * chains continue to be returned. On EOF (*key_nextp) may overflow since 2141 * it will wind up being (key_end + 1). 2142 * 2143 * WARNING! Must be called with child's spinlock held. Spinlock remains 2144 * held through the operation. 2145 */ 2146 struct hammer2_chain_find_info { 2147 hammer2_chain_t *best; 2148 hammer2_key_t key_beg; 2149 hammer2_key_t key_end; 2150 hammer2_key_t key_next; 2151 }; 2152 2153 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 2154 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 2155 2156 static 2157 hammer2_chain_t * 2158 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 2159 hammer2_key_t key_beg, hammer2_key_t key_end) 2160 { 2161 struct hammer2_chain_find_info info; 2162 2163 info.best = NULL; 2164 info.key_beg = key_beg; 2165 info.key_end = key_end; 2166 info.key_next = *key_nextp; 2167 2168 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 2169 hammer2_chain_find_cmp, hammer2_chain_find_callback, 2170 &info); 2171 *key_nextp = info.key_next; 2172 #if 0 2173 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 2174 parent, key_beg, key_end, *key_nextp); 2175 #endif 2176 2177 return (info.best); 2178 } 2179 2180 static 2181 int 2182 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2183 { 2184 struct hammer2_chain_find_info *info = data; 2185 hammer2_key_t child_beg; 2186 hammer2_key_t child_end; 2187 2188 child_beg = child->bref.key; 2189 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2190 2191 if (child_end < info->key_beg) 2192 return(-1); 2193 if (child_beg > info->key_end) 2194 return(1); 2195 return(0); 2196 } 2197 2198 static 2199 int 2200 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2201 { 2202 struct hammer2_chain_find_info *info = data; 2203 hammer2_chain_t *best; 2204 hammer2_key_t child_end; 2205 2206 /* 2207 * WARNING! Layerq is scanned forwards, exact matches should keep 2208 * the existing info->best. 2209 */ 2210 if ((best = info->best) == NULL) { 2211 /* 2212 * No previous best. Assign best 2213 */ 2214 info->best = child; 2215 } else if (best->bref.key <= info->key_beg && 2216 child->bref.key <= info->key_beg) { 2217 /* 2218 * Illegal overlap. 2219 */ 2220 KKASSERT(0); 2221 /*info->best = child;*/ 2222 } else if (child->bref.key < best->bref.key) { 2223 /* 2224 * Child has a nearer key and best is not flush with key_beg. 2225 * Set best to child. Truncate key_next to the old best key. 2226 */ 2227 info->best = child; 2228 if (info->key_next > best->bref.key || info->key_next == 0) 2229 info->key_next = best->bref.key; 2230 } else if (child->bref.key == best->bref.key) { 2231 /* 2232 * If our current best is flush with the child then this 2233 * is an illegal overlap. 2234 * 2235 * key_next will automatically be limited to the smaller of 2236 * the two end-points. 2237 */ 2238 KKASSERT(0); 2239 info->best = child; 2240 } else { 2241 /* 2242 * Keep the current best but truncate key_next to the child's 2243 * base. 2244 * 2245 * key_next will also automatically be limited to the smaller 2246 * of the two end-points (probably not necessary for this case 2247 * but we do it anyway). 2248 */ 2249 if (info->key_next > child->bref.key || info->key_next == 0) 2250 info->key_next = child->bref.key; 2251 } 2252 2253 /* 2254 * Always truncate key_next based on child's end-of-range. 2255 */ 2256 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2257 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2258 info->key_next = child_end; 2259 2260 return(0); 2261 } 2262 2263 /* 2264 * Retrieve the specified chain from a media blockref, creating the 2265 * in-memory chain structure which reflects it. The returned chain is 2266 * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must 2267 * handle crc-checks and so forth, and should check chain->error before 2268 * assuming that the data is good. 2269 * 2270 * To handle insertion races pass the INSERT_RACE flag along with the 2271 * generation number of the core. NULL will be returned if the generation 2272 * number changes before we have a chance to insert the chain. Insert 2273 * races can occur because the parent might be held shared. 2274 * 2275 * Caller must hold the parent locked shared or exclusive since we may 2276 * need the parent's bref array to find our block. 2277 * 2278 * WARNING! chain->pmp is always set to NULL for any chain representing 2279 * part of the super-root topology. 2280 */ 2281 hammer2_chain_t * 2282 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2283 hammer2_blockref_t *bref, int how) 2284 { 2285 hammer2_dev_t *hmp = parent->hmp; 2286 hammer2_chain_t *chain; 2287 int error; 2288 2289 /* 2290 * Allocate a chain structure representing the existing media 2291 * entry. Resulting chain has one ref and is not locked. 2292 */ 2293 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2294 chain = hammer2_chain_alloc(hmp, NULL, bref); 2295 else 2296 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2297 /* ref'd chain returned */ 2298 2299 /* 2300 * Flag that the chain is in the parent's blockmap so delete/flush 2301 * knows what to do with it. 2302 */ 2303 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 2304 2305 /* 2306 * chain must be locked to avoid unexpected ripouts 2307 */ 2308 hammer2_chain_lock(chain, how); 2309 2310 /* 2311 * Link the chain into its parent. A spinlock is required to safely 2312 * access the RBTREE, and it is possible to collide with another 2313 * hammer2_chain_get() operation because the caller might only hold 2314 * a shared lock on the parent. 2315 * 2316 * NOTE: Get races can occur quite often when we distribute 2317 * asynchronous read-aheads across multiple threads. 2318 */ 2319 KKASSERT(parent->refs > 0); 2320 error = hammer2_chain_insert(parent, chain, 2321 HAMMER2_CHAIN_INSERT_SPIN | 2322 HAMMER2_CHAIN_INSERT_RACE, 2323 generation); 2324 if (error) { 2325 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2326 /*kprintf("chain %p get race\n", chain);*/ 2327 hammer2_chain_unlock(chain); 2328 hammer2_chain_drop(chain); 2329 chain = NULL; 2330 } else { 2331 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2332 } 2333 2334 /* 2335 * Return our new chain referenced but not locked, or NULL if 2336 * a race occurred. 2337 */ 2338 return (chain); 2339 } 2340 2341 /* 2342 * Lookup initialization/completion API 2343 */ 2344 hammer2_chain_t * 2345 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2346 { 2347 hammer2_chain_ref(parent); 2348 if (flags & HAMMER2_LOOKUP_SHARED) { 2349 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2350 HAMMER2_RESOLVE_SHARED); 2351 } else { 2352 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2353 } 2354 return (parent); 2355 } 2356 2357 void 2358 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2359 { 2360 if (parent) { 2361 hammer2_chain_unlock(parent); 2362 hammer2_chain_drop(parent); 2363 } 2364 } 2365 2366 /* 2367 * Take the locked chain and return a locked parent. The chain remains 2368 * locked on return, but may have to be temporarily unlocked to acquire 2369 * the parent. Because of this, (chain) must be stable and cannot be 2370 * deleted while it was temporarily unlocked (typically means that (chain) 2371 * is an inode). 2372 * 2373 * Pass HAMMER2_RESOLVE_* flags in flags. 2374 * 2375 * This will work even if the chain is errored, and the caller can check 2376 * parent->error on return if desired since the parent will be locked. 2377 * 2378 * This function handles the lock order reversal. 2379 */ 2380 hammer2_chain_t * 2381 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2382 { 2383 hammer2_chain_t *parent; 2384 2385 /* 2386 * Be careful of order, chain must be unlocked before parent 2387 * is locked below to avoid a deadlock. Try it trivially first. 2388 */ 2389 parent = chain->parent; 2390 if (parent == NULL) 2391 panic("hammer2_chain_getparent: no parent"); 2392 hammer2_chain_ref(parent); 2393 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) 2394 return parent; 2395 2396 for (;;) { 2397 hammer2_chain_unlock(chain); 2398 hammer2_chain_lock(parent, flags); 2399 hammer2_chain_lock(chain, flags); 2400 2401 /* 2402 * Parent relinking races are quite common. We have to get 2403 * it right or we will blow up the block table. 2404 */ 2405 if (chain->parent == parent) 2406 break; 2407 hammer2_chain_unlock(parent); 2408 hammer2_chain_drop(parent); 2409 cpu_ccfence(); 2410 parent = chain->parent; 2411 if (parent == NULL) 2412 panic("hammer2_chain_getparent: no parent"); 2413 hammer2_chain_ref(parent); 2414 } 2415 return parent; 2416 } 2417 2418 /* 2419 * Take the locked chain and return a locked parent. The chain is unlocked 2420 * and dropped. *chainp is set to the returned parent as a convenience. 2421 * Pass HAMMER2_RESOLVE_* flags in flags. 2422 * 2423 * This will work even if the chain is errored, and the caller can check 2424 * parent->error on return if desired since the parent will be locked. 2425 * 2426 * The chain does NOT need to be stable. We use a tracking structure 2427 * to track the expected parent if the chain is deleted out from under us. 2428 * 2429 * This function handles the lock order reversal. 2430 */ 2431 hammer2_chain_t * 2432 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2433 { 2434 hammer2_chain_t *chain; 2435 hammer2_chain_t *parent; 2436 struct hammer2_reptrack reptrack; 2437 struct hammer2_reptrack **repp; 2438 2439 /* 2440 * Be careful of order, chain must be unlocked before parent 2441 * is locked below to avoid a deadlock. Try it trivially first. 2442 */ 2443 chain = *chainp; 2444 parent = chain->parent; 2445 if (parent == NULL) { 2446 hammer2_spin_unex(&chain->core.spin); 2447 panic("hammer2_chain_repparent: no parent"); 2448 } 2449 hammer2_chain_ref(parent); 2450 if (hammer2_chain_lock(parent, flags|HAMMER2_RESOLVE_NONBLOCK) == 0) { 2451 hammer2_chain_unlock(chain); 2452 hammer2_chain_drop(chain); 2453 *chainp = parent; 2454 2455 return parent; 2456 } 2457 2458 /* 2459 * Ok, now it gets a bit nasty. There are multiple situations where 2460 * the parent might be in the middle of a deletion, or where the child 2461 * (chain) might be deleted the instant we let go of its lock. 2462 * We can potentially end up in a no-win situation! 2463 * 2464 * In particular, the indirect_maintenance() case can cause these 2465 * situations. 2466 * 2467 * To deal with this we install a reptrack structure in the parent 2468 * This reptrack structure 'owns' the parent ref and will automatically 2469 * migrate to the parent's parent if the parent is deleted permanently. 2470 */ 2471 hammer2_spin_init(&reptrack.spin, "h2reptrk"); 2472 reptrack.chain = parent; 2473 hammer2_chain_ref(parent); /* for the reptrack */ 2474 2475 hammer2_spin_ex(&parent->core.spin); 2476 reptrack.next = parent->core.reptrack; 2477 parent->core.reptrack = &reptrack; 2478 hammer2_spin_unex(&parent->core.spin); 2479 2480 hammer2_chain_unlock(chain); 2481 hammer2_chain_drop(chain); 2482 chain = NULL; /* gone */ 2483 2484 /* 2485 * At the top of this loop, chain is gone and parent is refd both 2486 * by us explicitly AND via our reptrack. We are attempting to 2487 * lock parent. 2488 */ 2489 for (;;) { 2490 hammer2_chain_lock(parent, flags); 2491 2492 if (reptrack.chain == parent) 2493 break; 2494 hammer2_chain_unlock(parent); 2495 hammer2_chain_drop(parent); 2496 2497 kprintf("hammer2: debug REPTRACK %p->%p\n", 2498 parent, reptrack.chain); 2499 hammer2_spin_ex(&reptrack.spin); 2500 parent = reptrack.chain; 2501 hammer2_chain_ref(parent); 2502 hammer2_spin_unex(&reptrack.spin); 2503 } 2504 2505 /* 2506 * Once parent is locked and matches our reptrack, our reptrack 2507 * will be stable and we have our parent. We can unlink our 2508 * reptrack. 2509 * 2510 * WARNING! Remember that the chain lock might be shared. Chains 2511 * locked shared have stable parent linkages. 2512 */ 2513 hammer2_spin_ex(&parent->core.spin); 2514 repp = &parent->core.reptrack; 2515 while (*repp != &reptrack) 2516 repp = &(*repp)->next; 2517 *repp = reptrack.next; 2518 hammer2_spin_unex(&parent->core.spin); 2519 2520 hammer2_chain_drop(parent); /* reptrack ref */ 2521 *chainp = parent; /* return parent lock+ref */ 2522 2523 return parent; 2524 } 2525 2526 /* 2527 * Dispose of any linked reptrack structures in (chain) by shifting them to 2528 * (parent). Both (chain) and (parent) must be exclusively locked. 2529 * 2530 * This is interlocked against any children of (chain) on the other side. 2531 * No children so remain as-of when this is called so we can test 2532 * core.reptrack without holding the spin-lock. 2533 * 2534 * Used whenever the caller intends to permanently delete chains related 2535 * to topological recursions (BREF_TYPE_INDIRECT, BREF_TYPE_FREEMAP_NODE), 2536 * where the chains underneath the node being deleted are given a new parent 2537 * above the node being deleted. 2538 */ 2539 static 2540 void 2541 hammer2_chain_repchange(hammer2_chain_t *parent, hammer2_chain_t *chain) 2542 { 2543 struct hammer2_reptrack *reptrack; 2544 2545 KKASSERT(chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)); 2546 while (chain->core.reptrack) { 2547 hammer2_spin_ex(&parent->core.spin); 2548 hammer2_spin_ex(&chain->core.spin); 2549 reptrack = chain->core.reptrack; 2550 if (reptrack == NULL) { 2551 hammer2_spin_unex(&chain->core.spin); 2552 hammer2_spin_unex(&parent->core.spin); 2553 break; 2554 } 2555 hammer2_spin_ex(&reptrack->spin); 2556 chain->core.reptrack = reptrack->next; 2557 reptrack->chain = parent; 2558 reptrack->next = parent->core.reptrack; 2559 parent->core.reptrack = reptrack; 2560 hammer2_chain_ref(parent); /* reptrack */ 2561 2562 hammer2_spin_unex(&chain->core.spin); 2563 hammer2_spin_unex(&parent->core.spin); 2564 kprintf("hammer2: debug repchange %p %p->%p\n", 2565 reptrack, chain, parent); 2566 hammer2_chain_drop(chain); /* reptrack */ 2567 } 2568 } 2569 2570 /* 2571 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2572 * (*parentp) typically points to an inode but can also point to a related 2573 * indirect block and this function will recurse upwards and find the inode 2574 * or the nearest undeleted indirect block covering the key range. 2575 * 2576 * This function unconditionally sets *errorp, replacing any previous value. 2577 * 2578 * (*parentp) must be exclusive or shared locked (depending on flags) and 2579 * referenced and can be an inode or an existing indirect block within the 2580 * inode. 2581 * 2582 * If (*parent) is errored out, this function will not attempt to recurse 2583 * the radix tree and will return NULL along with an appropriate *errorp. 2584 * If NULL is returned and *errorp is 0, the requested lookup could not be 2585 * located. 2586 * 2587 * On return (*parentp) will be modified to point at the deepest parent chain 2588 * element encountered during the search, as a helper for an insertion or 2589 * deletion. 2590 * 2591 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2592 * and referenced, and the old will be unlocked and dereferenced (no change 2593 * if they are both the same). This is particularly important if the caller 2594 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2595 * is returned, as long as no error occurred. 2596 * 2597 * The matching chain will be returned locked according to flags. 2598 * 2599 * -- 2600 * 2601 * NULL is returned if no match was found, but (*parentp) will still 2602 * potentially be adjusted. 2603 * 2604 * On return (*key_nextp) will point to an iterative value for key_beg. 2605 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2606 * 2607 * This function will also recurse up the chain if the key is not within the 2608 * current parent's range. (*parentp) can never be set to NULL. An iteration 2609 * can simply allow (*parentp) to float inside the loop. 2610 * 2611 * NOTE! chain->data is not always resolved. By default it will not be 2612 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2613 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2614 * BREF_TYPE_DATA as the device buffer can alias the logical file 2615 * buffer). 2616 */ 2617 hammer2_chain_t * 2618 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2619 hammer2_key_t key_beg, hammer2_key_t key_end, 2620 int *errorp, int flags) 2621 { 2622 hammer2_dev_t *hmp; 2623 hammer2_chain_t *parent; 2624 hammer2_chain_t *chain; 2625 hammer2_blockref_t *base; 2626 hammer2_blockref_t *bref; 2627 hammer2_blockref_t bsave; 2628 hammer2_key_t scan_beg; 2629 hammer2_key_t scan_end; 2630 int count = 0; 2631 int how_always = HAMMER2_RESOLVE_ALWAYS; 2632 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2633 int how; 2634 int generation; 2635 int maxloops = 300000; 2636 volatile hammer2_mtx_t save_mtx; 2637 2638 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2639 how_maybe = how_always; 2640 how = HAMMER2_RESOLVE_ALWAYS; 2641 } else if (flags & HAMMER2_LOOKUP_NODATA) { 2642 how = HAMMER2_RESOLVE_NEVER; 2643 } else { 2644 how = HAMMER2_RESOLVE_MAYBE; 2645 } 2646 if (flags & HAMMER2_LOOKUP_SHARED) { 2647 how_maybe |= HAMMER2_RESOLVE_SHARED; 2648 how_always |= HAMMER2_RESOLVE_SHARED; 2649 how |= HAMMER2_RESOLVE_SHARED; 2650 } 2651 2652 /* 2653 * Recurse (*parentp) upward if necessary until the parent completely 2654 * encloses the key range or we hit the inode. 2655 * 2656 * Handle races against the flusher deleting indirect nodes on its 2657 * way back up by continuing to recurse upward past the deletion. 2658 */ 2659 parent = *parentp; 2660 hmp = parent->hmp; 2661 *errorp = 0; 2662 2663 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2664 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2665 scan_beg = parent->bref.key; 2666 scan_end = scan_beg + 2667 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2668 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2669 if (key_beg >= scan_beg && key_end <= scan_end) 2670 break; 2671 } 2672 parent = hammer2_chain_repparent(parentp, how_maybe); 2673 } 2674 again: 2675 if (--maxloops == 0) 2676 panic("hammer2_chain_lookup: maxloops"); 2677 2678 /* 2679 * MATCHIND case that does not require parent->data (do prior to 2680 * parent->error check). 2681 */ 2682 switch(parent->bref.type) { 2683 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2684 case HAMMER2_BREF_TYPE_INDIRECT: 2685 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2686 scan_beg = parent->bref.key; 2687 scan_end = scan_beg + 2688 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2689 if (key_beg == scan_beg && key_end == scan_end) { 2690 chain = parent; 2691 hammer2_chain_ref(chain); 2692 hammer2_chain_lock(chain, how_maybe); 2693 *key_nextp = scan_end + 1; 2694 goto done; 2695 } 2696 } 2697 break; 2698 default: 2699 break; 2700 } 2701 2702 /* 2703 * No lookup is possible if the parent is errored. We delayed 2704 * this check as long as we could to ensure that the parent backup, 2705 * embedded data, and MATCHIND code could still execute. 2706 */ 2707 if (parent->error) { 2708 *errorp = parent->error; 2709 return NULL; 2710 } 2711 2712 /* 2713 * Locate the blockref array. Currently we do a fully associative 2714 * search through the array. 2715 */ 2716 switch(parent->bref.type) { 2717 case HAMMER2_BREF_TYPE_INODE: 2718 /* 2719 * Special shortcut for embedded data returns the inode 2720 * itself. Callers must detect this condition and access 2721 * the embedded data (the strategy code does this for us). 2722 * 2723 * This is only applicable to regular files and softlinks. 2724 * 2725 * We need a second lock on parent. Since we already have 2726 * a lock we must pass LOCKAGAIN to prevent unexpected 2727 * blocking (we don't want to block on a second shared 2728 * ref if an exclusive lock is pending) 2729 */ 2730 if (parent->data->ipdata.meta.op_flags & 2731 HAMMER2_OPFLAG_DIRECTDATA) { 2732 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2733 chain = NULL; 2734 *key_nextp = key_end + 1; 2735 goto done; 2736 } 2737 hammer2_chain_ref(parent); 2738 hammer2_chain_lock(parent, how_always | 2739 HAMMER2_RESOLVE_LOCKAGAIN); 2740 *key_nextp = key_end + 1; 2741 return (parent); 2742 } 2743 base = &parent->data->ipdata.u.blockset.blockref[0]; 2744 count = HAMMER2_SET_COUNT; 2745 break; 2746 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2747 case HAMMER2_BREF_TYPE_INDIRECT: 2748 /* 2749 * Optimize indirect blocks in the INITIAL state to avoid 2750 * I/O. 2751 * 2752 * Debugging: Enter permanent wait state instead of 2753 * panicing on unexpectedly NULL data for the moment. 2754 */ 2755 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2756 base = NULL; 2757 } else { 2758 if (parent->data == NULL) { 2759 kprintf("hammer2: unexpected NULL data " 2760 "on %p\n", parent); 2761 while (1) 2762 tsleep(parent, 0, "xxx", 0); 2763 } 2764 base = &parent->data->npdata[0]; 2765 } 2766 count = parent->bytes / sizeof(hammer2_blockref_t); 2767 break; 2768 case HAMMER2_BREF_TYPE_VOLUME: 2769 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2770 count = HAMMER2_SET_COUNT; 2771 break; 2772 case HAMMER2_BREF_TYPE_FREEMAP: 2773 base = &parent->data->blkset.blockref[0]; 2774 count = HAMMER2_SET_COUNT; 2775 break; 2776 default: 2777 panic("hammer2_chain_lookup: unrecognized " 2778 "blockref(B) type: %d", 2779 parent->bref.type); 2780 base = NULL; /* safety */ 2781 count = 0; /* safety */ 2782 break; 2783 } 2784 2785 /* 2786 * Merged scan to find next candidate. 2787 * 2788 * hammer2_base_*() functions require the parent->core.live_* fields 2789 * to be synchronized. 2790 * 2791 * We need to hold the spinlock to access the block array and RB tree 2792 * and to interlock chain creation. 2793 */ 2794 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2795 hammer2_chain_countbrefs(parent, base, count); 2796 2797 /* 2798 * Combined search 2799 */ 2800 hammer2_spin_ex(&parent->core.spin); 2801 chain = hammer2_combined_find(parent, base, count, 2802 key_nextp, 2803 key_beg, key_end, 2804 &bref); 2805 generation = parent->core.generation; 2806 2807 /* 2808 * Exhausted parent chain, iterate. 2809 */ 2810 if (bref == NULL) { 2811 KKASSERT(chain == NULL); 2812 hammer2_spin_unex(&parent->core.spin); 2813 if (key_beg == key_end) /* short cut single-key case */ 2814 return (NULL); 2815 2816 /* 2817 * Stop if we reached the end of the iteration. 2818 */ 2819 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2820 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2821 return (NULL); 2822 } 2823 2824 /* 2825 * Calculate next key, stop if we reached the end of the 2826 * iteration, otherwise go up one level and loop. 2827 */ 2828 key_beg = parent->bref.key + 2829 ((hammer2_key_t)1 << parent->bref.keybits); 2830 if (key_beg == 0 || key_beg > key_end) 2831 return (NULL); 2832 parent = hammer2_chain_repparent(parentp, how_maybe); 2833 goto again; 2834 } 2835 2836 /* 2837 * Selected from blockref or in-memory chain. 2838 */ 2839 bsave = *bref; 2840 if (chain == NULL) { 2841 hammer2_spin_unex(&parent->core.spin); 2842 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2843 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2844 chain = hammer2_chain_get(parent, generation, 2845 &bsave, how_maybe); 2846 } else { 2847 chain = hammer2_chain_get(parent, generation, 2848 &bsave, how); 2849 } 2850 if (chain == NULL) 2851 goto again; 2852 } else { 2853 hammer2_chain_ref(chain); 2854 hammer2_spin_unex(&parent->core.spin); 2855 2856 /* 2857 * chain is referenced but not locked. We must lock the 2858 * chain to obtain definitive state. 2859 */ 2860 if (bsave.type == HAMMER2_BREF_TYPE_INDIRECT || 2861 bsave.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2862 hammer2_chain_lock(chain, how_maybe); 2863 } else { 2864 hammer2_chain_lock(chain, how); 2865 } 2866 KKASSERT(chain->parent == parent); 2867 } 2868 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 2869 chain->parent != parent) { 2870 hammer2_chain_unlock(chain); 2871 hammer2_chain_drop(chain); 2872 chain = NULL; /* SAFETY */ 2873 goto again; 2874 } 2875 2876 2877 /* 2878 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2879 * 2880 * NOTE: Chain's key range is not relevant as there might be 2881 * one-offs within the range that are not deleted. 2882 * 2883 * NOTE: Lookups can race delete-duplicate because 2884 * delete-duplicate does not lock the parent's core 2885 * (they just use the spinlock on the core). 2886 */ 2887 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2888 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2889 chain->bref.data_off, chain->bref.type, 2890 chain->bref.key); 2891 hammer2_chain_unlock(chain); 2892 hammer2_chain_drop(chain); 2893 chain = NULL; /* SAFETY */ 2894 key_beg = *key_nextp; 2895 if (key_beg == 0 || key_beg > key_end) 2896 return(NULL); 2897 goto again; 2898 } 2899 2900 /* 2901 * If the chain element is an indirect block it becomes the new 2902 * parent and we loop on it. We must maintain our top-down locks 2903 * to prevent the flusher from interfering (i.e. doing a 2904 * delete-duplicate and leaving us recursing down a deleted chain). 2905 * 2906 * The parent always has to be locked with at least RESOLVE_MAYBE 2907 * so we can access its data. It might need a fixup if the caller 2908 * passed incompatible flags. Be careful not to cause a deadlock 2909 * as a data-load requires an exclusive lock. 2910 * 2911 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2912 * range is within the requested key range we return the indirect 2913 * block and do NOT loop. This is usually only used to acquire 2914 * freemap nodes. 2915 */ 2916 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2917 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2918 save_mtx = parent->lock; 2919 hammer2_chain_unlock(parent); 2920 hammer2_chain_drop(parent); 2921 *parentp = parent = chain; 2922 chain = NULL; /* SAFETY */ 2923 goto again; 2924 } 2925 done: 2926 /* 2927 * All done, return the locked chain. 2928 * 2929 * If the caller does not want a locked chain, replace the lock with 2930 * a ref. Perhaps this can eventually be optimized to not obtain the 2931 * lock in the first place for situations where the data does not 2932 * need to be resolved. 2933 * 2934 * NOTE! A chain->error must be tested by the caller upon return. 2935 * *errorp is only set based on issues which occur while 2936 * trying to reach the chain. 2937 */ 2938 return (chain); 2939 } 2940 2941 /* 2942 * After having issued a lookup we can iterate all matching keys. 2943 * 2944 * If chain is non-NULL we continue the iteration from just after it's index. 2945 * 2946 * If chain is NULL we assume the parent was exhausted and continue the 2947 * iteration at the next parent. 2948 * 2949 * If a fatal error occurs (typically an I/O error), a dummy chain is 2950 * returned with chain->error and error-identifying information set. This 2951 * chain will assert if you try to do anything fancy with it. 2952 * 2953 * XXX Depending on where the error occurs we should allow continued iteration. 2954 * 2955 * parent must be locked on entry and remains locked throughout. chain's 2956 * lock status must match flags. Chain is always at least referenced. 2957 * 2958 * WARNING! The MATCHIND flag does not apply to this function. 2959 */ 2960 hammer2_chain_t * 2961 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2962 hammer2_key_t *key_nextp, 2963 hammer2_key_t key_beg, hammer2_key_t key_end, 2964 int *errorp, int flags) 2965 { 2966 hammer2_chain_t *parent; 2967 int how_maybe; 2968 2969 /* 2970 * Calculate locking flags for upward recursion. 2971 */ 2972 how_maybe = HAMMER2_RESOLVE_MAYBE; 2973 if (flags & HAMMER2_LOOKUP_SHARED) 2974 how_maybe |= HAMMER2_RESOLVE_SHARED; 2975 2976 parent = *parentp; 2977 *errorp = 0; 2978 2979 /* 2980 * Calculate the next index and recalculate the parent if necessary. 2981 */ 2982 if (chain) { 2983 key_beg = chain->bref.key + 2984 ((hammer2_key_t)1 << chain->bref.keybits); 2985 hammer2_chain_unlock(chain); 2986 hammer2_chain_drop(chain); 2987 2988 /* 2989 * chain invalid past this point, but we can still do a 2990 * pointer comparison w/parent. 2991 * 2992 * Any scan where the lookup returned degenerate data embedded 2993 * in the inode has an invalid index and must terminate. 2994 */ 2995 if (chain == parent) 2996 return(NULL); 2997 if (key_beg == 0 || key_beg > key_end) 2998 return(NULL); 2999 chain = NULL; 3000 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 3001 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 3002 /* 3003 * We reached the end of the iteration. 3004 */ 3005 return (NULL); 3006 } else { 3007 /* 3008 * Continue iteration with next parent unless the current 3009 * parent covers the range. 3010 * 3011 * (This also handles the case of a deleted, empty indirect 3012 * node). 3013 */ 3014 key_beg = parent->bref.key + 3015 ((hammer2_key_t)1 << parent->bref.keybits); 3016 if (key_beg == 0 || key_beg > key_end) 3017 return (NULL); 3018 parent = hammer2_chain_repparent(parentp, how_maybe); 3019 } 3020 3021 /* 3022 * And execute 3023 */ 3024 return (hammer2_chain_lookup(parentp, key_nextp, 3025 key_beg, key_end, 3026 errorp, flags)); 3027 } 3028 3029 /* 3030 * Caller wishes to iterate chains under parent, loading new chains into 3031 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 3032 * then call hammer2_chain_scan() repeatedly until a non-zero return. 3033 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 3034 * with the returned chain for the scan. The returned *chainp will be 3035 * locked and referenced. Any prior contents will be unlocked and dropped. 3036 * 3037 * Caller should check the return value. A normal scan EOF will return 3038 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 3039 * error trying to access parent data. Any error in the returned chain 3040 * must be tested separately by the caller. 3041 * 3042 * (*chainp) is dropped on each scan, but will only be set if the returned 3043 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 3044 * returned via *chainp. The caller will get their bref only. 3045 * 3046 * The raw scan function is similar to lookup/next but does not seek to a key. 3047 * Blockrefs are iterated via first_bref = (parent, NULL) and 3048 * next_chain = (parent, bref). 3049 * 3050 * The passed-in parent must be locked and its data resolved. The function 3051 * nominally returns a locked and referenced *chainp != NULL for chains 3052 * the caller might need to recurse on (and will dipose of any *chainp passed 3053 * in). The caller must check the chain->bref.type either way. 3054 */ 3055 int 3056 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 3057 hammer2_blockref_t *bref, int *firstp, 3058 int flags) 3059 { 3060 hammer2_dev_t *hmp; 3061 hammer2_blockref_t *base; 3062 hammer2_blockref_t *bref_ptr; 3063 hammer2_key_t key; 3064 hammer2_key_t next_key; 3065 hammer2_chain_t *chain = NULL; 3066 int count = 0; 3067 int how_always = HAMMER2_RESOLVE_ALWAYS; 3068 int how_maybe = HAMMER2_RESOLVE_MAYBE; 3069 int how; 3070 int generation; 3071 int maxloops = 300000; 3072 int error; 3073 3074 hmp = parent->hmp; 3075 error = 0; 3076 3077 /* 3078 * Scan flags borrowed from lookup. 3079 */ 3080 if (flags & HAMMER2_LOOKUP_ALWAYS) { 3081 how_maybe = how_always; 3082 how = HAMMER2_RESOLVE_ALWAYS; 3083 } else if (flags & HAMMER2_LOOKUP_NODATA) { 3084 how = HAMMER2_RESOLVE_NEVER; 3085 } else { 3086 how = HAMMER2_RESOLVE_MAYBE; 3087 } 3088 if (flags & HAMMER2_LOOKUP_SHARED) { 3089 how_maybe |= HAMMER2_RESOLVE_SHARED; 3090 how_always |= HAMMER2_RESOLVE_SHARED; 3091 how |= HAMMER2_RESOLVE_SHARED; 3092 } 3093 3094 /* 3095 * Calculate key to locate first/next element, unlocking the previous 3096 * element as we go. Be careful, the key calculation can overflow. 3097 * 3098 * (also reset bref to NULL) 3099 */ 3100 if (*firstp) { 3101 key = 0; 3102 *firstp = 0; 3103 } else { 3104 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 3105 if ((chain = *chainp) != NULL) { 3106 *chainp = NULL; 3107 hammer2_chain_unlock(chain); 3108 hammer2_chain_drop(chain); 3109 chain = NULL; 3110 } 3111 if (key == 0) { 3112 error |= HAMMER2_ERROR_EOF; 3113 goto done; 3114 } 3115 } 3116 3117 again: 3118 if (parent->error) { 3119 error = parent->error; 3120 goto done; 3121 } 3122 if (--maxloops == 0) 3123 panic("hammer2_chain_scan: maxloops"); 3124 3125 /* 3126 * Locate the blockref array. Currently we do a fully associative 3127 * search through the array. 3128 */ 3129 switch(parent->bref.type) { 3130 case HAMMER2_BREF_TYPE_INODE: 3131 /* 3132 * An inode with embedded data has no sub-chains. 3133 * 3134 * WARNING! Bulk scan code may pass a static chain marked 3135 * as BREF_TYPE_INODE with a copy of the volume 3136 * root blockset to snapshot the volume. 3137 */ 3138 if (parent->data->ipdata.meta.op_flags & 3139 HAMMER2_OPFLAG_DIRECTDATA) { 3140 error |= HAMMER2_ERROR_EOF; 3141 goto done; 3142 } 3143 base = &parent->data->ipdata.u.blockset.blockref[0]; 3144 count = HAMMER2_SET_COUNT; 3145 break; 3146 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3147 case HAMMER2_BREF_TYPE_INDIRECT: 3148 /* 3149 * Optimize indirect blocks in the INITIAL state to avoid 3150 * I/O. 3151 */ 3152 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3153 base = NULL; 3154 } else { 3155 if (parent->data == NULL) 3156 panic("parent->data is NULL"); 3157 base = &parent->data->npdata[0]; 3158 } 3159 count = parent->bytes / sizeof(hammer2_blockref_t); 3160 break; 3161 case HAMMER2_BREF_TYPE_VOLUME: 3162 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3163 count = HAMMER2_SET_COUNT; 3164 break; 3165 case HAMMER2_BREF_TYPE_FREEMAP: 3166 base = &parent->data->blkset.blockref[0]; 3167 count = HAMMER2_SET_COUNT; 3168 break; 3169 default: 3170 panic("hammer2_chain_scan: unrecognized blockref type: %d", 3171 parent->bref.type); 3172 base = NULL; /* safety */ 3173 count = 0; /* safety */ 3174 break; 3175 } 3176 3177 /* 3178 * Merged scan to find next candidate. 3179 * 3180 * hammer2_base_*() functions require the parent->core.live_* fields 3181 * to be synchronized. 3182 * 3183 * We need to hold the spinlock to access the block array and RB tree 3184 * and to interlock chain creation. 3185 */ 3186 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3187 hammer2_chain_countbrefs(parent, base, count); 3188 3189 next_key = 0; 3190 bref_ptr = NULL; 3191 hammer2_spin_ex(&parent->core.spin); 3192 chain = hammer2_combined_find(parent, base, count, 3193 &next_key, 3194 key, HAMMER2_KEY_MAX, 3195 &bref_ptr); 3196 generation = parent->core.generation; 3197 3198 /* 3199 * Exhausted parent chain, we're done. 3200 */ 3201 if (bref_ptr == NULL) { 3202 hammer2_spin_unex(&parent->core.spin); 3203 KKASSERT(chain == NULL); 3204 error |= HAMMER2_ERROR_EOF; 3205 goto done; 3206 } 3207 3208 /* 3209 * Copy into the supplied stack-based blockref. 3210 */ 3211 *bref = *bref_ptr; 3212 3213 /* 3214 * Selected from blockref or in-memory chain. 3215 */ 3216 if (chain == NULL) { 3217 switch(bref->type) { 3218 case HAMMER2_BREF_TYPE_INODE: 3219 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3220 case HAMMER2_BREF_TYPE_INDIRECT: 3221 case HAMMER2_BREF_TYPE_VOLUME: 3222 case HAMMER2_BREF_TYPE_FREEMAP: 3223 /* 3224 * Recursion, always get the chain 3225 */ 3226 hammer2_spin_unex(&parent->core.spin); 3227 chain = hammer2_chain_get(parent, generation, 3228 bref, how); 3229 if (chain == NULL) 3230 goto again; 3231 break; 3232 default: 3233 /* 3234 * No recursion, do not waste time instantiating 3235 * a chain, just iterate using the bref. 3236 */ 3237 hammer2_spin_unex(&parent->core.spin); 3238 break; 3239 } 3240 } else { 3241 /* 3242 * Recursion or not we need the chain in order to supply 3243 * the bref. 3244 */ 3245 hammer2_chain_ref(chain); 3246 hammer2_spin_unex(&parent->core.spin); 3247 hammer2_chain_lock(chain, how); 3248 } 3249 if (chain && 3250 (bcmp(bref, &chain->bref, sizeof(*bref)) || 3251 chain->parent != parent)) { 3252 hammer2_chain_unlock(chain); 3253 hammer2_chain_drop(chain); 3254 chain = NULL; 3255 goto again; 3256 } 3257 3258 /* 3259 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 3260 * 3261 * NOTE: chain's key range is not relevant as there might be 3262 * one-offs within the range that are not deleted. 3263 * 3264 * NOTE: XXX this could create problems with scans used in 3265 * situations other than mount-time recovery. 3266 * 3267 * NOTE: Lookups can race delete-duplicate because 3268 * delete-duplicate does not lock the parent's core 3269 * (they just use the spinlock on the core). 3270 */ 3271 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3272 hammer2_chain_unlock(chain); 3273 hammer2_chain_drop(chain); 3274 chain = NULL; 3275 3276 key = next_key; 3277 if (key == 0) { 3278 error |= HAMMER2_ERROR_EOF; 3279 goto done; 3280 } 3281 goto again; 3282 } 3283 3284 done: 3285 /* 3286 * All done, return the bref or NULL, supply chain if necessary. 3287 */ 3288 if (chain) 3289 *chainp = chain; 3290 return (error); 3291 } 3292 3293 /* 3294 * Create and return a new hammer2 system memory structure of the specified 3295 * key, type and size and insert it under (*parentp). This is a full 3296 * insertion, based on the supplied key/keybits, and may involve creating 3297 * indirect blocks and moving other chains around via delete/duplicate. 3298 * 3299 * This call can be made with parent == NULL as long as a non -1 methods 3300 * is supplied. hmp must also be supplied in this situation (otherwise 3301 * hmp is extracted from the supplied parent). The chain will be detached 3302 * from the topology. A later call with both parent and chain can be made 3303 * to attach it. 3304 * 3305 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3306 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3307 * FULL. This typically means that the caller is creating the chain after 3308 * doing a hammer2_chain_lookup(). 3309 * 3310 * (*parentp) must be exclusive locked and may be replaced on return 3311 * depending on how much work the function had to do. 3312 * 3313 * (*parentp) must not be errored or this function will assert. 3314 * 3315 * (*chainp) usually starts out NULL and returns the newly created chain, 3316 * but if the caller desires the caller may allocate a disconnected chain 3317 * and pass it in instead. 3318 * 3319 * This function should NOT be used to insert INDIRECT blocks. It is 3320 * typically used to create/insert inodes and data blocks. 3321 * 3322 * Caller must pass-in an exclusively locked parent the new chain is to 3323 * be inserted under, and optionally pass-in a disconnected, exclusively 3324 * locked chain to insert (else we create a new chain). The function will 3325 * adjust (*parentp) as necessary, create or connect the chain, and 3326 * return an exclusively locked chain in *chainp. 3327 * 3328 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3329 * and will be reassigned. 3330 * 3331 * NOTE: returns HAMMER_ERROR_* flags 3332 */ 3333 int 3334 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3335 hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods, 3336 hammer2_key_t key, int keybits, int type, size_t bytes, 3337 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3338 { 3339 hammer2_chain_t *chain; 3340 hammer2_chain_t *parent; 3341 hammer2_blockref_t *base; 3342 hammer2_blockref_t dummy; 3343 int allocated = 0; 3344 int error = 0; 3345 int count; 3346 int maxloops = 300000; 3347 3348 /* 3349 * Topology may be crossing a PFS boundary. 3350 */ 3351 parent = *parentp; 3352 if (parent) { 3353 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3354 KKASSERT(parent->error == 0); 3355 hmp = parent->hmp; 3356 } 3357 chain = *chainp; 3358 3359 if (chain == NULL) { 3360 /* 3361 * First allocate media space and construct the dummy bref, 3362 * then allocate the in-memory chain structure. Set the 3363 * INITIAL flag for fresh chains which do not have embedded 3364 * data. 3365 */ 3366 bzero(&dummy, sizeof(dummy)); 3367 dummy.type = type; 3368 dummy.key = key; 3369 dummy.keybits = keybits; 3370 dummy.data_off = hammer2_getradix(bytes); 3371 3372 /* 3373 * Inherit methods from parent by default. Primarily used 3374 * for BREF_TYPE_DATA. Non-data types *must* be set to 3375 * a non-NONE check algorithm. 3376 */ 3377 if (methods == HAMMER2_METH_DEFAULT) 3378 dummy.methods = parent->bref.methods; 3379 else 3380 dummy.methods = (uint8_t)methods; 3381 3382 if (type != HAMMER2_BREF_TYPE_DATA && 3383 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3384 dummy.methods |= 3385 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3386 } 3387 3388 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3389 3390 /* 3391 * Lock the chain manually, chain_lock will load the chain 3392 * which we do NOT want to do. (note: chain->refs is set 3393 * to 1 by chain_alloc() for us, but lockcnt is not). 3394 */ 3395 chain->lockcnt = 1; 3396 hammer2_mtx_ex(&chain->lock); 3397 allocated = 1; 3398 3399 /* 3400 * Set INITIAL to optimize I/O. The flag will generally be 3401 * processed when we call hammer2_chain_modify(). 3402 */ 3403 switch(type) { 3404 case HAMMER2_BREF_TYPE_VOLUME: 3405 case HAMMER2_BREF_TYPE_FREEMAP: 3406 panic("hammer2_chain_create: called with volume type"); 3407 break; 3408 case HAMMER2_BREF_TYPE_INDIRECT: 3409 panic("hammer2_chain_create: cannot be used to" 3410 "create indirect block"); 3411 break; 3412 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3413 panic("hammer2_chain_create: cannot be used to" 3414 "create freemap root or node"); 3415 break; 3416 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3417 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3418 /* fall through */ 3419 case HAMMER2_BREF_TYPE_DIRENT: 3420 case HAMMER2_BREF_TYPE_INODE: 3421 case HAMMER2_BREF_TYPE_DATA: 3422 default: 3423 /* 3424 * leave chain->data NULL, set INITIAL 3425 */ 3426 KKASSERT(chain->data == NULL); 3427 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3428 break; 3429 } 3430 } else { 3431 /* 3432 * We are reattaching a previously deleted chain, possibly 3433 * under a new parent and possibly with a new key/keybits. 3434 * The chain does not have to be in a modified state. The 3435 * UPDATE flag will be set later on in this routine. 3436 * 3437 * Do NOT mess with the current state of the INITIAL flag. 3438 */ 3439 chain->bref.key = key; 3440 chain->bref.keybits = keybits; 3441 if (chain->flags & HAMMER2_CHAIN_DELETED) 3442 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3443 KKASSERT(chain->parent == NULL); 3444 } 3445 3446 /* 3447 * Set the appropriate bref flag if requested. 3448 * 3449 * NOTE! Callers can call this function to move chains without 3450 * knowing about special flags, so don't clear bref flags 3451 * here! 3452 */ 3453 if (flags & HAMMER2_INSERT_PFSROOT) 3454 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3455 3456 if (parent == NULL) 3457 goto skip; 3458 3459 /* 3460 * Calculate how many entries we have in the blockref array and 3461 * determine if an indirect block is required when inserting into 3462 * the parent. 3463 */ 3464 again: 3465 if (--maxloops == 0) 3466 panic("hammer2_chain_create: maxloops"); 3467 3468 switch(parent->bref.type) { 3469 case HAMMER2_BREF_TYPE_INODE: 3470 if ((parent->data->ipdata.meta.op_flags & 3471 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3472 kprintf("hammer2: parent set for direct-data! " 3473 "pkey=%016jx ckey=%016jx\n", 3474 parent->bref.key, 3475 chain->bref.key); 3476 } 3477 KKASSERT((parent->data->ipdata.meta.op_flags & 3478 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3479 KKASSERT(parent->data != NULL); 3480 base = &parent->data->ipdata.u.blockset.blockref[0]; 3481 count = HAMMER2_SET_COUNT; 3482 break; 3483 case HAMMER2_BREF_TYPE_INDIRECT: 3484 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3485 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3486 base = NULL; 3487 else 3488 base = &parent->data->npdata[0]; 3489 count = parent->bytes / sizeof(hammer2_blockref_t); 3490 break; 3491 case HAMMER2_BREF_TYPE_VOLUME: 3492 KKASSERT(parent->data != NULL); 3493 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3494 count = HAMMER2_SET_COUNT; 3495 break; 3496 case HAMMER2_BREF_TYPE_FREEMAP: 3497 KKASSERT(parent->data != NULL); 3498 base = &parent->data->blkset.blockref[0]; 3499 count = HAMMER2_SET_COUNT; 3500 break; 3501 default: 3502 panic("hammer2_chain_create: unrecognized blockref type: %d", 3503 parent->bref.type); 3504 base = NULL; 3505 count = 0; 3506 break; 3507 } 3508 3509 /* 3510 * Make sure we've counted the brefs 3511 */ 3512 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3513 hammer2_chain_countbrefs(parent, base, count); 3514 3515 KASSERT(parent->core.live_count >= 0 && 3516 parent->core.live_count <= count, 3517 ("bad live_count %d/%d (%02x, %d)", 3518 parent->core.live_count, count, 3519 parent->bref.type, parent->bytes)); 3520 3521 /* 3522 * If no free blockref could be found we must create an indirect 3523 * block and move a number of blockrefs into it. With the parent 3524 * locked we can safely lock each child in order to delete+duplicate 3525 * it without causing a deadlock. 3526 * 3527 * This may return the new indirect block or the old parent depending 3528 * on where the key falls. NULL is returned on error. 3529 */ 3530 if (parent->core.live_count == count) { 3531 hammer2_chain_t *nparent; 3532 3533 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3534 3535 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3536 mtid, type, &error); 3537 if (nparent == NULL) { 3538 if (allocated) 3539 hammer2_chain_drop(chain); 3540 chain = NULL; 3541 goto done; 3542 } 3543 if (parent != nparent) { 3544 hammer2_chain_unlock(parent); 3545 hammer2_chain_drop(parent); 3546 parent = *parentp = nparent; 3547 } 3548 goto again; 3549 } 3550 3551 /* 3552 * fall through if parent, or skip to here if no parent. 3553 */ 3554 skip: 3555 if (chain->flags & HAMMER2_CHAIN_DELETED) 3556 kprintf("Inserting deleted chain @%016jx\n", 3557 chain->bref.key); 3558 3559 /* 3560 * Link the chain into its parent. 3561 */ 3562 if (chain->parent != NULL) 3563 panic("hammer2: hammer2_chain_create: chain already connected"); 3564 KKASSERT(chain->parent == NULL); 3565 if (parent) { 3566 KKASSERT(parent->core.live_count < count); 3567 hammer2_chain_insert(parent, chain, 3568 HAMMER2_CHAIN_INSERT_SPIN | 3569 HAMMER2_CHAIN_INSERT_LIVE, 3570 0); 3571 } 3572 3573 if (allocated) { 3574 /* 3575 * Mark the newly created chain modified. This will cause 3576 * UPDATE to be set and process the INITIAL flag. 3577 * 3578 * Device buffers are not instantiated for DATA elements 3579 * as these are handled by logical buffers. 3580 * 3581 * Indirect and freemap node indirect blocks are handled 3582 * by hammer2_chain_create_indirect() and not by this 3583 * function. 3584 * 3585 * Data for all other bref types is expected to be 3586 * instantiated (INODE, LEAF). 3587 */ 3588 switch(chain->bref.type) { 3589 case HAMMER2_BREF_TYPE_DATA: 3590 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3591 case HAMMER2_BREF_TYPE_DIRENT: 3592 case HAMMER2_BREF_TYPE_INODE: 3593 error = hammer2_chain_modify(chain, mtid, dedup_off, 3594 HAMMER2_MODIFY_OPTDATA); 3595 break; 3596 default: 3597 /* 3598 * Remaining types are not supported by this function. 3599 * In particular, INDIRECT and LEAF_NODE types are 3600 * handled by create_indirect(). 3601 */ 3602 panic("hammer2_chain_create: bad type: %d", 3603 chain->bref.type); 3604 /* NOT REACHED */ 3605 break; 3606 } 3607 } else { 3608 /* 3609 * When reconnecting a chain we must set UPDATE and 3610 * setflush so the flush recognizes that it must update 3611 * the bref in the parent. 3612 */ 3613 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3614 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3615 } 3616 3617 /* 3618 * We must setflush(parent) to ensure that it recurses through to 3619 * chain. setflush(chain) might not work because ONFLUSH is possibly 3620 * already set in the chain (so it won't recurse up to set it in the 3621 * parent). 3622 */ 3623 if (parent) 3624 hammer2_chain_setflush(parent); 3625 3626 done: 3627 *chainp = chain; 3628 3629 return (error); 3630 } 3631 3632 /* 3633 * Move the chain from its old parent to a new parent. The chain must have 3634 * already been deleted or already disconnected (or never associated) with 3635 * a parent. The chain is reassociated with the new parent and the deleted 3636 * flag will be cleared (no longer deleted). The chain's modification state 3637 * is not altered. 3638 * 3639 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3640 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3641 * FULL. This typically means that the caller is creating the chain after 3642 * doing a hammer2_chain_lookup(). 3643 * 3644 * Neither (parent) or (chain) can be errored. 3645 * 3646 * If (parent) is non-NULL then the chain is inserted under the parent. 3647 * 3648 * If (parent) is NULL then the newly duplicated chain is not inserted 3649 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3650 * passing into hammer2_chain_create() after this function returns). 3651 * 3652 * WARNING! This function calls create which means it can insert indirect 3653 * blocks. This can cause other unrelated chains in the parent to 3654 * be moved to a newly inserted indirect block in addition to the 3655 * specific chain. 3656 */ 3657 void 3658 hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3659 hammer2_tid_t mtid, int flags) 3660 { 3661 hammer2_blockref_t *bref; 3662 hammer2_dev_t *hmp; 3663 hammer2_chain_t *parent; 3664 3665 /* 3666 * WARNING! We should never resolve DATA to device buffers 3667 * (XXX allow it if the caller did?), and since 3668 * we currently do not have the logical buffer cache 3669 * buffer in-hand to fix its cached physical offset 3670 * we also force the modify code to not COW it. XXX 3671 * 3672 * NOTE! We allow error'd chains to be renamed. The bref itself 3673 * is good and can be renamed. The content, however, may 3674 * be inaccessible. 3675 */ 3676 hmp = chain->hmp; 3677 KKASSERT(chain->parent == NULL); 3678 /*KKASSERT(chain->error == 0); allow */ 3679 bref = &chain->bref; 3680 3681 /* 3682 * If parent is not NULL the duplicated chain will be entered under 3683 * the parent and the UPDATE bit set to tell flush to update 3684 * the blockref. 3685 * 3686 * We must setflush(parent) to ensure that it recurses through to 3687 * chain. setflush(chain) might not work because ONFLUSH is possibly 3688 * already set in the chain (so it won't recurse up to set it in the 3689 * parent). 3690 * 3691 * Having both chains locked is extremely important for atomicy. 3692 */ 3693 if (parentp && (parent = *parentp) != NULL) { 3694 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3695 KKASSERT(parent->refs > 0); 3696 KKASSERT(parent->error == 0); 3697 3698 hammer2_chain_create(parentp, &chain, NULL, chain->pmp, 3699 HAMMER2_METH_DEFAULT, 3700 bref->key, bref->keybits, bref->type, 3701 chain->bytes, mtid, 0, flags); 3702 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3703 hammer2_chain_setflush(*parentp); 3704 } 3705 } 3706 3707 /* 3708 * This works in tandem with delete_obref() to install a blockref in 3709 * (typically) an indirect block that is associated with the chain being 3710 * moved to *parentp. 3711 * 3712 * The reason we need this function is that the caller needs to maintain 3713 * the blockref as it was, and not generate a new blockref for what might 3714 * be a modified chain. Otherwise stuff will leak into the flush that 3715 * the flush code's FLUSH_INODE_STOP flag is unable to catch. 3716 * 3717 * It is EXTREMELY important that we properly set CHAIN_BLKMAPUPD and 3718 * CHAIN_UPDATE. We must set BLKMAPUPD if the bref does not match, and 3719 * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if 3720 * it does. Otherwise we can end up in a situation where H2 is unable to 3721 * clean up the in-memory chain topology. 3722 * 3723 * The reason for this is that flushes do not generally flush through 3724 * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq 3725 * or sideq to properly flush and dispose of the related inode chain's flags. 3726 * Situations where the inode is not actually modified by the frontend, 3727 * but where we have to move the related chains around as we insert or cleanup 3728 * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory 3729 * inode chain that does not have a hammer2_inode_t associated with it. 3730 */ 3731 static void 3732 hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain, 3733 hammer2_tid_t mtid, int flags, 3734 hammer2_blockref_t *obref) 3735 { 3736 hammer2_chain_rename(parentp, chain, mtid, flags); 3737 3738 if (obref->type != HAMMER2_BREF_TYPE_EMPTY) { 3739 hammer2_blockref_t *tbase; 3740 int tcount; 3741 3742 KKASSERT((chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0); 3743 hammer2_chain_modify(*parentp, mtid, 0, 0); 3744 tbase = hammer2_chain_base_and_count(*parentp, &tcount); 3745 hammer2_base_insert(*parentp, tbase, tcount, chain, obref); 3746 if (bcmp(obref, &chain->bref, sizeof(chain->bref))) { 3747 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPUPD | 3748 HAMMER2_CHAIN_UPDATE); 3749 } else { 3750 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3751 } 3752 } 3753 } 3754 3755 /* 3756 * Helper function for deleting chains. 3757 * 3758 * The chain is removed from the live view (the RBTREE) as well as the parent's 3759 * blockmap. Both chain and its parent must be locked. 3760 * 3761 * parent may not be errored. chain can be errored. 3762 */ 3763 static int 3764 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3765 hammer2_tid_t mtid, int flags, 3766 hammer2_blockref_t *obref) 3767 { 3768 hammer2_dev_t *hmp; 3769 int error = 0; 3770 3771 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 3772 KKASSERT(chain->parent == parent); 3773 hmp = chain->hmp; 3774 3775 if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) { 3776 /* 3777 * Chain is blockmapped, so there must be a parent. 3778 * Atomically remove the chain from the parent and remove 3779 * the blockmap entry. The parent must be set modified 3780 * to remove the blockmap entry. 3781 */ 3782 hammer2_blockref_t *base; 3783 int count; 3784 3785 KKASSERT(parent != NULL); 3786 KKASSERT(parent->error == 0); 3787 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3788 error = hammer2_chain_modify(parent, mtid, 0, 0); 3789 if (error) 3790 goto done; 3791 3792 /* 3793 * Calculate blockmap pointer 3794 */ 3795 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3796 hammer2_spin_ex(&chain->core.spin); 3797 hammer2_spin_ex(&parent->core.spin); 3798 3799 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3800 atomic_add_int(&parent->core.live_count, -1); 3801 ++parent->core.generation; 3802 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3803 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3804 --parent->core.chain_count; 3805 chain->parent = NULL; 3806 3807 switch(parent->bref.type) { 3808 case HAMMER2_BREF_TYPE_INODE: 3809 /* 3810 * Access the inode's block array. However, there 3811 * is no block array if the inode is flagged 3812 * DIRECTDATA. 3813 */ 3814 if (parent->data && 3815 (parent->data->ipdata.meta.op_flags & 3816 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3817 base = 3818 &parent->data->ipdata.u.blockset.blockref[0]; 3819 } else { 3820 base = NULL; 3821 } 3822 count = HAMMER2_SET_COUNT; 3823 break; 3824 case HAMMER2_BREF_TYPE_INDIRECT: 3825 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3826 if (parent->data) 3827 base = &parent->data->npdata[0]; 3828 else 3829 base = NULL; 3830 count = parent->bytes / sizeof(hammer2_blockref_t); 3831 break; 3832 case HAMMER2_BREF_TYPE_VOLUME: 3833 base = &parent->data->voldata. 3834 sroot_blockset.blockref[0]; 3835 count = HAMMER2_SET_COUNT; 3836 break; 3837 case HAMMER2_BREF_TYPE_FREEMAP: 3838 base = &parent->data->blkset.blockref[0]; 3839 count = HAMMER2_SET_COUNT; 3840 break; 3841 default: 3842 base = NULL; 3843 count = 0; 3844 panic("_hammer2_chain_delete_helper: " 3845 "unrecognized blockref type: %d", 3846 parent->bref.type); 3847 break; 3848 } 3849 3850 /* 3851 * delete blockmapped chain from its parent. 3852 * 3853 * The parent is not affected by any statistics in chain 3854 * which are pending synchronization. That is, there is 3855 * nothing to undo in the parent since they have not yet 3856 * been incorporated into the parent. 3857 * 3858 * The parent is affected by statistics stored in inodes. 3859 * Those have already been synchronized, so they must be 3860 * undone. XXX split update possible w/delete in middle? 3861 */ 3862 if (base) { 3863 hammer2_base_delete(parent, base, count, chain, obref); 3864 } 3865 hammer2_spin_unex(&parent->core.spin); 3866 hammer2_spin_unex(&chain->core.spin); 3867 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3868 /* 3869 * Chain is not blockmapped but a parent is present. 3870 * Atomically remove the chain from the parent. There is 3871 * no blockmap entry to remove. 3872 * 3873 * Because chain was associated with a parent but not 3874 * synchronized, the chain's *_count_up fields contain 3875 * inode adjustment statistics which must be undone. 3876 */ 3877 hammer2_spin_ex(&chain->core.spin); 3878 hammer2_spin_ex(&parent->core.spin); 3879 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3880 atomic_add_int(&parent->core.live_count, -1); 3881 ++parent->core.generation; 3882 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3883 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3884 --parent->core.chain_count; 3885 chain->parent = NULL; 3886 hammer2_spin_unex(&parent->core.spin); 3887 hammer2_spin_unex(&chain->core.spin); 3888 } else { 3889 /* 3890 * Chain is not blockmapped and has no parent. This 3891 * is a degenerate case. 3892 */ 3893 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3894 } 3895 done: 3896 return error; 3897 } 3898 3899 /* 3900 * Create an indirect block that covers one or more of the elements in the 3901 * current parent. Either returns the existing parent with no locking or 3902 * ref changes or returns the new indirect block locked and referenced 3903 * and leaving the original parent lock/ref intact as well. 3904 * 3905 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3906 * 3907 * The returned chain depends on where the specified key falls. 3908 * 3909 * The key/keybits for the indirect mode only needs to follow three rules: 3910 * 3911 * (1) That all elements underneath it fit within its key space and 3912 * 3913 * (2) That all elements outside it are outside its key space. 3914 * 3915 * (3) When creating the new indirect block any elements in the current 3916 * parent that fit within the new indirect block's keyspace must be 3917 * moved into the new indirect block. 3918 * 3919 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3920 * keyspace the the current parent, but lookup/iteration rules will 3921 * ensure (and must ensure) that rule (2) for all parents leading up 3922 * to the nearest inode or the root volume header is adhered to. This 3923 * is accomplished by always recursing through matching keyspaces in 3924 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3925 * 3926 * The current implementation calculates the current worst-case keyspace by 3927 * iterating the current parent and then divides it into two halves, choosing 3928 * whichever half has the most elements (not necessarily the half containing 3929 * the requested key). 3930 * 3931 * We can also opt to use the half with the least number of elements. This 3932 * causes lower-numbered keys (aka logical file offsets) to recurse through 3933 * fewer indirect blocks and higher-numbered keys to recurse through more. 3934 * This also has the risk of not moving enough elements to the new indirect 3935 * block and being forced to create several indirect blocks before the element 3936 * can be inserted. 3937 * 3938 * Must be called with an exclusively locked parent. 3939 * 3940 * NOTE: *errorp set to HAMMER_ERROR_* flags 3941 */ 3942 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3943 hammer2_key_t *keyp, int keybits, 3944 hammer2_blockref_t *base, int count); 3945 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3946 hammer2_key_t *keyp, int keybits, 3947 hammer2_blockref_t *base, int count, 3948 int ncount); 3949 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3950 hammer2_key_t *keyp, int keybits, 3951 hammer2_blockref_t *base, int count, 3952 int ncount); 3953 static 3954 hammer2_chain_t * 3955 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3956 hammer2_key_t create_key, int create_bits, 3957 hammer2_tid_t mtid, int for_type, int *errorp) 3958 { 3959 hammer2_dev_t *hmp; 3960 hammer2_blockref_t *base; 3961 hammer2_blockref_t *bref; 3962 hammer2_blockref_t bsave; 3963 hammer2_blockref_t dummy; 3964 hammer2_chain_t *chain; 3965 hammer2_chain_t *ichain; 3966 hammer2_key_t key = create_key; 3967 hammer2_key_t key_beg; 3968 hammer2_key_t key_end; 3969 hammer2_key_t key_next; 3970 int keybits = create_bits; 3971 int count; 3972 int ncount; 3973 int nbytes; 3974 int loops; 3975 int error; 3976 int reason; 3977 int generation; 3978 int maxloops = 300000; 3979 3980 /* 3981 * Calculate the base blockref pointer or NULL if the chain 3982 * is known to be empty. We need to calculate the array count 3983 * for RB lookups either way. 3984 */ 3985 hmp = parent->hmp; 3986 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3987 3988 /* 3989 * Pre-modify the parent now to avoid having to deal with error 3990 * processing if we tried to later (in the middle of our loop). 3991 * 3992 * We are going to be moving bref's around, the indirect blocks 3993 * cannot be in an initial state. Do not pass MODIFY_OPTDATA. 3994 */ 3995 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 3996 if (*errorp) { 3997 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 3998 *errorp, hammer2_error_str(*errorp)); 3999 return NULL; 4000 } 4001 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 4002 4003 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 4004 base = hammer2_chain_base_and_count(parent, &count); 4005 4006 /* 4007 * How big should our new indirect block be? It has to be at least 4008 * as large as its parent for splits to work properly. 4009 * 4010 * The freemap uses a specific indirect block size. The number of 4011 * levels are built dynamically and ultimately depend on the size 4012 * volume. Because freemap blocks are taken from the reserved areas 4013 * of the volume our goal is efficiency (fewer levels) and not so 4014 * much to save disk space. 4015 * 4016 * The first indirect block level for a directory usually uses 4017 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 4018 * the hash mechanism, this typically gives us a nominal 4019 * 32 * 4 entries with one level of indirection. 4020 * 4021 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 4022 * indirect blocks. The initial 4 entries in the inode gives us 4023 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 4024 * of indirection gives us 137GB, and so forth. H2 can support 4025 * huge file sizes but they are not typical, so we try to stick 4026 * with compactness and do not use a larger indirect block size. 4027 * 4028 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 4029 * due to the way indirect blocks are created this usually winds 4030 * up being extremely inefficient for small files. Even though 4031 * 16KB requires more levels of indirection for very large files, 4032 * the 16KB records can be ganged together into 64KB DIOs. 4033 */ 4034 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4035 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4036 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 4037 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4038 if (parent->data->ipdata.meta.type == 4039 HAMMER2_OBJTYPE_DIRECTORY) 4040 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 4041 else 4042 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 4043 4044 } else { 4045 nbytes = HAMMER2_IND_BYTES_NOM; 4046 } 4047 if (nbytes < count * sizeof(hammer2_blockref_t)) { 4048 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 4049 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 4050 nbytes = count * sizeof(hammer2_blockref_t); 4051 } 4052 ncount = nbytes / sizeof(hammer2_blockref_t); 4053 4054 /* 4055 * When creating an indirect block for a freemap node or leaf 4056 * the key/keybits must be fitted to static radix levels because 4057 * particular radix levels use particular reserved blocks in the 4058 * related zone. 4059 * 4060 * This routine calculates the key/radix of the indirect block 4061 * we need to create, and whether it is on the high-side or the 4062 * low-side. 4063 */ 4064 switch(for_type) { 4065 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4066 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4067 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 4068 base, count); 4069 break; 4070 case HAMMER2_BREF_TYPE_DATA: 4071 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 4072 base, count, ncount); 4073 break; 4074 case HAMMER2_BREF_TYPE_DIRENT: 4075 case HAMMER2_BREF_TYPE_INODE: 4076 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 4077 base, count, ncount); 4078 break; 4079 default: 4080 panic("illegal indirect block for bref type %d", for_type); 4081 break; 4082 } 4083 4084 /* 4085 * Normalize the key for the radix being represented, keeping the 4086 * high bits and throwing away the low bits. 4087 */ 4088 key &= ~(((hammer2_key_t)1 << keybits) - 1); 4089 4090 /* 4091 * Ok, create our new indirect block 4092 */ 4093 bzero(&dummy, sizeof(dummy)); 4094 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 4095 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 4096 dummy.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 4097 } else { 4098 dummy.type = HAMMER2_BREF_TYPE_INDIRECT; 4099 } 4100 dummy.key = key; 4101 dummy.keybits = keybits; 4102 dummy.data_off = hammer2_getradix(nbytes); 4103 dummy.methods = 4104 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 4105 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 4106 4107 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy); 4108 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 4109 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 4110 /* ichain has one ref at this point */ 4111 4112 /* 4113 * We have to mark it modified to allocate its block, but use 4114 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 4115 * it won't be acted upon by the flush code. 4116 * 4117 * XXX remove OPTDATA, we need a fully initialized indirect block to 4118 * be able to move the original blockref. 4119 */ 4120 *errorp = hammer2_chain_modify(ichain, mtid, 0, 0); 4121 if (*errorp) { 4122 kprintf("hammer2_chain_create_indirect: error %08x %s\n", 4123 *errorp, hammer2_error_str(*errorp)); 4124 hammer2_chain_unlock(ichain); 4125 hammer2_chain_drop(ichain); 4126 return NULL; 4127 } 4128 KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4129 4130 /* 4131 * Iterate the original parent and move the matching brefs into 4132 * the new indirect block. 4133 * 4134 * XXX handle flushes. 4135 */ 4136 key_beg = 0; 4137 key_end = HAMMER2_KEY_MAX; 4138 key_next = 0; /* avoid gcc warnings */ 4139 hammer2_spin_ex(&parent->core.spin); 4140 loops = 0; 4141 reason = 0; 4142 4143 for (;;) { 4144 /* 4145 * Parent may have been modified, relocating its block array. 4146 * Reload the base pointer. 4147 */ 4148 base = hammer2_chain_base_and_count(parent, &count); 4149 4150 if (++loops > 100000) { 4151 hammer2_spin_unex(&parent->core.spin); 4152 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 4153 reason, parent, base, count, key_next); 4154 } 4155 4156 /* 4157 * NOTE: spinlock stays intact, returned chain (if not NULL) 4158 * is not referenced or locked which means that we 4159 * cannot safely check its flagged / deletion status 4160 * until we lock it. 4161 */ 4162 chain = hammer2_combined_find(parent, base, count, 4163 &key_next, 4164 key_beg, key_end, 4165 &bref); 4166 generation = parent->core.generation; 4167 if (bref == NULL) 4168 break; 4169 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4170 4171 /* 4172 * Skip keys that are not within the key/radix of the new 4173 * indirect block. They stay in the parent. 4174 */ 4175 if (rounddown2(key ^ bref->key, (hammer2_key_t)1 << keybits) != 0) { 4176 goto next_key_spinlocked; 4177 } 4178 4179 /* 4180 * Load the new indirect block by acquiring the related 4181 * chains (potentially from media as it might not be 4182 * in-memory). Then move it to the new parent (ichain). 4183 * 4184 * chain is referenced but not locked. We must lock the 4185 * chain to obtain definitive state. 4186 */ 4187 bsave = *bref; 4188 if (chain) { 4189 /* 4190 * Use chain already present in the RBTREE 4191 */ 4192 hammer2_chain_ref(chain); 4193 hammer2_spin_unex(&parent->core.spin); 4194 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 4195 } else { 4196 /* 4197 * Get chain for blockref element. _get returns NULL 4198 * on insertion race. 4199 */ 4200 hammer2_spin_unex(&parent->core.spin); 4201 chain = hammer2_chain_get(parent, generation, &bsave, 4202 HAMMER2_RESOLVE_NEVER); 4203 if (chain == NULL) { 4204 reason = 1; 4205 hammer2_spin_ex(&parent->core.spin); 4206 continue; 4207 } 4208 } 4209 4210 /* 4211 * This is always live so if the chain has been deleted 4212 * we raced someone and we have to retry. 4213 * 4214 * NOTE: Lookups can race delete-duplicate because 4215 * delete-duplicate does not lock the parent's core 4216 * (they just use the spinlock on the core). 4217 * 4218 * (note reversed logic for this one) 4219 */ 4220 if (bcmp(&bsave, &chain->bref, sizeof(bsave)) || 4221 chain->parent != parent || 4222 (chain->flags & HAMMER2_CHAIN_DELETED)) { 4223 hammer2_chain_unlock(chain); 4224 hammer2_chain_drop(chain); 4225 if (hammer2_debug & 0x0040) { 4226 kprintf("LOST PARENT RETRY " 4227 "RETRY (%p,%p)->%p %08x\n", 4228 parent, chain->parent, chain, chain->flags); 4229 } 4230 hammer2_spin_ex(&parent->core.spin); 4231 continue; 4232 } 4233 4234 /* 4235 * Shift the chain to the indirect block. 4236 * 4237 * WARNING! No reason for us to load chain data, pass NOSTATS 4238 * to prevent delete/insert from trying to access 4239 * inode stats (and thus asserting if there is no 4240 * chain->data loaded). 4241 * 4242 * WARNING! The (parent, chain) deletion may modify the parent 4243 * and invalidate the base pointer. 4244 * 4245 * WARNING! Parent must already be marked modified, so we 4246 * can assume that chain_delete always suceeds. 4247 * 4248 * WARNING! hammer2_chain_repchange() does not have to be 4249 * called (and doesn't work anyway because we are 4250 * only doing a partial shift). A recursion that is 4251 * in-progress can continue at the current parent 4252 * and will be able to properly find its next key. 4253 */ 4254 error = hammer2_chain_delete_obref(parent, chain, mtid, 0, 4255 &bsave); 4256 KKASSERT(error == 0); 4257 hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bsave); 4258 hammer2_chain_unlock(chain); 4259 hammer2_chain_drop(chain); 4260 KKASSERT(parent->refs > 0); 4261 chain = NULL; 4262 base = NULL; /* safety */ 4263 hammer2_spin_ex(&parent->core.spin); 4264 next_key_spinlocked: 4265 if (--maxloops == 0) 4266 panic("hammer2_chain_create_indirect: maxloops"); 4267 reason = 4; 4268 if (key_next == 0 || key_next > key_end) 4269 break; 4270 key_beg = key_next; 4271 /* loop */ 4272 } 4273 hammer2_spin_unex(&parent->core.spin); 4274 4275 /* 4276 * Insert the new indirect block into the parent now that we've 4277 * cleared out some entries in the parent. We calculated a good 4278 * insertion index in the loop above (ichain->index). 4279 * 4280 * We don't have to set UPDATE here because we mark ichain 4281 * modified down below (so the normal modified -> flush -> set-moved 4282 * sequence applies). 4283 * 4284 * The insertion shouldn't race as this is a completely new block 4285 * and the parent is locked. 4286 */ 4287 base = NULL; /* safety, parent modify may change address */ 4288 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 4289 KKASSERT(parent->core.live_count < count); 4290 hammer2_chain_insert(parent, ichain, 4291 HAMMER2_CHAIN_INSERT_SPIN | 4292 HAMMER2_CHAIN_INSERT_LIVE, 4293 0); 4294 4295 /* 4296 * Make sure flushes propogate after our manual insertion. 4297 */ 4298 hammer2_chain_setflush(ichain); 4299 hammer2_chain_setflush(parent); 4300 4301 /* 4302 * Figure out what to return. 4303 */ 4304 if (rounddown2(create_key ^ key, (hammer2_key_t)1 << keybits)) { 4305 /* 4306 * Key being created is outside the key range, 4307 * return the original parent. 4308 */ 4309 hammer2_chain_unlock(ichain); 4310 hammer2_chain_drop(ichain); 4311 } else { 4312 /* 4313 * Otherwise its in the range, return the new parent. 4314 * (leave both the new and old parent locked). 4315 */ 4316 parent = ichain; 4317 } 4318 4319 return(parent); 4320 } 4321 4322 /* 4323 * Do maintenance on an indirect chain. Both parent and chain are locked. 4324 * 4325 * Returns non-zero if (chain) is deleted, either due to being empty or 4326 * because its children were safely moved into the parent. 4327 */ 4328 int 4329 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4330 hammer2_chain_t *chain) 4331 { 4332 hammer2_blockref_t *chain_base; 4333 hammer2_blockref_t *base; 4334 hammer2_blockref_t *bref; 4335 hammer2_blockref_t bsave; 4336 hammer2_key_t key_next; 4337 hammer2_key_t key_beg; 4338 hammer2_key_t key_end; 4339 hammer2_chain_t *sub; 4340 int chain_count; 4341 int count; 4342 int error; 4343 int generation; 4344 4345 /* 4346 * Make sure we have an accurate live_count 4347 */ 4348 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4349 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4350 base = &chain->data->npdata[0]; 4351 count = chain->bytes / sizeof(hammer2_blockref_t); 4352 hammer2_chain_countbrefs(chain, base, count); 4353 } 4354 4355 /* 4356 * If the indirect block is empty we can delete it. 4357 * (ignore deletion error) 4358 */ 4359 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4360 hammer2_chain_delete(parent, chain, 4361 chain->bref.modify_tid, 4362 HAMMER2_DELETE_PERMANENT); 4363 hammer2_chain_repchange(parent, chain); 4364 return 1; 4365 } 4366 4367 base = hammer2_chain_base_and_count(parent, &count); 4368 4369 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4370 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4371 hammer2_chain_countbrefs(parent, base, count); 4372 } 4373 4374 /* 4375 * Determine if we can collapse chain into parent, calculate 4376 * hysteresis for chain emptiness. 4377 */ 4378 if (parent->core.live_count + chain->core.live_count - 1 > count) 4379 return 0; 4380 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4381 if (chain->core.live_count > chain_count * 3 / 4) 4382 return 0; 4383 4384 /* 4385 * Ok, theoretically we can collapse chain's contents into 4386 * parent. chain is locked, but any in-memory children of chain 4387 * are not. For this to work, we must be able to dispose of any 4388 * in-memory children of chain. 4389 * 4390 * For now require that there are no in-memory children of chain. 4391 * 4392 * WARNING! Both chain and parent must remain locked across this 4393 * entire operation. 4394 */ 4395 4396 /* 4397 * Parent must be marked modified. Don't try to collapse it if we 4398 * can't mark it modified. Once modified, destroy chain to make room 4399 * and to get rid of what will be a conflicting key (this is included 4400 * in the calculation above). Finally, move the children of chain 4401 * into chain's parent. 4402 * 4403 * This order creates an accounting problem for bref.embed.stats 4404 * because we destroy chain before we remove its children. Any 4405 * elements whos blockref is already synchronized will be counted 4406 * twice. To deal with the problem we clean out chain's stats prior 4407 * to deleting it. 4408 */ 4409 error = hammer2_chain_modify(parent, 0, 0, 0); 4410 if (error) { 4411 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4412 hammer2_error_str(error)); 4413 return 0; 4414 } 4415 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4416 if (error) { 4417 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4418 hammer2_error_str(error)); 4419 return 0; 4420 } 4421 4422 chain->bref.embed.stats.inode_count = 0; 4423 chain->bref.embed.stats.data_count = 0; 4424 error = hammer2_chain_delete(parent, chain, 4425 chain->bref.modify_tid, 4426 HAMMER2_DELETE_PERMANENT); 4427 KKASSERT(error == 0); 4428 4429 /* 4430 * The combined_find call requires core.spin to be held. One would 4431 * think there wouldn't be any conflicts since we hold chain 4432 * exclusively locked, but the caching mechanism for 0-ref children 4433 * does not require a chain lock. 4434 */ 4435 hammer2_spin_ex(&chain->core.spin); 4436 4437 key_next = 0; 4438 key_beg = 0; 4439 key_end = HAMMER2_KEY_MAX; 4440 for (;;) { 4441 chain_base = &chain->data->npdata[0]; 4442 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4443 sub = hammer2_combined_find(chain, chain_base, chain_count, 4444 &key_next, 4445 key_beg, key_end, 4446 &bref); 4447 generation = chain->core.generation; 4448 if (bref == NULL) 4449 break; 4450 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4451 4452 bsave = *bref; 4453 if (sub) { 4454 hammer2_chain_ref(sub); 4455 hammer2_spin_unex(&chain->core.spin); 4456 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4457 } else { 4458 hammer2_spin_unex(&chain->core.spin); 4459 sub = hammer2_chain_get(chain, generation, &bsave, 4460 HAMMER2_RESOLVE_NEVER); 4461 if (sub == NULL) { 4462 hammer2_spin_ex(&chain->core.spin); 4463 continue; 4464 } 4465 } 4466 if (bcmp(&bsave, &sub->bref, sizeof(bsave)) || 4467 sub->parent != chain || 4468 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4469 hammer2_chain_unlock(sub); 4470 hammer2_chain_drop(sub); 4471 hammer2_spin_ex(&chain->core.spin); 4472 sub = NULL; /* safety */ 4473 continue; 4474 } 4475 error = hammer2_chain_delete_obref(chain, sub, 4476 sub->bref.modify_tid, 0, 4477 &bsave); 4478 KKASSERT(error == 0); 4479 hammer2_chain_rename_obref(&parent, sub, 4480 sub->bref.modify_tid, 4481 HAMMER2_INSERT_SAMEPARENT, &bsave); 4482 hammer2_chain_unlock(sub); 4483 hammer2_chain_drop(sub); 4484 hammer2_spin_ex(&chain->core.spin); 4485 4486 if (key_next == 0) 4487 break; 4488 key_beg = key_next; 4489 } 4490 hammer2_spin_unex(&chain->core.spin); 4491 4492 hammer2_chain_repchange(parent, chain); 4493 4494 return 1; 4495 } 4496 4497 /* 4498 * Freemap indirect blocks 4499 * 4500 * Calculate the keybits and highside/lowside of the freemap node the 4501 * caller is creating. 4502 * 4503 * This routine will specify the next higher-level freemap key/radix 4504 * representing the lowest-ordered set. By doing so, eventually all 4505 * low-ordered sets will be moved one level down. 4506 * 4507 * We have to be careful here because the freemap reserves a limited 4508 * number of blocks for a limited number of levels. So we can't just 4509 * push indiscriminately. 4510 */ 4511 int 4512 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4513 int keybits, hammer2_blockref_t *base, int count) 4514 { 4515 hammer2_chain_t *chain; 4516 hammer2_blockref_t *bref; 4517 hammer2_key_t key; 4518 hammer2_key_t key_beg; 4519 hammer2_key_t key_end; 4520 hammer2_key_t key_next; 4521 int locount; 4522 int hicount; 4523 int maxloops = 300000; 4524 4525 key = *keyp; 4526 locount = 0; 4527 hicount = 0; 4528 keybits = 64; 4529 4530 /* 4531 * Calculate the range of keys in the array being careful to skip 4532 * slots which are overridden with a deletion. 4533 */ 4534 key_beg = 0; 4535 key_end = HAMMER2_KEY_MAX; 4536 hammer2_spin_ex(&parent->core.spin); 4537 4538 for (;;) { 4539 if (--maxloops == 0) { 4540 panic("indkey_freemap shit %p %p:%d\n", 4541 parent, base, count); 4542 } 4543 chain = hammer2_combined_find(parent, base, count, 4544 &key_next, 4545 key_beg, key_end, 4546 &bref); 4547 4548 /* 4549 * Exhausted search 4550 */ 4551 if (bref == NULL) 4552 break; 4553 4554 /* 4555 * Skip deleted chains. 4556 */ 4557 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4558 if (key_next == 0 || key_next > key_end) 4559 break; 4560 key_beg = key_next; 4561 continue; 4562 } 4563 4564 /* 4565 * Use the full live (not deleted) element for the scan 4566 * iteration. HAMMER2 does not allow partial replacements. 4567 * 4568 * XXX should be built into hammer2_combined_find(). 4569 */ 4570 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4571 4572 if (keybits > bref->keybits) { 4573 key = bref->key; 4574 keybits = bref->keybits; 4575 } else if (keybits == bref->keybits && bref->key < key) { 4576 key = bref->key; 4577 } 4578 if (key_next == 0) 4579 break; 4580 key_beg = key_next; 4581 } 4582 hammer2_spin_unex(&parent->core.spin); 4583 4584 /* 4585 * Return the keybits for a higher-level FREEMAP_NODE covering 4586 * this node. 4587 */ 4588 switch(keybits) { 4589 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4590 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4591 break; 4592 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4593 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4594 break; 4595 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4596 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4597 break; 4598 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4599 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4600 break; 4601 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4602 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4603 break; 4604 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4605 panic("hammer2_chain_indkey_freemap: level too high"); 4606 break; 4607 default: 4608 panic("hammer2_chain_indkey_freemap: bad radix"); 4609 break; 4610 } 4611 *keyp = key; 4612 4613 return (keybits); 4614 } 4615 4616 /* 4617 * File indirect blocks 4618 * 4619 * Calculate the key/keybits for the indirect block to create by scanning 4620 * existing keys. The key being created is also passed in *keyp and can be 4621 * inside or outside the indirect block. Regardless, the indirect block 4622 * must hold at least two keys in order to guarantee sufficient space. 4623 * 4624 * We use a modified version of the freemap's fixed radix tree, but taylored 4625 * for file data. Basically we configure an indirect block encompassing the 4626 * smallest key. 4627 */ 4628 static int 4629 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4630 int keybits, hammer2_blockref_t *base, int count, 4631 int ncount) 4632 { 4633 hammer2_chain_t *chain; 4634 hammer2_blockref_t *bref; 4635 hammer2_key_t key; 4636 hammer2_key_t key_beg; 4637 hammer2_key_t key_end; 4638 hammer2_key_t key_next; 4639 int nradix; 4640 int locount; 4641 int hicount; 4642 int maxloops = 300000; 4643 4644 key = *keyp; 4645 locount = 0; 4646 hicount = 0; 4647 keybits = 64; 4648 4649 /* 4650 * Calculate the range of keys in the array being careful to skip 4651 * slots which are overridden with a deletion. 4652 * 4653 * Locate the smallest key. 4654 */ 4655 key_beg = 0; 4656 key_end = HAMMER2_KEY_MAX; 4657 hammer2_spin_ex(&parent->core.spin); 4658 4659 for (;;) { 4660 if (--maxloops == 0) { 4661 panic("indkey_freemap shit %p %p:%d\n", 4662 parent, base, count); 4663 } 4664 chain = hammer2_combined_find(parent, base, count, 4665 &key_next, 4666 key_beg, key_end, 4667 &bref); 4668 4669 /* 4670 * Exhausted search 4671 */ 4672 if (bref == NULL) 4673 break; 4674 4675 /* 4676 * Skip deleted chains. 4677 */ 4678 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4679 if (key_next == 0 || key_next > key_end) 4680 break; 4681 key_beg = key_next; 4682 continue; 4683 } 4684 4685 /* 4686 * Use the full live (not deleted) element for the scan 4687 * iteration. HAMMER2 does not allow partial replacements. 4688 * 4689 * XXX should be built into hammer2_combined_find(). 4690 */ 4691 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4692 4693 if (keybits > bref->keybits) { 4694 key = bref->key; 4695 keybits = bref->keybits; 4696 } else if (keybits == bref->keybits && bref->key < key) { 4697 key = bref->key; 4698 } 4699 if (key_next == 0) 4700 break; 4701 key_beg = key_next; 4702 } 4703 hammer2_spin_unex(&parent->core.spin); 4704 4705 /* 4706 * Calculate the static keybits for a higher-level indirect block 4707 * that contains the key. 4708 */ 4709 *keyp = key; 4710 4711 switch(ncount) { 4712 case HAMMER2_IND_COUNT_MIN: 4713 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4714 break; 4715 case HAMMER2_IND_COUNT_NOM: 4716 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4717 break; 4718 case HAMMER2_IND_COUNT_MAX: 4719 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4720 break; 4721 default: 4722 panic("bad ncount %d\n", ncount); 4723 nradix = 0; 4724 break; 4725 } 4726 4727 /* 4728 * The largest radix that can be returned for an indirect block is 4729 * 63 bits. (The largest practical indirect block radix is actually 4730 * 62 bits because the top-level inode or volume root contains four 4731 * entries, but allow 63 to be returned). 4732 */ 4733 if (nradix >= 64) 4734 nradix = 63; 4735 4736 return keybits + nradix; 4737 } 4738 4739 #if 1 4740 4741 /* 4742 * Directory indirect blocks. 4743 * 4744 * Covers both the inode index (directory of inodes), and directory contents 4745 * (filenames hardlinked to inodes). 4746 * 4747 * Because directory keys are hashed we generally try to cut the space in 4748 * half. We accomodate the inode index (which tends to have linearly 4749 * increasing inode numbers) by ensuring that the keyspace is at least large 4750 * enough to fill up the indirect block being created. 4751 */ 4752 static int 4753 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4754 int keybits, hammer2_blockref_t *base, int count, 4755 int ncount) 4756 { 4757 hammer2_blockref_t *bref; 4758 hammer2_chain_t *chain; 4759 hammer2_key_t key_beg; 4760 hammer2_key_t key_end; 4761 hammer2_key_t key_next; 4762 hammer2_key_t key; 4763 int nkeybits; 4764 int locount; 4765 int hicount; 4766 int maxloops = 300000; 4767 4768 /* 4769 * NOTE: We can't take a shortcut here anymore for inodes because 4770 * the root directory can contain a mix of inodes and directory 4771 * entries (we used to just return 63 if parent->bref.type was 4772 * HAMMER2_BREF_TYPE_INODE. 4773 */ 4774 key = *keyp; 4775 locount = 0; 4776 hicount = 0; 4777 4778 /* 4779 * Calculate the range of keys in the array being careful to skip 4780 * slots which are overridden with a deletion. 4781 */ 4782 key_beg = 0; 4783 key_end = HAMMER2_KEY_MAX; 4784 hammer2_spin_ex(&parent->core.spin); 4785 4786 for (;;) { 4787 if (--maxloops == 0) { 4788 panic("indkey_freemap shit %p %p:%d\n", 4789 parent, base, count); 4790 } 4791 chain = hammer2_combined_find(parent, base, count, 4792 &key_next, 4793 key_beg, key_end, 4794 &bref); 4795 4796 /* 4797 * Exhausted search 4798 */ 4799 if (bref == NULL) 4800 break; 4801 4802 /* 4803 * Deleted object 4804 */ 4805 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4806 if (key_next == 0 || key_next > key_end) 4807 break; 4808 key_beg = key_next; 4809 continue; 4810 } 4811 4812 /* 4813 * Use the full live (not deleted) element for the scan 4814 * iteration. HAMMER2 does not allow partial replacements. 4815 * 4816 * XXX should be built into hammer2_combined_find(). 4817 */ 4818 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4819 4820 /* 4821 * Expand our calculated key range (key, keybits) to fit 4822 * the scanned key. nkeybits represents the full range 4823 * that we will later cut in half (two halves @ nkeybits - 1). 4824 */ 4825 nkeybits = keybits; 4826 if (nkeybits < bref->keybits) { 4827 if (bref->keybits > 64) { 4828 kprintf("bad bref chain %p bref %p\n", 4829 chain, bref); 4830 Debugger("fubar"); 4831 } 4832 nkeybits = bref->keybits; 4833 } 4834 while (nkeybits < 64 && 4835 rounddown2(key ^ bref->key, (hammer2_key_t)1 << nkeybits) != 0) { 4836 ++nkeybits; 4837 } 4838 4839 /* 4840 * If the new key range is larger we have to determine 4841 * which side of the new key range the existing keys fall 4842 * under by checking the high bit, then collapsing the 4843 * locount into the hicount or vise-versa. 4844 */ 4845 if (keybits != nkeybits) { 4846 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4847 hicount += locount; 4848 locount = 0; 4849 } else { 4850 locount += hicount; 4851 hicount = 0; 4852 } 4853 keybits = nkeybits; 4854 } 4855 4856 /* 4857 * The newly scanned key will be in the lower half or the 4858 * upper half of the (new) key range. 4859 */ 4860 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4861 ++hicount; 4862 else 4863 ++locount; 4864 4865 if (key_next == 0) 4866 break; 4867 key_beg = key_next; 4868 } 4869 hammer2_spin_unex(&parent->core.spin); 4870 bref = NULL; /* now invalid (safety) */ 4871 4872 /* 4873 * Adjust keybits to represent half of the full range calculated 4874 * above (radix 63 max) for our new indirect block. 4875 */ 4876 --keybits; 4877 4878 /* 4879 * Expand keybits to hold at least ncount elements. ncount will be 4880 * a power of 2. This is to try to completely fill leaf nodes (at 4881 * least for keys which are not hashes). 4882 * 4883 * We aren't counting 'in' or 'out', we are counting 'high side' 4884 * and 'low side' based on the bit at (1LL << keybits). We want 4885 * everything to be inside in these cases so shift it all to 4886 * the low or high side depending on the new high bit. 4887 */ 4888 while (((hammer2_key_t)1 << keybits) < ncount) { 4889 ++keybits; 4890 if (key & ((hammer2_key_t)1 << keybits)) { 4891 hicount += locount; 4892 locount = 0; 4893 } else { 4894 locount += hicount; 4895 hicount = 0; 4896 } 4897 } 4898 4899 if (hicount > locount) 4900 key |= (hammer2_key_t)1 << keybits; 4901 else 4902 key &= ~(hammer2_key_t)1 << keybits; 4903 4904 *keyp = key; 4905 4906 return (keybits); 4907 } 4908 4909 #else 4910 4911 /* 4912 * Directory indirect blocks. 4913 * 4914 * Covers both the inode index (directory of inodes), and directory contents 4915 * (filenames hardlinked to inodes). 4916 * 4917 * Because directory keys are hashed we generally try to cut the space in 4918 * half. We accomodate the inode index (which tends to have linearly 4919 * increasing inode numbers) by ensuring that the keyspace is at least large 4920 * enough to fill up the indirect block being created. 4921 */ 4922 static int 4923 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4924 int keybits, hammer2_blockref_t *base, int count, 4925 int ncount) 4926 { 4927 hammer2_blockref_t *bref; 4928 hammer2_chain_t *chain; 4929 hammer2_key_t key_beg; 4930 hammer2_key_t key_end; 4931 hammer2_key_t key_next; 4932 hammer2_key_t key; 4933 int nkeybits; 4934 int locount; 4935 int hicount; 4936 int maxloops = 300000; 4937 4938 /* 4939 * Shortcut if the parent is the inode. In this situation the 4940 * parent has 4+1 directory entries and we are creating an indirect 4941 * block capable of holding many more. 4942 */ 4943 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4944 return 63; 4945 } 4946 4947 key = *keyp; 4948 locount = 0; 4949 hicount = 0; 4950 4951 /* 4952 * Calculate the range of keys in the array being careful to skip 4953 * slots which are overridden with a deletion. 4954 */ 4955 key_beg = 0; 4956 key_end = HAMMER2_KEY_MAX; 4957 hammer2_spin_ex(&parent->core.spin); 4958 4959 for (;;) { 4960 if (--maxloops == 0) { 4961 panic("indkey_freemap shit %p %p:%d\n", 4962 parent, base, count); 4963 } 4964 chain = hammer2_combined_find(parent, base, count, 4965 &key_next, 4966 key_beg, key_end, 4967 &bref); 4968 4969 /* 4970 * Exhausted search 4971 */ 4972 if (bref == NULL) 4973 break; 4974 4975 /* 4976 * Deleted object 4977 */ 4978 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4979 if (key_next == 0 || key_next > key_end) 4980 break; 4981 key_beg = key_next; 4982 continue; 4983 } 4984 4985 /* 4986 * Use the full live (not deleted) element for the scan 4987 * iteration. HAMMER2 does not allow partial replacements. 4988 * 4989 * XXX should be built into hammer2_combined_find(). 4990 */ 4991 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4992 4993 /* 4994 * Expand our calculated key range (key, keybits) to fit 4995 * the scanned key. nkeybits represents the full range 4996 * that we will later cut in half (two halves @ nkeybits - 1). 4997 */ 4998 nkeybits = keybits; 4999 if (nkeybits < bref->keybits) { 5000 if (bref->keybits > 64) { 5001 kprintf("bad bref chain %p bref %p\n", 5002 chain, bref); 5003 Debugger("fubar"); 5004 } 5005 nkeybits = bref->keybits; 5006 } 5007 while (nkeybits < 64 && 5008 (~(((hammer2_key_t)1 << nkeybits) - 1) & 5009 (key ^ bref->key)) != 0) { 5010 ++nkeybits; 5011 } 5012 5013 /* 5014 * If the new key range is larger we have to determine 5015 * which side of the new key range the existing keys fall 5016 * under by checking the high bit, then collapsing the 5017 * locount into the hicount or vise-versa. 5018 */ 5019 if (keybits != nkeybits) { 5020 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 5021 hicount += locount; 5022 locount = 0; 5023 } else { 5024 locount += hicount; 5025 hicount = 0; 5026 } 5027 keybits = nkeybits; 5028 } 5029 5030 /* 5031 * The newly scanned key will be in the lower half or the 5032 * upper half of the (new) key range. 5033 */ 5034 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 5035 ++hicount; 5036 else 5037 ++locount; 5038 5039 if (key_next == 0) 5040 break; 5041 key_beg = key_next; 5042 } 5043 hammer2_spin_unex(&parent->core.spin); 5044 bref = NULL; /* now invalid (safety) */ 5045 5046 /* 5047 * Adjust keybits to represent half of the full range calculated 5048 * above (radix 63 max) for our new indirect block. 5049 */ 5050 --keybits; 5051 5052 /* 5053 * Expand keybits to hold at least ncount elements. ncount will be 5054 * a power of 2. This is to try to completely fill leaf nodes (at 5055 * least for keys which are not hashes). 5056 * 5057 * We aren't counting 'in' or 'out', we are counting 'high side' 5058 * and 'low side' based on the bit at (1LL << keybits). We want 5059 * everything to be inside in these cases so shift it all to 5060 * the low or high side depending on the new high bit. 5061 */ 5062 while (((hammer2_key_t)1 << keybits) < ncount) { 5063 ++keybits; 5064 if (key & ((hammer2_key_t)1 << keybits)) { 5065 hicount += locount; 5066 locount = 0; 5067 } else { 5068 locount += hicount; 5069 hicount = 0; 5070 } 5071 } 5072 5073 if (hicount > locount) 5074 key |= (hammer2_key_t)1 << keybits; 5075 else 5076 key &= ~(hammer2_key_t)1 << keybits; 5077 5078 *keyp = key; 5079 5080 return (keybits); 5081 } 5082 5083 #endif 5084 5085 /* 5086 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 5087 * it exists. 5088 * 5089 * Both parent and chain must be locked exclusively. 5090 * 5091 * This function will modify the parent if the blockref requires removal 5092 * from the parent's block table. 5093 * 5094 * This function is NOT recursive. Any entity already pushed into the 5095 * chain (such as an inode) may still need visibility into its contents, 5096 * as well as the ability to read and modify the contents. For example, 5097 * for an unlinked file which is still open. 5098 * 5099 * Also note that the flusher is responsible for cleaning up empty 5100 * indirect blocks. 5101 */ 5102 int 5103 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 5104 hammer2_tid_t mtid, int flags) 5105 { 5106 int error = 0; 5107 5108 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5109 5110 /* 5111 * Nothing to do if already marked. 5112 * 5113 * We need the spinlock on the core whos RBTREE contains chain 5114 * to protect against races. 5115 */ 5116 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5117 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5118 chain->parent == parent); 5119 error = _hammer2_chain_delete_helper(parent, chain, 5120 mtid, flags, NULL); 5121 } 5122 5123 /* 5124 * Permanent deletions mark the chain as destroyed. 5125 * 5126 * NOTE: We do not setflush the chain unless the deletion is 5127 * permanent, since the deletion of a chain does not actually 5128 * require it to be flushed. 5129 */ 5130 if (error == 0) { 5131 if (flags & HAMMER2_DELETE_PERMANENT) { 5132 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5133 hammer2_chain_setflush(chain); 5134 } 5135 } 5136 5137 return error; 5138 } 5139 5140 static int 5141 hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain, 5142 hammer2_tid_t mtid, int flags, 5143 hammer2_blockref_t *obref) 5144 { 5145 int error = 0; 5146 5147 KKASSERT(hammer2_mtx_owned(&chain->lock)); 5148 5149 /* 5150 * Nothing to do if already marked. 5151 * 5152 * We need the spinlock on the core whos RBTREE contains chain 5153 * to protect against races. 5154 */ 5155 obref->type = HAMMER2_BREF_TYPE_EMPTY; 5156 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 5157 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 5158 chain->parent == parent); 5159 error = _hammer2_chain_delete_helper(parent, chain, 5160 mtid, flags, obref); 5161 } 5162 5163 /* 5164 * Permanent deletions mark the chain as destroyed. 5165 * 5166 * NOTE: We do not setflush the chain unless the deletion is 5167 * permanent, since the deletion of a chain does not actually 5168 * require it to be flushed. 5169 */ 5170 if (error == 0) { 5171 if (flags & HAMMER2_DELETE_PERMANENT) { 5172 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 5173 hammer2_chain_setflush(chain); 5174 } 5175 } 5176 5177 return error; 5178 } 5179 5180 /* 5181 * Returns the index of the nearest element in the blockref array >= elm. 5182 * Returns (count) if no element could be found. 5183 * 5184 * Sets *key_nextp to the next key for loop purposes but does not modify 5185 * it if the next key would be higher than the current value of *key_nextp. 5186 * Note that *key_nexp can overflow to 0, which should be tested by the 5187 * caller. 5188 * 5189 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5190 * held through the operation. 5191 */ 5192 static int 5193 hammer2_base_find(hammer2_chain_t *parent, 5194 hammer2_blockref_t *base, int count, 5195 hammer2_key_t *key_nextp, 5196 hammer2_key_t key_beg, hammer2_key_t key_end) 5197 { 5198 hammer2_blockref_t *scan; 5199 hammer2_key_t scan_end; 5200 int i; 5201 int limit; 5202 5203 /* 5204 * Require the live chain's already have their core's counted 5205 * so we can optimize operations. 5206 */ 5207 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 5208 5209 /* 5210 * Degenerate case 5211 */ 5212 if (count == 0 || base == NULL) 5213 return(count); 5214 5215 /* 5216 * Sequential optimization using parent->cache_index. This is 5217 * the most likely scenario. 5218 * 5219 * We can avoid trailing empty entries on live chains, otherwise 5220 * we might have to check the whole block array. 5221 */ 5222 i = parent->cache_index; /* SMP RACE OK */ 5223 cpu_ccfence(); 5224 limit = parent->core.live_zero; 5225 if (i >= limit) 5226 i = limit - 1; 5227 if (i < 0) 5228 i = 0; 5229 KKASSERT(i < count); 5230 5231 /* 5232 * Search backwards 5233 */ 5234 scan = &base[i]; 5235 while (i > 0 && (scan->type == HAMMER2_BREF_TYPE_EMPTY || 5236 scan->key > key_beg)) { 5237 --scan; 5238 --i; 5239 } 5240 parent->cache_index = i; 5241 5242 /* 5243 * Search forwards, stop when we find a scan element which 5244 * encloses the key or until we know that there are no further 5245 * elements. 5246 */ 5247 while (i < count) { 5248 if (scan->type != HAMMER2_BREF_TYPE_EMPTY) { 5249 scan_end = scan->key + 5250 ((hammer2_key_t)1 << scan->keybits) - 1; 5251 if (scan->key > key_beg || scan_end >= key_beg) 5252 break; 5253 } 5254 if (i >= limit) 5255 return (count); 5256 ++scan; 5257 ++i; 5258 } 5259 if (i != count) { 5260 parent->cache_index = i; 5261 if (i >= limit) { 5262 i = count; 5263 } else { 5264 scan_end = scan->key + 5265 ((hammer2_key_t)1 << scan->keybits); 5266 if (scan_end && (*key_nextp > scan_end || 5267 *key_nextp == 0)) { 5268 *key_nextp = scan_end; 5269 } 5270 } 5271 } 5272 return (i); 5273 } 5274 5275 /* 5276 * Do a combined search and return the next match either from the blockref 5277 * array or from the in-memory chain. Sets *brefp to the returned bref in 5278 * both cases, or sets it to NULL if the search exhausted. Only returns 5279 * a non-NULL chain if the search matched from the in-memory chain. 5280 * 5281 * When no in-memory chain has been found and a non-NULL bref is returned 5282 * in *brefp. 5283 * 5284 * 5285 * The returned chain is not locked or referenced. Use the returned bref 5286 * to determine if the search exhausted or not. Iterate if the base find 5287 * is chosen but matches a deleted chain. 5288 * 5289 * WARNING! Must be called with parent's spinlock held. Spinlock remains 5290 * held through the operation. 5291 */ 5292 static hammer2_chain_t * 5293 hammer2_combined_find(hammer2_chain_t *parent, 5294 hammer2_blockref_t *base, int count, 5295 hammer2_key_t *key_nextp, 5296 hammer2_key_t key_beg, hammer2_key_t key_end, 5297 hammer2_blockref_t **brefp) 5298 { 5299 hammer2_blockref_t *bref; 5300 hammer2_chain_t *chain; 5301 int i; 5302 5303 /* 5304 * Lookup in block array and in rbtree. 5305 */ 5306 *key_nextp = key_end + 1; 5307 i = hammer2_base_find(parent, base, count, key_nextp, 5308 key_beg, key_end); 5309 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 5310 5311 /* 5312 * Neither matched 5313 */ 5314 if (i == count && chain == NULL) { 5315 *brefp = NULL; 5316 return(NULL); 5317 } 5318 5319 /* 5320 * Only chain matched. 5321 */ 5322 if (i == count) { 5323 bref = &chain->bref; 5324 goto found; 5325 } 5326 5327 /* 5328 * Only blockref matched. 5329 */ 5330 if (chain == NULL) { 5331 bref = &base[i]; 5332 goto found; 5333 } 5334 5335 /* 5336 * Both in-memory and blockref matched, select the nearer element. 5337 * 5338 * If both are flush with the left-hand side or both are the 5339 * same distance away, select the chain. In this situation the 5340 * chain must have been loaded from the matching blockmap. 5341 */ 5342 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 5343 chain->bref.key == base[i].key) { 5344 KKASSERT(chain->bref.key == base[i].key); 5345 bref = &chain->bref; 5346 goto found; 5347 } 5348 5349 /* 5350 * Select the nearer key 5351 */ 5352 if (chain->bref.key < base[i].key) { 5353 bref = &chain->bref; 5354 } else { 5355 bref = &base[i]; 5356 chain = NULL; 5357 } 5358 5359 /* 5360 * If the bref is out of bounds we've exhausted our search. 5361 */ 5362 found: 5363 if (bref->key > key_end) { 5364 *brefp = NULL; 5365 chain = NULL; 5366 } else { 5367 *brefp = bref; 5368 } 5369 return(chain); 5370 } 5371 5372 /* 5373 * Locate the specified block array element and delete it. The element 5374 * must exist. 5375 * 5376 * The spin lock on the related chain must be held. 5377 * 5378 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5379 * need to be adjusted when we commit the media change. 5380 */ 5381 void 5382 hammer2_base_delete(hammer2_chain_t *parent, 5383 hammer2_blockref_t *base, int count, 5384 hammer2_chain_t *chain, 5385 hammer2_blockref_t *obref) 5386 { 5387 hammer2_blockref_t *elm = &chain->bref; 5388 hammer2_blockref_t *scan; 5389 hammer2_key_t key_next; 5390 int i; 5391 5392 /* 5393 * Delete element. Expect the element to exist. 5394 * 5395 * XXX see caller, flush code not yet sophisticated enough to prevent 5396 * re-flushed in some cases. 5397 */ 5398 key_next = 0; /* max range */ 5399 i = hammer2_base_find(parent, base, count, &key_next, 5400 elm->key, elm->key); 5401 scan = &base[i]; 5402 if (i == count || scan->type == HAMMER2_BREF_TYPE_EMPTY || 5403 scan->key != elm->key || 5404 ((chain->flags & HAMMER2_CHAIN_BLKMAPUPD) == 0 && 5405 scan->keybits != elm->keybits)) { 5406 hammer2_spin_unex(&parent->core.spin); 5407 panic("delete base %p element not found at %d/%d elm %p\n", 5408 base, i, count, elm); 5409 return; 5410 } 5411 5412 /* 5413 * Update stats and zero the entry. 5414 * 5415 * NOTE: Handle radix == 0 (0 bytes) case. 5416 */ 5417 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5418 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5419 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5420 } 5421 switch(scan->type) { 5422 case HAMMER2_BREF_TYPE_INODE: 5423 --parent->bref.embed.stats.inode_count; 5424 /* fall through */ 5425 case HAMMER2_BREF_TYPE_DATA: 5426 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5427 atomic_set_int(&chain->flags, 5428 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5429 } else { 5430 if (parent->bref.leaf_count) 5431 --parent->bref.leaf_count; 5432 } 5433 /* fall through */ 5434 case HAMMER2_BREF_TYPE_INDIRECT: 5435 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5436 parent->bref.embed.stats.data_count -= 5437 scan->embed.stats.data_count; 5438 parent->bref.embed.stats.inode_count -= 5439 scan->embed.stats.inode_count; 5440 } 5441 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5442 break; 5443 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5444 atomic_set_int(&chain->flags, 5445 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5446 } else { 5447 if (parent->bref.leaf_count <= scan->leaf_count) 5448 parent->bref.leaf_count = 0; 5449 else 5450 parent->bref.leaf_count -= scan->leaf_count; 5451 } 5452 break; 5453 case HAMMER2_BREF_TYPE_DIRENT: 5454 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5455 atomic_set_int(&chain->flags, 5456 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5457 } else { 5458 if (parent->bref.leaf_count) 5459 --parent->bref.leaf_count; 5460 } 5461 default: 5462 break; 5463 } 5464 5465 if (obref) 5466 *obref = *scan; 5467 bzero(scan, sizeof(*scan)); 5468 5469 /* 5470 * We can only optimize parent->core.live_zero for live chains. 5471 */ 5472 if (parent->core.live_zero == i + 1) { 5473 while (--i >= 0 && base[i].type == HAMMER2_BREF_TYPE_EMPTY) 5474 ; 5475 parent->core.live_zero = i + 1; 5476 } 5477 5478 /* 5479 * Clear appropriate blockmap flags in chain. 5480 */ 5481 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED | 5482 HAMMER2_CHAIN_BLKMAPUPD); 5483 } 5484 5485 /* 5486 * Insert the specified element. The block array must not already have the 5487 * element and must have space available for the insertion. 5488 * 5489 * The spin lock on the related chain must be held. 5490 * 5491 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5492 * need to be adjusted when we commit the media change. 5493 */ 5494 void 5495 hammer2_base_insert(hammer2_chain_t *parent, 5496 hammer2_blockref_t *base, int count, 5497 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5498 { 5499 hammer2_key_t key_next; 5500 hammer2_key_t xkey; 5501 int i; 5502 int j; 5503 int k; 5504 int l; 5505 int u = 1; 5506 5507 /* 5508 * Insert new element. Expect the element to not already exist 5509 * unless we are replacing it. 5510 * 5511 * XXX see caller, flush code not yet sophisticated enough to prevent 5512 * re-flushed in some cases. 5513 */ 5514 key_next = 0; /* max range */ 5515 i = hammer2_base_find(parent, base, count, &key_next, 5516 elm->key, elm->key); 5517 5518 /* 5519 * Shortcut fill optimization, typical ordered insertion(s) may not 5520 * require a search. 5521 */ 5522 KKASSERT(i >= 0 && i <= count); 5523 5524 /* 5525 * Set appropriate blockmap flags in chain (if not NULL) 5526 */ 5527 if (chain) 5528 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED); 5529 5530 /* 5531 * Update stats and zero the entry 5532 */ 5533 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5534 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5535 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5536 } 5537 switch(elm->type) { 5538 case HAMMER2_BREF_TYPE_INODE: 5539 ++parent->bref.embed.stats.inode_count; 5540 /* fall through */ 5541 case HAMMER2_BREF_TYPE_DATA: 5542 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5543 ++parent->bref.leaf_count; 5544 /* fall through */ 5545 case HAMMER2_BREF_TYPE_INDIRECT: 5546 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5547 parent->bref.embed.stats.data_count += 5548 elm->embed.stats.data_count; 5549 parent->bref.embed.stats.inode_count += 5550 elm->embed.stats.inode_count; 5551 } 5552 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5553 break; 5554 if (parent->bref.leaf_count + elm->leaf_count < 5555 HAMMER2_BLOCKREF_LEAF_MAX) { 5556 parent->bref.leaf_count += elm->leaf_count; 5557 } else { 5558 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5559 } 5560 break; 5561 case HAMMER2_BREF_TYPE_DIRENT: 5562 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5563 ++parent->bref.leaf_count; 5564 break; 5565 default: 5566 break; 5567 } 5568 5569 5570 /* 5571 * We can only optimize parent->core.live_zero for live chains. 5572 */ 5573 if (i == count && parent->core.live_zero < count) { 5574 i = parent->core.live_zero++; 5575 base[i] = *elm; 5576 return; 5577 } 5578 5579 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5580 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5581 hammer2_spin_unex(&parent->core.spin); 5582 panic("insert base %p overlapping elements at %d elm %p\n", 5583 base, i, elm); 5584 } 5585 5586 /* 5587 * Try to find an empty slot before or after. 5588 */ 5589 j = i; 5590 k = i; 5591 while (j > 0 || k < count) { 5592 --j; 5593 if (j >= 0 && base[j].type == HAMMER2_BREF_TYPE_EMPTY) { 5594 if (j == i - 1) { 5595 base[j] = *elm; 5596 } else { 5597 bcopy(&base[j+1], &base[j], 5598 (i - j - 1) * sizeof(*base)); 5599 base[i - 1] = *elm; 5600 } 5601 goto validate; 5602 } 5603 ++k; 5604 if (k < count && base[k].type == HAMMER2_BREF_TYPE_EMPTY) { 5605 bcopy(&base[i], &base[i+1], 5606 (k - i) * sizeof(hammer2_blockref_t)); 5607 base[i] = *elm; 5608 5609 /* 5610 * We can only update parent->core.live_zero for live 5611 * chains. 5612 */ 5613 if (parent->core.live_zero <= k) 5614 parent->core.live_zero = k + 1; 5615 u = 2; 5616 goto validate; 5617 } 5618 } 5619 panic("hammer2_base_insert: no room!"); 5620 5621 /* 5622 * Debugging 5623 */ 5624 validate: 5625 key_next = 0; 5626 for (l = 0; l < count; ++l) { 5627 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5628 key_next = base[l].key + 5629 ((hammer2_key_t)1 << base[l].keybits) - 1; 5630 break; 5631 } 5632 } 5633 while (++l < count) { 5634 if (base[l].type != HAMMER2_BREF_TYPE_EMPTY) { 5635 if (base[l].key <= key_next) 5636 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5637 key_next = base[l].key + 5638 ((hammer2_key_t)1 << base[l].keybits) - 1; 5639 5640 } 5641 } 5642 5643 } 5644 5645 #if 0 5646 5647 /* 5648 * Sort the blockref array for the chain. Used by the flush code to 5649 * sort the blockref[] array. 5650 * 5651 * The chain must be exclusively locked AND spin-locked. 5652 */ 5653 typedef hammer2_blockref_t *hammer2_blockref_p; 5654 5655 static 5656 int 5657 hammer2_base_sort_callback(const void *v1, const void *v2) 5658 { 5659 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5660 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5661 5662 /* 5663 * Make sure empty elements are placed at the end of the array 5664 */ 5665 if (bref1->type == HAMMER2_BREF_TYPE_EMPTY) { 5666 if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) 5667 return(0); 5668 return(1); 5669 } else if (bref2->type == HAMMER2_BREF_TYPE_EMPTY) { 5670 return(-1); 5671 } 5672 5673 /* 5674 * Sort by key 5675 */ 5676 if (bref1->key < bref2->key) 5677 return(-1); 5678 if (bref1->key > bref2->key) 5679 return(1); 5680 return(0); 5681 } 5682 5683 void 5684 hammer2_base_sort(hammer2_chain_t *chain) 5685 { 5686 hammer2_blockref_t *base; 5687 int count; 5688 5689 switch(chain->bref.type) { 5690 case HAMMER2_BREF_TYPE_INODE: 5691 /* 5692 * Special shortcut for embedded data returns the inode 5693 * itself. Callers must detect this condition and access 5694 * the embedded data (the strategy code does this for us). 5695 * 5696 * This is only applicable to regular files and softlinks. 5697 */ 5698 if (chain->data->ipdata.meta.op_flags & 5699 HAMMER2_OPFLAG_DIRECTDATA) { 5700 return; 5701 } 5702 base = &chain->data->ipdata.u.blockset.blockref[0]; 5703 count = HAMMER2_SET_COUNT; 5704 break; 5705 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5706 case HAMMER2_BREF_TYPE_INDIRECT: 5707 /* 5708 * Optimize indirect blocks in the INITIAL state to avoid 5709 * I/O. 5710 */ 5711 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5712 base = &chain->data->npdata[0]; 5713 count = chain->bytes / sizeof(hammer2_blockref_t); 5714 break; 5715 case HAMMER2_BREF_TYPE_VOLUME: 5716 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5717 count = HAMMER2_SET_COUNT; 5718 break; 5719 case HAMMER2_BREF_TYPE_FREEMAP: 5720 base = &chain->data->blkset.blockref[0]; 5721 count = HAMMER2_SET_COUNT; 5722 break; 5723 default: 5724 panic("hammer2_base_sort: unrecognized " 5725 "blockref(A) type: %d", 5726 chain->bref.type); 5727 base = NULL; /* safety */ 5728 count = 0; /* safety */ 5729 break; 5730 } 5731 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5732 } 5733 5734 #endif 5735 5736 /* 5737 * Set the check data for a chain. This can be a heavy-weight operation 5738 * and typically only runs on-flush. For file data check data is calculated 5739 * when the logical buffers are flushed. 5740 */ 5741 void 5742 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5743 { 5744 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_NOTTESTED); 5745 5746 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5747 case HAMMER2_CHECK_NONE: 5748 break; 5749 case HAMMER2_CHECK_DISABLED: 5750 break; 5751 case HAMMER2_CHECK_ISCSI32: 5752 chain->bref.check.iscsi32.value = 5753 hammer2_icrc32(bdata, chain->bytes); 5754 break; 5755 case HAMMER2_CHECK_XXHASH64: 5756 chain->bref.check.xxhash64.value = 5757 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5758 break; 5759 case HAMMER2_CHECK_SHA192: 5760 assert(0); /* XXX unsupported */ 5761 /* 5762 { 5763 SHA256_CTX hash_ctx; 5764 union { 5765 uint8_t digest[SHA256_DIGEST_LENGTH]; 5766 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5767 } u; 5768 5769 SHA256_Init(&hash_ctx); 5770 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5771 SHA256_Final(u.digest, &hash_ctx); 5772 u.digest64[2] ^= u.digest64[3]; 5773 bcopy(u.digest, 5774 chain->bref.check.sha192.data, 5775 sizeof(chain->bref.check.sha192.data)); 5776 } 5777 */ 5778 break; 5779 case HAMMER2_CHECK_FREEMAP: 5780 chain->bref.check.freemap.icrc32 = 5781 hammer2_icrc32(bdata, chain->bytes); 5782 break; 5783 default: 5784 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5785 chain->bref.methods); 5786 break; 5787 } 5788 } 5789 5790 /* 5791 * Characterize a failed check code and try to trace back to the inode. 5792 */ 5793 static void 5794 hammer2_characterize_failed_chain(hammer2_chain_t *chain, uint64_t check, 5795 int bits) 5796 { 5797 hammer2_chain_t *lchain; 5798 hammer2_chain_t *ochain; 5799 int did; 5800 5801 did = krateprintf(&krate_h2chk, 5802 "chain %016jx.%02x (%s) meth=%02x CHECK FAIL " 5803 "(flags=%08x, bref/data ", 5804 chain->bref.data_off, 5805 chain->bref.type, 5806 hammer2_bref_type_str(chain->bref.type), 5807 chain->bref.methods, 5808 chain->flags); 5809 if (did == 0) 5810 return; 5811 5812 if (bits == 32) { 5813 kprintf("%08x/%08x)\n", 5814 chain->bref.check.iscsi32.value, 5815 (uint32_t)check); 5816 } else { 5817 kprintf("%016jx/%016jx)\n", 5818 chain->bref.check.xxhash64.value, 5819 check); 5820 } 5821 5822 /* 5823 * Run up the chains to try to find the governing inode so we 5824 * can report it. 5825 * 5826 * XXX This error reporting is not really MPSAFE 5827 */ 5828 ochain = chain; 5829 lchain = chain; 5830 while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) { 5831 lchain = chain; 5832 chain = chain->parent; 5833 } 5834 5835 if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE && 5836 ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) == 0 || 5837 (lchain->bref.key & HAMMER2_DIRHASH_VISIBLE))) { 5838 kprintf(" Resides at/in inode %ld\n", 5839 chain->bref.key); 5840 } else if (chain && chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5841 kprintf(" Resides in inode index - CRITICAL!!!\n"); 5842 } else { 5843 kprintf(" Resides in root index - CRITICAL!!!\n"); 5844 } 5845 if (ochain->hmp) { 5846 const char *pfsname = "UNKNOWN"; 5847 int i; 5848 5849 if (ochain->pmp) { 5850 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) { 5851 if (ochain->pmp->pfs_hmps[i] == ochain->hmp && 5852 ochain->pmp->pfs_names[i]) { 5853 pfsname = ochain->pmp->pfs_names[i]; 5854 break; 5855 } 5856 } 5857 } 5858 kprintf(" In pfs %s on device %s\n", 5859 pfsname, ochain->hmp->devrepname); 5860 } 5861 } 5862 5863 /* 5864 * Returns non-zero on success, 0 on failure. 5865 */ 5866 int 5867 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5868 { 5869 uint32_t check32; 5870 uint64_t check64; 5871 int r; 5872 5873 if (chain->flags & HAMMER2_CHAIN_NOTTESTED) 5874 return 1; 5875 5876 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5877 case HAMMER2_CHECK_NONE: 5878 r = 1; 5879 break; 5880 case HAMMER2_CHECK_DISABLED: 5881 r = 1; 5882 break; 5883 case HAMMER2_CHECK_ISCSI32: 5884 check32 = hammer2_icrc32(bdata, chain->bytes); 5885 r = (chain->bref.check.iscsi32.value == check32); 5886 if (r == 0) { 5887 hammer2_characterize_failed_chain(chain, check32, 32); 5888 } 5889 hammer2_process_icrc32 += chain->bytes; 5890 break; 5891 case HAMMER2_CHECK_XXHASH64: 5892 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5893 r = (chain->bref.check.xxhash64.value == check64); 5894 if (r == 0) { 5895 hammer2_characterize_failed_chain(chain, check64, 64); 5896 } 5897 hammer2_process_xxhash64 += chain->bytes; 5898 break; 5899 case HAMMER2_CHECK_SHA192: 5900 assert(0); /* XXX unsupported */ 5901 /* 5902 { 5903 SHA256_CTX hash_ctx; 5904 union { 5905 uint8_t digest[SHA256_DIGEST_LENGTH]; 5906 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5907 } u; 5908 5909 SHA256_Init(&hash_ctx); 5910 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5911 SHA256_Final(u.digest, &hash_ctx); 5912 u.digest64[2] ^= u.digest64[3]; 5913 if (bcmp(u.digest, 5914 chain->bref.check.sha192.data, 5915 sizeof(chain->bref.check.sha192.data)) == 0) { 5916 r = 1; 5917 } else { 5918 r = 0; 5919 krateprintf(&krate_h2chk, 5920 "chain %016jx.%02x meth=%02x " 5921 "CHECK FAIL\n", 5922 chain->bref.data_off, 5923 chain->bref.type, 5924 chain->bref.methods); 5925 } 5926 } 5927 */ 5928 break; 5929 case HAMMER2_CHECK_FREEMAP: 5930 r = (chain->bref.check.freemap.icrc32 == 5931 hammer2_icrc32(bdata, chain->bytes)); 5932 if (r == 0) { 5933 int did; 5934 5935 did = krateprintf(&krate_h2chk, 5936 "chain %016jx.%02x meth=%02x " 5937 "CHECK FAIL\n", 5938 chain->bref.data_off, 5939 chain->bref.type, 5940 chain->bref.methods); 5941 if (did) { 5942 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 5943 chain->bref.check.freemap.icrc32, 5944 hammer2_icrc32(bdata, chain->bytes), 5945 chain->bytes); 5946 if (chain->dio) { 5947 kprintf("dio %p buf %016jx,%ld " 5948 "bdata %p/%p\n", 5949 chain->dio, 5950 chain->dio->bp->b_loffset, 5951 chain->dio->bp->b_bufsize, 5952 bdata, 5953 chain->dio->bp->b_data); 5954 } 5955 } 5956 } 5957 break; 5958 default: 5959 kprintf("hammer2_chain_testcheck: unknown check type %02x\n", 5960 chain->bref.methods); 5961 r = 1; 5962 break; 5963 } 5964 return r; 5965 } 5966 5967 /* 5968 * Acquire the chain and parent representing the specified inode for the 5969 * device at the specified cluster index. 5970 * 5971 * The flags passed in are LOOKUP flags, not RESOLVE flags. 5972 * 5973 * If we are unable to locate the inode, HAMMER2_ERROR_EIO or HAMMER2_ERROR_CHECK 5974 * is returned. In case of error, *chainp and/or *parentp may still be returned 5975 * non-NULL. 5976 * 5977 * The caller may pass-in a locked *parentp and/or *chainp, or neither. 5978 * They will be unlocked and released by this function. The *parentp and 5979 * *chainp representing the located inode are returned locked. 5980 * 5981 * The returned error includes any error on the returned chain in addition to 5982 * errors incurred while trying to lookup the inode. However, a chain->error 5983 * might not be recognized if HAMMER2_LOOKUP_NODATA is passed. This flag may 5984 * not be passed to this function. 5985 */ 5986 int 5987 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 5988 int clindex, int flags, 5989 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 5990 { 5991 hammer2_chain_t *parent; 5992 hammer2_chain_t *rchain; 5993 hammer2_key_t key_dummy; 5994 hammer2_inode_t *ip; 5995 int resolve_flags; 5996 int error; 5997 5998 KKASSERT((flags & HAMMER2_LOOKUP_NODATA) == 0); 5999 6000 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 6001 HAMMER2_RESOLVE_SHARED : 0; 6002 6003 /* 6004 * Caller expects us to replace these. 6005 */ 6006 if (*chainp) { 6007 hammer2_chain_unlock(*chainp); 6008 hammer2_chain_drop(*chainp); 6009 *chainp = NULL; 6010 } 6011 if (*parentp) { 6012 hammer2_chain_unlock(*parentp); 6013 hammer2_chain_drop(*parentp); 6014 *parentp = NULL; 6015 } 6016 6017 /* 6018 * Be very careful, this is a backend function and we CANNOT 6019 * lock any frontend inode structure we find. But we have to 6020 * look the inode up this way first in case it exists but is 6021 * detached from the radix tree. 6022 */ 6023 ip = hammer2_inode_lookup(pmp, inum); 6024 if (ip) { 6025 *chainp = hammer2_inode_chain_and_parent(ip, clindex, 6026 parentp, 6027 resolve_flags); 6028 hammer2_inode_drop(ip); 6029 if (*chainp) 6030 return (*chainp)->error; 6031 hammer2_chain_unlock(*chainp); 6032 hammer2_chain_drop(*chainp); 6033 *chainp = NULL; 6034 if (*parentp) { 6035 hammer2_chain_unlock(*parentp); 6036 hammer2_chain_drop(*parentp); 6037 *parentp = NULL; 6038 } 6039 } 6040 6041 /* 6042 * Inodes hang off of the iroot (bit 63 is clear, differentiating 6043 * inodes from root directory entries in the key lookup). 6044 */ 6045 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 6046 rchain = NULL; 6047 if (parent) { 6048 /* 6049 * NOTE: rchain can be returned as NULL even if error == 0 6050 * (i.e. not found) 6051 */ 6052 rchain = hammer2_chain_lookup(&parent, &key_dummy, 6053 inum, inum, 6054 &error, flags); 6055 /* 6056 * Propagate a chain-specific error to caller. 6057 * 6058 * If the chain is not errored, we must still validate that the inode 6059 * number is correct, because all hell will break loose if it isn't 6060 * correct. It should always be correct so print to the console and 6061 * simulate a CHECK error if it is not. 6062 */ 6063 if (error == 0 && rchain) { 6064 error = rchain->error; 6065 if (error == 0 && rchain->data) { 6066 if (inum != rchain->data->ipdata.meta.inum) { 6067 kprintf("hammer2_chain_inode_find: lookup inum %ld, " 6068 "got valid inode but with inum %ld\n", 6069 inum, rchain->data->ipdata.meta.inum); 6070 error = HAMMER2_ERROR_CHECK; 6071 rchain->error = error; 6072 } 6073 } 6074 } 6075 } else { 6076 error = HAMMER2_ERROR_EIO; 6077 } 6078 *parentp = parent; 6079 *chainp = rchain; 6080 6081 return error; 6082 } 6083 6084 /* 6085 * Used by the bulkscan code to snapshot the synchronized storage for 6086 * a volume, allowing it to be scanned concurrently against normal 6087 * operation. 6088 */ 6089 hammer2_chain_t * 6090 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 6091 { 6092 hammer2_chain_t *copy; 6093 6094 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 6095 copy->data = kmalloc(sizeof(copy->data->voldata), 6096 hmp->mmsg, M_WAITOK | M_ZERO); 6097 hammer2_voldata_lock(hmp); 6098 copy->data->voldata = hmp->volsync; 6099 hammer2_voldata_unlock(hmp); 6100 6101 return copy; 6102 } 6103 6104 void 6105 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 6106 { 6107 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 6108 KKASSERT(copy->data); 6109 kfree(copy->data, copy->hmp->mmsg); 6110 copy->data = NULL; 6111 hammer2_chain_drop(copy); 6112 } 6113 6114 /* 6115 * Returns non-zero if the chain (INODE or DIRENT) matches the 6116 * filename. 6117 */ 6118 int 6119 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 6120 size_t name_len) 6121 { 6122 const hammer2_inode_data_t *ripdata; 6123 6124 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 6125 ripdata = &chain->data->ipdata; 6126 if (ripdata->meta.name_len == name_len && 6127 bcmp(ripdata->filename, name, name_len) == 0) { 6128 return 1; 6129 } 6130 } 6131 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 6132 chain->bref.embed.dirent.namlen == name_len) { 6133 if (name_len > sizeof(chain->bref.check.buf) && 6134 bcmp(chain->data->buf, name, name_len) == 0) { 6135 return 1; 6136 } 6137 if (name_len <= sizeof(chain->bref.check.buf) && 6138 bcmp(chain->bref.check.buf, name, name_len) == 0) { 6139 return 1; 6140 } 6141 } 6142 return 0; 6143 } 6144