1 /* 2 * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static hammer2_chain_t *hammer2_chain_create_indirect( 68 hammer2_chain_t *parent, 69 hammer2_key_t key, int keybits, 70 hammer2_tid_t mtid, int for_type, int *errorp); 71 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain); 72 static hammer2_chain_t *hammer2_combined_find( 73 hammer2_chain_t *parent, 74 hammer2_blockref_t *base, int count, 75 hammer2_key_t *key_nextp, 76 hammer2_key_t key_beg, hammer2_key_t key_end, 77 hammer2_blockref_t **bresp); 78 79 static struct krate krate_h2me = { .freq = 1 }; 80 81 /* 82 * Basic RBTree for chains (core->rbtree and core->dbtree). Chains cannot 83 * overlap in the RB trees. Deleted chains are moved from rbtree to either 84 * dbtree or to dbq. 85 * 86 * Chains in delete-duplicate sequences can always iterate through core_entry 87 * to locate the live version of the chain. 88 */ 89 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 90 91 int 92 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 93 { 94 hammer2_key_t c1_beg; 95 hammer2_key_t c1_end; 96 hammer2_key_t c2_beg; 97 hammer2_key_t c2_end; 98 99 /* 100 * Compare chains. Overlaps are not supposed to happen and catch 101 * any software issues early we count overlaps as a match. 102 */ 103 c1_beg = chain1->bref.key; 104 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 105 c2_beg = chain2->bref.key; 106 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 107 108 if (c1_end < c2_beg) /* fully to the left */ 109 return(-1); 110 if (c1_beg > c2_end) /* fully to the right */ 111 return(1); 112 return(0); /* overlap (must not cross edge boundary) */ 113 } 114 115 /* 116 * Assert that a chain has no media data associated with it. 117 */ 118 static __inline void 119 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 120 { 121 KKASSERT(chain->dio == NULL); 122 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 123 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 124 chain->data) { 125 panic("hammer2_assert_no_data: chain %p still has data", chain); 126 } 127 } 128 129 /* 130 * Make a chain visible to the flusher. The flusher needs to be able to 131 * do flushes of subdirectory chains or single files so it does a top-down 132 * recursion using the ONFLUSH flag for the recursion. It locates MODIFIED 133 * or UPDATE chains and flushes back up the chain to the volume root. 134 * 135 * This routine sets ONFLUSH upward until it hits the volume root. For 136 * simplicity we ignore PFSROOT boundaries whos rules can be complex. 137 * Extra ONFLUSH flagging doesn't hurt the filesystem. 138 */ 139 void 140 hammer2_chain_setflush(hammer2_chain_t *chain) 141 { 142 hammer2_chain_t *parent; 143 144 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 145 hammer2_spin_sh(&chain->core.spin); 146 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 147 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 148 if ((parent = chain->parent) == NULL) 149 break; 150 hammer2_spin_sh(&parent->core.spin); 151 hammer2_spin_unsh(&chain->core.spin); 152 chain = parent; 153 } 154 hammer2_spin_unsh(&chain->core.spin); 155 } 156 } 157 158 /* 159 * Allocate a new disconnected chain element representing the specified 160 * bref. chain->refs is set to 1 and the passed bref is copied to 161 * chain->bref. chain->bytes is derived from the bref. 162 * 163 * chain->pmp inherits pmp unless the chain is an inode (other than the 164 * super-root inode). 165 * 166 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 167 */ 168 hammer2_chain_t * 169 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 170 hammer2_blockref_t *bref) 171 { 172 hammer2_chain_t *chain; 173 u_int bytes; 174 175 /* 176 * Special case - radix of 0 indicates a chain that does not 177 * need a data reference (context is completely embedded in the 178 * bref). 179 */ 180 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 181 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 182 else 183 bytes = 0; 184 185 atomic_add_long(&hammer2_chain_allocs, 1); 186 187 /* 188 * Construct the appropriate system structure. 189 */ 190 switch(bref->type) { 191 case HAMMER2_BREF_TYPE_DIRENT: 192 case HAMMER2_BREF_TYPE_INODE: 193 case HAMMER2_BREF_TYPE_INDIRECT: 194 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 195 case HAMMER2_BREF_TYPE_DATA: 196 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 197 /* 198 * Chain's are really only associated with the hmp but we 199 * maintain a pmp association for per-mount memory tracking 200 * purposes. The pmp can be NULL. 201 */ 202 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 203 break; 204 case HAMMER2_BREF_TYPE_VOLUME: 205 case HAMMER2_BREF_TYPE_FREEMAP: 206 /* 207 * Only hammer2_chain_bulksnap() calls this function with these 208 * types. 209 */ 210 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 211 break; 212 default: 213 chain = NULL; 214 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 215 bref->type); 216 } 217 218 /* 219 * Initialize the new chain structure. pmp must be set to NULL for 220 * chains belonging to the super-root topology of a device mount. 221 */ 222 if (pmp == hmp->spmp) 223 chain->pmp = NULL; 224 else 225 chain->pmp = pmp; 226 chain->hmp = hmp; 227 chain->bref = *bref; 228 chain->bytes = bytes; 229 chain->refs = 1; 230 chain->flags = HAMMER2_CHAIN_ALLOCATED; 231 232 /* 233 * Set the PFS boundary flag if this chain represents a PFS root. 234 */ 235 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 236 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 237 hammer2_chain_core_init(chain); 238 239 return (chain); 240 } 241 242 /* 243 * Initialize a chain's core structure. This structure used to be allocated 244 * but is now embedded. 245 * 246 * The core is not locked. No additional refs on the chain are made. 247 * (trans) must not be NULL if (core) is not NULL. 248 */ 249 void 250 hammer2_chain_core_init(hammer2_chain_t *chain) 251 { 252 /* 253 * Fresh core under nchain (no multi-homing of ochain's 254 * sub-tree). 255 */ 256 RB_INIT(&chain->core.rbtree); /* live chains */ 257 hammer2_mtx_init(&chain->lock, "h2chain"); 258 } 259 260 /* 261 * Add a reference to a chain element, preventing its destruction. 262 * 263 * (can be called with spinlock held) 264 */ 265 void 266 hammer2_chain_ref(hammer2_chain_t *chain) 267 { 268 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 269 /* 270 * 0->non-zero transition must ensure that chain is removed 271 * from the LRU list. 272 * 273 * NOTE: Already holding lru_spin here so we cannot call 274 * hammer2_chain_ref() to get it off lru_list, do 275 * it manually. 276 */ 277 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 278 hammer2_pfs_t *pmp = chain->pmp; 279 hammer2_spin_ex(&pmp->lru_spin); 280 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 281 atomic_add_int(&pmp->lru_count, -1); 282 atomic_clear_int(&chain->flags, 283 HAMMER2_CHAIN_ONLRU); 284 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 285 } 286 hammer2_spin_unex(&pmp->lru_spin); 287 } 288 } 289 #if 0 290 kprintf("REFC %p %d %08x\n", chain, chain->refs - 1, chain->flags); 291 print_backtrace(8); 292 #endif 293 } 294 295 /* 296 * Ref a locked chain and force the data to be held across an unlock. 297 * Chain must be currently locked. The user of the chain who desires 298 * to release the hold must call hammer2_chain_lock_unhold() to relock 299 * and unhold the chain, then unlock normally, or may simply call 300 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 301 */ 302 void 303 hammer2_chain_ref_hold(hammer2_chain_t *chain) 304 { 305 atomic_add_int(&chain->lockcnt, 1); 306 hammer2_chain_ref(chain); 307 } 308 309 /* 310 * Insert the chain in the core rbtree. 311 * 312 * Normal insertions are placed in the live rbtree. Insertion of a deleted 313 * chain is a special case used by the flush code that is placed on the 314 * unstaged deleted list to avoid confusing the live view. 315 */ 316 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 317 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 318 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 319 320 static 321 int 322 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 323 int flags, int generation) 324 { 325 hammer2_chain_t *xchain; 326 int error = 0; 327 328 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 329 hammer2_spin_ex(&parent->core.spin); 330 331 /* 332 * Interlocked by spinlock, check for race 333 */ 334 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 335 parent->core.generation != generation) { 336 error = HAMMER2_ERROR_EAGAIN; 337 goto failed; 338 } 339 340 /* 341 * Insert chain 342 */ 343 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 344 KASSERT(xchain == NULL, 345 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 346 chain, xchain, chain->bref.key)); 347 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 348 chain->parent = parent; 349 ++parent->core.chain_count; 350 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 351 352 /* 353 * We have to keep track of the effective live-view blockref count 354 * so the create code knows when to push an indirect block. 355 */ 356 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 357 atomic_add_int(&parent->core.live_count, 1); 358 failed: 359 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 360 hammer2_spin_unex(&parent->core.spin); 361 return error; 362 } 363 364 /* 365 * Drop the caller's reference to the chain. When the ref count drops to 366 * zero this function will try to disassociate the chain from its parent and 367 * deallocate it, then recursely drop the parent using the implied ref 368 * from the chain's chain->parent. 369 * 370 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 371 * races an acquisition by another cpu. Therefore we can loop if we are 372 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 373 * race against another drop. 374 */ 375 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain); 376 377 void 378 hammer2_chain_drop(hammer2_chain_t *chain) 379 { 380 u_int refs; 381 382 if (hammer2_debug & 0x200000) 383 Debugger("drop"); 384 #if 0 385 kprintf("DROP %p %d %08x\n", chain, chain->refs - 1, chain->flags); 386 print_backtrace(8); 387 #endif 388 389 KKASSERT(chain->refs > 0); 390 391 while (chain) { 392 refs = chain->refs; 393 cpu_ccfence(); 394 KKASSERT(refs > 0); 395 396 if (refs == 1) { 397 if (mtx_lock_ex_try(&chain->lock) == 0) 398 chain = hammer2_chain_lastdrop(chain); 399 /* retry the same chain */ 400 } else { 401 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 402 break; 403 /* retry the same chain */ 404 } 405 cpu_pause(); 406 } 407 } 408 409 /* 410 * Unhold a held and probably not-locked chain, ensure that the data is 411 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 412 * lock and then simply unlocking the chain. 413 */ 414 void 415 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 416 { 417 u_int lockcnt; 418 int iter = 0; 419 420 for (;;) { 421 lockcnt = chain->lockcnt; 422 cpu_ccfence(); 423 if (lockcnt > 1) { 424 if (atomic_cmpset_int(&chain->lockcnt, 425 lockcnt, lockcnt - 1)) { 426 break; 427 } 428 } else if (mtx_lock_ex_try(&chain->lock) == 0) { 429 hammer2_chain_unlock(chain); 430 break; 431 } else { 432 /* 433 * This situation can easily occur on SMP due to 434 * the gap inbetween the 1->0 transition and the 435 * final unlock. We cannot safely block on the 436 * mutex because lockcnt might go above 1. 437 * 438 * XXX Sleep for one tick if it takes too long. 439 */ 440 if (++iter > 1000) { 441 if (iter > 1000 + hz) { 442 kprintf("hammer2: h2race1 %p\n", chain); 443 iter = 1000; 444 } 445 tsleep(&iter, 0, "h2race1", 1); 446 } 447 cpu_pause(); 448 } 449 } 450 hammer2_chain_drop(chain); 451 } 452 453 /* 454 * Handles the (potential) last drop of chain->refs from 1->0. Called with 455 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 456 * possible against refs and lockcnt. We must dispose of the mutex on chain. 457 * 458 * This function returns an unlocked chain for recursive drop or NULL. It 459 * can return the same chain if it determines it has raced another ref. 460 * 461 * -- 462 * 463 * When two chains need to be recursively dropped we use the chain we 464 * would otherwise free to placehold the additional chain. It's a bit 465 * convoluted but we can't just recurse without potentially blowing out 466 * the kernel stack. 467 * 468 * The chain cannot be freed if it has any children. 469 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 470 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 471 * Any dedup registration can remain intact. 472 * 473 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 474 */ 475 static 476 hammer2_chain_t * 477 hammer2_chain_lastdrop(hammer2_chain_t *chain) 478 { 479 hammer2_pfs_t *pmp; 480 hammer2_dev_t *hmp; 481 hammer2_chain_t *parent; 482 hammer2_chain_t *rdrop; 483 #if 0 484 hammer2_io_t *dio; 485 #endif 486 487 #if 0 488 /* 489 * On last drop if there is no parent and data_off is good (at 490 * least does not represent the volume root), the modified chain 491 * is probably going to be destroyed. We have to make sure that 492 * the data area is not registered for dedup. 493 * 494 * XXX removed. In fact, we do not have to make sure that the 495 * data area is not registered for dedup. The data area 496 * can, in fact, still be used for dedup because it is 497 * still allocated in the freemap and the underlying I/O 498 * will still be flushed. 499 */ 500 if (chain->parent == NULL && 501 (chain->flags & HAMMER2_CHAIN_MODIFIED) && 502 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 503 hmp = chain->hmp; 504 hammer2_io_dedup_delete(hmp, chain->bref.type, 505 chain->bref.data_off, chain->bytes); 506 } 507 #endif 508 /* 509 * We need chain's spinlock to interlock the sub-tree test. 510 * We already have chain's mutex, protecting chain->parent. 511 * 512 * Remember that chain->refs can be in flux. 513 */ 514 hammer2_spin_ex(&chain->core.spin); 515 516 if ((parent = chain->parent) != NULL) { 517 /* 518 * If the chain has a parent the UPDATE bit prevents scrapping 519 * as the chain is needed to properly flush the parent. Try 520 * to complete the 1->0 transition and return NULL. Retry 521 * (return chain) if we are unable to complete the 1->0 522 * transition, else return NULL (nothing more to do). 523 * 524 * If the chain has a parent the MODIFIED bit prevents 525 * scrapping. 526 * 527 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 528 */ 529 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 530 HAMMER2_CHAIN_MODIFIED)) { 531 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 532 hammer2_spin_unex(&chain->core.spin); 533 #if 0 534 dio = hammer2_chain_drop_data(chain, 0); 535 if (dio) 536 hammer2_io_bqrelse(&dio); 537 #endif 538 hammer2_chain_assert_no_data(chain); 539 hammer2_mtx_unlock(&chain->lock); 540 chain = NULL; 541 } else { 542 hammer2_spin_unex(&chain->core.spin); 543 hammer2_mtx_unlock(&chain->lock); 544 } 545 return (chain); 546 } 547 /* spinlock still held */ 548 } else { 549 /* 550 * The chain has no parent and can be flagged for destruction. 551 * Since it has no parent, UPDATE can also be cleared. 552 */ 553 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 554 if (chain->flags & HAMMER2_CHAIN_UPDATE) 555 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 556 557 /* 558 * If the chain has children we must still flush the chain. 559 * Any dedup is already handled by the underlying DIO, so 560 * we do not have to specifically flush it here. 561 * 562 * In the case where it has children, the DESTROY flag test 563 * in the flush code will prevent unnecessary flushes of 564 * MODIFIED chains that are not flagged DEDUP so don't worry 565 * about that here. 566 */ 567 if (chain->core.chain_count) { 568 /* 569 * Put on flushq (should ensure refs > 1), retry 570 * the drop. 571 */ 572 hammer2_spin_unex(&chain->core.spin); 573 hammer2_delayed_flush(chain); 574 hammer2_mtx_unlock(&chain->lock); 575 576 return(chain); /* retry drop */ 577 } 578 579 /* 580 * Otherwise we can scrap the MODIFIED bit if it is set, 581 * and continue along the freeing path. 582 * 583 * Be sure to clean-out any dedup bits. Without a parent 584 * this chain will no longer be visible to the flush code. 585 * Easy check data_off to avoid the volume root. 586 */ 587 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 588 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 589 atomic_add_long(&hammer2_count_modified_chains, -1); 590 if (chain->pmp) 591 hammer2_pfs_memory_wakeup(chain->pmp); 592 } 593 /* spinlock still held */ 594 } 595 596 /* spinlock still held */ 597 #if 0 598 dio = NULL; 599 #endif 600 601 /* 602 * If any children exist we must leave the chain intact with refs == 0. 603 * They exist because chains are retained below us which have refs or 604 * may require flushing. 605 * 606 * Retry (return chain) if we fail to transition the refs to 0, else 607 * return NULL indication nothing more to do. 608 * 609 * Chains with children are NOT put on the LRU list. 610 */ 611 if (chain->core.chain_count) { 612 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 613 hammer2_spin_unex(&chain->core.spin); 614 hammer2_chain_assert_no_data(chain); 615 hammer2_mtx_unlock(&chain->lock); 616 chain = NULL; 617 } else { 618 hammer2_spin_unex(&chain->core.spin); 619 hammer2_mtx_unlock(&chain->lock); 620 } 621 return (chain); 622 } 623 /* spinlock still held */ 624 /* no chains left under us */ 625 626 /* 627 * chain->core has no children left so no accessors can get to our 628 * chain from there. Now we have to lock the parent core to interlock 629 * remaining possible accessors that might bump chain's refs before 630 * we can safely drop chain's refs with intent to free the chain. 631 */ 632 hmp = chain->hmp; 633 pmp = chain->pmp; /* can be NULL */ 634 rdrop = NULL; 635 636 parent = chain->parent; 637 638 /* 639 * WARNING! chain's spin lock is still held here, and other spinlocks 640 * will be acquired and released in the code below. We 641 * cannot be making fancy procedure calls! 642 */ 643 644 /* 645 * We can cache the chain if it is associated with a pmp 646 * and not flagged as being destroyed or requesting a full 647 * release. In this situation the chain is not removed 648 * from its parent, i.e. it can still be looked up. 649 * 650 * We intentionally do not cache DATA chains because these 651 * were likely used to load data into the logical buffer cache 652 * and will not be accessed again for some time. 653 */ 654 if ((chain->flags & 655 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 656 chain->pmp && 657 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 658 if (parent) 659 hammer2_spin_ex(&parent->core.spin); 660 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 661 /* 662 * 1->0 transition failed, retry. Do not drop 663 * the chain's data yet! 664 */ 665 if (parent) 666 hammer2_spin_unex(&parent->core.spin); 667 hammer2_spin_unex(&chain->core.spin); 668 hammer2_mtx_unlock(&chain->lock); 669 670 return(chain); 671 } 672 673 /* 674 * Success 675 */ 676 #if 0 677 dio = hammer2_chain_drop_data(chain, 1); 678 #endif 679 hammer2_chain_assert_no_data(chain); 680 681 KKASSERT((chain->flags & HAMMER2_CHAIN_ONLRU) == 0); 682 hammer2_spin_ex(&pmp->lru_spin); 683 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 684 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 685 686 /* 687 * If we are over the LRU limit we need to drop something. 688 */ 689 if (pmp->lru_count > HAMMER2_LRU_LIMIT) { 690 rdrop = TAILQ_FIRST(&pmp->lru_list); 691 atomic_clear_int(&rdrop->flags, HAMMER2_CHAIN_ONLRU); 692 TAILQ_REMOVE(&pmp->lru_list, rdrop, lru_node); 693 atomic_add_int(&rdrop->refs, 1); 694 atomic_set_int(&rdrop->flags, HAMMER2_CHAIN_RELEASE); 695 } else { 696 atomic_add_int(&pmp->lru_count, 1); 697 } 698 hammer2_spin_unex(&pmp->lru_spin); 699 if (parent) { 700 hammer2_spin_unex(&parent->core.spin); 701 parent = NULL; /* safety */ 702 } 703 hammer2_spin_unex(&chain->core.spin); 704 hammer2_mtx_unlock(&chain->lock); 705 #if 0 706 if (dio) 707 hammer2_io_bqrelse(&dio); 708 #endif 709 710 return rdrop; 711 /* NOT REACHED */ 712 } 713 714 /* 715 * Spinlock the parent and try to drop the last ref on chain. 716 * On success determine if we should dispose of the chain 717 * (remove the chain from its parent, etc). 718 * 719 * (normal core locks are top-down recursive but we define 720 * core spinlocks as bottom-up recursive, so this is safe). 721 */ 722 if (parent) { 723 hammer2_spin_ex(&parent->core.spin); 724 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 725 #if 0 726 /* XXX remove, don't try to drop data on fail */ 727 hammer2_spin_unex(&parent->core.spin); 728 dio = hammer2_chain_drop_data(chain, 0); 729 hammer2_spin_unex(&chain->core.spin); 730 if (dio) 731 hammer2_io_bqrelse(&dio); 732 #endif 733 /* 734 * 1->0 transition failed, retry. 735 */ 736 hammer2_spin_unex(&parent->core.spin); 737 hammer2_spin_unex(&chain->core.spin); 738 hammer2_mtx_unlock(&chain->lock); 739 740 return(chain); 741 } 742 743 /* 744 * 1->0 transition successful, parent spin held to prevent 745 * new lookups, chain spinlock held to protect parent field. 746 * Remove chain from the parent. 747 * 748 * If the chain is being removed from the parent's btree but 749 * is not bmapped, we have to adjust live_count downward. If 750 * it is bmapped then the blockref is retained in the parent 751 * as is its associated live_count. This case can occur when 752 * a chain added to the topology is unable to flush and is 753 * then later deleted. 754 */ 755 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 756 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 757 (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 758 atomic_add_int(&parent->core.live_count, -1); 759 } 760 RB_REMOVE(hammer2_chain_tree, 761 &parent->core.rbtree, chain); 762 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 763 --parent->core.chain_count; 764 chain->parent = NULL; 765 } 766 767 /* 768 * If our chain was the last chain in the parent's core the 769 * core is now empty and its parent might have to be 770 * re-dropped if it has 0 refs. 771 */ 772 if (parent->core.chain_count == 0) { 773 rdrop = parent; 774 atomic_add_int(&rdrop->refs, 1); 775 /* 776 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 777 rdrop = NULL; 778 */ 779 } 780 hammer2_spin_unex(&parent->core.spin); 781 parent = NULL; /* safety */ 782 /* FALL THROUGH */ 783 } else { 784 /* 785 * No-parent case. 786 */ 787 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 788 /* 789 * 1->0 transition failed, retry. 790 */ 791 hammer2_spin_unex(&parent->core.spin); 792 hammer2_spin_unex(&chain->core.spin); 793 hammer2_mtx_unlock(&chain->lock); 794 795 return(chain); 796 } 797 } 798 799 /* 800 * Successful 1->0 transition, no parent, no children... no way for 801 * anyone to ref this chain any more. We can clean-up and free it. 802 * 803 * We still have the core spinlock, and core's chain_count is 0. 804 * Any parent spinlock is gone. 805 */ 806 hammer2_spin_unex(&chain->core.spin); 807 hammer2_chain_assert_no_data(chain); 808 hammer2_mtx_unlock(&chain->lock); 809 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 810 chain->core.chain_count == 0); 811 812 /* 813 * All locks are gone, no pointers remain to the chain, finish 814 * freeing it. 815 */ 816 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 817 HAMMER2_CHAIN_MODIFIED)) == 0); 818 #if 0 819 dio = hammer2_chain_drop_data(chain, 1); 820 if (dio) 821 hammer2_io_bqrelse(&dio); 822 #endif 823 824 /* 825 * Once chain resources are gone we can use the now dead chain 826 * structure to placehold what might otherwise require a recursive 827 * drop, because we have potentially two things to drop and can only 828 * return one directly. 829 */ 830 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 831 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 832 chain->hmp = NULL; 833 kfree(chain, hmp->mchain); 834 } 835 836 /* 837 * Possible chaining loop when parent re-drop needed. 838 */ 839 return(rdrop); 840 } 841 842 /* 843 * On last lock release. 844 */ 845 static hammer2_io_t * 846 hammer2_chain_drop_data(hammer2_chain_t *chain) 847 { 848 hammer2_io_t *dio; 849 850 if ((dio = chain->dio) != NULL) { 851 chain->dio = NULL; 852 chain->data = NULL; 853 } else { 854 switch(chain->bref.type) { 855 case HAMMER2_BREF_TYPE_VOLUME: 856 case HAMMER2_BREF_TYPE_FREEMAP: 857 break; 858 default: 859 if (chain->data != NULL) { 860 hammer2_spin_unex(&chain->core.spin); 861 panic("chain data not null: " 862 "chain %p bref %016jx.%02x " 863 "refs %d parent %p dio %p data %p", 864 chain, chain->bref.data_off, 865 chain->bref.type, chain->refs, 866 chain->parent, 867 chain->dio, chain->data); 868 } 869 KKASSERT(chain->data == NULL); 870 break; 871 } 872 } 873 return dio; 874 } 875 876 /* 877 * Lock a referenced chain element, acquiring its data with I/O if necessary, 878 * and specify how you would like the data to be resolved. 879 * 880 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 881 * 882 * The lock is allowed to recurse, multiple locking ops will aggregate 883 * the requested resolve types. Once data is assigned it will not be 884 * removed until the last unlock. 885 * 886 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 887 * (typically used to avoid device/logical buffer 888 * aliasing for data) 889 * 890 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 891 * the INITIAL-create state (indirect blocks only). 892 * 893 * Do not resolve data elements for DATA chains. 894 * (typically used to avoid device/logical buffer 895 * aliasing for data) 896 * 897 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 898 * 899 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 900 * it will be locked exclusive. 901 * 902 * NOTE: Embedded elements (volume header, inodes) are always resolved 903 * regardless. 904 * 905 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 906 * element will instantiate and zero its buffer, and flush it on 907 * release. 908 * 909 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 910 * so as not to instantiate a device buffer, which could alias against 911 * a logical file buffer. However, if ALWAYS is specified the 912 * device buffer will be instantiated anyway. 913 * 914 * WARNING! This function blocks on I/O if data needs to be fetched. This 915 * blocking can run concurrent with other compatible lock holders 916 * who do not need data returning. The lock is not upgraded to 917 * exclusive during a data fetch, a separate bit is used to 918 * interlock I/O. However, an exclusive lock holder can still count 919 * on being interlocked against an I/O fetch managed by a shared 920 * lock holder. 921 */ 922 void 923 hammer2_chain_lock(hammer2_chain_t *chain, int how) 924 { 925 /* 926 * Ref and lock the element. Recursive locks are allowed. 927 */ 928 KKASSERT(chain->refs > 0); 929 atomic_add_int(&chain->lockcnt, 1); 930 931 /* 932 * Get the appropriate lock. If LOCKAGAIN is flagged with SHARED 933 * the caller expects a shared lock to already be present and we 934 * are giving it another ref. This case must importantly not block 935 * if there is a pending exclusive lock request. 936 */ 937 if (how & HAMMER2_RESOLVE_SHARED) { 938 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 939 hammer2_mtx_sh_again(&chain->lock); 940 } else { 941 hammer2_mtx_sh(&chain->lock); 942 } 943 } else { 944 hammer2_mtx_ex(&chain->lock); 945 } 946 ++curthread->td_tracker; 947 948 /* 949 * If we already have a valid data pointer no further action is 950 * necessary. 951 */ 952 if (chain->data) 953 return; 954 955 /* 956 * Do we have to resolve the data? This is generally only 957 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 958 * Other BREF types expects the data to be there. 959 */ 960 switch(how & HAMMER2_RESOLVE_MASK) { 961 case HAMMER2_RESOLVE_NEVER: 962 return; 963 case HAMMER2_RESOLVE_MAYBE: 964 if (chain->flags & HAMMER2_CHAIN_INITIAL) 965 return; 966 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 967 return; 968 #if 0 969 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 970 return; 971 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 972 return; 973 #endif 974 /* fall through */ 975 case HAMMER2_RESOLVE_ALWAYS: 976 default: 977 break; 978 } 979 980 /* 981 * Caller requires data 982 */ 983 hammer2_chain_load_data(chain); 984 } 985 986 /* 987 * Lock the chain, retain the hold, and drop the data persistence count. 988 * The data should remain valid because we never transitioned lockcnt 989 * through 0. 990 */ 991 void 992 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 993 { 994 hammer2_chain_lock(chain, how); 995 atomic_add_int(&chain->lockcnt, -1); 996 } 997 998 #if 0 999 /* 1000 * Downgrade an exclusive chain lock to a shared chain lock. 1001 * 1002 * NOTE: There is no upgrade equivalent due to the ease of 1003 * deadlocks in that direction. 1004 */ 1005 void 1006 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1007 { 1008 hammer2_mtx_downgrade(&chain->lock); 1009 } 1010 #endif 1011 1012 /* 1013 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1014 * may be of any type. 1015 * 1016 * Once chain->data is set it cannot be disposed of until all locks are 1017 * released. 1018 */ 1019 void 1020 hammer2_chain_load_data(hammer2_chain_t *chain) 1021 { 1022 hammer2_blockref_t *bref; 1023 hammer2_dev_t *hmp; 1024 hammer2_io_t *dio; 1025 char *bdata; 1026 int error; 1027 1028 /* 1029 * Degenerate case, data already present, or chain has no media 1030 * reference to load. 1031 */ 1032 if (chain->data) 1033 return; 1034 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1035 return; 1036 1037 hmp = chain->hmp; 1038 KKASSERT(hmp != NULL); 1039 1040 /* 1041 * Gain the IOINPROG bit, interlocked block. 1042 */ 1043 for (;;) { 1044 u_int oflags; 1045 u_int nflags; 1046 1047 oflags = chain->flags; 1048 cpu_ccfence(); 1049 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1050 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1051 tsleep_interlock(&chain->flags, 0); 1052 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1053 tsleep(&chain->flags, PINTERLOCKED, 1054 "h2iocw", 0); 1055 } 1056 /* retry */ 1057 } else { 1058 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1059 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1060 break; 1061 } 1062 /* retry */ 1063 } 1064 } 1065 1066 /* 1067 * We own CHAIN_IOINPROG 1068 * 1069 * Degenerate case if we raced another load. 1070 */ 1071 if (chain->data) 1072 goto done; 1073 1074 /* 1075 * We must resolve to a device buffer, either by issuing I/O or 1076 * by creating a zero-fill element. We do not mark the buffer 1077 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1078 * API must still be used to do that). 1079 * 1080 * The device buffer is variable-sized in powers of 2 down 1081 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1082 * chunk always contains buffers of the same size. (XXX) 1083 * 1084 * The minimum physical IO size may be larger than the variable 1085 * block size. 1086 */ 1087 bref = &chain->bref; 1088 1089 /* 1090 * The getblk() optimization can only be used on newly created 1091 * elements if the physical block size matches the request. 1092 */ 1093 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1094 error = hammer2_io_new(hmp, bref->type, 1095 bref->data_off, chain->bytes, 1096 &chain->dio); 1097 } else { 1098 error = hammer2_io_bread(hmp, bref->type, 1099 bref->data_off, chain->bytes, 1100 &chain->dio); 1101 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1102 } 1103 if (error) { 1104 chain->error = HAMMER2_ERROR_EIO; 1105 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 1106 (intmax_t)bref->data_off, error); 1107 hammer2_io_bqrelse(&chain->dio); 1108 goto done; 1109 } 1110 chain->error = 0; 1111 1112 /* 1113 * This isn't perfect and can be ignored on OSs which do not have 1114 * an indication as to whether a buffer is coming from cache or 1115 * if I/O was actually issued for the read. TESTEDGOOD will work 1116 * pretty well without the B_IOISSUED logic because chains are 1117 * cached, but in that situation (without B_IOISSUED) it will not 1118 * detect whether a re-read via I/O is corrupted verses the original 1119 * read. 1120 * 1121 * We can't re-run the CRC on every fresh lock. That would be 1122 * insanely expensive. 1123 * 1124 * If the underlying kernel buffer covers the entire chain we can 1125 * use the B_IOISSUED indication to determine if we have to re-run 1126 * the CRC on chain data for chains that managed to stay cached 1127 * across the kernel disposal of the original buffer. 1128 */ 1129 if ((dio = chain->dio) != NULL && dio->bp) { 1130 struct buf *bp = dio->bp; 1131 1132 if (dio->psize == chain->bytes && 1133 (bp->b_flags & B_IOISSUED)) { 1134 atomic_clear_int(&chain->flags, 1135 HAMMER2_CHAIN_TESTEDGOOD); 1136 bp->b_flags &= ~B_IOISSUED; 1137 } 1138 } 1139 1140 /* 1141 * NOTE: A locked chain's data cannot be modified without first 1142 * calling hammer2_chain_modify(). 1143 */ 1144 1145 /* 1146 * Clear INITIAL. In this case we used io_new() and the buffer has 1147 * been zero'd and marked dirty. 1148 */ 1149 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1150 1151 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1152 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1153 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO; 1154 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1155 /* 1156 * check data not currently synchronized due to 1157 * modification. XXX assumes data stays in the buffer 1158 * cache, which might not be true (need biodep on flush 1159 * to calculate crc? or simple crc?). 1160 */ 1161 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1162 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1163 chain->error = HAMMER2_ERROR_CHECK; 1164 } else { 1165 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1166 } 1167 } 1168 1169 /* 1170 * Setup the data pointer, either pointing it to an embedded data 1171 * structure and copying the data from the buffer, or pointing it 1172 * into the buffer. 1173 * 1174 * The buffer is not retained when copying to an embedded data 1175 * structure in order to avoid potential deadlocks or recursions 1176 * on the same physical buffer. 1177 * 1178 * WARNING! Other threads can start using the data the instant we 1179 * set chain->data non-NULL. 1180 */ 1181 switch (bref->type) { 1182 case HAMMER2_BREF_TYPE_VOLUME: 1183 case HAMMER2_BREF_TYPE_FREEMAP: 1184 /* 1185 * Copy data from bp to embedded buffer 1186 */ 1187 panic("hammer2_chain_load_data: unresolved volume header"); 1188 break; 1189 case HAMMER2_BREF_TYPE_DIRENT: 1190 KKASSERT(chain->bytes != 0); 1191 /* fall through */ 1192 case HAMMER2_BREF_TYPE_INODE: 1193 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1194 case HAMMER2_BREF_TYPE_INDIRECT: 1195 case HAMMER2_BREF_TYPE_DATA: 1196 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1197 default: 1198 /* 1199 * Point data at the device buffer and leave dio intact. 1200 */ 1201 chain->data = (void *)bdata; 1202 break; 1203 } 1204 1205 /* 1206 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1207 */ 1208 done: 1209 for (;;) { 1210 u_int oflags; 1211 u_int nflags; 1212 1213 oflags = chain->flags; 1214 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1215 HAMMER2_CHAIN_IOSIGNAL); 1216 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1217 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1218 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1219 wakeup(&chain->flags); 1220 break; 1221 } 1222 } 1223 } 1224 1225 /* 1226 * Unlock and deref a chain element. 1227 * 1228 * Remember that the presence of children under chain prevent the chain's 1229 * destruction but do not add additional references, so the dio will still 1230 * be dropped. 1231 */ 1232 void 1233 hammer2_chain_unlock(hammer2_chain_t *chain) 1234 { 1235 hammer2_io_t *dio; 1236 u_int lockcnt; 1237 int iter = 0; 1238 1239 --curthread->td_tracker; 1240 1241 /* 1242 * If multiple locks are present (or being attempted) on this 1243 * particular chain we can just unlock, drop refs, and return. 1244 * 1245 * Otherwise fall-through on the 1->0 transition. 1246 */ 1247 for (;;) { 1248 lockcnt = chain->lockcnt; 1249 KKASSERT(lockcnt > 0); 1250 cpu_ccfence(); 1251 if (lockcnt > 1) { 1252 if (atomic_cmpset_int(&chain->lockcnt, 1253 lockcnt, lockcnt - 1)) { 1254 hammer2_mtx_unlock(&chain->lock); 1255 return; 1256 } 1257 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1258 /* while holding the mutex exclusively */ 1259 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1260 break; 1261 } else { 1262 /* 1263 * This situation can easily occur on SMP due to 1264 * the gap inbetween the 1->0 transition and the 1265 * final unlock. We cannot safely block on the 1266 * mutex because lockcnt might go above 1. 1267 * 1268 * XXX Sleep for one tick if it takes too long. 1269 */ 1270 if (++iter > 1000) { 1271 if (iter > 1000 + hz) { 1272 kprintf("hammer2: h2race2 %p\n", chain); 1273 iter = 1000; 1274 } 1275 tsleep(&iter, 0, "h2race2", 1); 1276 } 1277 cpu_pause(); 1278 } 1279 /* retry */ 1280 } 1281 1282 /* 1283 * Last unlock / mutex upgraded to exclusive. Drop the data 1284 * reference. 1285 */ 1286 dio = hammer2_chain_drop_data(chain); 1287 if (dio) 1288 hammer2_io_bqrelse(&dio); 1289 hammer2_mtx_unlock(&chain->lock); 1290 } 1291 1292 /* 1293 * Unlock and hold chain data intact 1294 */ 1295 void 1296 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1297 { 1298 atomic_add_int(&chain->lockcnt, 1); 1299 hammer2_chain_unlock(chain); 1300 } 1301 1302 /* 1303 * Helper to obtain the blockref[] array base and count for a chain. 1304 * 1305 * XXX Not widely used yet, various use cases need to be validated and 1306 * converted to use this function. 1307 */ 1308 static 1309 hammer2_blockref_t * 1310 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1311 { 1312 hammer2_blockref_t *base; 1313 int count; 1314 1315 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1316 base = NULL; 1317 1318 switch(parent->bref.type) { 1319 case HAMMER2_BREF_TYPE_INODE: 1320 count = HAMMER2_SET_COUNT; 1321 break; 1322 case HAMMER2_BREF_TYPE_INDIRECT: 1323 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1324 count = parent->bytes / sizeof(hammer2_blockref_t); 1325 break; 1326 case HAMMER2_BREF_TYPE_VOLUME: 1327 count = HAMMER2_SET_COUNT; 1328 break; 1329 case HAMMER2_BREF_TYPE_FREEMAP: 1330 count = HAMMER2_SET_COUNT; 1331 break; 1332 default: 1333 panic("hammer2_chain_create_indirect: " 1334 "unrecognized blockref type: %d", 1335 parent->bref.type); 1336 count = 0; 1337 break; 1338 } 1339 } else { 1340 switch(parent->bref.type) { 1341 case HAMMER2_BREF_TYPE_INODE: 1342 base = &parent->data->ipdata.u.blockset.blockref[0]; 1343 count = HAMMER2_SET_COUNT; 1344 break; 1345 case HAMMER2_BREF_TYPE_INDIRECT: 1346 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1347 base = &parent->data->npdata[0]; 1348 count = parent->bytes / sizeof(hammer2_blockref_t); 1349 break; 1350 case HAMMER2_BREF_TYPE_VOLUME: 1351 base = &parent->data->voldata. 1352 sroot_blockset.blockref[0]; 1353 count = HAMMER2_SET_COUNT; 1354 break; 1355 case HAMMER2_BREF_TYPE_FREEMAP: 1356 base = &parent->data->blkset.blockref[0]; 1357 count = HAMMER2_SET_COUNT; 1358 break; 1359 default: 1360 panic("hammer2_chain_create_indirect: " 1361 "unrecognized blockref type: %d", 1362 parent->bref.type); 1363 count = 0; 1364 break; 1365 } 1366 } 1367 *countp = count; 1368 1369 return base; 1370 } 1371 1372 /* 1373 * This counts the number of live blockrefs in a block array and 1374 * also calculates the point at which all remaining blockrefs are empty. 1375 * This routine can only be called on a live chain. 1376 * 1377 * Caller holds the chain locked, but possibly with a shared lock. We 1378 * must use an exclusive spinlock to prevent corruption. 1379 * 1380 * NOTE: Flag is not set until after the count is complete, allowing 1381 * callers to test the flag without holding the spinlock. 1382 * 1383 * NOTE: If base is NULL the related chain is still in the INITIAL 1384 * state and there are no blockrefs to count. 1385 * 1386 * NOTE: live_count may already have some counts accumulated due to 1387 * creation and deletion and could even be initially negative. 1388 */ 1389 void 1390 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1391 hammer2_blockref_t *base, int count) 1392 { 1393 hammer2_spin_ex(&chain->core.spin); 1394 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1395 if (base) { 1396 while (--count >= 0) { 1397 if (base[count].type) 1398 break; 1399 } 1400 chain->core.live_zero = count + 1; 1401 while (count >= 0) { 1402 if (base[count].type) 1403 atomic_add_int(&chain->core.live_count, 1404 1); 1405 --count; 1406 } 1407 } else { 1408 chain->core.live_zero = 0; 1409 } 1410 /* else do not modify live_count */ 1411 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1412 } 1413 hammer2_spin_unex(&chain->core.spin); 1414 } 1415 1416 /* 1417 * Resize the chain's physical storage allocation in-place. This function does 1418 * not usually adjust the data pointer and must be followed by (typically) a 1419 * hammer2_chain_modify() call to copy any old data over and adjust the 1420 * data pointer. 1421 * 1422 * Chains can be resized smaller without reallocating the storage. Resizing 1423 * larger will reallocate the storage. Excess or prior storage is reclaimed 1424 * asynchronously at a later time. 1425 * 1426 * An nradix value of 0 is special-cased to mean that the storage should 1427 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1428 * byte). 1429 * 1430 * Must be passed an exclusively locked parent and chain. 1431 * 1432 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1433 * to avoid instantiating a device buffer that conflicts with the vnode data 1434 * buffer. However, because H2 can compress or encrypt data, the chain may 1435 * have a dio assigned to it in those situations, and they do not conflict. 1436 * 1437 * XXX return error if cannot resize. 1438 */ 1439 int 1440 hammer2_chain_resize(hammer2_chain_t *chain, 1441 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1442 int nradix, int flags) 1443 { 1444 hammer2_dev_t *hmp; 1445 size_t obytes; 1446 size_t nbytes; 1447 int error; 1448 1449 hmp = chain->hmp; 1450 1451 /* 1452 * Only data and indirect blocks can be resized for now. 1453 * (The volu root, inodes, and freemap elements use a fixed size). 1454 */ 1455 KKASSERT(chain != &hmp->vchain); 1456 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1457 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1458 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1459 1460 /* 1461 * Nothing to do if the element is already the proper size 1462 */ 1463 obytes = chain->bytes; 1464 nbytes = (nradix) ? (1U << nradix) : 0; 1465 if (obytes == nbytes) 1466 return (chain->error); 1467 1468 /* 1469 * Make sure the old data is instantiated so we can copy it. If this 1470 * is a data block, the device data may be superfluous since the data 1471 * might be in a logical block, but compressed or encrypted data is 1472 * another matter. 1473 * 1474 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 1475 */ 1476 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1477 if (error) 1478 return error; 1479 1480 /* 1481 * Relocate the block, even if making it smaller (because different 1482 * block sizes may be in different regions). 1483 * 1484 * NOTE: Operation does not copy the data and may only be used 1485 * to resize data blocks in-place, or directory entry blocks 1486 * which are about to be modified in some manner. 1487 */ 1488 error = hammer2_freemap_alloc(chain, nbytes); 1489 if (error) 1490 return error; 1491 1492 chain->bytes = nbytes; 1493 1494 /* 1495 * We don't want the followup chain_modify() to try to copy data 1496 * from the old (wrong-sized) buffer. It won't know how much to 1497 * copy. This case should only occur during writes when the 1498 * originator already has the data to write in-hand. 1499 */ 1500 if (chain->dio) { 1501 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1502 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1503 hammer2_io_brelse(&chain->dio); 1504 chain->data = NULL; 1505 } 1506 return (chain->error); 1507 } 1508 1509 /* 1510 * Set the chain modified so its data can be changed by the caller, or 1511 * install deduplicated data. The caller must call this routine for each 1512 * set of modifications it makes, even if the chain is already flagged 1513 * MODIFIED. 1514 * 1515 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1516 * is a CLC (cluster level change) field and is not updated by parent 1517 * propagation during a flush. 1518 * 1519 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1520 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1521 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1522 * remains unmodified with its old data ref intact and chain->error 1523 * unchanged. 1524 * 1525 * Dedup Handling 1526 * 1527 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1528 * even if the chain is still flagged MODIFIED. In this case the chain's 1529 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1530 * 1531 * If the caller passes a non-zero dedup_off we will use it to assign the 1532 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1533 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1534 * must not modify the data content upon return. 1535 */ 1536 int 1537 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1538 hammer2_off_t dedup_off, int flags) 1539 { 1540 hammer2_blockref_t obref; 1541 hammer2_dev_t *hmp; 1542 hammer2_io_t *dio; 1543 int error; 1544 int wasinitial; 1545 int setmodified; 1546 int setupdate; 1547 int newmod; 1548 char *bdata; 1549 1550 hmp = chain->hmp; 1551 obref = chain->bref; 1552 KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0); 1553 1554 /* 1555 * Data is not optional for freemap chains (we must always be sure 1556 * to copy the data on COW storage allocations). 1557 */ 1558 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1559 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1560 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1561 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1562 } 1563 1564 /* 1565 * Data must be resolved if already assigned, unless explicitly 1566 * flagged otherwise. If we cannot safety load the data the 1567 * modification fails and we return early. 1568 */ 1569 if (chain->data == NULL && chain->bytes != 0 && 1570 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1571 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1572 hammer2_chain_load_data(chain); 1573 if (chain->error) 1574 return (chain->error); 1575 } 1576 error = 0; 1577 1578 /* 1579 * Set MODIFIED to indicate that the chain has been modified. A new 1580 * allocation is required when modifying a chain. 1581 * 1582 * Set UPDATE to ensure that the blockref is updated in the parent. 1583 * 1584 * If MODIFIED is already set determine if we can reuse the assigned 1585 * data block or if we need a new data block. 1586 */ 1587 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1588 /* 1589 * Must set modified bit. 1590 */ 1591 atomic_add_long(&hammer2_count_modified_chains, 1); 1592 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1593 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1594 setmodified = 1; 1595 1596 /* 1597 * We may be able to avoid a copy-on-write if the chain's 1598 * check mode is set to NONE and the chain's current 1599 * modify_tid is beyond the last explicit snapshot tid. 1600 * 1601 * This implements HAMMER2's overwrite-in-place feature. 1602 * 1603 * NOTE! This data-block cannot be used as a de-duplication 1604 * source when the check mode is set to NONE. 1605 */ 1606 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1607 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1608 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1609 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1610 HAMMER2_DEC_CHECK(chain->bref.methods) == 1611 HAMMER2_CHECK_NONE && 1612 chain->pmp && 1613 chain->bref.modify_tid > 1614 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1615 /* 1616 * Sector overwrite allowed. 1617 */ 1618 newmod = 0; 1619 } else { 1620 /* 1621 * Sector overwrite not allowed, must copy-on-write. 1622 */ 1623 newmod = 1; 1624 } 1625 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1626 /* 1627 * If the modified chain was registered for dedup we need 1628 * a new allocation. This only happens for delayed-flush 1629 * chains (i.e. which run through the front-end buffer 1630 * cache). 1631 */ 1632 newmod = 1; 1633 setmodified = 0; 1634 } else { 1635 /* 1636 * Already flagged modified, no new allocation is needed. 1637 */ 1638 newmod = 0; 1639 setmodified = 0; 1640 } 1641 1642 /* 1643 * Flag parent update required. 1644 */ 1645 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1646 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1647 setupdate = 1; 1648 } else { 1649 setupdate = 0; 1650 } 1651 1652 /* 1653 * The modification or re-modification requires an allocation and 1654 * possible COW. If an error occurs, the previous content and data 1655 * reference is retained and the modification fails. 1656 * 1657 * If dedup_off is non-zero, the caller is requesting a deduplication 1658 * rather than a modification. The MODIFIED bit is not set and the 1659 * data offset is set to the deduplication offset. The data cannot 1660 * be modified. 1661 * 1662 * NOTE: The dedup offset is allowed to be in a partially free state 1663 * and we must be sure to reset it to a fully allocated state 1664 * to force two bulkfree passes to free it again. 1665 * 1666 * NOTE: Only applicable when chain->bytes != 0. 1667 * 1668 * XXX can a chain already be marked MODIFIED without a data 1669 * assignment? If not, assert here instead of testing the case. 1670 */ 1671 if (chain != &hmp->vchain && chain != &hmp->fchain && 1672 chain->bytes) { 1673 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1674 newmod 1675 ) { 1676 /* 1677 * NOTE: We do not have to remove the dedup 1678 * registration because the area is still 1679 * allocated and the underlying DIO will 1680 * still be flushed. 1681 */ 1682 if (dedup_off) { 1683 chain->bref.data_off = dedup_off; 1684 chain->bytes = 1 << (dedup_off & 1685 HAMMER2_OFF_MASK_RADIX); 1686 chain->error = 0; 1687 atomic_clear_int(&chain->flags, 1688 HAMMER2_CHAIN_MODIFIED); 1689 atomic_add_long(&hammer2_count_modified_chains, 1690 -1); 1691 if (chain->pmp) 1692 hammer2_pfs_memory_wakeup(chain->pmp); 1693 hammer2_freemap_adjust(hmp, &chain->bref, 1694 HAMMER2_FREEMAP_DORECOVER); 1695 atomic_set_int(&chain->flags, 1696 HAMMER2_CHAIN_DEDUPABLE); 1697 } else { 1698 error = hammer2_freemap_alloc(chain, 1699 chain->bytes); 1700 atomic_clear_int(&chain->flags, 1701 HAMMER2_CHAIN_DEDUPABLE); 1702 } 1703 } 1704 } 1705 1706 /* 1707 * Stop here if error. We have to undo any flag bits we might 1708 * have set above. 1709 */ 1710 if (error) { 1711 if (setmodified) { 1712 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1713 atomic_add_long(&hammer2_count_modified_chains, -1); 1714 if (chain->pmp) 1715 hammer2_pfs_memory_wakeup(chain->pmp); 1716 } 1717 if (setupdate) { 1718 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1719 } 1720 return error; 1721 } 1722 1723 /* 1724 * Update mirror_tid and modify_tid. modify_tid is only updated 1725 * if not passed as zero (during flushes, parent propagation passes 1726 * the value 0). 1727 * 1728 * NOTE: chain->pmp could be the device spmp. 1729 */ 1730 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1731 if (mtid) 1732 chain->bref.modify_tid = mtid; 1733 1734 /* 1735 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1736 * requires updating as well as to tell the delete code that the 1737 * chain's blockref might not exactly match (in terms of physical size 1738 * or block offset) the one in the parent's blocktable. The base key 1739 * of course will still match. 1740 */ 1741 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1742 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1743 1744 /* 1745 * Short-cut data blocks which the caller does not need an actual 1746 * data reference to (aka OPTDATA), as long as the chain does not 1747 * already have a data pointer to the data. This generally means 1748 * that the modifications are being done via the logical buffer cache. 1749 * The INITIAL flag relates only to the device data buffer and thus 1750 * remains unchange in this situation. 1751 * 1752 * This code also handles bytes == 0 (most dirents). 1753 */ 1754 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1755 (flags & HAMMER2_MODIFY_OPTDATA) && 1756 chain->data == NULL) { 1757 KKASSERT(chain->dio == NULL); 1758 goto skip2; 1759 } 1760 1761 /* 1762 * Clearing the INITIAL flag (for indirect blocks) indicates that 1763 * we've processed the uninitialized storage allocation. 1764 * 1765 * If this flag is already clear we are likely in a copy-on-write 1766 * situation but we have to be sure NOT to bzero the storage if 1767 * no data is present. 1768 */ 1769 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1770 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1771 wasinitial = 1; 1772 } else { 1773 wasinitial = 0; 1774 } 1775 1776 /* 1777 * Instantiate data buffer and possibly execute COW operation 1778 */ 1779 switch(chain->bref.type) { 1780 case HAMMER2_BREF_TYPE_VOLUME: 1781 case HAMMER2_BREF_TYPE_FREEMAP: 1782 /* 1783 * The data is embedded, no copy-on-write operation is 1784 * needed. 1785 */ 1786 KKASSERT(chain->dio == NULL); 1787 break; 1788 case HAMMER2_BREF_TYPE_DIRENT: 1789 /* 1790 * The data might be fully embedded. 1791 */ 1792 if (chain->bytes == 0) { 1793 KKASSERT(chain->dio == NULL); 1794 break; 1795 } 1796 /* fall through */ 1797 case HAMMER2_BREF_TYPE_INODE: 1798 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1799 case HAMMER2_BREF_TYPE_DATA: 1800 case HAMMER2_BREF_TYPE_INDIRECT: 1801 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1802 /* 1803 * Perform the copy-on-write operation 1804 * 1805 * zero-fill or copy-on-write depending on whether 1806 * chain->data exists or not and set the dirty state for 1807 * the new buffer. hammer2_io_new() will handle the 1808 * zero-fill. 1809 * 1810 * If a dedup_off was supplied this is an existing block 1811 * and no COW, copy, or further modification is required. 1812 */ 1813 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1814 1815 if (wasinitial && dedup_off == 0) { 1816 error = hammer2_io_new(hmp, chain->bref.type, 1817 chain->bref.data_off, 1818 chain->bytes, &dio); 1819 } else { 1820 error = hammer2_io_bread(hmp, chain->bref.type, 1821 chain->bref.data_off, 1822 chain->bytes, &dio); 1823 } 1824 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1825 1826 /* 1827 * If an I/O error occurs make sure callers cannot accidently 1828 * modify the old buffer's contents and corrupt the filesystem. 1829 */ 1830 if (error) { 1831 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 1832 hmp); 1833 chain->error = HAMMER2_ERROR_EIO; 1834 hammer2_io_brelse(&dio); 1835 hammer2_io_brelse(&chain->dio); 1836 chain->data = NULL; 1837 break; 1838 } 1839 chain->error = 0; 1840 bdata = hammer2_io_data(dio, chain->bref.data_off); 1841 1842 if (chain->data) { 1843 /* 1844 * COW (unless a dedup). 1845 */ 1846 KKASSERT(chain->dio != NULL); 1847 if (chain->data != (void *)bdata && dedup_off == 0) { 1848 bcopy(chain->data, bdata, chain->bytes); 1849 } 1850 } else if (wasinitial == 0) { 1851 /* 1852 * We have a problem. We were asked to COW but 1853 * we don't have any data to COW with! 1854 */ 1855 panic("hammer2_chain_modify: having a COW %p\n", 1856 chain); 1857 } 1858 1859 /* 1860 * Retire the old buffer, replace with the new. Dirty or 1861 * redirty the new buffer. 1862 * 1863 * WARNING! The system buffer cache may have already flushed 1864 * the buffer, so we must be sure to [re]dirty it 1865 * for further modification. 1866 * 1867 * If dedup_off was supplied, the caller is not 1868 * expected to make any further modification to the 1869 * buffer. 1870 */ 1871 if (chain->dio) 1872 hammer2_io_bqrelse(&chain->dio); 1873 chain->data = (void *)bdata; 1874 chain->dio = dio; 1875 if (dedup_off == 0) 1876 hammer2_io_setdirty(dio); 1877 break; 1878 default: 1879 panic("hammer2_chain_modify: illegal non-embedded type %d", 1880 chain->bref.type); 1881 break; 1882 1883 } 1884 skip2: 1885 /* 1886 * setflush on parent indicating that the parent must recurse down 1887 * to us. Do not call on chain itself which might already have it 1888 * set. 1889 */ 1890 if (chain->parent) 1891 hammer2_chain_setflush(chain->parent); 1892 return (chain->error); 1893 } 1894 1895 /* 1896 * Modify the chain associated with an inode. 1897 */ 1898 int 1899 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 1900 hammer2_tid_t mtid, int flags) 1901 { 1902 int error; 1903 1904 hammer2_inode_modify(ip); 1905 error = hammer2_chain_modify(chain, mtid, 0, flags); 1906 1907 return error; 1908 } 1909 1910 /* 1911 * Volume header data locks 1912 */ 1913 void 1914 hammer2_voldata_lock(hammer2_dev_t *hmp) 1915 { 1916 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 1917 } 1918 1919 void 1920 hammer2_voldata_unlock(hammer2_dev_t *hmp) 1921 { 1922 lockmgr(&hmp->vollk, LK_RELEASE); 1923 } 1924 1925 void 1926 hammer2_voldata_modify(hammer2_dev_t *hmp) 1927 { 1928 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1929 atomic_add_long(&hammer2_count_modified_chains, 1); 1930 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1931 hammer2_pfs_memory_inc(hmp->vchain.pmp); 1932 } 1933 } 1934 1935 /* 1936 * This function returns the chain at the nearest key within the specified 1937 * range. The returned chain will be referenced but not locked. 1938 * 1939 * This function will recurse through chain->rbtree as necessary and will 1940 * return a *key_nextp suitable for iteration. *key_nextp is only set if 1941 * the iteration value is less than the current value of *key_nextp. 1942 * 1943 * The caller should use (*key_nextp) to calculate the actual range of 1944 * the returned element, which will be (key_beg to *key_nextp - 1), because 1945 * there might be another element which is superior to the returned element 1946 * and overlaps it. 1947 * 1948 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 1949 * chains continue to be returned. On EOF (*key_nextp) may overflow since 1950 * it will wind up being (key_end + 1). 1951 * 1952 * WARNING! Must be called with child's spinlock held. Spinlock remains 1953 * held through the operation. 1954 */ 1955 struct hammer2_chain_find_info { 1956 hammer2_chain_t *best; 1957 hammer2_key_t key_beg; 1958 hammer2_key_t key_end; 1959 hammer2_key_t key_next; 1960 }; 1961 1962 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 1963 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 1964 1965 static 1966 hammer2_chain_t * 1967 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 1968 hammer2_key_t key_beg, hammer2_key_t key_end) 1969 { 1970 struct hammer2_chain_find_info info; 1971 1972 info.best = NULL; 1973 info.key_beg = key_beg; 1974 info.key_end = key_end; 1975 info.key_next = *key_nextp; 1976 1977 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 1978 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1979 &info); 1980 *key_nextp = info.key_next; 1981 #if 0 1982 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 1983 parent, key_beg, key_end, *key_nextp); 1984 #endif 1985 1986 return (info.best); 1987 } 1988 1989 static 1990 int 1991 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 1992 { 1993 struct hammer2_chain_find_info *info = data; 1994 hammer2_key_t child_beg; 1995 hammer2_key_t child_end; 1996 1997 child_beg = child->bref.key; 1998 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 1999 2000 if (child_end < info->key_beg) 2001 return(-1); 2002 if (child_beg > info->key_end) 2003 return(1); 2004 return(0); 2005 } 2006 2007 static 2008 int 2009 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2010 { 2011 struct hammer2_chain_find_info *info = data; 2012 hammer2_chain_t *best; 2013 hammer2_key_t child_end; 2014 2015 /* 2016 * WARNING! Layerq is scanned forwards, exact matches should keep 2017 * the existing info->best. 2018 */ 2019 if ((best = info->best) == NULL) { 2020 /* 2021 * No previous best. Assign best 2022 */ 2023 info->best = child; 2024 } else if (best->bref.key <= info->key_beg && 2025 child->bref.key <= info->key_beg) { 2026 /* 2027 * Illegal overlap. 2028 */ 2029 KKASSERT(0); 2030 /*info->best = child;*/ 2031 } else if (child->bref.key < best->bref.key) { 2032 /* 2033 * Child has a nearer key and best is not flush with key_beg. 2034 * Set best to child. Truncate key_next to the old best key. 2035 */ 2036 info->best = child; 2037 if (info->key_next > best->bref.key || info->key_next == 0) 2038 info->key_next = best->bref.key; 2039 } else if (child->bref.key == best->bref.key) { 2040 /* 2041 * If our current best is flush with the child then this 2042 * is an illegal overlap. 2043 * 2044 * key_next will automatically be limited to the smaller of 2045 * the two end-points. 2046 */ 2047 KKASSERT(0); 2048 info->best = child; 2049 } else { 2050 /* 2051 * Keep the current best but truncate key_next to the child's 2052 * base. 2053 * 2054 * key_next will also automatically be limited to the smaller 2055 * of the two end-points (probably not necessary for this case 2056 * but we do it anyway). 2057 */ 2058 if (info->key_next > child->bref.key || info->key_next == 0) 2059 info->key_next = child->bref.key; 2060 } 2061 2062 /* 2063 * Always truncate key_next based on child's end-of-range. 2064 */ 2065 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2066 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2067 info->key_next = child_end; 2068 2069 return(0); 2070 } 2071 2072 /* 2073 * Retrieve the specified chain from a media blockref, creating the 2074 * in-memory chain structure which reflects it. The returned chain is 2075 * held but not locked. The caller must lock it to crc-check and 2076 * dereference its data, and should check chain->error after locking 2077 * before assuming that the data is good. 2078 * 2079 * To handle insertion races pass the INSERT_RACE flag along with the 2080 * generation number of the core. NULL will be returned if the generation 2081 * number changes before we have a chance to insert the chain. Insert 2082 * races can occur because the parent might be held shared. 2083 * 2084 * Caller must hold the parent locked shared or exclusive since we may 2085 * need the parent's bref array to find our block. 2086 * 2087 * WARNING! chain->pmp is always set to NULL for any chain representing 2088 * part of the super-root topology. 2089 */ 2090 hammer2_chain_t * 2091 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2092 hammer2_blockref_t *bref) 2093 { 2094 hammer2_dev_t *hmp = parent->hmp; 2095 hammer2_chain_t *chain; 2096 int error; 2097 2098 /* 2099 * Allocate a chain structure representing the existing media 2100 * entry. Resulting chain has one ref and is not locked. 2101 */ 2102 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2103 chain = hammer2_chain_alloc(hmp, NULL, bref); 2104 else 2105 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2106 /* ref'd chain returned */ 2107 2108 /* 2109 * Flag that the chain is in the parent's blockmap so delete/flush 2110 * knows what to do with it. 2111 */ 2112 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 2113 2114 /* 2115 * Link the chain into its parent. A spinlock is required to safely 2116 * access the RBTREE, and it is possible to collide with another 2117 * hammer2_chain_get() operation because the caller might only hold 2118 * a shared lock on the parent. 2119 * 2120 * NOTE: Get races can occur quite often when we distribute 2121 * asynchronous read-aheads across multiple threads. 2122 */ 2123 KKASSERT(parent->refs > 0); 2124 error = hammer2_chain_insert(parent, chain, 2125 HAMMER2_CHAIN_INSERT_SPIN | 2126 HAMMER2_CHAIN_INSERT_RACE, 2127 generation); 2128 if (error) { 2129 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2130 /*kprintf("chain %p get race\n", chain);*/ 2131 hammer2_chain_drop(chain); 2132 chain = NULL; 2133 } else { 2134 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2135 } 2136 2137 /* 2138 * Return our new chain referenced but not locked, or NULL if 2139 * a race occurred. 2140 */ 2141 return (chain); 2142 } 2143 2144 /* 2145 * Lookup initialization/completion API 2146 */ 2147 hammer2_chain_t * 2148 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2149 { 2150 hammer2_chain_ref(parent); 2151 if (flags & HAMMER2_LOOKUP_SHARED) { 2152 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2153 HAMMER2_RESOLVE_SHARED); 2154 } else { 2155 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2156 } 2157 return (parent); 2158 } 2159 2160 void 2161 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2162 { 2163 if (parent) { 2164 hammer2_chain_unlock(parent); 2165 hammer2_chain_drop(parent); 2166 } 2167 } 2168 2169 /* 2170 * Take the locked chain and return a locked parent. The chain remains 2171 * locked on return, but may have to be temporarily unlocked to acquire 2172 * the parent. Can return NULL. 2173 * 2174 * Pass HAMMER2_RESOLVE_* flags in flags. 2175 * 2176 * This will work even if the chain is errored, and the caller can check 2177 * parent->error on return if desired since the parent will be locked. 2178 * 2179 * This function handles the lock order reversal. 2180 */ 2181 hammer2_chain_t * 2182 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2183 { 2184 hammer2_chain_t *parent; 2185 2186 /* 2187 * Be careful of order, chain must be unlocked before parent 2188 * is locked below to avoid a deadlock. 2189 * 2190 * Safe access to fu->parent requires fu's core spinlock. 2191 */ 2192 again: 2193 hammer2_spin_ex(&chain->core.spin); 2194 parent = chain->parent; 2195 if (parent == NULL) { 2196 hammer2_spin_unex(&chain->core.spin); 2197 panic("hammer2_chain_getparent: no parent"); 2198 } 2199 hammer2_chain_ref(parent); 2200 hammer2_spin_unex(&chain->core.spin); 2201 2202 hammer2_chain_unlock(chain); 2203 hammer2_chain_lock(parent, flags); 2204 hammer2_chain_lock(chain, flags); 2205 2206 /* 2207 * Parent relinking races are quite common. We have to get it right 2208 * or we will blow up the block table. 2209 */ 2210 if (chain->parent != parent) { 2211 hammer2_chain_unlock(parent); 2212 hammer2_chain_drop(parent); 2213 goto again; 2214 } 2215 return parent; 2216 } 2217 2218 /* 2219 * Take the locked chain and return a locked parent. The chain is unlocked 2220 * and dropped. *chainp is set to the returned parent as a convenience. 2221 * Pass HAMMER2_RESOLVE_* flags in flags. 2222 * 2223 * This will work even if the chain is errored, and the caller can check 2224 * parent->error on return if desired since the parent will be locked. 2225 * 2226 * This function handles the lock order reversal. 2227 */ 2228 hammer2_chain_t * 2229 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2230 { 2231 hammer2_chain_t *chain; 2232 hammer2_chain_t *parent; 2233 2234 /* 2235 * Be careful of order, chain must be unlocked before parent 2236 * is locked below to avoid a deadlock. 2237 * 2238 * Safe access to fu->parent requires fu's core spinlock. 2239 */ 2240 chain = *chainp; 2241 again: 2242 hammer2_spin_ex(&chain->core.spin); 2243 parent = chain->parent; 2244 if (parent == NULL) { 2245 hammer2_spin_unex(&chain->core.spin); 2246 panic("hammer2_chain_getparent: no parent"); 2247 } 2248 hammer2_chain_ref(parent); 2249 hammer2_spin_unex(&chain->core.spin); 2250 2251 hammer2_chain_unlock(chain); 2252 hammer2_chain_lock(parent, flags); 2253 2254 /* 2255 * Parent relinking races are quite common. We have to get it right 2256 * or we will blow up the block table. 2257 */ 2258 if (chain->parent != parent) { 2259 hammer2_chain_lock(chain, flags); 2260 hammer2_chain_unlock(parent); 2261 hammer2_chain_drop(parent); 2262 goto again; 2263 } 2264 hammer2_chain_drop(chain); 2265 *chainp = parent; 2266 2267 return parent; 2268 } 2269 2270 /* 2271 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2272 * (*parentp) typically points to an inode but can also point to a related 2273 * indirect block and this function will recurse upwards and find the inode 2274 * or the nearest undeleted indirect block covering the key range. 2275 * 2276 * This function unconditionally sets *errorp, replacing any previous value. 2277 * 2278 * (*parentp) must be exclusive or shared locked (depending on flags) and 2279 * referenced and can be an inode or an existing indirect block within the 2280 * inode. 2281 * 2282 * If (*parent) is errored out, this function will not attempt to recurse 2283 * the radix tree and will return NULL along with an appropriate *errorp. 2284 * If NULL is returned and *errorp is 0, the requested lookup could not be 2285 * located. 2286 * 2287 * On return (*parentp) will be modified to point at the deepest parent chain 2288 * element encountered during the search, as a helper for an insertion or 2289 * deletion. 2290 * 2291 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2292 * and referenced, and the old will be unlocked and dereferenced (no change 2293 * if they are both the same). This is particularly important if the caller 2294 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2295 * is returned, as long as no error occurred. 2296 * 2297 * The matching chain will be returned locked according to flags. If NOLOCK 2298 * is requested the chain will be returned only referenced. Note that the 2299 * parent chain must always be locked shared or exclusive, matching the 2300 * HAMMER2_LOOKUP_SHARED flag. We can conceivably lock it SHARED temporarily 2301 * when NOLOCK is specified but that complicates matters if *parentp must 2302 * inherit the chain. 2303 * 2304 * NOLOCK also implies NODATA, since an unlocked chain usually has a NULL 2305 * data pointer or can otherwise be in flux. 2306 * 2307 * NULL is returned if no match was found, but (*parentp) will still 2308 * potentially be adjusted. 2309 * 2310 * On return (*key_nextp) will point to an iterative value for key_beg. 2311 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2312 * 2313 * This function will also recurse up the chain if the key is not within the 2314 * current parent's range. (*parentp) can never be set to NULL. An iteration 2315 * can simply allow (*parentp) to float inside the loop. 2316 * 2317 * NOTE! chain->data is not always resolved. By default it will not be 2318 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2319 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2320 * BREF_TYPE_DATA as the device buffer can alias the logical file 2321 * buffer). 2322 */ 2323 2324 hammer2_chain_t * 2325 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2326 hammer2_key_t key_beg, hammer2_key_t key_end, 2327 int *errorp, int flags) 2328 { 2329 hammer2_dev_t *hmp; 2330 hammer2_chain_t *parent; 2331 hammer2_chain_t *chain; 2332 hammer2_blockref_t *base; 2333 hammer2_blockref_t *bref; 2334 hammer2_blockref_t bcopy; 2335 hammer2_key_t scan_beg; 2336 hammer2_key_t scan_end; 2337 int count = 0; 2338 int how_always = HAMMER2_RESOLVE_ALWAYS; 2339 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2340 int how; 2341 int generation; 2342 int maxloops = 300000; 2343 volatile hammer2_mtx_t save_mtx; 2344 2345 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2346 how_maybe = how_always; 2347 how = HAMMER2_RESOLVE_ALWAYS; 2348 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2349 how = HAMMER2_RESOLVE_NEVER; 2350 } else { 2351 how = HAMMER2_RESOLVE_MAYBE; 2352 } 2353 if (flags & HAMMER2_LOOKUP_SHARED) { 2354 how_maybe |= HAMMER2_RESOLVE_SHARED; 2355 how_always |= HAMMER2_RESOLVE_SHARED; 2356 how |= HAMMER2_RESOLVE_SHARED; 2357 } 2358 2359 /* 2360 * Recurse (*parentp) upward if necessary until the parent completely 2361 * encloses the key range or we hit the inode. 2362 * 2363 * Handle races against the flusher deleting indirect nodes on its 2364 * way back up by continuing to recurse upward past the deletion. 2365 */ 2366 parent = *parentp; 2367 hmp = parent->hmp; 2368 *errorp = 0; 2369 2370 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2371 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2372 scan_beg = parent->bref.key; 2373 scan_end = scan_beg + 2374 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2375 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2376 if (key_beg >= scan_beg && key_end <= scan_end) 2377 break; 2378 } 2379 parent = hammer2_chain_repparent(parentp, how_maybe); 2380 } 2381 again: 2382 2383 if (--maxloops == 0) 2384 panic("hammer2_chain_lookup: maxloops"); 2385 /* 2386 * Locate the blockref array. Currently we do a fully associative 2387 * search through the array. 2388 */ 2389 switch(parent->bref.type) { 2390 case HAMMER2_BREF_TYPE_INODE: 2391 /* 2392 * Special shortcut for embedded data returns the inode 2393 * itself. Callers must detect this condition and access 2394 * the embedded data (the strategy code does this for us). 2395 * 2396 * This is only applicable to regular files and softlinks. 2397 * 2398 * We need a second lock on parent. Since we already have 2399 * a lock we must pass LOCKAGAIN to prevent unexpected 2400 * blocking (we don't want to block on a second shared 2401 * ref if an exclusive lock is pending) 2402 */ 2403 if (parent->data->ipdata.meta.op_flags & 2404 HAMMER2_OPFLAG_DIRECTDATA) { 2405 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2406 chain = NULL; 2407 *key_nextp = key_end + 1; 2408 goto done; 2409 } 2410 hammer2_chain_ref(parent); 2411 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 2412 hammer2_chain_lock(parent, 2413 how_always | 2414 HAMMER2_RESOLVE_LOCKAGAIN); 2415 *key_nextp = key_end + 1; 2416 return (parent); 2417 } 2418 base = &parent->data->ipdata.u.blockset.blockref[0]; 2419 count = HAMMER2_SET_COUNT; 2420 break; 2421 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2422 case HAMMER2_BREF_TYPE_INDIRECT: 2423 /* 2424 * Handle MATCHIND on the parent 2425 */ 2426 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2427 scan_beg = parent->bref.key; 2428 scan_end = scan_beg + 2429 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2430 if (key_beg == scan_beg && key_end == scan_end) { 2431 chain = parent; 2432 hammer2_chain_ref(chain); 2433 hammer2_chain_lock(chain, how_maybe); 2434 *key_nextp = scan_end + 1; 2435 goto done; 2436 } 2437 } 2438 2439 /* 2440 * Optimize indirect blocks in the INITIAL state to avoid 2441 * I/O. 2442 */ 2443 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2444 base = NULL; 2445 } else { 2446 if (parent->data == NULL) { 2447 kprintf("parent->data is NULL %p\n", parent); 2448 while (1) 2449 tsleep(parent, 0, "xxx", 0); 2450 } 2451 base = &parent->data->npdata[0]; 2452 } 2453 count = parent->bytes / sizeof(hammer2_blockref_t); 2454 break; 2455 case HAMMER2_BREF_TYPE_VOLUME: 2456 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2457 count = HAMMER2_SET_COUNT; 2458 break; 2459 case HAMMER2_BREF_TYPE_FREEMAP: 2460 base = &parent->data->blkset.blockref[0]; 2461 count = HAMMER2_SET_COUNT; 2462 break; 2463 default: 2464 kprintf("hammer2_chain_lookup: unrecognized " 2465 "blockref(B) type: %d", 2466 parent->bref.type); 2467 while (1) 2468 tsleep(&base, 0, "dead", 0); 2469 panic("hammer2_chain_lookup: unrecognized " 2470 "blockref(B) type: %d", 2471 parent->bref.type); 2472 base = NULL; /* safety */ 2473 count = 0; /* safety */ 2474 } 2475 2476 /* 2477 * No lookup is possible if the parent is errored. We delayed 2478 * this check as long as we could to ensure that the parent backup, 2479 * embedded data, and MATCHIND code could still execute. 2480 */ 2481 if (parent->error) { 2482 *errorp = parent->error; 2483 return NULL; 2484 } 2485 2486 /* 2487 * Merged scan to find next candidate. 2488 * 2489 * hammer2_base_*() functions require the parent->core.live_* fields 2490 * to be synchronized. 2491 * 2492 * We need to hold the spinlock to access the block array and RB tree 2493 * and to interlock chain creation. 2494 */ 2495 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2496 hammer2_chain_countbrefs(parent, base, count); 2497 2498 /* 2499 * Combined search 2500 */ 2501 hammer2_spin_ex(&parent->core.spin); 2502 chain = hammer2_combined_find(parent, base, count, 2503 key_nextp, 2504 key_beg, key_end, 2505 &bref); 2506 generation = parent->core.generation; 2507 2508 /* 2509 * Exhausted parent chain, iterate. 2510 */ 2511 if (bref == NULL) { 2512 KKASSERT(chain == NULL); 2513 hammer2_spin_unex(&parent->core.spin); 2514 if (key_beg == key_end) /* short cut single-key case */ 2515 return (NULL); 2516 2517 /* 2518 * Stop if we reached the end of the iteration. 2519 */ 2520 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2521 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2522 return (NULL); 2523 } 2524 2525 /* 2526 * Calculate next key, stop if we reached the end of the 2527 * iteration, otherwise go up one level and loop. 2528 */ 2529 key_beg = parent->bref.key + 2530 ((hammer2_key_t)1 << parent->bref.keybits); 2531 if (key_beg == 0 || key_beg > key_end) 2532 return (NULL); 2533 parent = hammer2_chain_repparent(parentp, how_maybe); 2534 goto again; 2535 } 2536 2537 /* 2538 * Selected from blockref or in-memory chain. 2539 */ 2540 if (chain == NULL) { 2541 bcopy = *bref; 2542 hammer2_spin_unex(&parent->core.spin); 2543 chain = hammer2_chain_get(parent, generation, 2544 &bcopy); 2545 if (chain == NULL) { 2546 /* 2547 kprintf("retry lookup parent %p keys %016jx:%016jx\n", 2548 parent, key_beg, key_end); 2549 */ 2550 goto again; 2551 } 2552 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2553 hammer2_chain_drop(chain); 2554 chain = NULL; /* SAFETY */ 2555 goto again; 2556 } 2557 } else { 2558 hammer2_chain_ref(chain); 2559 hammer2_spin_unex(&parent->core.spin); 2560 } 2561 2562 /* 2563 * chain is referenced but not locked. We must lock the chain 2564 * to obtain definitive state. 2565 */ 2566 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2567 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2568 hammer2_chain_lock(chain, how_maybe); 2569 } else { 2570 hammer2_chain_lock(chain, how); 2571 } 2572 KKASSERT(chain->parent == parent); 2573 2574 /* 2575 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2576 * 2577 * NOTE: Chain's key range is not relevant as there might be 2578 * one-offs within the range that are not deleted. 2579 * 2580 * NOTE: Lookups can race delete-duplicate because 2581 * delete-duplicate does not lock the parent's core 2582 * (they just use the spinlock on the core). 2583 */ 2584 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2585 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2586 chain->bref.data_off, chain->bref.type, 2587 chain->bref.key); 2588 hammer2_chain_unlock(chain); 2589 hammer2_chain_drop(chain); 2590 chain = NULL; /* SAFETY */ 2591 key_beg = *key_nextp; 2592 if (key_beg == 0 || key_beg > key_end) 2593 return(NULL); 2594 goto again; 2595 } 2596 2597 /* 2598 * If the chain element is an indirect block it becomes the new 2599 * parent and we loop on it. We must maintain our top-down locks 2600 * to prevent the flusher from interfering (i.e. doing a 2601 * delete-duplicate and leaving us recursing down a deleted chain). 2602 * 2603 * The parent always has to be locked with at least RESOLVE_MAYBE 2604 * so we can access its data. It might need a fixup if the caller 2605 * passed incompatible flags. Be careful not to cause a deadlock 2606 * as a data-load requires an exclusive lock. 2607 * 2608 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2609 * range is within the requested key range we return the indirect 2610 * block and do NOT loop. This is usually only used to acquire 2611 * freemap nodes. 2612 */ 2613 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2614 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2615 save_mtx = parent->lock; 2616 hammer2_chain_unlock(parent); 2617 hammer2_chain_drop(parent); 2618 *parentp = parent = chain; 2619 chain = NULL; /* SAFETY */ 2620 goto again; 2621 } 2622 done: 2623 /* 2624 * All done, return the chain. 2625 * 2626 * If the caller does not want a locked chain, replace the lock with 2627 * a ref. Perhaps this can eventually be optimized to not obtain the 2628 * lock in the first place for situations where the data does not 2629 * need to be resolved. 2630 * 2631 * NOTE! A chain->error must be tested by the caller upon return. 2632 * *errorp is only set based on issues which occur while 2633 * trying to reach the chain. 2634 */ 2635 if (chain) { 2636 if (flags & HAMMER2_LOOKUP_NOLOCK) 2637 hammer2_chain_unlock(chain); 2638 } 2639 return (chain); 2640 } 2641 2642 /* 2643 * After having issued a lookup we can iterate all matching keys. 2644 * 2645 * If chain is non-NULL we continue the iteration from just after it's index. 2646 * 2647 * If chain is NULL we assume the parent was exhausted and continue the 2648 * iteration at the next parent. 2649 * 2650 * If a fatal error occurs (typically an I/O error), a dummy chain is 2651 * returned with chain->error and error-identifying information set. This 2652 * chain will assert if you try to do anything fancy with it. 2653 * 2654 * XXX Depending on where the error occurs we should allow continued iteration. 2655 * 2656 * parent must be locked on entry and remains locked throughout. chain's 2657 * lock status must match flags. Chain is always at least referenced. 2658 * 2659 * WARNING! The MATCHIND flag does not apply to this function. 2660 */ 2661 hammer2_chain_t * 2662 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2663 hammer2_key_t *key_nextp, 2664 hammer2_key_t key_beg, hammer2_key_t key_end, 2665 int *errorp, int flags) 2666 { 2667 hammer2_chain_t *parent; 2668 int how_maybe; 2669 2670 /* 2671 * Calculate locking flags for upward recursion. 2672 */ 2673 how_maybe = HAMMER2_RESOLVE_MAYBE; 2674 if (flags & HAMMER2_LOOKUP_SHARED) 2675 how_maybe |= HAMMER2_RESOLVE_SHARED; 2676 2677 parent = *parentp; 2678 *errorp = 0; 2679 2680 /* 2681 * Calculate the next index and recalculate the parent if necessary. 2682 */ 2683 if (chain) { 2684 key_beg = chain->bref.key + 2685 ((hammer2_key_t)1 << chain->bref.keybits); 2686 if ((flags & (HAMMER2_LOOKUP_NOLOCK | 2687 HAMMER2_LOOKUP_NOUNLOCK)) == 0) { 2688 hammer2_chain_unlock(chain); 2689 } 2690 hammer2_chain_drop(chain); 2691 2692 /* 2693 * chain invalid past this point, but we can still do a 2694 * pointer comparison w/parent. 2695 * 2696 * Any scan where the lookup returned degenerate data embedded 2697 * in the inode has an invalid index and must terminate. 2698 */ 2699 if (chain == parent) 2700 return(NULL); 2701 if (key_beg == 0 || key_beg > key_end) 2702 return(NULL); 2703 chain = NULL; 2704 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2705 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2706 /* 2707 * We reached the end of the iteration. 2708 */ 2709 return (NULL); 2710 } else { 2711 /* 2712 * Continue iteration with next parent unless the current 2713 * parent covers the range. 2714 * 2715 * (This also handles the case of a deleted, empty indirect 2716 * node). 2717 */ 2718 key_beg = parent->bref.key + 2719 ((hammer2_key_t)1 << parent->bref.keybits); 2720 if (key_beg == 0 || key_beg > key_end) 2721 return (NULL); 2722 parent = hammer2_chain_repparent(parentp, how_maybe); 2723 } 2724 2725 /* 2726 * And execute 2727 */ 2728 return (hammer2_chain_lookup(parentp, key_nextp, 2729 key_beg, key_end, 2730 errorp, flags)); 2731 } 2732 2733 /* 2734 * Caller wishes to iterate chains under parent, loading new chains into 2735 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 2736 * then call hammer2_chain_scan() repeatedly until a non-zero return. 2737 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 2738 * with the returned chain for the scan. The returned *chainp will be 2739 * locked and referenced. Any prior contents will be unlocked and dropped. 2740 * 2741 * Caller should check the return value. A normal scan EOF will return 2742 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 2743 * error trying to access parent data. Any error in the returned chain 2744 * must be tested separately by the caller. 2745 * 2746 * (*chainp) is dropped on each scan, but will only be set if the returned 2747 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 2748 * returned via *chainp. The caller will get their bref only. 2749 * 2750 * The raw scan function is similar to lookup/next but does not seek to a key. 2751 * Blockrefs are iterated via first_bref = (parent, NULL) and 2752 * next_chain = (parent, bref). 2753 * 2754 * The passed-in parent must be locked and its data resolved. The function 2755 * nominally returns a locked and referenced *chainp != NULL for chains 2756 * the caller might need to recurse on (and will dipose of any *chainp passed 2757 * in). The caller must check the chain->bref.type either way. 2758 */ 2759 int 2760 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 2761 hammer2_blockref_t *bref, int *firstp, 2762 int flags) 2763 { 2764 hammer2_dev_t *hmp; 2765 hammer2_blockref_t *base; 2766 hammer2_blockref_t *bref_ptr; 2767 hammer2_key_t key; 2768 hammer2_key_t next_key; 2769 hammer2_chain_t *chain = NULL; 2770 int count = 0; 2771 int how_always = HAMMER2_RESOLVE_ALWAYS; 2772 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2773 int how; 2774 int generation; 2775 int maxloops = 300000; 2776 int error; 2777 2778 hmp = parent->hmp; 2779 error = 0; 2780 2781 /* 2782 * Scan flags borrowed from lookup. 2783 */ 2784 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2785 how_maybe = how_always; 2786 how = HAMMER2_RESOLVE_ALWAYS; 2787 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2788 how = HAMMER2_RESOLVE_NEVER; 2789 } else { 2790 how = HAMMER2_RESOLVE_MAYBE; 2791 } 2792 if (flags & HAMMER2_LOOKUP_SHARED) { 2793 how_maybe |= HAMMER2_RESOLVE_SHARED; 2794 how_always |= HAMMER2_RESOLVE_SHARED; 2795 how |= HAMMER2_RESOLVE_SHARED; 2796 } 2797 2798 /* 2799 * Calculate key to locate first/next element, unlocking the previous 2800 * element as we go. Be careful, the key calculation can overflow. 2801 * 2802 * (also reset bref to NULL) 2803 */ 2804 if (*firstp) { 2805 key = 0; 2806 *firstp = 0; 2807 } else { 2808 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 2809 if ((chain = *chainp) != NULL) { 2810 *chainp = NULL; 2811 hammer2_chain_unlock(chain); 2812 hammer2_chain_drop(chain); 2813 chain = NULL; 2814 } 2815 if (key == 0) { 2816 error |= HAMMER2_ERROR_EOF; 2817 goto done; 2818 } 2819 } 2820 2821 again: 2822 if (parent->error) { 2823 error = parent->error; 2824 goto done; 2825 } 2826 if (--maxloops == 0) 2827 panic("hammer2_chain_scan: maxloops"); 2828 2829 /* 2830 * Locate the blockref array. Currently we do a fully associative 2831 * search through the array. 2832 */ 2833 switch(parent->bref.type) { 2834 case HAMMER2_BREF_TYPE_INODE: 2835 /* 2836 * An inode with embedded data has no sub-chains. 2837 * 2838 * WARNING! Bulk scan code may pass a static chain marked 2839 * as BREF_TYPE_INODE with a copy of the volume 2840 * root blockset to snapshot the volume. 2841 */ 2842 if (parent->data->ipdata.meta.op_flags & 2843 HAMMER2_OPFLAG_DIRECTDATA) { 2844 error |= HAMMER2_ERROR_EOF; 2845 goto done; 2846 } 2847 base = &parent->data->ipdata.u.blockset.blockref[0]; 2848 count = HAMMER2_SET_COUNT; 2849 break; 2850 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2851 case HAMMER2_BREF_TYPE_INDIRECT: 2852 /* 2853 * Optimize indirect blocks in the INITIAL state to avoid 2854 * I/O. 2855 */ 2856 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2857 base = NULL; 2858 } else { 2859 if (parent->data == NULL) 2860 panic("parent->data is NULL"); 2861 base = &parent->data->npdata[0]; 2862 } 2863 count = parent->bytes / sizeof(hammer2_blockref_t); 2864 break; 2865 case HAMMER2_BREF_TYPE_VOLUME: 2866 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2867 count = HAMMER2_SET_COUNT; 2868 break; 2869 case HAMMER2_BREF_TYPE_FREEMAP: 2870 base = &parent->data->blkset.blockref[0]; 2871 count = HAMMER2_SET_COUNT; 2872 break; 2873 default: 2874 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2875 parent->bref.type); 2876 base = NULL; /* safety */ 2877 count = 0; /* safety */ 2878 } 2879 2880 /* 2881 * Merged scan to find next candidate. 2882 * 2883 * hammer2_base_*() functions require the parent->core.live_* fields 2884 * to be synchronized. 2885 * 2886 * We need to hold the spinlock to access the block array and RB tree 2887 * and to interlock chain creation. 2888 */ 2889 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2890 hammer2_chain_countbrefs(parent, base, count); 2891 2892 next_key = 0; 2893 bref_ptr = NULL; 2894 hammer2_spin_ex(&parent->core.spin); 2895 chain = hammer2_combined_find(parent, base, count, 2896 &next_key, 2897 key, HAMMER2_KEY_MAX, 2898 &bref_ptr); 2899 generation = parent->core.generation; 2900 2901 /* 2902 * Exhausted parent chain, we're done. 2903 */ 2904 if (bref_ptr == NULL) { 2905 hammer2_spin_unex(&parent->core.spin); 2906 KKASSERT(chain == NULL); 2907 error |= HAMMER2_ERROR_EOF; 2908 goto done; 2909 } 2910 2911 /* 2912 * Copy into the supplied stack-based blockref. 2913 */ 2914 *bref = *bref_ptr; 2915 2916 /* 2917 * Selected from blockref or in-memory chain. 2918 */ 2919 if (chain == NULL) { 2920 switch(bref->type) { 2921 case HAMMER2_BREF_TYPE_INODE: 2922 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2923 case HAMMER2_BREF_TYPE_INDIRECT: 2924 case HAMMER2_BREF_TYPE_VOLUME: 2925 case HAMMER2_BREF_TYPE_FREEMAP: 2926 /* 2927 * Recursion, always get the chain 2928 */ 2929 hammer2_spin_unex(&parent->core.spin); 2930 chain = hammer2_chain_get(parent, generation, bref); 2931 if (chain == NULL) { 2932 kprintf("retry scan parent %p keys %016jx\n", 2933 parent, key); 2934 goto again; 2935 } 2936 if (bcmp(bref, bref_ptr, sizeof(*bref))) { 2937 hammer2_chain_drop(chain); 2938 chain = NULL; 2939 goto again; 2940 } 2941 break; 2942 default: 2943 /* 2944 * No recursion, do not waste time instantiating 2945 * a chain, just iterate using the bref. 2946 */ 2947 hammer2_spin_unex(&parent->core.spin); 2948 break; 2949 } 2950 } else { 2951 /* 2952 * Recursion or not we need the chain in order to supply 2953 * the bref. 2954 */ 2955 hammer2_chain_ref(chain); 2956 hammer2_spin_unex(&parent->core.spin); 2957 } 2958 2959 /* 2960 * chain is referenced but not locked. We must lock the chain 2961 * to obtain definitive state. 2962 */ 2963 if (chain) 2964 hammer2_chain_lock(chain, how); 2965 2966 /* 2967 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2968 * 2969 * NOTE: chain's key range is not relevant as there might be 2970 * one-offs within the range that are not deleted. 2971 * 2972 * NOTE: XXX this could create problems with scans used in 2973 * situations other than mount-time recovery. 2974 * 2975 * NOTE: Lookups can race delete-duplicate because 2976 * delete-duplicate does not lock the parent's core 2977 * (they just use the spinlock on the core). 2978 */ 2979 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 2980 hammer2_chain_unlock(chain); 2981 hammer2_chain_drop(chain); 2982 chain = NULL; 2983 2984 key = next_key; 2985 if (key == 0) { 2986 error |= HAMMER2_ERROR_EOF; 2987 goto done; 2988 } 2989 goto again; 2990 } 2991 2992 done: 2993 /* 2994 * All done, return the bref or NULL, supply chain if necessary. 2995 */ 2996 if (chain) 2997 *chainp = chain; 2998 return (error); 2999 } 3000 3001 /* 3002 * Create and return a new hammer2 system memory structure of the specified 3003 * key, type and size and insert it under (*parentp). This is a full 3004 * insertion, based on the supplied key/keybits, and may involve creating 3005 * indirect blocks and moving other chains around via delete/duplicate. 3006 * 3007 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3008 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3009 * FULL. This typically means that the caller is creating the chain after 3010 * doing a hammer2_chain_lookup(). 3011 * 3012 * (*parentp) must be exclusive locked and may be replaced on return 3013 * depending on how much work the function had to do. 3014 * 3015 * (*parentp) must not be errored or this function will assert. 3016 * 3017 * (*chainp) usually starts out NULL and returns the newly created chain, 3018 * but if the caller desires the caller may allocate a disconnected chain 3019 * and pass it in instead. 3020 * 3021 * This function should NOT be used to insert INDIRECT blocks. It is 3022 * typically used to create/insert inodes and data blocks. 3023 * 3024 * Caller must pass-in an exclusively locked parent the new chain is to 3025 * be inserted under, and optionally pass-in a disconnected, exclusively 3026 * locked chain to insert (else we create a new chain). The function will 3027 * adjust (*parentp) as necessary, create or connect the chain, and 3028 * return an exclusively locked chain in *chainp. 3029 * 3030 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3031 * and will be reassigned. 3032 * 3033 * NOTE: returns HAMMER_ERROR_* flags 3034 */ 3035 int 3036 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3037 hammer2_pfs_t *pmp, int methods, 3038 hammer2_key_t key, int keybits, int type, size_t bytes, 3039 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3040 { 3041 hammer2_dev_t *hmp; 3042 hammer2_chain_t *chain; 3043 hammer2_chain_t *parent; 3044 hammer2_blockref_t *base; 3045 hammer2_blockref_t dummy; 3046 int allocated = 0; 3047 int error = 0; 3048 int count; 3049 int maxloops = 300000; 3050 3051 /* 3052 * Topology may be crossing a PFS boundary. 3053 */ 3054 parent = *parentp; 3055 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3056 KKASSERT(parent->error == 0); 3057 hmp = parent->hmp; 3058 chain = *chainp; 3059 3060 if (chain == NULL) { 3061 /* 3062 * First allocate media space and construct the dummy bref, 3063 * then allocate the in-memory chain structure. Set the 3064 * INITIAL flag for fresh chains which do not have embedded 3065 * data. 3066 * 3067 * XXX for now set the check mode of the child based on 3068 * the parent or, if the parent is an inode, the 3069 * specification in the inode. 3070 */ 3071 bzero(&dummy, sizeof(dummy)); 3072 dummy.type = type; 3073 dummy.key = key; 3074 dummy.keybits = keybits; 3075 dummy.data_off = hammer2_getradix(bytes); 3076 3077 /* 3078 * Inherit methods from parent by default. Primarily used 3079 * for BREF_TYPE_DATA. Non-data types *must* be set to 3080 * a non-NONE check algorithm. 3081 */ 3082 if (methods == -1) 3083 dummy.methods = parent->bref.methods; 3084 else 3085 dummy.methods = (uint8_t)methods; 3086 3087 if (type != HAMMER2_BREF_TYPE_DATA && 3088 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3089 dummy.methods |= 3090 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3091 } 3092 3093 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3094 3095 /* 3096 * Lock the chain manually, chain_lock will load the chain 3097 * which we do NOT want to do. (note: chain->refs is set 3098 * to 1 by chain_alloc() for us, but lockcnt is not). 3099 */ 3100 chain->lockcnt = 1; 3101 hammer2_mtx_ex(&chain->lock); 3102 allocated = 1; 3103 ++curthread->td_tracker; 3104 3105 /* 3106 * Set INITIAL to optimize I/O. The flag will generally be 3107 * processed when we call hammer2_chain_modify(). 3108 * 3109 * Recalculate bytes to reflect the actual media block 3110 * allocation. Handle special case radix 0 == 0 bytes. 3111 */ 3112 bytes = (size_t)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 3113 if (bytes) 3114 bytes = (hammer2_off_t)1 << bytes; 3115 chain->bytes = bytes; 3116 3117 switch(type) { 3118 case HAMMER2_BREF_TYPE_VOLUME: 3119 case HAMMER2_BREF_TYPE_FREEMAP: 3120 panic("hammer2_chain_create: called with volume type"); 3121 break; 3122 case HAMMER2_BREF_TYPE_INDIRECT: 3123 panic("hammer2_chain_create: cannot be used to" 3124 "create indirect block"); 3125 break; 3126 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3127 panic("hammer2_chain_create: cannot be used to" 3128 "create freemap root or node"); 3129 break; 3130 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3131 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3132 /* fall through */ 3133 case HAMMER2_BREF_TYPE_DIRENT: 3134 case HAMMER2_BREF_TYPE_INODE: 3135 case HAMMER2_BREF_TYPE_DATA: 3136 default: 3137 /* 3138 * leave chain->data NULL, set INITIAL 3139 */ 3140 KKASSERT(chain->data == NULL); 3141 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3142 break; 3143 } 3144 } else { 3145 /* 3146 * We are reattaching a previously deleted chain, possibly 3147 * under a new parent and possibly with a new key/keybits. 3148 * The chain does not have to be in a modified state. The 3149 * UPDATE flag will be set later on in this routine. 3150 * 3151 * Do NOT mess with the current state of the INITIAL flag. 3152 */ 3153 chain->bref.key = key; 3154 chain->bref.keybits = keybits; 3155 if (chain->flags & HAMMER2_CHAIN_DELETED) 3156 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3157 KKASSERT(chain->parent == NULL); 3158 } 3159 if (flags & HAMMER2_INSERT_PFSROOT) 3160 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3161 else 3162 chain->bref.flags &= ~HAMMER2_BREF_FLAG_PFSROOT; 3163 3164 /* 3165 * Calculate how many entries we have in the blockref array and 3166 * determine if an indirect block is required. 3167 */ 3168 again: 3169 if (--maxloops == 0) 3170 panic("hammer2_chain_create: maxloops"); 3171 3172 switch(parent->bref.type) { 3173 case HAMMER2_BREF_TYPE_INODE: 3174 if ((parent->data->ipdata.meta.op_flags & 3175 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3176 kprintf("hammer2: parent set for direct-data! " 3177 "pkey=%016jx ckey=%016jx\n", 3178 parent->bref.key, 3179 chain->bref.key); 3180 } 3181 KKASSERT((parent->data->ipdata.meta.op_flags & 3182 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3183 KKASSERT(parent->data != NULL); 3184 base = &parent->data->ipdata.u.blockset.blockref[0]; 3185 count = HAMMER2_SET_COUNT; 3186 break; 3187 case HAMMER2_BREF_TYPE_INDIRECT: 3188 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3189 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3190 base = NULL; 3191 else 3192 base = &parent->data->npdata[0]; 3193 count = parent->bytes / sizeof(hammer2_blockref_t); 3194 break; 3195 case HAMMER2_BREF_TYPE_VOLUME: 3196 KKASSERT(parent->data != NULL); 3197 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3198 count = HAMMER2_SET_COUNT; 3199 break; 3200 case HAMMER2_BREF_TYPE_FREEMAP: 3201 KKASSERT(parent->data != NULL); 3202 base = &parent->data->blkset.blockref[0]; 3203 count = HAMMER2_SET_COUNT; 3204 break; 3205 default: 3206 panic("hammer2_chain_create: unrecognized blockref type: %d", 3207 parent->bref.type); 3208 base = NULL; 3209 count = 0; 3210 break; 3211 } 3212 3213 /* 3214 * Make sure we've counted the brefs 3215 */ 3216 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3217 hammer2_chain_countbrefs(parent, base, count); 3218 3219 KASSERT(parent->core.live_count >= 0 && 3220 parent->core.live_count <= count, 3221 ("bad live_count %d/%d (%02x, %d)", 3222 parent->core.live_count, count, 3223 parent->bref.type, parent->bytes)); 3224 3225 /* 3226 * If no free blockref could be found we must create an indirect 3227 * block and move a number of blockrefs into it. With the parent 3228 * locked we can safely lock each child in order to delete+duplicate 3229 * it without causing a deadlock. 3230 * 3231 * This may return the new indirect block or the old parent depending 3232 * on where the key falls. NULL is returned on error. 3233 */ 3234 if (parent->core.live_count == count) { 3235 hammer2_chain_t *nparent; 3236 3237 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3238 3239 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3240 mtid, type, &error); 3241 if (nparent == NULL) { 3242 if (allocated) 3243 hammer2_chain_drop(chain); 3244 chain = NULL; 3245 goto done; 3246 } 3247 if (parent != nparent) { 3248 hammer2_chain_unlock(parent); 3249 hammer2_chain_drop(parent); 3250 parent = *parentp = nparent; 3251 } 3252 goto again; 3253 } 3254 3255 if (chain->flags & HAMMER2_CHAIN_DELETED) 3256 kprintf("Inserting deleted chain @%016jx\n", 3257 chain->bref.key); 3258 3259 /* 3260 * Link the chain into its parent. 3261 */ 3262 if (chain->parent != NULL) 3263 panic("hammer2: hammer2_chain_create: chain already connected"); 3264 KKASSERT(chain->parent == NULL); 3265 KKASSERT(parent->core.live_count < count); 3266 hammer2_chain_insert(parent, chain, 3267 HAMMER2_CHAIN_INSERT_SPIN | 3268 HAMMER2_CHAIN_INSERT_LIVE, 3269 0); 3270 3271 if (allocated) { 3272 /* 3273 * Mark the newly created chain modified. This will cause 3274 * UPDATE to be set and process the INITIAL flag. 3275 * 3276 * Device buffers are not instantiated for DATA elements 3277 * as these are handled by logical buffers. 3278 * 3279 * Indirect and freemap node indirect blocks are handled 3280 * by hammer2_chain_create_indirect() and not by this 3281 * function. 3282 * 3283 * Data for all other bref types is expected to be 3284 * instantiated (INODE, LEAF). 3285 */ 3286 switch(chain->bref.type) { 3287 case HAMMER2_BREF_TYPE_DATA: 3288 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3289 case HAMMER2_BREF_TYPE_DIRENT: 3290 case HAMMER2_BREF_TYPE_INODE: 3291 error = hammer2_chain_modify(chain, mtid, dedup_off, 3292 HAMMER2_MODIFY_OPTDATA); 3293 break; 3294 default: 3295 /* 3296 * Remaining types are not supported by this function. 3297 * In particular, INDIRECT and LEAF_NODE types are 3298 * handled by create_indirect(). 3299 */ 3300 panic("hammer2_chain_create: bad type: %d", 3301 chain->bref.type); 3302 /* NOT REACHED */ 3303 break; 3304 } 3305 } else { 3306 /* 3307 * When reconnecting a chain we must set UPDATE and 3308 * setflush so the flush recognizes that it must update 3309 * the bref in the parent. 3310 */ 3311 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3312 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3313 } 3314 3315 /* 3316 * We must setflush(parent) to ensure that it recurses through to 3317 * chain. setflush(chain) might not work because ONFLUSH is possibly 3318 * already set in the chain (so it won't recurse up to set it in the 3319 * parent). 3320 */ 3321 hammer2_chain_setflush(parent); 3322 3323 done: 3324 *chainp = chain; 3325 3326 return (error); 3327 } 3328 3329 /* 3330 * Move the chain from its old parent to a new parent. The chain must have 3331 * already been deleted or already disconnected (or never associated) with 3332 * a parent. The chain is reassociated with the new parent and the deleted 3333 * flag will be cleared (no longer deleted). The chain's modification state 3334 * is not altered. 3335 * 3336 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3337 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3338 * FULL. This typically means that the caller is creating the chain after 3339 * doing a hammer2_chain_lookup(). 3340 * 3341 * A non-NULL bref is typically passed when key and keybits must be overridden. 3342 * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields 3343 * from a passed-in bref and uses the old chain's bref for everything else. 3344 * 3345 * Neither (parent) or (chain) can be errored. 3346 * 3347 * If (parent) is non-NULL then the chain is inserted under the parent. 3348 * 3349 * If (parent) is NULL then the newly duplicated chain is not inserted 3350 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3351 * passing into hammer2_chain_create() after this function returns). 3352 * 3353 * WARNING! This function calls create which means it can insert indirect 3354 * blocks. This can cause other unrelated chains in the parent to 3355 * be moved to a newly inserted indirect block in addition to the 3356 * specific chain. 3357 */ 3358 void 3359 hammer2_chain_rename(hammer2_blockref_t *bref, 3360 hammer2_chain_t **parentp, hammer2_chain_t *chain, 3361 hammer2_tid_t mtid, int flags) 3362 { 3363 hammer2_dev_t *hmp; 3364 hammer2_chain_t *parent; 3365 size_t bytes; 3366 3367 /* 3368 * WARNING! We should never resolve DATA to device buffers 3369 * (XXX allow it if the caller did?), and since 3370 * we currently do not have the logical buffer cache 3371 * buffer in-hand to fix its cached physical offset 3372 * we also force the modify code to not COW it. XXX 3373 */ 3374 hmp = chain->hmp; 3375 KKASSERT(chain->parent == NULL); 3376 KKASSERT(chain->error == 0); 3377 3378 /* 3379 * Now create a duplicate of the chain structure, associating 3380 * it with the same core, making it the same size, pointing it 3381 * to the same bref (the same media block). 3382 * 3383 * NOTE: Handle special radix == 0 case (means 0 bytes). 3384 */ 3385 if (bref == NULL) 3386 bref = &chain->bref; 3387 bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 3388 if (bytes) 3389 bytes = (hammer2_off_t)1 << bytes; 3390 3391 /* 3392 * If parent is not NULL the duplicated chain will be entered under 3393 * the parent and the UPDATE bit set to tell flush to update 3394 * the blockref. 3395 * 3396 * We must setflush(parent) to ensure that it recurses through to 3397 * chain. setflush(chain) might not work because ONFLUSH is possibly 3398 * already set in the chain (so it won't recurse up to set it in the 3399 * parent). 3400 * 3401 * Having both chains locked is extremely important for atomicy. 3402 */ 3403 if (parentp && (parent = *parentp) != NULL) { 3404 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3405 KKASSERT(parent->refs > 0); 3406 KKASSERT(parent->error == 0); 3407 3408 hammer2_chain_create(parentp, &chain, 3409 chain->pmp, HAMMER2_METH_DEFAULT, 3410 bref->key, bref->keybits, bref->type, 3411 chain->bytes, mtid, 0, flags); 3412 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3413 hammer2_chain_setflush(*parentp); 3414 } 3415 } 3416 3417 /* 3418 * Helper function for deleting chains. 3419 * 3420 * The chain is removed from the live view (the RBTREE) as well as the parent's 3421 * blockmap. Both chain and its parent must be locked. 3422 * 3423 * parent may not be errored. chain can be errored. 3424 */ 3425 static int 3426 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3427 hammer2_tid_t mtid, int flags) 3428 { 3429 hammer2_dev_t *hmp; 3430 int error = 0; 3431 3432 KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED | 3433 HAMMER2_CHAIN_FICTITIOUS)) == 0); 3434 KKASSERT(chain->parent == parent); 3435 hmp = chain->hmp; 3436 3437 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 3438 /* 3439 * Chain is blockmapped, so there must be a parent. 3440 * Atomically remove the chain from the parent and remove 3441 * the blockmap entry. The parent must be set modified 3442 * to remove the blockmap entry. 3443 */ 3444 hammer2_blockref_t *base; 3445 int count; 3446 3447 KKASSERT(parent != NULL); 3448 KKASSERT(parent->error == 0); 3449 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3450 error = hammer2_chain_modify(parent, mtid, 0, 0); 3451 if (error) 3452 goto done; 3453 3454 /* 3455 * Calculate blockmap pointer 3456 */ 3457 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3458 hammer2_spin_ex(&chain->core.spin); 3459 hammer2_spin_ex(&parent->core.spin); 3460 3461 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3462 atomic_add_int(&parent->core.live_count, -1); 3463 ++parent->core.generation; 3464 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3465 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3466 --parent->core.chain_count; 3467 chain->parent = NULL; 3468 3469 switch(parent->bref.type) { 3470 case HAMMER2_BREF_TYPE_INODE: 3471 /* 3472 * Access the inode's block array. However, there 3473 * is no block array if the inode is flagged 3474 * DIRECTDATA. 3475 */ 3476 if (parent->data && 3477 (parent->data->ipdata.meta.op_flags & 3478 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3479 base = 3480 &parent->data->ipdata.u.blockset.blockref[0]; 3481 } else { 3482 base = NULL; 3483 } 3484 count = HAMMER2_SET_COUNT; 3485 break; 3486 case HAMMER2_BREF_TYPE_INDIRECT: 3487 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3488 if (parent->data) 3489 base = &parent->data->npdata[0]; 3490 else 3491 base = NULL; 3492 count = parent->bytes / sizeof(hammer2_blockref_t); 3493 break; 3494 case HAMMER2_BREF_TYPE_VOLUME: 3495 base = &parent->data->voldata. 3496 sroot_blockset.blockref[0]; 3497 count = HAMMER2_SET_COUNT; 3498 break; 3499 case HAMMER2_BREF_TYPE_FREEMAP: 3500 base = &parent->data->blkset.blockref[0]; 3501 count = HAMMER2_SET_COUNT; 3502 break; 3503 default: 3504 base = NULL; 3505 count = 0; 3506 panic("hammer2_flush_pass2: " 3507 "unrecognized blockref type: %d", 3508 parent->bref.type); 3509 } 3510 3511 /* 3512 * delete blockmapped chain from its parent. 3513 * 3514 * The parent is not affected by any statistics in chain 3515 * which are pending synchronization. That is, there is 3516 * nothing to undo in the parent since they have not yet 3517 * been incorporated into the parent. 3518 * 3519 * The parent is affected by statistics stored in inodes. 3520 * Those have already been synchronized, so they must be 3521 * undone. XXX split update possible w/delete in middle? 3522 */ 3523 if (base) { 3524 hammer2_base_delete(parent, base, count, chain); 3525 } 3526 hammer2_spin_unex(&parent->core.spin); 3527 hammer2_spin_unex(&chain->core.spin); 3528 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3529 /* 3530 * Chain is not blockmapped but a parent is present. 3531 * Atomically remove the chain from the parent. There is 3532 * no blockmap entry to remove. 3533 * 3534 * Because chain was associated with a parent but not 3535 * synchronized, the chain's *_count_up fields contain 3536 * inode adjustment statistics which must be undone. 3537 */ 3538 hammer2_spin_ex(&chain->core.spin); 3539 hammer2_spin_ex(&parent->core.spin); 3540 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3541 atomic_add_int(&parent->core.live_count, -1); 3542 ++parent->core.generation; 3543 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3544 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3545 --parent->core.chain_count; 3546 chain->parent = NULL; 3547 hammer2_spin_unex(&parent->core.spin); 3548 hammer2_spin_unex(&chain->core.spin); 3549 } else { 3550 /* 3551 * Chain is not blockmapped and has no parent. This 3552 * is a degenerate case. 3553 */ 3554 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3555 } 3556 done: 3557 return error; 3558 } 3559 3560 /* 3561 * Create an indirect block that covers one or more of the elements in the 3562 * current parent. Either returns the existing parent with no locking or 3563 * ref changes or returns the new indirect block locked and referenced 3564 * and leaving the original parent lock/ref intact as well. 3565 * 3566 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3567 * 3568 * The returned chain depends on where the specified key falls. 3569 * 3570 * The key/keybits for the indirect mode only needs to follow three rules: 3571 * 3572 * (1) That all elements underneath it fit within its key space and 3573 * 3574 * (2) That all elements outside it are outside its key space. 3575 * 3576 * (3) When creating the new indirect block any elements in the current 3577 * parent that fit within the new indirect block's keyspace must be 3578 * moved into the new indirect block. 3579 * 3580 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3581 * keyspace the the current parent, but lookup/iteration rules will 3582 * ensure (and must ensure) that rule (2) for all parents leading up 3583 * to the nearest inode or the root volume header is adhered to. This 3584 * is accomplished by always recursing through matching keyspaces in 3585 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3586 * 3587 * The current implementation calculates the current worst-case keyspace by 3588 * iterating the current parent and then divides it into two halves, choosing 3589 * whichever half has the most elements (not necessarily the half containing 3590 * the requested key). 3591 * 3592 * We can also opt to use the half with the least number of elements. This 3593 * causes lower-numbered keys (aka logical file offsets) to recurse through 3594 * fewer indirect blocks and higher-numbered keys to recurse through more. 3595 * This also has the risk of not moving enough elements to the new indirect 3596 * block and being forced to create several indirect blocks before the element 3597 * can be inserted. 3598 * 3599 * Must be called with an exclusively locked parent. 3600 * 3601 * NOTE: *errorp set to HAMMER_ERROR_* flags 3602 */ 3603 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3604 hammer2_key_t *keyp, int keybits, 3605 hammer2_blockref_t *base, int count); 3606 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3607 hammer2_key_t *keyp, int keybits, 3608 hammer2_blockref_t *base, int count, 3609 int ncount); 3610 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3611 hammer2_key_t *keyp, int keybits, 3612 hammer2_blockref_t *base, int count, 3613 int ncount); 3614 static 3615 hammer2_chain_t * 3616 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3617 hammer2_key_t create_key, int create_bits, 3618 hammer2_tid_t mtid, int for_type, int *errorp) 3619 { 3620 hammer2_dev_t *hmp; 3621 hammer2_blockref_t *base; 3622 hammer2_blockref_t *bref; 3623 hammer2_blockref_t bcopy; 3624 hammer2_chain_t *chain; 3625 hammer2_chain_t *ichain; 3626 hammer2_chain_t dummy; 3627 hammer2_key_t key = create_key; 3628 hammer2_key_t key_beg; 3629 hammer2_key_t key_end; 3630 hammer2_key_t key_next; 3631 int keybits = create_bits; 3632 int count; 3633 int ncount; 3634 int nbytes; 3635 int loops; 3636 int error; 3637 int reason; 3638 int generation; 3639 int maxloops = 300000; 3640 3641 /* 3642 * Calculate the base blockref pointer or NULL if the chain 3643 * is known to be empty. We need to calculate the array count 3644 * for RB lookups either way. 3645 */ 3646 hmp = parent->hmp; 3647 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3648 3649 /* 3650 * Pre-modify the parent now to avoid having to deal with error 3651 * processing if we tried to later (in the middle of our loop). 3652 */ 3653 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 3654 if (*errorp) { 3655 kprintf("hammer2_create_indirect: error %08x %s\n", 3656 *errorp, hammer2_error_str(*errorp)); 3657 return NULL; 3658 } 3659 3660 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 3661 base = hammer2_chain_base_and_count(parent, &count); 3662 3663 /* 3664 * dummy used in later chain allocation (no longer used for lookups). 3665 */ 3666 bzero(&dummy, sizeof(dummy)); 3667 3668 /* 3669 * How big should our new indirect block be? It has to be at least 3670 * as large as its parent for splits to work properly. 3671 * 3672 * The freemap uses a specific indirect block size. The number of 3673 * levels are built dynamically and ultimately depend on the size 3674 * volume. Because freemap blocks are taken from the reserved areas 3675 * of the volume our goal is efficiency (fewer levels) and not so 3676 * much to save disk space. 3677 * 3678 * The first indirect block level for a directory usually uses 3679 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 3680 * the hash mechanism, this typically gives us a nominal 3681 * 32 * 4 entries with one level of indirection. 3682 * 3683 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 3684 * indirect blocks. The initial 4 entries in the inode gives us 3685 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 3686 * of indirection gives us 137GB, and so forth. H2 can support 3687 * huge file sizes but they are not typical, so we try to stick 3688 * with compactness and do not use a larger indirect block size. 3689 * 3690 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 3691 * due to the way indirect blocks are created this usually winds 3692 * up being extremely inefficient for small files. Even though 3693 * 16KB requires more levels of indirection for very large files, 3694 * the 16KB records can be ganged together into 64KB DIOs. 3695 */ 3696 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3697 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3698 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 3699 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 3700 if (parent->data->ipdata.meta.type == 3701 HAMMER2_OBJTYPE_DIRECTORY) 3702 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 3703 else 3704 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 3705 3706 } else { 3707 nbytes = HAMMER2_IND_BYTES_NOM; 3708 } 3709 if (nbytes < count * sizeof(hammer2_blockref_t)) { 3710 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 3711 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 3712 nbytes = count * sizeof(hammer2_blockref_t); 3713 } 3714 ncount = nbytes / sizeof(hammer2_blockref_t); 3715 3716 /* 3717 * When creating an indirect block for a freemap node or leaf 3718 * the key/keybits must be fitted to static radix levels because 3719 * particular radix levels use particular reserved blocks in the 3720 * related zone. 3721 * 3722 * This routine calculates the key/radix of the indirect block 3723 * we need to create, and whether it is on the high-side or the 3724 * low-side. 3725 */ 3726 switch(for_type) { 3727 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3728 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3729 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 3730 base, count); 3731 break; 3732 case HAMMER2_BREF_TYPE_DATA: 3733 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 3734 base, count, ncount); 3735 break; 3736 case HAMMER2_BREF_TYPE_DIRENT: 3737 case HAMMER2_BREF_TYPE_INODE: 3738 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 3739 base, count, ncount); 3740 break; 3741 default: 3742 panic("illegal indirect block for bref type %d", for_type); 3743 break; 3744 } 3745 3746 /* 3747 * Normalize the key for the radix being represented, keeping the 3748 * high bits and throwing away the low bits. 3749 */ 3750 key &= ~(((hammer2_key_t)1 << keybits) - 1); 3751 3752 /* 3753 * Ok, create our new indirect block 3754 */ 3755 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3756 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3757 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 3758 } else { 3759 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 3760 } 3761 dummy.bref.key = key; 3762 dummy.bref.keybits = keybits; 3763 dummy.bref.data_off = hammer2_getradix(nbytes); 3764 dummy.bref.methods = 3765 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 3766 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 3767 3768 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy.bref); 3769 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 3770 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 3771 /* ichain has one ref at this point */ 3772 3773 /* 3774 * We have to mark it modified to allocate its block, but use 3775 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 3776 * it won't be acted upon by the flush code. 3777 */ 3778 *errorp = hammer2_chain_modify(ichain, mtid, 0, HAMMER2_MODIFY_OPTDATA); 3779 if (*errorp) { 3780 kprintf("hammer2_alloc_indirect: error %08x %s\n", 3781 *errorp, hammer2_error_str(*errorp)); 3782 hammer2_chain_unlock(ichain); 3783 hammer2_chain_drop(ichain); 3784 return NULL; 3785 } 3786 3787 /* 3788 * Iterate the original parent and move the matching brefs into 3789 * the new indirect block. 3790 * 3791 * XXX handle flushes. 3792 */ 3793 key_beg = 0; 3794 key_end = HAMMER2_KEY_MAX; 3795 key_next = 0; /* avoid gcc warnings */ 3796 hammer2_spin_ex(&parent->core.spin); 3797 loops = 0; 3798 reason = 0; 3799 3800 for (;;) { 3801 /* 3802 * Parent may have been modified, relocating its block array. 3803 * Reload the base pointer. 3804 */ 3805 base = hammer2_chain_base_and_count(parent, &count); 3806 3807 if (++loops > 100000) { 3808 hammer2_spin_unex(&parent->core.spin); 3809 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 3810 reason, parent, base, count, key_next); 3811 } 3812 3813 /* 3814 * NOTE: spinlock stays intact, returned chain (if not NULL) 3815 * is not referenced or locked which means that we 3816 * cannot safely check its flagged / deletion status 3817 * until we lock it. 3818 */ 3819 chain = hammer2_combined_find(parent, base, count, 3820 &key_next, 3821 key_beg, key_end, 3822 &bref); 3823 generation = parent->core.generation; 3824 if (bref == NULL) 3825 break; 3826 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3827 3828 /* 3829 * Skip keys that are not within the key/radix of the new 3830 * indirect block. They stay in the parent. 3831 */ 3832 if ((~(((hammer2_key_t)1 << keybits) - 1) & 3833 (key ^ bref->key)) != 0) { 3834 goto next_key_spinlocked; 3835 } 3836 3837 /* 3838 * Load the new indirect block by acquiring the related 3839 * chains (potentially from media as it might not be 3840 * in-memory). Then move it to the new parent (ichain). 3841 * 3842 * chain is referenced but not locked. We must lock the 3843 * chain to obtain definitive state. 3844 */ 3845 if (chain) { 3846 /* 3847 * Use chain already present in the RBTREE 3848 */ 3849 hammer2_chain_ref(chain); 3850 hammer2_spin_unex(&parent->core.spin); 3851 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3852 } else { 3853 /* 3854 * Get chain for blockref element. _get returns NULL 3855 * on insertion race. 3856 */ 3857 bcopy = *bref; 3858 hammer2_spin_unex(&parent->core.spin); 3859 chain = hammer2_chain_get(parent, generation, &bcopy); 3860 if (chain == NULL) { 3861 reason = 1; 3862 hammer2_spin_ex(&parent->core.spin); 3863 continue; 3864 } 3865 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3866 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 3867 reason = 2; 3868 hammer2_chain_unlock(chain); 3869 hammer2_chain_drop(chain); 3870 hammer2_spin_ex(&parent->core.spin); 3871 continue; 3872 } 3873 } 3874 3875 /* 3876 * This is always live so if the chain has been deleted 3877 * we raced someone and we have to retry. 3878 * 3879 * NOTE: Lookups can race delete-duplicate because 3880 * delete-duplicate does not lock the parent's core 3881 * (they just use the spinlock on the core). 3882 * 3883 * (note reversed logic for this one) 3884 */ 3885 if (chain->parent != parent || 3886 (chain->flags & HAMMER2_CHAIN_DELETED)) { 3887 hammer2_chain_unlock(chain); 3888 hammer2_chain_drop(chain); 3889 kprintf("hammer2_chain_create_indirect " 3890 "RETRY (%p,%p)->%p %08x\n", 3891 parent, chain->parent, chain, chain->flags); 3892 hammer2_spin_ex(&parent->core.spin); 3893 continue; 3894 } 3895 3896 /* 3897 * Shift the chain to the indirect block. 3898 * 3899 * WARNING! No reason for us to load chain data, pass NOSTATS 3900 * to prevent delete/insert from trying to access 3901 * inode stats (and thus asserting if there is no 3902 * chain->data loaded). 3903 * 3904 * WARNING! The (parent, chain) deletion may modify the parent 3905 * and invalidate the base pointer. 3906 * 3907 * WARNING! Parent must already be marked modified, so we 3908 * can assume that chain_delete always suceeds. 3909 */ 3910 error = hammer2_chain_delete(parent, chain, mtid, 0); 3911 KKASSERT(error == 0); 3912 hammer2_chain_rename(NULL, &ichain, chain, mtid, 0); 3913 hammer2_chain_unlock(chain); 3914 hammer2_chain_drop(chain); 3915 KKASSERT(parent->refs > 0); 3916 chain = NULL; 3917 base = NULL; /* safety */ 3918 hammer2_spin_ex(&parent->core.spin); 3919 next_key_spinlocked: 3920 if (--maxloops == 0) 3921 panic("hammer2_chain_create_indirect: maxloops"); 3922 reason = 4; 3923 if (key_next == 0 || key_next > key_end) 3924 break; 3925 key_beg = key_next; 3926 /* loop */ 3927 } 3928 hammer2_spin_unex(&parent->core.spin); 3929 3930 /* 3931 * Insert the new indirect block into the parent now that we've 3932 * cleared out some entries in the parent. We calculated a good 3933 * insertion index in the loop above (ichain->index). 3934 * 3935 * We don't have to set UPDATE here because we mark ichain 3936 * modified down below (so the normal modified -> flush -> set-moved 3937 * sequence applies). 3938 * 3939 * The insertion shouldn't race as this is a completely new block 3940 * and the parent is locked. 3941 */ 3942 base = NULL; /* safety, parent modify may change address */ 3943 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 3944 KKASSERT(parent->core.live_count < count); 3945 hammer2_chain_insert(parent, ichain, 3946 HAMMER2_CHAIN_INSERT_SPIN | 3947 HAMMER2_CHAIN_INSERT_LIVE, 3948 0); 3949 3950 /* 3951 * Make sure flushes propogate after our manual insertion. 3952 */ 3953 hammer2_chain_setflush(ichain); 3954 hammer2_chain_setflush(parent); 3955 3956 /* 3957 * Figure out what to return. 3958 */ 3959 if (~(((hammer2_key_t)1 << keybits) - 1) & 3960 (create_key ^ key)) { 3961 /* 3962 * Key being created is outside the key range, 3963 * return the original parent. 3964 */ 3965 hammer2_chain_unlock(ichain); 3966 hammer2_chain_drop(ichain); 3967 } else { 3968 /* 3969 * Otherwise its in the range, return the new parent. 3970 * (leave both the new and old parent locked). 3971 */ 3972 parent = ichain; 3973 } 3974 3975 return(parent); 3976 } 3977 3978 /* 3979 * Do maintenance on an indirect chain. Both parent and chain are locked. 3980 * 3981 * Returns non-zero if (chain) is deleted, either due to being empty or 3982 * because its children were safely moved into the parent. 3983 */ 3984 int 3985 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 3986 hammer2_chain_t *chain) 3987 { 3988 hammer2_blockref_t *chain_base; 3989 hammer2_blockref_t *base; 3990 hammer2_blockref_t *bref; 3991 hammer2_blockref_t bcopy; 3992 hammer2_key_t key_next; 3993 hammer2_key_t key_beg; 3994 hammer2_key_t key_end; 3995 hammer2_chain_t *sub; 3996 int chain_count; 3997 int count; 3998 int error; 3999 int generation; 4000 4001 /* 4002 * Make sure we have an accurate live_count 4003 */ 4004 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4005 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4006 base = &chain->data->npdata[0]; 4007 count = chain->bytes / sizeof(hammer2_blockref_t); 4008 hammer2_chain_countbrefs(chain, base, count); 4009 } 4010 4011 /* 4012 * If the indirect block is empty we can delete it. 4013 * (ignore deletion error) 4014 */ 4015 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4016 hammer2_chain_delete(parent, chain, 4017 chain->bref.modify_tid, 4018 HAMMER2_DELETE_PERMANENT); 4019 return 1; 4020 } 4021 4022 base = hammer2_chain_base_and_count(parent, &count); 4023 4024 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4025 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4026 hammer2_chain_countbrefs(parent, base, count); 4027 } 4028 4029 /* 4030 * Determine if we can collapse chain into parent, calculate 4031 * hysteresis for chain emptiness. 4032 */ 4033 if (parent->core.live_count + chain->core.live_count - 1 > count) 4034 return 0; 4035 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4036 if (chain->core.live_count > chain_count * 3 / 4) 4037 return 0; 4038 4039 /* 4040 * Ok, theoretically we can collapse chain's contents into 4041 * parent. chain is locked, but any in-memory children of chain 4042 * are not. For this to work, we must be able to dispose of any 4043 * in-memory children of chain. 4044 * 4045 * For now require that there are no in-memory children of chain. 4046 * 4047 * WARNING! Both chain and parent must remain locked across this 4048 * entire operation. 4049 */ 4050 4051 /* 4052 * Parent must be marked modified. Don't try to collapse it if we 4053 * can't mark it modified. Once modified, destroy chain to make room 4054 * and to get rid of what will be a conflicting key (this is included 4055 * in the calculation above). Finally, move the children of chain 4056 * into chain's parent. 4057 * 4058 * This order creates an accounting problem for bref.embed.stats 4059 * because we destroy chain before we remove its children. Any 4060 * elements whos blockref is already synchronized will be counted 4061 * twice. To deal with the problem we clean out chain's stats prior 4062 * to deleting it. 4063 */ 4064 error = hammer2_chain_modify(parent, 0, 0, 0); 4065 if (error) { 4066 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4067 hammer2_error_str(error)); 4068 return 0; 4069 } 4070 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4071 if (error) { 4072 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4073 hammer2_error_str(error)); 4074 return 0; 4075 } 4076 4077 chain->bref.embed.stats.inode_count = 0; 4078 chain->bref.embed.stats.data_count = 0; 4079 error = hammer2_chain_delete(parent, chain, 4080 chain->bref.modify_tid, 4081 HAMMER2_DELETE_PERMANENT); 4082 KKASSERT(error == 0); 4083 4084 /* 4085 * The combined_find call requires core.spin to be held. One would 4086 * think there wouldn't be any conflicts since we hold chain 4087 * exclusively locked, but the caching mechanism for 0-ref children 4088 * does not require a chain lock. 4089 */ 4090 hammer2_spin_ex(&chain->core.spin); 4091 4092 key_next = 0; 4093 key_beg = 0; 4094 key_end = HAMMER2_KEY_MAX; 4095 for (;;) { 4096 chain_base = &chain->data->npdata[0]; 4097 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4098 sub = hammer2_combined_find(chain, chain_base, chain_count, 4099 &key_next, 4100 key_beg, key_end, 4101 &bref); 4102 generation = chain->core.generation; 4103 if (bref == NULL) 4104 break; 4105 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4106 4107 if (sub) { 4108 hammer2_chain_ref(sub); 4109 hammer2_spin_unex(&chain->core.spin); 4110 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4111 if (sub->parent != chain || 4112 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4113 hammer2_chain_unlock(sub); 4114 hammer2_chain_drop(sub); 4115 hammer2_spin_ex(&chain->core.spin); 4116 continue; 4117 } 4118 } else { 4119 bcopy = *bref; 4120 hammer2_spin_unex(&chain->core.spin); 4121 sub = hammer2_chain_get(chain, generation, &bcopy); 4122 if (sub == NULL) { 4123 hammer2_spin_ex(&chain->core.spin); 4124 continue; 4125 } 4126 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4127 if (bcmp(&bcopy, bref, sizeof(bcopy)) != 0) { 4128 hammer2_chain_unlock(sub); 4129 hammer2_chain_drop(sub); 4130 hammer2_spin_ex(&chain->core.spin); 4131 continue; 4132 } 4133 } 4134 error = hammer2_chain_delete(chain, sub, 4135 sub->bref.modify_tid, 0); 4136 KKASSERT(error == 0); 4137 hammer2_chain_rename(NULL, &parent, sub, 4138 sub->bref.modify_tid, 4139 HAMMER2_INSERT_SAMEPARENT); 4140 hammer2_chain_unlock(sub); 4141 hammer2_chain_drop(sub); 4142 hammer2_spin_ex(&chain->core.spin); 4143 4144 if (key_next == 0) 4145 break; 4146 key_beg = key_next; 4147 } 4148 hammer2_spin_unex(&chain->core.spin); 4149 4150 return 1; 4151 } 4152 4153 /* 4154 * Freemap indirect blocks 4155 * 4156 * Calculate the keybits and highside/lowside of the freemap node the 4157 * caller is creating. 4158 * 4159 * This routine will specify the next higher-level freemap key/radix 4160 * representing the lowest-ordered set. By doing so, eventually all 4161 * low-ordered sets will be moved one level down. 4162 * 4163 * We have to be careful here because the freemap reserves a limited 4164 * number of blocks for a limited number of levels. So we can't just 4165 * push indiscriminately. 4166 */ 4167 int 4168 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4169 int keybits, hammer2_blockref_t *base, int count) 4170 { 4171 hammer2_chain_t *chain; 4172 hammer2_blockref_t *bref; 4173 hammer2_key_t key; 4174 hammer2_key_t key_beg; 4175 hammer2_key_t key_end; 4176 hammer2_key_t key_next; 4177 int locount; 4178 int hicount; 4179 int maxloops = 300000; 4180 4181 key = *keyp; 4182 locount = 0; 4183 hicount = 0; 4184 keybits = 64; 4185 4186 /* 4187 * Calculate the range of keys in the array being careful to skip 4188 * slots which are overridden with a deletion. 4189 */ 4190 key_beg = 0; 4191 key_end = HAMMER2_KEY_MAX; 4192 hammer2_spin_ex(&parent->core.spin); 4193 4194 for (;;) { 4195 if (--maxloops == 0) { 4196 panic("indkey_freemap shit %p %p:%d\n", 4197 parent, base, count); 4198 } 4199 chain = hammer2_combined_find(parent, base, count, 4200 &key_next, 4201 key_beg, key_end, 4202 &bref); 4203 4204 /* 4205 * Exhausted search 4206 */ 4207 if (bref == NULL) 4208 break; 4209 4210 /* 4211 * Skip deleted chains. 4212 */ 4213 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4214 if (key_next == 0 || key_next > key_end) 4215 break; 4216 key_beg = key_next; 4217 continue; 4218 } 4219 4220 /* 4221 * Use the full live (not deleted) element for the scan 4222 * iteration. HAMMER2 does not allow partial replacements. 4223 * 4224 * XXX should be built into hammer2_combined_find(). 4225 */ 4226 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4227 4228 if (keybits > bref->keybits) { 4229 key = bref->key; 4230 keybits = bref->keybits; 4231 } else if (keybits == bref->keybits && bref->key < key) { 4232 key = bref->key; 4233 } 4234 if (key_next == 0) 4235 break; 4236 key_beg = key_next; 4237 } 4238 hammer2_spin_unex(&parent->core.spin); 4239 4240 /* 4241 * Return the keybits for a higher-level FREEMAP_NODE covering 4242 * this node. 4243 */ 4244 switch(keybits) { 4245 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4246 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4247 break; 4248 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4249 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4250 break; 4251 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4252 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4253 break; 4254 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4255 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4256 break; 4257 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4258 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4259 break; 4260 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4261 panic("hammer2_chain_indkey_freemap: level too high"); 4262 break; 4263 default: 4264 panic("hammer2_chain_indkey_freemap: bad radix"); 4265 break; 4266 } 4267 *keyp = key; 4268 4269 return (keybits); 4270 } 4271 4272 /* 4273 * File indirect blocks 4274 * 4275 * Calculate the key/keybits for the indirect block to create by scanning 4276 * existing keys. The key being created is also passed in *keyp and can be 4277 * inside or outside the indirect block. Regardless, the indirect block 4278 * must hold at least two keys in order to guarantee sufficient space. 4279 * 4280 * We use a modified version of the freemap's fixed radix tree, but taylored 4281 * for file data. Basically we configure an indirect block encompassing the 4282 * smallest key. 4283 */ 4284 static int 4285 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4286 int keybits, hammer2_blockref_t *base, int count, 4287 int ncount) 4288 { 4289 hammer2_chain_t *chain; 4290 hammer2_blockref_t *bref; 4291 hammer2_key_t key; 4292 hammer2_key_t key_beg; 4293 hammer2_key_t key_end; 4294 hammer2_key_t key_next; 4295 int nradix; 4296 int locount; 4297 int hicount; 4298 int maxloops = 300000; 4299 4300 key = *keyp; 4301 locount = 0; 4302 hicount = 0; 4303 keybits = 64; 4304 4305 /* 4306 * Calculate the range of keys in the array being careful to skip 4307 * slots which are overridden with a deletion. 4308 * 4309 * Locate the smallest key. 4310 */ 4311 key_beg = 0; 4312 key_end = HAMMER2_KEY_MAX; 4313 hammer2_spin_ex(&parent->core.spin); 4314 4315 for (;;) { 4316 if (--maxloops == 0) { 4317 panic("indkey_freemap shit %p %p:%d\n", 4318 parent, base, count); 4319 } 4320 chain = hammer2_combined_find(parent, base, count, 4321 &key_next, 4322 key_beg, key_end, 4323 &bref); 4324 4325 /* 4326 * Exhausted search 4327 */ 4328 if (bref == NULL) 4329 break; 4330 4331 /* 4332 * Skip deleted chains. 4333 */ 4334 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4335 if (key_next == 0 || key_next > key_end) 4336 break; 4337 key_beg = key_next; 4338 continue; 4339 } 4340 4341 /* 4342 * Use the full live (not deleted) element for the scan 4343 * iteration. HAMMER2 does not allow partial replacements. 4344 * 4345 * XXX should be built into hammer2_combined_find(). 4346 */ 4347 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4348 4349 if (keybits > bref->keybits) { 4350 key = bref->key; 4351 keybits = bref->keybits; 4352 } else if (keybits == bref->keybits && bref->key < key) { 4353 key = bref->key; 4354 } 4355 if (key_next == 0) 4356 break; 4357 key_beg = key_next; 4358 } 4359 hammer2_spin_unex(&parent->core.spin); 4360 4361 /* 4362 * Calculate the static keybits for a higher-level indirect block 4363 * that contains the key. 4364 */ 4365 *keyp = key; 4366 4367 switch(ncount) { 4368 case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t): 4369 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4370 break; 4371 case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t): 4372 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4373 break; 4374 case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t): 4375 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4376 break; 4377 default: 4378 panic("bad ncount %d\n", ncount); 4379 nradix = 0; 4380 break; 4381 } 4382 4383 /* 4384 * The largest radix that can be returned for an indirect block is 4385 * 63 bits. (The largest practical indirect block radix is actually 4386 * 62 bits because the top-level inode or volume root contains four 4387 * entries, but allow 63 to be returned). 4388 */ 4389 if (nradix >= 64) 4390 nradix = 63; 4391 4392 return keybits + nradix; 4393 } 4394 4395 #if 1 4396 4397 /* 4398 * Directory indirect blocks. 4399 * 4400 * Covers both the inode index (directory of inodes), and directory contents 4401 * (filenames hardlinked to inodes). 4402 * 4403 * Because directory keys are hashed we generally try to cut the space in 4404 * half. We accomodate the inode index (which tends to have linearly 4405 * increasing inode numbers) by ensuring that the keyspace is at least large 4406 * enough to fill up the indirect block being created. 4407 */ 4408 static int 4409 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4410 int keybits, hammer2_blockref_t *base, int count, 4411 int ncount) 4412 { 4413 hammer2_blockref_t *bref; 4414 hammer2_chain_t *chain; 4415 hammer2_key_t key_beg; 4416 hammer2_key_t key_end; 4417 hammer2_key_t key_next; 4418 hammer2_key_t key; 4419 int nkeybits; 4420 int locount; 4421 int hicount; 4422 int maxloops = 300000; 4423 4424 /* 4425 * NOTE: We can't take a shortcut here anymore for inodes because 4426 * the root directory can contain a mix of inodes and directory 4427 * entries (we used to just return 63 if parent->bref.type was 4428 * HAMMER2_BREF_TYPE_INODE. 4429 */ 4430 key = *keyp; 4431 locount = 0; 4432 hicount = 0; 4433 4434 /* 4435 * Calculate the range of keys in the array being careful to skip 4436 * slots which are overridden with a deletion. 4437 */ 4438 key_beg = 0; 4439 key_end = HAMMER2_KEY_MAX; 4440 hammer2_spin_ex(&parent->core.spin); 4441 4442 for (;;) { 4443 if (--maxloops == 0) { 4444 panic("indkey_freemap shit %p %p:%d\n", 4445 parent, base, count); 4446 } 4447 chain = hammer2_combined_find(parent, base, count, 4448 &key_next, 4449 key_beg, key_end, 4450 &bref); 4451 4452 /* 4453 * Exhausted search 4454 */ 4455 if (bref == NULL) 4456 break; 4457 4458 /* 4459 * Deleted object 4460 */ 4461 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4462 if (key_next == 0 || key_next > key_end) 4463 break; 4464 key_beg = key_next; 4465 continue; 4466 } 4467 4468 /* 4469 * Use the full live (not deleted) element for the scan 4470 * iteration. HAMMER2 does not allow partial replacements. 4471 * 4472 * XXX should be built into hammer2_combined_find(). 4473 */ 4474 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4475 4476 /* 4477 * Expand our calculated key range (key, keybits) to fit 4478 * the scanned key. nkeybits represents the full range 4479 * that we will later cut in half (two halves @ nkeybits - 1). 4480 */ 4481 nkeybits = keybits; 4482 if (nkeybits < bref->keybits) { 4483 if (bref->keybits > 64) { 4484 kprintf("bad bref chain %p bref %p\n", 4485 chain, bref); 4486 Debugger("fubar"); 4487 } 4488 nkeybits = bref->keybits; 4489 } 4490 while (nkeybits < 64 && 4491 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4492 (key ^ bref->key)) != 0) { 4493 ++nkeybits; 4494 } 4495 4496 /* 4497 * If the new key range is larger we have to determine 4498 * which side of the new key range the existing keys fall 4499 * under by checking the high bit, then collapsing the 4500 * locount into the hicount or vise-versa. 4501 */ 4502 if (keybits != nkeybits) { 4503 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4504 hicount += locount; 4505 locount = 0; 4506 } else { 4507 locount += hicount; 4508 hicount = 0; 4509 } 4510 keybits = nkeybits; 4511 } 4512 4513 /* 4514 * The newly scanned key will be in the lower half or the 4515 * upper half of the (new) key range. 4516 */ 4517 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4518 ++hicount; 4519 else 4520 ++locount; 4521 4522 if (key_next == 0) 4523 break; 4524 key_beg = key_next; 4525 } 4526 hammer2_spin_unex(&parent->core.spin); 4527 bref = NULL; /* now invalid (safety) */ 4528 4529 /* 4530 * Adjust keybits to represent half of the full range calculated 4531 * above (radix 63 max) for our new indirect block. 4532 */ 4533 --keybits; 4534 4535 /* 4536 * Expand keybits to hold at least ncount elements. ncount will be 4537 * a power of 2. This is to try to completely fill leaf nodes (at 4538 * least for keys which are not hashes). 4539 * 4540 * We aren't counting 'in' or 'out', we are counting 'high side' 4541 * and 'low side' based on the bit at (1LL << keybits). We want 4542 * everything to be inside in these cases so shift it all to 4543 * the low or high side depending on the new high bit. 4544 */ 4545 while (((hammer2_key_t)1 << keybits) < ncount) { 4546 ++keybits; 4547 if (key & ((hammer2_key_t)1 << keybits)) { 4548 hicount += locount; 4549 locount = 0; 4550 } else { 4551 locount += hicount; 4552 hicount = 0; 4553 } 4554 } 4555 4556 if (hicount > locount) 4557 key |= (hammer2_key_t)1 << keybits; 4558 else 4559 key &= ~(hammer2_key_t)1 << keybits; 4560 4561 *keyp = key; 4562 4563 return (keybits); 4564 } 4565 4566 #else 4567 4568 /* 4569 * Directory indirect blocks. 4570 * 4571 * Covers both the inode index (directory of inodes), and directory contents 4572 * (filenames hardlinked to inodes). 4573 * 4574 * Because directory keys are hashed we generally try to cut the space in 4575 * half. We accomodate the inode index (which tends to have linearly 4576 * increasing inode numbers) by ensuring that the keyspace is at least large 4577 * enough to fill up the indirect block being created. 4578 */ 4579 static int 4580 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4581 int keybits, hammer2_blockref_t *base, int count, 4582 int ncount) 4583 { 4584 hammer2_blockref_t *bref; 4585 hammer2_chain_t *chain; 4586 hammer2_key_t key_beg; 4587 hammer2_key_t key_end; 4588 hammer2_key_t key_next; 4589 hammer2_key_t key; 4590 int nkeybits; 4591 int locount; 4592 int hicount; 4593 int maxloops = 300000; 4594 4595 /* 4596 * Shortcut if the parent is the inode. In this situation the 4597 * parent has 4+1 directory entries and we are creating an indirect 4598 * block capable of holding many more. 4599 */ 4600 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4601 return 63; 4602 } 4603 4604 key = *keyp; 4605 locount = 0; 4606 hicount = 0; 4607 4608 /* 4609 * Calculate the range of keys in the array being careful to skip 4610 * slots which are overridden with a deletion. 4611 */ 4612 key_beg = 0; 4613 key_end = HAMMER2_KEY_MAX; 4614 hammer2_spin_ex(&parent->core.spin); 4615 4616 for (;;) { 4617 if (--maxloops == 0) { 4618 panic("indkey_freemap shit %p %p:%d\n", 4619 parent, base, count); 4620 } 4621 chain = hammer2_combined_find(parent, base, count, 4622 &key_next, 4623 key_beg, key_end, 4624 &bref); 4625 4626 /* 4627 * Exhausted search 4628 */ 4629 if (bref == NULL) 4630 break; 4631 4632 /* 4633 * Deleted object 4634 */ 4635 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4636 if (key_next == 0 || key_next > key_end) 4637 break; 4638 key_beg = key_next; 4639 continue; 4640 } 4641 4642 /* 4643 * Use the full live (not deleted) element for the scan 4644 * iteration. HAMMER2 does not allow partial replacements. 4645 * 4646 * XXX should be built into hammer2_combined_find(). 4647 */ 4648 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4649 4650 /* 4651 * Expand our calculated key range (key, keybits) to fit 4652 * the scanned key. nkeybits represents the full range 4653 * that we will later cut in half (two halves @ nkeybits - 1). 4654 */ 4655 nkeybits = keybits; 4656 if (nkeybits < bref->keybits) { 4657 if (bref->keybits > 64) { 4658 kprintf("bad bref chain %p bref %p\n", 4659 chain, bref); 4660 Debugger("fubar"); 4661 } 4662 nkeybits = bref->keybits; 4663 } 4664 while (nkeybits < 64 && 4665 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4666 (key ^ bref->key)) != 0) { 4667 ++nkeybits; 4668 } 4669 4670 /* 4671 * If the new key range is larger we have to determine 4672 * which side of the new key range the existing keys fall 4673 * under by checking the high bit, then collapsing the 4674 * locount into the hicount or vise-versa. 4675 */ 4676 if (keybits != nkeybits) { 4677 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4678 hicount += locount; 4679 locount = 0; 4680 } else { 4681 locount += hicount; 4682 hicount = 0; 4683 } 4684 keybits = nkeybits; 4685 } 4686 4687 /* 4688 * The newly scanned key will be in the lower half or the 4689 * upper half of the (new) key range. 4690 */ 4691 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4692 ++hicount; 4693 else 4694 ++locount; 4695 4696 if (key_next == 0) 4697 break; 4698 key_beg = key_next; 4699 } 4700 hammer2_spin_unex(&parent->core.spin); 4701 bref = NULL; /* now invalid (safety) */ 4702 4703 /* 4704 * Adjust keybits to represent half of the full range calculated 4705 * above (radix 63 max) for our new indirect block. 4706 */ 4707 --keybits; 4708 4709 /* 4710 * Expand keybits to hold at least ncount elements. ncount will be 4711 * a power of 2. This is to try to completely fill leaf nodes (at 4712 * least for keys which are not hashes). 4713 * 4714 * We aren't counting 'in' or 'out', we are counting 'high side' 4715 * and 'low side' based on the bit at (1LL << keybits). We want 4716 * everything to be inside in these cases so shift it all to 4717 * the low or high side depending on the new high bit. 4718 */ 4719 while (((hammer2_key_t)1 << keybits) < ncount) { 4720 ++keybits; 4721 if (key & ((hammer2_key_t)1 << keybits)) { 4722 hicount += locount; 4723 locount = 0; 4724 } else { 4725 locount += hicount; 4726 hicount = 0; 4727 } 4728 } 4729 4730 if (hicount > locount) 4731 key |= (hammer2_key_t)1 << keybits; 4732 else 4733 key &= ~(hammer2_key_t)1 << keybits; 4734 4735 *keyp = key; 4736 4737 return (keybits); 4738 } 4739 4740 #endif 4741 4742 /* 4743 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 4744 * it exists. 4745 * 4746 * Both parent and chain must be locked exclusively. 4747 * 4748 * This function will modify the parent if the blockref requires removal 4749 * from the parent's block table. 4750 * 4751 * This function is NOT recursive. Any entity already pushed into the 4752 * chain (such as an inode) may still need visibility into its contents, 4753 * as well as the ability to read and modify the contents. For example, 4754 * for an unlinked file which is still open. 4755 * 4756 * Also note that the flusher is responsible for cleaning up empty 4757 * indirect blocks. 4758 */ 4759 int 4760 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 4761 hammer2_tid_t mtid, int flags) 4762 { 4763 int error = 0; 4764 4765 KKASSERT(hammer2_mtx_owned(&chain->lock)); 4766 4767 /* 4768 * Nothing to do if already marked. 4769 * 4770 * We need the spinlock on the core whos RBTREE contains chain 4771 * to protect against races. 4772 */ 4773 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 4774 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 4775 chain->parent == parent); 4776 error = _hammer2_chain_delete_helper(parent, chain, 4777 mtid, flags); 4778 } 4779 4780 /* 4781 * Permanent deletions mark the chain as destroyed. 4782 */ 4783 if (error == 0) { 4784 if (flags & HAMMER2_DELETE_PERMANENT) 4785 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 4786 hammer2_chain_setflush(chain); 4787 } 4788 4789 return error; 4790 } 4791 4792 /* 4793 * Returns the index of the nearest element in the blockref array >= elm. 4794 * Returns (count) if no element could be found. 4795 * 4796 * Sets *key_nextp to the next key for loop purposes but does not modify 4797 * it if the next key would be higher than the current value of *key_nextp. 4798 * Note that *key_nexp can overflow to 0, which should be tested by the 4799 * caller. 4800 * 4801 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4802 * held through the operation. 4803 */ 4804 static int 4805 hammer2_base_find(hammer2_chain_t *parent, 4806 hammer2_blockref_t *base, int count, 4807 hammer2_key_t *key_nextp, 4808 hammer2_key_t key_beg, hammer2_key_t key_end) 4809 { 4810 hammer2_blockref_t *scan; 4811 hammer2_key_t scan_end; 4812 int i; 4813 int limit; 4814 4815 /* 4816 * Require the live chain's already have their core's counted 4817 * so we can optimize operations. 4818 */ 4819 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 4820 4821 /* 4822 * Degenerate case 4823 */ 4824 if (count == 0 || base == NULL) 4825 return(count); 4826 4827 /* 4828 * Sequential optimization using parent->cache_index. This is 4829 * the most likely scenario. 4830 * 4831 * We can avoid trailing empty entries on live chains, otherwise 4832 * we might have to check the whole block array. 4833 */ 4834 i = parent->cache_index; /* SMP RACE OK */ 4835 cpu_ccfence(); 4836 limit = parent->core.live_zero; 4837 if (i >= limit) 4838 i = limit - 1; 4839 if (i < 0) 4840 i = 0; 4841 KKASSERT(i < count); 4842 4843 /* 4844 * Search backwards 4845 */ 4846 scan = &base[i]; 4847 while (i > 0 && (scan->type == 0 || scan->key > key_beg)) { 4848 --scan; 4849 --i; 4850 } 4851 parent->cache_index = i; 4852 4853 /* 4854 * Search forwards, stop when we find a scan element which 4855 * encloses the key or until we know that there are no further 4856 * elements. 4857 */ 4858 while (i < count) { 4859 if (scan->type != 0) { 4860 scan_end = scan->key + 4861 ((hammer2_key_t)1 << scan->keybits) - 1; 4862 if (scan->key > key_beg || scan_end >= key_beg) 4863 break; 4864 } 4865 if (i >= limit) 4866 return (count); 4867 ++scan; 4868 ++i; 4869 } 4870 if (i != count) { 4871 parent->cache_index = i; 4872 if (i >= limit) { 4873 i = count; 4874 } else { 4875 scan_end = scan->key + 4876 ((hammer2_key_t)1 << scan->keybits); 4877 if (scan_end && (*key_nextp > scan_end || 4878 *key_nextp == 0)) { 4879 *key_nextp = scan_end; 4880 } 4881 } 4882 } 4883 return (i); 4884 } 4885 4886 /* 4887 * Do a combined search and return the next match either from the blockref 4888 * array or from the in-memory chain. Sets *bresp to the returned bref in 4889 * both cases, or sets it to NULL if the search exhausted. Only returns 4890 * a non-NULL chain if the search matched from the in-memory chain. 4891 * 4892 * When no in-memory chain has been found and a non-NULL bref is returned 4893 * in *bresp. 4894 * 4895 * 4896 * The returned chain is not locked or referenced. Use the returned bref 4897 * to determine if the search exhausted or not. Iterate if the base find 4898 * is chosen but matches a deleted chain. 4899 * 4900 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4901 * held through the operation. 4902 */ 4903 hammer2_chain_t * 4904 hammer2_combined_find(hammer2_chain_t *parent, 4905 hammer2_blockref_t *base, int count, 4906 hammer2_key_t *key_nextp, 4907 hammer2_key_t key_beg, hammer2_key_t key_end, 4908 hammer2_blockref_t **bresp) 4909 { 4910 hammer2_blockref_t *bref; 4911 hammer2_chain_t *chain; 4912 int i; 4913 4914 /* 4915 * Lookup in block array and in rbtree. 4916 */ 4917 *key_nextp = key_end + 1; 4918 i = hammer2_base_find(parent, base, count, key_nextp, 4919 key_beg, key_end); 4920 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 4921 4922 /* 4923 * Neither matched 4924 */ 4925 if (i == count && chain == NULL) { 4926 *bresp = NULL; 4927 return(NULL); 4928 } 4929 4930 /* 4931 * Only chain matched. 4932 */ 4933 if (i == count) { 4934 bref = &chain->bref; 4935 goto found; 4936 } 4937 4938 /* 4939 * Only blockref matched. 4940 */ 4941 if (chain == NULL) { 4942 bref = &base[i]; 4943 goto found; 4944 } 4945 4946 /* 4947 * Both in-memory and blockref matched, select the nearer element. 4948 * 4949 * If both are flush with the left-hand side or both are the 4950 * same distance away, select the chain. In this situation the 4951 * chain must have been loaded from the matching blockmap. 4952 */ 4953 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 4954 chain->bref.key == base[i].key) { 4955 KKASSERT(chain->bref.key == base[i].key); 4956 bref = &chain->bref; 4957 goto found; 4958 } 4959 4960 /* 4961 * Select the nearer key 4962 */ 4963 if (chain->bref.key < base[i].key) { 4964 bref = &chain->bref; 4965 } else { 4966 bref = &base[i]; 4967 chain = NULL; 4968 } 4969 4970 /* 4971 * If the bref is out of bounds we've exhausted our search. 4972 */ 4973 found: 4974 if (bref->key > key_end) { 4975 *bresp = NULL; 4976 chain = NULL; 4977 } else { 4978 *bresp = bref; 4979 } 4980 return(chain); 4981 } 4982 4983 /* 4984 * Locate the specified block array element and delete it. The element 4985 * must exist. 4986 * 4987 * The spin lock on the related chain must be held. 4988 * 4989 * NOTE: live_count was adjusted when the chain was deleted, so it does not 4990 * need to be adjusted when we commit the media change. 4991 */ 4992 void 4993 hammer2_base_delete(hammer2_chain_t *parent, 4994 hammer2_blockref_t *base, int count, 4995 hammer2_chain_t *chain) 4996 { 4997 hammer2_blockref_t *elm = &chain->bref; 4998 hammer2_blockref_t *scan; 4999 hammer2_key_t key_next; 5000 int i; 5001 5002 /* 5003 * Delete element. Expect the element to exist. 5004 * 5005 * XXX see caller, flush code not yet sophisticated enough to prevent 5006 * re-flushed in some cases. 5007 */ 5008 key_next = 0; /* max range */ 5009 i = hammer2_base_find(parent, base, count, &key_next, 5010 elm->key, elm->key); 5011 scan = &base[i]; 5012 if (i == count || scan->type == 0 || 5013 scan->key != elm->key || 5014 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 5015 scan->keybits != elm->keybits)) { 5016 hammer2_spin_unex(&parent->core.spin); 5017 panic("delete base %p element not found at %d/%d elm %p\n", 5018 base, i, count, elm); 5019 return; 5020 } 5021 5022 /* 5023 * Update stats and zero the entry. 5024 * 5025 * NOTE: Handle radix == 0 (0 bytes) case. 5026 */ 5027 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5028 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5029 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5030 } 5031 switch(scan->type) { 5032 case HAMMER2_BREF_TYPE_INODE: 5033 --parent->bref.embed.stats.inode_count; 5034 /* fall through */ 5035 case HAMMER2_BREF_TYPE_DATA: 5036 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5037 atomic_set_int(&chain->flags, 5038 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5039 } else { 5040 if (parent->bref.leaf_count) 5041 --parent->bref.leaf_count; 5042 } 5043 /* fall through */ 5044 case HAMMER2_BREF_TYPE_INDIRECT: 5045 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5046 parent->bref.embed.stats.data_count -= 5047 scan->embed.stats.data_count; 5048 parent->bref.embed.stats.inode_count -= 5049 scan->embed.stats.inode_count; 5050 } 5051 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5052 break; 5053 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5054 atomic_set_int(&chain->flags, 5055 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5056 } else { 5057 if (parent->bref.leaf_count <= scan->leaf_count) 5058 parent->bref.leaf_count = 0; 5059 else 5060 parent->bref.leaf_count -= scan->leaf_count; 5061 } 5062 break; 5063 case HAMMER2_BREF_TYPE_DIRENT: 5064 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5065 atomic_set_int(&chain->flags, 5066 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5067 } else { 5068 if (parent->bref.leaf_count) 5069 --parent->bref.leaf_count; 5070 } 5071 default: 5072 break; 5073 } 5074 5075 bzero(scan, sizeof(*scan)); 5076 5077 /* 5078 * We can only optimize parent->core.live_zero for live chains. 5079 */ 5080 if (parent->core.live_zero == i + 1) { 5081 while (--i >= 0 && base[i].type == 0) 5082 ; 5083 parent->core.live_zero = i + 1; 5084 } 5085 5086 /* 5087 * Clear appropriate blockmap flags in chain. 5088 */ 5089 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 5090 HAMMER2_CHAIN_BMAPUPD); 5091 } 5092 5093 /* 5094 * Insert the specified element. The block array must not already have the 5095 * element and must have space available for the insertion. 5096 * 5097 * The spin lock on the related chain must be held. 5098 * 5099 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5100 * need to be adjusted when we commit the media change. 5101 */ 5102 void 5103 hammer2_base_insert(hammer2_chain_t *parent, 5104 hammer2_blockref_t *base, int count, 5105 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5106 { 5107 hammer2_key_t key_next; 5108 hammer2_key_t xkey; 5109 int i; 5110 int j; 5111 int k; 5112 int l; 5113 int u = 1; 5114 5115 /* 5116 * Insert new element. Expect the element to not already exist 5117 * unless we are replacing it. 5118 * 5119 * XXX see caller, flush code not yet sophisticated enough to prevent 5120 * re-flushed in some cases. 5121 */ 5122 key_next = 0; /* max range */ 5123 i = hammer2_base_find(parent, base, count, &key_next, 5124 elm->key, elm->key); 5125 5126 /* 5127 * Shortcut fill optimization, typical ordered insertion(s) may not 5128 * require a search. 5129 */ 5130 KKASSERT(i >= 0 && i <= count); 5131 5132 /* 5133 * Set appropriate blockmap flags in chain (if not NULL) 5134 */ 5135 if (chain) 5136 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 5137 5138 /* 5139 * Update stats and zero the entry 5140 */ 5141 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5142 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5143 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5144 } 5145 switch(elm->type) { 5146 case HAMMER2_BREF_TYPE_INODE: 5147 ++parent->bref.embed.stats.inode_count; 5148 /* fall through */ 5149 case HAMMER2_BREF_TYPE_DATA: 5150 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5151 ++parent->bref.leaf_count; 5152 /* fall through */ 5153 case HAMMER2_BREF_TYPE_INDIRECT: 5154 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5155 parent->bref.embed.stats.data_count += 5156 elm->embed.stats.data_count; 5157 parent->bref.embed.stats.inode_count += 5158 elm->embed.stats.inode_count; 5159 } 5160 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5161 break; 5162 if (parent->bref.leaf_count + elm->leaf_count < 5163 HAMMER2_BLOCKREF_LEAF_MAX) { 5164 parent->bref.leaf_count += elm->leaf_count; 5165 } else { 5166 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5167 } 5168 break; 5169 case HAMMER2_BREF_TYPE_DIRENT: 5170 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5171 ++parent->bref.leaf_count; 5172 break; 5173 default: 5174 break; 5175 } 5176 5177 5178 /* 5179 * We can only optimize parent->core.live_zero for live chains. 5180 */ 5181 if (i == count && parent->core.live_zero < count) { 5182 i = parent->core.live_zero++; 5183 base[i] = *elm; 5184 return; 5185 } 5186 5187 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5188 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5189 hammer2_spin_unex(&parent->core.spin); 5190 panic("insert base %p overlapping elements at %d elm %p\n", 5191 base, i, elm); 5192 } 5193 5194 /* 5195 * Try to find an empty slot before or after. 5196 */ 5197 j = i; 5198 k = i; 5199 while (j > 0 || k < count) { 5200 --j; 5201 if (j >= 0 && base[j].type == 0) { 5202 if (j == i - 1) { 5203 base[j] = *elm; 5204 } else { 5205 bcopy(&base[j+1], &base[j], 5206 (i - j - 1) * sizeof(*base)); 5207 base[i - 1] = *elm; 5208 } 5209 goto validate; 5210 } 5211 ++k; 5212 if (k < count && base[k].type == 0) { 5213 bcopy(&base[i], &base[i+1], 5214 (k - i) * sizeof(hammer2_blockref_t)); 5215 base[i] = *elm; 5216 5217 /* 5218 * We can only update parent->core.live_zero for live 5219 * chains. 5220 */ 5221 if (parent->core.live_zero <= k) 5222 parent->core.live_zero = k + 1; 5223 u = 2; 5224 goto validate; 5225 } 5226 } 5227 panic("hammer2_base_insert: no room!"); 5228 5229 /* 5230 * Debugging 5231 */ 5232 validate: 5233 key_next = 0; 5234 for (l = 0; l < count; ++l) { 5235 if (base[l].type) { 5236 key_next = base[l].key + 5237 ((hammer2_key_t)1 << base[l].keybits) - 1; 5238 break; 5239 } 5240 } 5241 while (++l < count) { 5242 if (base[l].type) { 5243 if (base[l].key <= key_next) 5244 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5245 key_next = base[l].key + 5246 ((hammer2_key_t)1 << base[l].keybits) - 1; 5247 5248 } 5249 } 5250 5251 } 5252 5253 #if 0 5254 5255 /* 5256 * Sort the blockref array for the chain. Used by the flush code to 5257 * sort the blockref[] array. 5258 * 5259 * The chain must be exclusively locked AND spin-locked. 5260 */ 5261 typedef hammer2_blockref_t *hammer2_blockref_p; 5262 5263 static 5264 int 5265 hammer2_base_sort_callback(const void *v1, const void *v2) 5266 { 5267 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5268 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5269 5270 /* 5271 * Make sure empty elements are placed at the end of the array 5272 */ 5273 if (bref1->type == 0) { 5274 if (bref2->type == 0) 5275 return(0); 5276 return(1); 5277 } else if (bref2->type == 0) { 5278 return(-1); 5279 } 5280 5281 /* 5282 * Sort by key 5283 */ 5284 if (bref1->key < bref2->key) 5285 return(-1); 5286 if (bref1->key > bref2->key) 5287 return(1); 5288 return(0); 5289 } 5290 5291 void 5292 hammer2_base_sort(hammer2_chain_t *chain) 5293 { 5294 hammer2_blockref_t *base; 5295 int count; 5296 5297 switch(chain->bref.type) { 5298 case HAMMER2_BREF_TYPE_INODE: 5299 /* 5300 * Special shortcut for embedded data returns the inode 5301 * itself. Callers must detect this condition and access 5302 * the embedded data (the strategy code does this for us). 5303 * 5304 * This is only applicable to regular files and softlinks. 5305 */ 5306 if (chain->data->ipdata.meta.op_flags & 5307 HAMMER2_OPFLAG_DIRECTDATA) { 5308 return; 5309 } 5310 base = &chain->data->ipdata.u.blockset.blockref[0]; 5311 count = HAMMER2_SET_COUNT; 5312 break; 5313 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5314 case HAMMER2_BREF_TYPE_INDIRECT: 5315 /* 5316 * Optimize indirect blocks in the INITIAL state to avoid 5317 * I/O. 5318 */ 5319 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5320 base = &chain->data->npdata[0]; 5321 count = chain->bytes / sizeof(hammer2_blockref_t); 5322 break; 5323 case HAMMER2_BREF_TYPE_VOLUME: 5324 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5325 count = HAMMER2_SET_COUNT; 5326 break; 5327 case HAMMER2_BREF_TYPE_FREEMAP: 5328 base = &chain->data->blkset.blockref[0]; 5329 count = HAMMER2_SET_COUNT; 5330 break; 5331 default: 5332 kprintf("hammer2_chain_lookup: unrecognized " 5333 "blockref(A) type: %d", 5334 chain->bref.type); 5335 while (1) 5336 tsleep(&base, 0, "dead", 0); 5337 panic("hammer2_chain_lookup: unrecognized " 5338 "blockref(A) type: %d", 5339 chain->bref.type); 5340 base = NULL; /* safety */ 5341 count = 0; /* safety */ 5342 } 5343 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5344 } 5345 5346 #endif 5347 5348 /* 5349 * Chain memory management 5350 */ 5351 void 5352 hammer2_chain_wait(hammer2_chain_t *chain) 5353 { 5354 tsleep(chain, 0, "chnflw", 1); 5355 } 5356 5357 const hammer2_media_data_t * 5358 hammer2_chain_rdata(hammer2_chain_t *chain) 5359 { 5360 KKASSERT(chain->data != NULL); 5361 return (chain->data); 5362 } 5363 5364 hammer2_media_data_t * 5365 hammer2_chain_wdata(hammer2_chain_t *chain) 5366 { 5367 KKASSERT(chain->data != NULL); 5368 return (chain->data); 5369 } 5370 5371 /* 5372 * Set the check data for a chain. This can be a heavy-weight operation 5373 * and typically only runs on-flush. For file data check data is calculated 5374 * when the logical buffers are flushed. 5375 */ 5376 void 5377 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5378 { 5379 chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO; 5380 5381 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5382 case HAMMER2_CHECK_NONE: 5383 break; 5384 case HAMMER2_CHECK_DISABLED: 5385 break; 5386 case HAMMER2_CHECK_ISCSI32: 5387 chain->bref.check.iscsi32.value = 5388 hammer2_icrc32(bdata, chain->bytes); 5389 break; 5390 case HAMMER2_CHECK_XXHASH64: 5391 chain->bref.check.xxhash64.value = 5392 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5393 break; 5394 case HAMMER2_CHECK_SHA192: 5395 { 5396 SHA256_CTX hash_ctx; 5397 union { 5398 uint8_t digest[SHA256_DIGEST_LENGTH]; 5399 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5400 } u; 5401 5402 SHA256_Init(&hash_ctx); 5403 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5404 SHA256_Final(u.digest, &hash_ctx); 5405 u.digest64[2] ^= u.digest64[3]; 5406 bcopy(u.digest, 5407 chain->bref.check.sha192.data, 5408 sizeof(chain->bref.check.sha192.data)); 5409 } 5410 break; 5411 case HAMMER2_CHECK_FREEMAP: 5412 chain->bref.check.freemap.icrc32 = 5413 hammer2_icrc32(bdata, chain->bytes); 5414 break; 5415 default: 5416 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5417 chain->bref.methods); 5418 break; 5419 } 5420 } 5421 5422 int 5423 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5424 { 5425 uint32_t check32; 5426 uint64_t check64; 5427 int r; 5428 5429 if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO) 5430 return 1; 5431 5432 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5433 case HAMMER2_CHECK_NONE: 5434 r = 1; 5435 break; 5436 case HAMMER2_CHECK_DISABLED: 5437 r = 1; 5438 break; 5439 case HAMMER2_CHECK_ISCSI32: 5440 check32 = hammer2_icrc32(bdata, chain->bytes); 5441 r = (chain->bref.check.iscsi32.value == check32); 5442 if (r == 0) { 5443 kprintf("chain %016jx.%02x meth=%02x CHECK FAIL " 5444 "(flags=%08x, bref/data %08x/%08x)\n", 5445 chain->bref.data_off, 5446 chain->bref.type, 5447 chain->bref.methods, 5448 chain->flags, 5449 chain->bref.check.iscsi32.value, 5450 check32); 5451 } 5452 hammer2_check_icrc32 += chain->bytes; 5453 break; 5454 case HAMMER2_CHECK_XXHASH64: 5455 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5456 r = (chain->bref.check.xxhash64.value == check64); 5457 if (r == 0) { 5458 kprintf("chain %016jx.%02x key=%016jx " 5459 "meth=%02x CHECK FAIL " 5460 "(flags=%08x, bref/data %016jx/%016jx)\n", 5461 chain->bref.data_off, 5462 chain->bref.type, 5463 chain->bref.key, 5464 chain->bref.methods, 5465 chain->flags, 5466 chain->bref.check.xxhash64.value, 5467 check64); 5468 } 5469 hammer2_check_xxhash64 += chain->bytes; 5470 break; 5471 case HAMMER2_CHECK_SHA192: 5472 { 5473 SHA256_CTX hash_ctx; 5474 union { 5475 uint8_t digest[SHA256_DIGEST_LENGTH]; 5476 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5477 } u; 5478 5479 SHA256_Init(&hash_ctx); 5480 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5481 SHA256_Final(u.digest, &hash_ctx); 5482 u.digest64[2] ^= u.digest64[3]; 5483 if (bcmp(u.digest, 5484 chain->bref.check.sha192.data, 5485 sizeof(chain->bref.check.sha192.data)) == 0) { 5486 r = 1; 5487 } else { 5488 r = 0; 5489 kprintf("chain %016jx.%02x meth=%02x " 5490 "CHECK FAIL\n", 5491 chain->bref.data_off, 5492 chain->bref.type, 5493 chain->bref.methods); 5494 } 5495 } 5496 break; 5497 case HAMMER2_CHECK_FREEMAP: 5498 r = (chain->bref.check.freemap.icrc32 == 5499 hammer2_icrc32(bdata, chain->bytes)); 5500 if (r == 0) { 5501 kprintf("chain %016jx.%02x meth=%02x " 5502 "CHECK FAIL\n", 5503 chain->bref.data_off, 5504 chain->bref.type, 5505 chain->bref.methods); 5506 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 5507 chain->bref.check.freemap.icrc32, 5508 hammer2_icrc32(bdata, chain->bytes), 5509 chain->bytes); 5510 if (chain->dio) 5511 kprintf("dio %p buf %016jx,%d bdata %p/%p\n", 5512 chain->dio, chain->dio->bp->b_loffset, 5513 chain->dio->bp->b_bufsize, bdata, 5514 chain->dio->bp->b_data); 5515 } 5516 5517 break; 5518 default: 5519 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5520 chain->bref.methods); 5521 r = 1; 5522 break; 5523 } 5524 return r; 5525 } 5526 5527 /* 5528 * Acquire the chain and parent representing the specified inode for the 5529 * device at the specified cluster index. 5530 * 5531 * The flags passed in are LOOKUP flags, not RESOLVE flags. 5532 * 5533 * If we are unable to locate the hardlink, INVAL is returned and *chainp 5534 * will be NULL. *parentp may still be set error or not, or NULL if the 5535 * parent itself could not be resolved. 5536 * 5537 * Caller must pass-in a valid or NULL *parentp or *chainp. The passed-in 5538 * *parentp and *chainp will be unlocked if not NULL. 5539 */ 5540 int 5541 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 5542 int clindex, int flags, 5543 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 5544 { 5545 hammer2_chain_t *parent; 5546 hammer2_chain_t *rchain; 5547 hammer2_key_t key_dummy; 5548 int resolve_flags; 5549 int error; 5550 5551 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 5552 HAMMER2_RESOLVE_SHARED : 0; 5553 5554 /* 5555 * Caller expects us to replace these. 5556 */ 5557 if (*chainp) { 5558 hammer2_chain_unlock(*chainp); 5559 hammer2_chain_drop(*chainp); 5560 *chainp = NULL; 5561 } 5562 if (*parentp) { 5563 hammer2_chain_unlock(*parentp); 5564 hammer2_chain_drop(*parentp); 5565 *parentp = NULL; 5566 } 5567 5568 /* 5569 * Inodes hang off of the iroot (bit 63 is clear, differentiating 5570 * inodes from root directory entries in the key lookup). 5571 */ 5572 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 5573 rchain = NULL; 5574 if (parent) { 5575 rchain = hammer2_chain_lookup(&parent, &key_dummy, 5576 inum, inum, 5577 &error, flags); 5578 } else { 5579 error = HAMMER2_ERROR_EIO; 5580 } 5581 *parentp = parent; 5582 *chainp = rchain; 5583 5584 return error; 5585 } 5586 5587 /* 5588 * Used by the bulkscan code to snapshot the synchronized storage for 5589 * a volume, allowing it to be scanned concurrently against normal 5590 * operation. 5591 */ 5592 hammer2_chain_t * 5593 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 5594 { 5595 hammer2_chain_t *copy; 5596 5597 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 5598 copy->data = kmalloc(sizeof(copy->data->voldata), 5599 hmp->mchain, 5600 M_WAITOK | M_ZERO); 5601 hammer2_voldata_lock(hmp); 5602 copy->data->voldata = hmp->volsync; 5603 hammer2_voldata_unlock(hmp); 5604 5605 return copy; 5606 } 5607 5608 void 5609 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 5610 { 5611 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 5612 KKASSERT(copy->data); 5613 kfree(copy->data, copy->hmp->mchain); 5614 copy->data = NULL; 5615 atomic_add_long(&hammer2_chain_allocs, -1); 5616 hammer2_chain_drop(copy); 5617 } 5618 5619 /* 5620 * Returns non-zero if the chain (INODE or DIRENT) matches the 5621 * filename. 5622 */ 5623 int 5624 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 5625 size_t name_len) 5626 { 5627 const hammer2_inode_data_t *ripdata; 5628 const hammer2_dirent_head_t *den; 5629 5630 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5631 ripdata = &chain->data->ipdata; 5632 if (ripdata->meta.name_len == name_len && 5633 bcmp(ripdata->filename, name, name_len) == 0) { 5634 return 1; 5635 } 5636 } 5637 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 5638 chain->bref.embed.dirent.namlen == name_len) { 5639 den = &chain->bref.embed.dirent; 5640 if (name_len > sizeof(chain->bref.check.buf) && 5641 bcmp(chain->data->buf, name, name_len) == 0) { 5642 return 1; 5643 } 5644 if (name_len <= sizeof(chain->bref.check.buf) && 5645 bcmp(chain->bref.check.buf, name, name_len) == 0) { 5646 return 1; 5647 } 5648 } 5649 return 0; 5650 } 5651