1 /* 2 * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static hammer2_chain_t *hammer2_chain_create_indirect( 68 hammer2_chain_t *parent, 69 hammer2_key_t key, int keybits, 70 hammer2_tid_t mtid, int for_type, int *errorp); 71 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain); 72 static hammer2_chain_t *hammer2_combined_find( 73 hammer2_chain_t *parent, 74 hammer2_blockref_t *base, int count, 75 hammer2_key_t *key_nextp, 76 hammer2_key_t key_beg, hammer2_key_t key_end, 77 hammer2_blockref_t **bresp); 78 79 static struct krate krate_h2me = { .freq = 1 }; 80 81 /* 82 * Basic RBTree for chains (core->rbtree and core->dbtree). Chains cannot 83 * overlap in the RB trees. Deleted chains are moved from rbtree to either 84 * dbtree or to dbq. 85 * 86 * Chains in delete-duplicate sequences can always iterate through core_entry 87 * to locate the live version of the chain. 88 */ 89 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 90 91 int 92 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 93 { 94 hammer2_key_t c1_beg; 95 hammer2_key_t c1_end; 96 hammer2_key_t c2_beg; 97 hammer2_key_t c2_end; 98 99 /* 100 * Compare chains. Overlaps are not supposed to happen and catch 101 * any software issues early we count overlaps as a match. 102 */ 103 c1_beg = chain1->bref.key; 104 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 105 c2_beg = chain2->bref.key; 106 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 107 108 if (c1_end < c2_beg) /* fully to the left */ 109 return(-1); 110 if (c1_beg > c2_end) /* fully to the right */ 111 return(1); 112 return(0); /* overlap (must not cross edge boundary) */ 113 } 114 115 /* 116 * Assert that a chain has no media data associated with it. 117 */ 118 static __inline void 119 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 120 { 121 KKASSERT(chain->dio == NULL); 122 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 123 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 124 chain->data) { 125 panic("hammer2_assert_no_data: chain %p still has data", chain); 126 } 127 } 128 129 /* 130 * Make a chain visible to the flusher. The flusher needs to be able to 131 * do flushes of subdirectory chains or single files so it does a top-down 132 * recursion using the ONFLUSH flag for the recursion. It locates MODIFIED 133 * or UPDATE chains and flushes back up the chain to the volume root. 134 * 135 * This routine sets ONFLUSH upward until it hits the volume root. For 136 * simplicity we ignore PFSROOT boundaries whos rules can be complex. 137 * Extra ONFLUSH flagging doesn't hurt the filesystem. 138 */ 139 void 140 hammer2_chain_setflush(hammer2_chain_t *chain) 141 { 142 hammer2_chain_t *parent; 143 144 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 145 hammer2_spin_sh(&chain->core.spin); 146 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 147 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 148 if ((parent = chain->parent) == NULL) 149 break; 150 hammer2_spin_sh(&parent->core.spin); 151 hammer2_spin_unsh(&chain->core.spin); 152 chain = parent; 153 } 154 hammer2_spin_unsh(&chain->core.spin); 155 } 156 } 157 158 /* 159 * Allocate a new disconnected chain element representing the specified 160 * bref. chain->refs is set to 1 and the passed bref is copied to 161 * chain->bref. chain->bytes is derived from the bref. 162 * 163 * chain->pmp inherits pmp unless the chain is an inode (other than the 164 * super-root inode). 165 * 166 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 167 */ 168 hammer2_chain_t * 169 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 170 hammer2_blockref_t *bref) 171 { 172 hammer2_chain_t *chain; 173 u_int bytes; 174 175 /* 176 * Special case - radix of 0 indicates a chain that does not 177 * need a data reference (context is completely embedded in the 178 * bref). 179 */ 180 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 181 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 182 else 183 bytes = 0; 184 185 atomic_add_long(&hammer2_chain_allocs, 1); 186 187 /* 188 * Construct the appropriate system structure. 189 */ 190 switch(bref->type) { 191 case HAMMER2_BREF_TYPE_DIRENT: 192 case HAMMER2_BREF_TYPE_INODE: 193 case HAMMER2_BREF_TYPE_INDIRECT: 194 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 195 case HAMMER2_BREF_TYPE_DATA: 196 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 197 /* 198 * Chain's are really only associated with the hmp but we 199 * maintain a pmp association for per-mount memory tracking 200 * purposes. The pmp can be NULL. 201 */ 202 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 203 break; 204 case HAMMER2_BREF_TYPE_VOLUME: 205 case HAMMER2_BREF_TYPE_FREEMAP: 206 /* 207 * Only hammer2_chain_bulksnap() calls this function with these 208 * types. 209 */ 210 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 211 break; 212 default: 213 chain = NULL; 214 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 215 bref->type); 216 } 217 218 /* 219 * Initialize the new chain structure. pmp must be set to NULL for 220 * chains belonging to the super-root topology of a device mount. 221 */ 222 if (pmp == hmp->spmp) 223 chain->pmp = NULL; 224 else 225 chain->pmp = pmp; 226 chain->hmp = hmp; 227 chain->bref = *bref; 228 chain->bytes = bytes; 229 chain->refs = 1; 230 chain->flags = HAMMER2_CHAIN_ALLOCATED; 231 232 /* 233 * Set the PFS boundary flag if this chain represents a PFS root. 234 */ 235 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 236 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 237 hammer2_chain_core_init(chain); 238 239 return (chain); 240 } 241 242 /* 243 * Initialize a chain's core structure. This structure used to be allocated 244 * but is now embedded. 245 * 246 * The core is not locked. No additional refs on the chain are made. 247 * (trans) must not be NULL if (core) is not NULL. 248 */ 249 void 250 hammer2_chain_core_init(hammer2_chain_t *chain) 251 { 252 /* 253 * Fresh core under nchain (no multi-homing of ochain's 254 * sub-tree). 255 */ 256 RB_INIT(&chain->core.rbtree); /* live chains */ 257 hammer2_mtx_init(&chain->lock, "h2chain"); 258 } 259 260 /* 261 * Add a reference to a chain element, preventing its destruction. 262 * 263 * (can be called with spinlock held) 264 */ 265 void 266 hammer2_chain_ref(hammer2_chain_t *chain) 267 { 268 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 269 /* 270 * 0->non-zero transition must ensure that chain is removed 271 * from the LRU list. 272 * 273 * NOTE: Already holding lru_spin here so we cannot call 274 * hammer2_chain_ref() to get it off lru_list, do 275 * it manually. 276 */ 277 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 278 hammer2_pfs_t *pmp = chain->pmp; 279 hammer2_spin_ex(&pmp->lru_spin); 280 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 281 atomic_add_int(&pmp->lru_count, -1); 282 atomic_clear_int(&chain->flags, 283 HAMMER2_CHAIN_ONLRU); 284 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 285 } 286 hammer2_spin_unex(&pmp->lru_spin); 287 } 288 } 289 #if 0 290 kprintf("REFC %p %d %08x\n", chain, chain->refs - 1, chain->flags); 291 print_backtrace(8); 292 #endif 293 } 294 295 /* 296 * Ref a locked chain and force the data to be held across an unlock. 297 * Chain must be currently locked. The user of the chain who desires 298 * to release the hold must call hammer2_chain_lock_unhold() to relock 299 * and unhold the chain, then unlock normally, or may simply call 300 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 301 */ 302 void 303 hammer2_chain_ref_hold(hammer2_chain_t *chain) 304 { 305 atomic_add_int(&chain->lockcnt, 1); 306 hammer2_chain_ref(chain); 307 } 308 309 /* 310 * Insert the chain in the core rbtree. 311 * 312 * Normal insertions are placed in the live rbtree. Insertion of a deleted 313 * chain is a special case used by the flush code that is placed on the 314 * unstaged deleted list to avoid confusing the live view. 315 */ 316 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 317 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 318 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 319 320 static 321 int 322 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 323 int flags, int generation) 324 { 325 hammer2_chain_t *xchain; 326 int error = 0; 327 328 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 329 hammer2_spin_ex(&parent->core.spin); 330 331 /* 332 * Interlocked by spinlock, check for race 333 */ 334 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 335 parent->core.generation != generation) { 336 error = HAMMER2_ERROR_EAGAIN; 337 goto failed; 338 } 339 340 /* 341 * Insert chain 342 */ 343 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 344 KASSERT(xchain == NULL, 345 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 346 chain, xchain, chain->bref.key)); 347 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 348 chain->parent = parent; 349 ++parent->core.chain_count; 350 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 351 352 /* 353 * We have to keep track of the effective live-view blockref count 354 * so the create code knows when to push an indirect block. 355 */ 356 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 357 atomic_add_int(&parent->core.live_count, 1); 358 failed: 359 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 360 hammer2_spin_unex(&parent->core.spin); 361 return error; 362 } 363 364 /* 365 * Drop the caller's reference to the chain. When the ref count drops to 366 * zero this function will try to disassociate the chain from its parent and 367 * deallocate it, then recursely drop the parent using the implied ref 368 * from the chain's chain->parent. 369 * 370 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 371 * races an acquisition by another cpu. Therefore we can loop if we are 372 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 373 * race against another drop. 374 */ 375 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain); 376 377 void 378 hammer2_chain_drop(hammer2_chain_t *chain) 379 { 380 u_int refs; 381 382 if (hammer2_debug & 0x200000) 383 Debugger("drop"); 384 #if 0 385 kprintf("DROP %p %d %08x\n", chain, chain->refs - 1, chain->flags); 386 print_backtrace(8); 387 #endif 388 389 KKASSERT(chain->refs > 0); 390 391 while (chain) { 392 refs = chain->refs; 393 cpu_ccfence(); 394 KKASSERT(refs > 0); 395 396 if (refs == 1) { 397 if (mtx_lock_ex_try(&chain->lock) == 0) 398 chain = hammer2_chain_lastdrop(chain); 399 /* retry the same chain */ 400 } else { 401 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 402 break; 403 /* retry the same chain */ 404 } 405 cpu_pause(); 406 } 407 } 408 409 /* 410 * Unhold a held and probably not-locked chain, ensure that the data is 411 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 412 * lock and then simply unlocking the chain. 413 */ 414 void 415 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 416 { 417 u_int lockcnt; 418 int iter = 0; 419 420 for (;;) { 421 lockcnt = chain->lockcnt; 422 cpu_ccfence(); 423 if (lockcnt > 1) { 424 if (atomic_cmpset_int(&chain->lockcnt, 425 lockcnt, lockcnt - 1)) { 426 break; 427 } 428 } else if (mtx_lock_ex_try(&chain->lock) == 0) { 429 hammer2_chain_unlock(chain); 430 break; 431 } else { 432 /* 433 * This situation can easily occur on SMP due to 434 * the gap inbetween the 1->0 transition and the 435 * final unlock. We cannot safely block on the 436 * mutex because lockcnt might go above 1. 437 * 438 * XXX Sleep for one tick if it takes too long. 439 */ 440 if (++iter > 1000) { 441 if (iter > 1000 + hz) { 442 kprintf("hammer2: h2race1 %p\n", chain); 443 iter = 1000; 444 } 445 tsleep(&iter, 0, "h2race1", 1); 446 } 447 cpu_pause(); 448 } 449 } 450 hammer2_chain_drop(chain); 451 } 452 453 /* 454 * Handles the (potential) last drop of chain->refs from 1->0. Called with 455 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 456 * possible against refs and lockcnt. We must dispose of the mutex on chain. 457 * 458 * This function returns an unlocked chain for recursive drop or NULL. It 459 * can return the same chain if it determines it has raced another ref. 460 * 461 * -- 462 * 463 * When two chains need to be recursively dropped we use the chain we 464 * would otherwise free to placehold the additional chain. It's a bit 465 * convoluted but we can't just recurse without potentially blowing out 466 * the kernel stack. 467 * 468 * The chain cannot be freed if it has any children. 469 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 470 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 471 * Any dedup registration can remain intact. 472 * 473 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 474 */ 475 static 476 hammer2_chain_t * 477 hammer2_chain_lastdrop(hammer2_chain_t *chain) 478 { 479 hammer2_pfs_t *pmp; 480 hammer2_dev_t *hmp; 481 hammer2_chain_t *parent; 482 hammer2_chain_t *rdrop; 483 #if 0 484 hammer2_io_t *dio; 485 #endif 486 487 #if 0 488 /* 489 * On last drop if there is no parent and data_off is good (at 490 * least does not represent the volume root), the modified chain 491 * is probably going to be destroyed. We have to make sure that 492 * the data area is not registered for dedup. 493 * 494 * XXX removed. In fact, we do not have to make sure that the 495 * data area is not registered for dedup. The data area 496 * can, in fact, still be used for dedup because it is 497 * still allocated in the freemap and the underlying I/O 498 * will still be flushed. 499 */ 500 if (chain->parent == NULL && 501 (chain->flags & HAMMER2_CHAIN_MODIFIED) && 502 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 503 hmp = chain->hmp; 504 hammer2_io_dedup_delete(hmp, chain->bref.type, 505 chain->bref.data_off, chain->bytes); 506 } 507 #endif 508 /* 509 * We need chain's spinlock to interlock the sub-tree test. 510 * We already have chain's mutex, protecting chain->parent. 511 * 512 * Remember that chain->refs can be in flux. 513 */ 514 hammer2_spin_ex(&chain->core.spin); 515 516 if ((parent = chain->parent) != NULL) { 517 /* 518 * If the chain has a parent the UPDATE bit prevents scrapping 519 * as the chain is needed to properly flush the parent. Try 520 * to complete the 1->0 transition and return NULL. Retry 521 * (return chain) if we are unable to complete the 1->0 522 * transition, else return NULL (nothing more to do). 523 * 524 * If the chain has a parent the MODIFIED bit prevents 525 * scrapping. 526 * 527 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 528 */ 529 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 530 HAMMER2_CHAIN_MODIFIED)) { 531 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 532 hammer2_spin_unex(&chain->core.spin); 533 #if 0 534 dio = hammer2_chain_drop_data(chain, 0); 535 if (dio) 536 hammer2_io_bqrelse(&dio); 537 #endif 538 hammer2_chain_assert_no_data(chain); 539 hammer2_mtx_unlock(&chain->lock); 540 chain = NULL; 541 } else { 542 hammer2_spin_unex(&chain->core.spin); 543 hammer2_mtx_unlock(&chain->lock); 544 } 545 return (chain); 546 } 547 /* spinlock still held */ 548 } else { 549 /* 550 * The chain has no parent and can be flagged for destruction. 551 * Since it has no parent, UPDATE can also be cleared. 552 */ 553 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 554 if (chain->flags & HAMMER2_CHAIN_UPDATE) 555 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 556 557 /* 558 * If the chain has children we must still flush the chain. 559 * Any dedup is already handled by the underlying DIO, so 560 * we do not have to specifically flush it here. 561 * 562 * In the case where it has children, the DESTROY flag test 563 * in the flush code will prevent unnecessary flushes of 564 * MODIFIED chains that are not flagged DEDUP so don't worry 565 * about that here. 566 */ 567 if (chain->core.chain_count) { 568 /* 569 * Put on flushq (should ensure refs > 1), retry 570 * the drop. 571 */ 572 hammer2_spin_unex(&chain->core.spin); 573 hammer2_delayed_flush(chain); 574 hammer2_mtx_unlock(&chain->lock); 575 576 return(chain); /* retry drop */ 577 } 578 579 /* 580 * Otherwise we can scrap the MODIFIED bit if it is set, 581 * and continue along the freeing path. 582 * 583 * Be sure to clean-out any dedup bits. Without a parent 584 * this chain will no longer be visible to the flush code. 585 * Easy check data_off to avoid the volume root. 586 */ 587 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 588 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 589 atomic_add_long(&hammer2_count_modified_chains, -1); 590 if (chain->pmp) 591 hammer2_pfs_memory_wakeup(chain->pmp); 592 } 593 /* spinlock still held */ 594 } 595 596 /* spinlock still held */ 597 #if 0 598 dio = NULL; 599 #endif 600 601 /* 602 * If any children exist we must leave the chain intact with refs == 0. 603 * They exist because chains are retained below us which have refs or 604 * may require flushing. 605 * 606 * Retry (return chain) if we fail to transition the refs to 0, else 607 * return NULL indication nothing more to do. 608 * 609 * Chains with children are NOT put on the LRU list. 610 */ 611 if (chain->core.chain_count) { 612 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 613 hammer2_spin_unex(&chain->core.spin); 614 hammer2_chain_assert_no_data(chain); 615 hammer2_mtx_unlock(&chain->lock); 616 chain = NULL; 617 } else { 618 hammer2_spin_unex(&chain->core.spin); 619 hammer2_mtx_unlock(&chain->lock); 620 } 621 return (chain); 622 } 623 /* spinlock still held */ 624 /* no chains left under us */ 625 626 /* 627 * chain->core has no children left so no accessors can get to our 628 * chain from there. Now we have to lock the parent core to interlock 629 * remaining possible accessors that might bump chain's refs before 630 * we can safely drop chain's refs with intent to free the chain. 631 */ 632 hmp = chain->hmp; 633 pmp = chain->pmp; /* can be NULL */ 634 rdrop = NULL; 635 636 parent = chain->parent; 637 638 /* 639 * WARNING! chain's spin lock is still held here, and other spinlocks 640 * will be acquired and released in the code below. We 641 * cannot be making fancy procedure calls! 642 */ 643 644 /* 645 * We can cache the chain if it is associated with a pmp 646 * and not flagged as being destroyed or requesting a full 647 * release. In this situation the chain is not removed 648 * from its parent, i.e. it can still be looked up. 649 * 650 * We intentionally do not cache DATA chains because these 651 * were likely used to load data into the logical buffer cache 652 * and will not be accessed again for some time. 653 */ 654 if ((chain->flags & 655 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 656 chain->pmp && 657 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 658 if (parent) 659 hammer2_spin_ex(&parent->core.spin); 660 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 661 /* 662 * 1->0 transition failed, retry. Do not drop 663 * the chain's data yet! 664 */ 665 if (parent) 666 hammer2_spin_unex(&parent->core.spin); 667 hammer2_spin_unex(&chain->core.spin); 668 hammer2_mtx_unlock(&chain->lock); 669 670 return(chain); 671 } 672 673 /* 674 * Success 675 */ 676 #if 0 677 dio = hammer2_chain_drop_data(chain, 1); 678 #endif 679 hammer2_chain_assert_no_data(chain); 680 681 KKASSERT((chain->flags & HAMMER2_CHAIN_ONLRU) == 0); 682 hammer2_spin_ex(&pmp->lru_spin); 683 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 684 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 685 686 /* 687 * If we are over the LRU limit we need to drop something. 688 */ 689 if (pmp->lru_count > HAMMER2_LRU_LIMIT) { 690 rdrop = TAILQ_FIRST(&pmp->lru_list); 691 atomic_clear_int(&rdrop->flags, HAMMER2_CHAIN_ONLRU); 692 TAILQ_REMOVE(&pmp->lru_list, rdrop, lru_node); 693 atomic_add_int(&rdrop->refs, 1); 694 atomic_set_int(&rdrop->flags, HAMMER2_CHAIN_RELEASE); 695 } else { 696 atomic_add_int(&pmp->lru_count, 1); 697 } 698 hammer2_spin_unex(&pmp->lru_spin); 699 if (parent) { 700 hammer2_spin_unex(&parent->core.spin); 701 parent = NULL; /* safety */ 702 } 703 hammer2_spin_unex(&chain->core.spin); 704 hammer2_mtx_unlock(&chain->lock); 705 #if 0 706 if (dio) 707 hammer2_io_bqrelse(&dio); 708 #endif 709 710 return rdrop; 711 /* NOT REACHED */ 712 } 713 714 /* 715 * Spinlock the parent and try to drop the last ref on chain. 716 * On success determine if we should dispose of the chain 717 * (remove the chain from its parent, etc). 718 * 719 * (normal core locks are top-down recursive but we define 720 * core spinlocks as bottom-up recursive, so this is safe). 721 */ 722 if (parent) { 723 hammer2_spin_ex(&parent->core.spin); 724 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 725 #if 0 726 /* XXX remove, don't try to drop data on fail */ 727 hammer2_spin_unex(&parent->core.spin); 728 dio = hammer2_chain_drop_data(chain, 0); 729 hammer2_spin_unex(&chain->core.spin); 730 if (dio) 731 hammer2_io_bqrelse(&dio); 732 #endif 733 /* 734 * 1->0 transition failed, retry. 735 */ 736 hammer2_spin_unex(&parent->core.spin); 737 hammer2_spin_unex(&chain->core.spin); 738 hammer2_mtx_unlock(&chain->lock); 739 740 return(chain); 741 } 742 743 /* 744 * 1->0 transition successful, parent spin held to prevent 745 * new lookups, chain spinlock held to protect parent field. 746 * Remove chain from the parent. 747 * 748 * If the chain is being removed from the parent's btree but 749 * is not bmapped, we have to adjust live_count downward. If 750 * it is bmapped then the blockref is retained in the parent 751 * as is its associated live_count. This case can occur when 752 * a chain added to the topology is unable to flush and is 753 * then later deleted. 754 */ 755 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 756 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 757 (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 758 atomic_add_int(&parent->core.live_count, -1); 759 } 760 RB_REMOVE(hammer2_chain_tree, 761 &parent->core.rbtree, chain); 762 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 763 --parent->core.chain_count; 764 chain->parent = NULL; 765 } 766 767 /* 768 * If our chain was the last chain in the parent's core the 769 * core is now empty and its parent might have to be 770 * re-dropped if it has 0 refs. 771 */ 772 if (parent->core.chain_count == 0) { 773 rdrop = parent; 774 atomic_add_int(&rdrop->refs, 1); 775 /* 776 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 777 rdrop = NULL; 778 */ 779 } 780 hammer2_spin_unex(&parent->core.spin); 781 parent = NULL; /* safety */ 782 /* FALL THROUGH */ 783 } else { 784 /* 785 * No-parent case. 786 */ 787 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 788 /* 789 * 1->0 transition failed, retry. 790 */ 791 hammer2_spin_unex(&parent->core.spin); 792 hammer2_spin_unex(&chain->core.spin); 793 hammer2_mtx_unlock(&chain->lock); 794 795 return(chain); 796 } 797 } 798 799 /* 800 * Successful 1->0 transition, no parent, no children... no way for 801 * anyone to ref this chain any more. We can clean-up and free it. 802 * 803 * We still have the core spinlock, and core's chain_count is 0. 804 * Any parent spinlock is gone. 805 */ 806 hammer2_spin_unex(&chain->core.spin); 807 hammer2_chain_assert_no_data(chain); 808 hammer2_mtx_unlock(&chain->lock); 809 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 810 chain->core.chain_count == 0); 811 812 /* 813 * All locks are gone, no pointers remain to the chain, finish 814 * freeing it. 815 */ 816 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 817 HAMMER2_CHAIN_MODIFIED)) == 0); 818 #if 0 819 dio = hammer2_chain_drop_data(chain, 1); 820 if (dio) 821 hammer2_io_bqrelse(&dio); 822 #endif 823 824 /* 825 * Once chain resources are gone we can use the now dead chain 826 * structure to placehold what might otherwise require a recursive 827 * drop, because we have potentially two things to drop and can only 828 * return one directly. 829 */ 830 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 831 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 832 chain->hmp = NULL; 833 kfree(chain, hmp->mchain); 834 } 835 836 /* 837 * Possible chaining loop when parent re-drop needed. 838 */ 839 return(rdrop); 840 } 841 842 /* 843 * On last lock release. 844 */ 845 static hammer2_io_t * 846 hammer2_chain_drop_data(hammer2_chain_t *chain) 847 { 848 hammer2_io_t *dio; 849 850 if ((dio = chain->dio) != NULL) { 851 chain->dio = NULL; 852 chain->data = NULL; 853 } else { 854 switch(chain->bref.type) { 855 case HAMMER2_BREF_TYPE_VOLUME: 856 case HAMMER2_BREF_TYPE_FREEMAP: 857 break; 858 default: 859 if (chain->data != NULL) { 860 hammer2_spin_unex(&chain->core.spin); 861 panic("chain data not null: " 862 "chain %p bref %016jx.%02x " 863 "refs %d parent %p dio %p data %p", 864 chain, chain->bref.data_off, 865 chain->bref.type, chain->refs, 866 chain->parent, 867 chain->dio, chain->data); 868 } 869 KKASSERT(chain->data == NULL); 870 break; 871 } 872 } 873 return dio; 874 } 875 876 /* 877 * Lock a referenced chain element, acquiring its data with I/O if necessary, 878 * and specify how you would like the data to be resolved. 879 * 880 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 881 * 882 * The lock is allowed to recurse, multiple locking ops will aggregate 883 * the requested resolve types. Once data is assigned it will not be 884 * removed until the last unlock. 885 * 886 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 887 * (typically used to avoid device/logical buffer 888 * aliasing for data) 889 * 890 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 891 * the INITIAL-create state (indirect blocks only). 892 * 893 * Do not resolve data elements for DATA chains. 894 * (typically used to avoid device/logical buffer 895 * aliasing for data) 896 * 897 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 898 * 899 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 900 * it will be locked exclusive. 901 * 902 * NOTE: Embedded elements (volume header, inodes) are always resolved 903 * regardless. 904 * 905 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 906 * element will instantiate and zero its buffer, and flush it on 907 * release. 908 * 909 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 910 * so as not to instantiate a device buffer, which could alias against 911 * a logical file buffer. However, if ALWAYS is specified the 912 * device buffer will be instantiated anyway. 913 * 914 * WARNING! This function blocks on I/O if data needs to be fetched. This 915 * blocking can run concurrent with other compatible lock holders 916 * who do not need data returning. The lock is not upgraded to 917 * exclusive during a data fetch, a separate bit is used to 918 * interlock I/O. However, an exclusive lock holder can still count 919 * on being interlocked against an I/O fetch managed by a shared 920 * lock holder. 921 */ 922 void 923 hammer2_chain_lock(hammer2_chain_t *chain, int how) 924 { 925 /* 926 * Ref and lock the element. Recursive locks are allowed. 927 */ 928 KKASSERT(chain->refs > 0); 929 atomic_add_int(&chain->lockcnt, 1); 930 931 /* 932 * Get the appropriate lock. If LOCKAGAIN is flagged with SHARED 933 * the caller expects a shared lock to already be present and we 934 * are giving it another ref. This case must importantly not block 935 * if there is a pending exclusive lock request. 936 */ 937 if (how & HAMMER2_RESOLVE_SHARED) { 938 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 939 hammer2_mtx_sh_again(&chain->lock); 940 } else { 941 hammer2_mtx_sh(&chain->lock); 942 } 943 } else { 944 hammer2_mtx_ex(&chain->lock); 945 } 946 ++curthread->td_tracker; 947 948 /* 949 * If we already have a valid data pointer make sure the data is 950 * synchronized to the current cpu, and then no further action is 951 * necessary. 952 */ 953 if (chain->data) { 954 if (chain->dio) 955 hammer2_io_bkvasync(chain->dio); 956 return; 957 } 958 959 /* 960 * Do we have to resolve the data? This is generally only 961 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 962 * Other BREF types expects the data to be there. 963 */ 964 switch(how & HAMMER2_RESOLVE_MASK) { 965 case HAMMER2_RESOLVE_NEVER: 966 return; 967 case HAMMER2_RESOLVE_MAYBE: 968 if (chain->flags & HAMMER2_CHAIN_INITIAL) 969 return; 970 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 971 return; 972 #if 0 973 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 974 return; 975 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 976 return; 977 #endif 978 /* fall through */ 979 case HAMMER2_RESOLVE_ALWAYS: 980 default: 981 break; 982 } 983 984 /* 985 * Caller requires data 986 */ 987 hammer2_chain_load_data(chain); 988 } 989 990 /* 991 * Lock the chain, retain the hold, and drop the data persistence count. 992 * The data should remain valid because we never transitioned lockcnt 993 * through 0. 994 */ 995 void 996 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 997 { 998 hammer2_chain_lock(chain, how); 999 atomic_add_int(&chain->lockcnt, -1); 1000 } 1001 1002 #if 0 1003 /* 1004 * Downgrade an exclusive chain lock to a shared chain lock. 1005 * 1006 * NOTE: There is no upgrade equivalent due to the ease of 1007 * deadlocks in that direction. 1008 */ 1009 void 1010 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1011 { 1012 hammer2_mtx_downgrade(&chain->lock); 1013 } 1014 #endif 1015 1016 /* 1017 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1018 * may be of any type. 1019 * 1020 * Once chain->data is set it cannot be disposed of until all locks are 1021 * released. 1022 * 1023 * Make sure the data is synchronized to the current cpu. 1024 */ 1025 void 1026 hammer2_chain_load_data(hammer2_chain_t *chain) 1027 { 1028 hammer2_blockref_t *bref; 1029 hammer2_dev_t *hmp; 1030 hammer2_io_t *dio; 1031 char *bdata; 1032 int error; 1033 1034 /* 1035 * Degenerate case, data already present, or chain has no media 1036 * reference to load. 1037 */ 1038 if (chain->data) { 1039 if (chain->dio) 1040 hammer2_io_bkvasync(chain->dio); 1041 return; 1042 } 1043 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1044 return; 1045 1046 hmp = chain->hmp; 1047 KKASSERT(hmp != NULL); 1048 1049 /* 1050 * Gain the IOINPROG bit, interlocked block. 1051 */ 1052 for (;;) { 1053 u_int oflags; 1054 u_int nflags; 1055 1056 oflags = chain->flags; 1057 cpu_ccfence(); 1058 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1059 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1060 tsleep_interlock(&chain->flags, 0); 1061 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1062 tsleep(&chain->flags, PINTERLOCKED, 1063 "h2iocw", 0); 1064 } 1065 /* retry */ 1066 } else { 1067 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1068 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1069 break; 1070 } 1071 /* retry */ 1072 } 1073 } 1074 1075 /* 1076 * We own CHAIN_IOINPROG 1077 * 1078 * Degenerate case if we raced another load. 1079 */ 1080 if (chain->data) { 1081 if (chain->dio) 1082 hammer2_io_bkvasync(chain->dio); 1083 goto done; 1084 } 1085 1086 /* 1087 * We must resolve to a device buffer, either by issuing I/O or 1088 * by creating a zero-fill element. We do not mark the buffer 1089 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1090 * API must still be used to do that). 1091 * 1092 * The device buffer is variable-sized in powers of 2 down 1093 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1094 * chunk always contains buffers of the same size. (XXX) 1095 * 1096 * The minimum physical IO size may be larger than the variable 1097 * block size. 1098 */ 1099 bref = &chain->bref; 1100 1101 /* 1102 * The getblk() optimization can only be used on newly created 1103 * elements if the physical block size matches the request. 1104 */ 1105 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1106 error = hammer2_io_new(hmp, bref->type, 1107 bref->data_off, chain->bytes, 1108 &chain->dio); 1109 } else { 1110 error = hammer2_io_bread(hmp, bref->type, 1111 bref->data_off, chain->bytes, 1112 &chain->dio); 1113 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1114 } 1115 if (error) { 1116 chain->error = HAMMER2_ERROR_EIO; 1117 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 1118 (intmax_t)bref->data_off, error); 1119 hammer2_io_bqrelse(&chain->dio); 1120 goto done; 1121 } 1122 chain->error = 0; 1123 1124 /* 1125 * This isn't perfect and can be ignored on OSs which do not have 1126 * an indication as to whether a buffer is coming from cache or 1127 * if I/O was actually issued for the read. TESTEDGOOD will work 1128 * pretty well without the B_IOISSUED logic because chains are 1129 * cached, but in that situation (without B_IOISSUED) it will not 1130 * detect whether a re-read via I/O is corrupted verses the original 1131 * read. 1132 * 1133 * We can't re-run the CRC on every fresh lock. That would be 1134 * insanely expensive. 1135 * 1136 * If the underlying kernel buffer covers the entire chain we can 1137 * use the B_IOISSUED indication to determine if we have to re-run 1138 * the CRC on chain data for chains that managed to stay cached 1139 * across the kernel disposal of the original buffer. 1140 */ 1141 if ((dio = chain->dio) != NULL && dio->bp) { 1142 struct buf *bp = dio->bp; 1143 1144 if (dio->psize == chain->bytes && 1145 (bp->b_flags & B_IOISSUED)) { 1146 atomic_clear_int(&chain->flags, 1147 HAMMER2_CHAIN_TESTEDGOOD); 1148 bp->b_flags &= ~B_IOISSUED; 1149 } 1150 } 1151 1152 /* 1153 * NOTE: A locked chain's data cannot be modified without first 1154 * calling hammer2_chain_modify(). 1155 */ 1156 1157 /* 1158 * Clear INITIAL. In this case we used io_new() and the buffer has 1159 * been zero'd and marked dirty. 1160 * 1161 * NOTE: hammer2_io_data() call issues bkvasync() 1162 */ 1163 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1164 1165 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1166 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1167 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO; 1168 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1169 /* 1170 * check data not currently synchronized due to 1171 * modification. XXX assumes data stays in the buffer 1172 * cache, which might not be true (need biodep on flush 1173 * to calculate crc? or simple crc?). 1174 */ 1175 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1176 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1177 chain->error = HAMMER2_ERROR_CHECK; 1178 } else { 1179 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1180 } 1181 } 1182 1183 /* 1184 * Setup the data pointer, either pointing it to an embedded data 1185 * structure and copying the data from the buffer, or pointing it 1186 * into the buffer. 1187 * 1188 * The buffer is not retained when copying to an embedded data 1189 * structure in order to avoid potential deadlocks or recursions 1190 * on the same physical buffer. 1191 * 1192 * WARNING! Other threads can start using the data the instant we 1193 * set chain->data non-NULL. 1194 */ 1195 switch (bref->type) { 1196 case HAMMER2_BREF_TYPE_VOLUME: 1197 case HAMMER2_BREF_TYPE_FREEMAP: 1198 /* 1199 * Copy data from bp to embedded buffer 1200 */ 1201 panic("hammer2_chain_load_data: unresolved volume header"); 1202 break; 1203 case HAMMER2_BREF_TYPE_DIRENT: 1204 KKASSERT(chain->bytes != 0); 1205 /* fall through */ 1206 case HAMMER2_BREF_TYPE_INODE: 1207 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1208 case HAMMER2_BREF_TYPE_INDIRECT: 1209 case HAMMER2_BREF_TYPE_DATA: 1210 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1211 default: 1212 /* 1213 * Point data at the device buffer and leave dio intact. 1214 */ 1215 chain->data = (void *)bdata; 1216 break; 1217 } 1218 1219 /* 1220 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1221 */ 1222 done: 1223 for (;;) { 1224 u_int oflags; 1225 u_int nflags; 1226 1227 oflags = chain->flags; 1228 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1229 HAMMER2_CHAIN_IOSIGNAL); 1230 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1231 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1232 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1233 wakeup(&chain->flags); 1234 break; 1235 } 1236 } 1237 } 1238 1239 /* 1240 * Unlock and deref a chain element. 1241 * 1242 * Remember that the presence of children under chain prevent the chain's 1243 * destruction but do not add additional references, so the dio will still 1244 * be dropped. 1245 */ 1246 void 1247 hammer2_chain_unlock(hammer2_chain_t *chain) 1248 { 1249 hammer2_io_t *dio; 1250 u_int lockcnt; 1251 int iter = 0; 1252 1253 --curthread->td_tracker; 1254 1255 /* 1256 * If multiple locks are present (or being attempted) on this 1257 * particular chain we can just unlock, drop refs, and return. 1258 * 1259 * Otherwise fall-through on the 1->0 transition. 1260 */ 1261 for (;;) { 1262 lockcnt = chain->lockcnt; 1263 KKASSERT(lockcnt > 0); 1264 cpu_ccfence(); 1265 if (lockcnt > 1) { 1266 if (atomic_cmpset_int(&chain->lockcnt, 1267 lockcnt, lockcnt - 1)) { 1268 hammer2_mtx_unlock(&chain->lock); 1269 return; 1270 } 1271 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1272 /* while holding the mutex exclusively */ 1273 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1274 break; 1275 } else { 1276 /* 1277 * This situation can easily occur on SMP due to 1278 * the gap inbetween the 1->0 transition and the 1279 * final unlock. We cannot safely block on the 1280 * mutex because lockcnt might go above 1. 1281 * 1282 * XXX Sleep for one tick if it takes too long. 1283 */ 1284 if (++iter > 1000) { 1285 if (iter > 1000 + hz) { 1286 kprintf("hammer2: h2race2 %p\n", chain); 1287 iter = 1000; 1288 } 1289 tsleep(&iter, 0, "h2race2", 1); 1290 } 1291 cpu_pause(); 1292 } 1293 /* retry */ 1294 } 1295 1296 /* 1297 * Last unlock / mutex upgraded to exclusive. Drop the data 1298 * reference. 1299 */ 1300 dio = hammer2_chain_drop_data(chain); 1301 if (dio) 1302 hammer2_io_bqrelse(&dio); 1303 hammer2_mtx_unlock(&chain->lock); 1304 } 1305 1306 /* 1307 * Unlock and hold chain data intact 1308 */ 1309 void 1310 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1311 { 1312 atomic_add_int(&chain->lockcnt, 1); 1313 hammer2_chain_unlock(chain); 1314 } 1315 1316 /* 1317 * Helper to obtain the blockref[] array base and count for a chain. 1318 * 1319 * XXX Not widely used yet, various use cases need to be validated and 1320 * converted to use this function. 1321 */ 1322 static 1323 hammer2_blockref_t * 1324 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1325 { 1326 hammer2_blockref_t *base; 1327 int count; 1328 1329 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1330 base = NULL; 1331 1332 switch(parent->bref.type) { 1333 case HAMMER2_BREF_TYPE_INODE: 1334 count = HAMMER2_SET_COUNT; 1335 break; 1336 case HAMMER2_BREF_TYPE_INDIRECT: 1337 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1338 count = parent->bytes / sizeof(hammer2_blockref_t); 1339 break; 1340 case HAMMER2_BREF_TYPE_VOLUME: 1341 count = HAMMER2_SET_COUNT; 1342 break; 1343 case HAMMER2_BREF_TYPE_FREEMAP: 1344 count = HAMMER2_SET_COUNT; 1345 break; 1346 default: 1347 panic("hammer2_chain_create_indirect: " 1348 "unrecognized blockref type: %d", 1349 parent->bref.type); 1350 count = 0; 1351 break; 1352 } 1353 } else { 1354 switch(parent->bref.type) { 1355 case HAMMER2_BREF_TYPE_INODE: 1356 base = &parent->data->ipdata.u.blockset.blockref[0]; 1357 count = HAMMER2_SET_COUNT; 1358 break; 1359 case HAMMER2_BREF_TYPE_INDIRECT: 1360 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1361 base = &parent->data->npdata[0]; 1362 count = parent->bytes / sizeof(hammer2_blockref_t); 1363 break; 1364 case HAMMER2_BREF_TYPE_VOLUME: 1365 base = &parent->data->voldata. 1366 sroot_blockset.blockref[0]; 1367 count = HAMMER2_SET_COUNT; 1368 break; 1369 case HAMMER2_BREF_TYPE_FREEMAP: 1370 base = &parent->data->blkset.blockref[0]; 1371 count = HAMMER2_SET_COUNT; 1372 break; 1373 default: 1374 panic("hammer2_chain_create_indirect: " 1375 "unrecognized blockref type: %d", 1376 parent->bref.type); 1377 count = 0; 1378 break; 1379 } 1380 } 1381 *countp = count; 1382 1383 return base; 1384 } 1385 1386 /* 1387 * This counts the number of live blockrefs in a block array and 1388 * also calculates the point at which all remaining blockrefs are empty. 1389 * This routine can only be called on a live chain. 1390 * 1391 * Caller holds the chain locked, but possibly with a shared lock. We 1392 * must use an exclusive spinlock to prevent corruption. 1393 * 1394 * NOTE: Flag is not set until after the count is complete, allowing 1395 * callers to test the flag without holding the spinlock. 1396 * 1397 * NOTE: If base is NULL the related chain is still in the INITIAL 1398 * state and there are no blockrefs to count. 1399 * 1400 * NOTE: live_count may already have some counts accumulated due to 1401 * creation and deletion and could even be initially negative. 1402 */ 1403 void 1404 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1405 hammer2_blockref_t *base, int count) 1406 { 1407 hammer2_spin_ex(&chain->core.spin); 1408 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1409 if (base) { 1410 while (--count >= 0) { 1411 if (base[count].type) 1412 break; 1413 } 1414 chain->core.live_zero = count + 1; 1415 while (count >= 0) { 1416 if (base[count].type) 1417 atomic_add_int(&chain->core.live_count, 1418 1); 1419 --count; 1420 } 1421 } else { 1422 chain->core.live_zero = 0; 1423 } 1424 /* else do not modify live_count */ 1425 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1426 } 1427 hammer2_spin_unex(&chain->core.spin); 1428 } 1429 1430 /* 1431 * Resize the chain's physical storage allocation in-place. This function does 1432 * not usually adjust the data pointer and must be followed by (typically) a 1433 * hammer2_chain_modify() call to copy any old data over and adjust the 1434 * data pointer. 1435 * 1436 * Chains can be resized smaller without reallocating the storage. Resizing 1437 * larger will reallocate the storage. Excess or prior storage is reclaimed 1438 * asynchronously at a later time. 1439 * 1440 * An nradix value of 0 is special-cased to mean that the storage should 1441 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1442 * byte). 1443 * 1444 * Must be passed an exclusively locked parent and chain. 1445 * 1446 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1447 * to avoid instantiating a device buffer that conflicts with the vnode data 1448 * buffer. However, because H2 can compress or encrypt data, the chain may 1449 * have a dio assigned to it in those situations, and they do not conflict. 1450 * 1451 * XXX return error if cannot resize. 1452 */ 1453 int 1454 hammer2_chain_resize(hammer2_chain_t *chain, 1455 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1456 int nradix, int flags) 1457 { 1458 hammer2_dev_t *hmp; 1459 size_t obytes; 1460 size_t nbytes; 1461 int error; 1462 1463 hmp = chain->hmp; 1464 1465 /* 1466 * Only data and indirect blocks can be resized for now. 1467 * (The volu root, inodes, and freemap elements use a fixed size). 1468 */ 1469 KKASSERT(chain != &hmp->vchain); 1470 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1471 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1472 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1473 1474 /* 1475 * Nothing to do if the element is already the proper size 1476 */ 1477 obytes = chain->bytes; 1478 nbytes = (nradix) ? (1U << nradix) : 0; 1479 if (obytes == nbytes) 1480 return (chain->error); 1481 1482 /* 1483 * Make sure the old data is instantiated so we can copy it. If this 1484 * is a data block, the device data may be superfluous since the data 1485 * might be in a logical block, but compressed or encrypted data is 1486 * another matter. 1487 * 1488 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 1489 */ 1490 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1491 if (error) 1492 return error; 1493 1494 /* 1495 * Relocate the block, even if making it smaller (because different 1496 * block sizes may be in different regions). 1497 * 1498 * NOTE: Operation does not copy the data and may only be used 1499 * to resize data blocks in-place, or directory entry blocks 1500 * which are about to be modified in some manner. 1501 */ 1502 error = hammer2_freemap_alloc(chain, nbytes); 1503 if (error) 1504 return error; 1505 1506 chain->bytes = nbytes; 1507 1508 /* 1509 * We don't want the followup chain_modify() to try to copy data 1510 * from the old (wrong-sized) buffer. It won't know how much to 1511 * copy. This case should only occur during writes when the 1512 * originator already has the data to write in-hand. 1513 */ 1514 if (chain->dio) { 1515 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1516 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1517 hammer2_io_brelse(&chain->dio); 1518 chain->data = NULL; 1519 } 1520 return (chain->error); 1521 } 1522 1523 /* 1524 * Set the chain modified so its data can be changed by the caller, or 1525 * install deduplicated data. The caller must call this routine for each 1526 * set of modifications it makes, even if the chain is already flagged 1527 * MODIFIED. 1528 * 1529 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1530 * is a CLC (cluster level change) field and is not updated by parent 1531 * propagation during a flush. 1532 * 1533 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1534 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1535 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1536 * remains unmodified with its old data ref intact and chain->error 1537 * unchanged. 1538 * 1539 * Dedup Handling 1540 * 1541 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1542 * even if the chain is still flagged MODIFIED. In this case the chain's 1543 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1544 * 1545 * If the caller passes a non-zero dedup_off we will use it to assign the 1546 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1547 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1548 * must not modify the data content upon return. 1549 */ 1550 int 1551 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1552 hammer2_off_t dedup_off, int flags) 1553 { 1554 hammer2_blockref_t obref; 1555 hammer2_dev_t *hmp; 1556 hammer2_io_t *dio; 1557 int error; 1558 int wasinitial; 1559 int setmodified; 1560 int setupdate; 1561 int newmod; 1562 char *bdata; 1563 1564 hmp = chain->hmp; 1565 obref = chain->bref; 1566 KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0); 1567 1568 /* 1569 * Data is not optional for freemap chains (we must always be sure 1570 * to copy the data on COW storage allocations). 1571 */ 1572 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1573 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1574 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1575 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1576 } 1577 1578 /* 1579 * Data must be resolved if already assigned, unless explicitly 1580 * flagged otherwise. If we cannot safety load the data the 1581 * modification fails and we return early. 1582 */ 1583 if (chain->data == NULL && chain->bytes != 0 && 1584 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1585 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1586 hammer2_chain_load_data(chain); 1587 if (chain->error) 1588 return (chain->error); 1589 } 1590 error = 0; 1591 1592 /* 1593 * Set MODIFIED to indicate that the chain has been modified. A new 1594 * allocation is required when modifying a chain. 1595 * 1596 * Set UPDATE to ensure that the blockref is updated in the parent. 1597 * 1598 * If MODIFIED is already set determine if we can reuse the assigned 1599 * data block or if we need a new data block. 1600 */ 1601 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1602 /* 1603 * Must set modified bit. 1604 */ 1605 atomic_add_long(&hammer2_count_modified_chains, 1); 1606 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1607 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1608 setmodified = 1; 1609 1610 /* 1611 * We may be able to avoid a copy-on-write if the chain's 1612 * check mode is set to NONE and the chain's current 1613 * modify_tid is beyond the last explicit snapshot tid. 1614 * 1615 * This implements HAMMER2's overwrite-in-place feature. 1616 * 1617 * NOTE! This data-block cannot be used as a de-duplication 1618 * source when the check mode is set to NONE. 1619 */ 1620 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1621 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1622 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1623 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1624 HAMMER2_DEC_CHECK(chain->bref.methods) == 1625 HAMMER2_CHECK_NONE && 1626 chain->pmp && 1627 chain->bref.modify_tid > 1628 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1629 /* 1630 * Sector overwrite allowed. 1631 */ 1632 newmod = 0; 1633 } else { 1634 /* 1635 * Sector overwrite not allowed, must copy-on-write. 1636 */ 1637 newmod = 1; 1638 } 1639 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1640 /* 1641 * If the modified chain was registered for dedup we need 1642 * a new allocation. This only happens for delayed-flush 1643 * chains (i.e. which run through the front-end buffer 1644 * cache). 1645 */ 1646 newmod = 1; 1647 setmodified = 0; 1648 } else { 1649 /* 1650 * Already flagged modified, no new allocation is needed. 1651 */ 1652 newmod = 0; 1653 setmodified = 0; 1654 } 1655 1656 /* 1657 * Flag parent update required. 1658 */ 1659 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1660 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1661 setupdate = 1; 1662 } else { 1663 setupdate = 0; 1664 } 1665 1666 /* 1667 * The modification or re-modification requires an allocation and 1668 * possible COW. If an error occurs, the previous content and data 1669 * reference is retained and the modification fails. 1670 * 1671 * If dedup_off is non-zero, the caller is requesting a deduplication 1672 * rather than a modification. The MODIFIED bit is not set and the 1673 * data offset is set to the deduplication offset. The data cannot 1674 * be modified. 1675 * 1676 * NOTE: The dedup offset is allowed to be in a partially free state 1677 * and we must be sure to reset it to a fully allocated state 1678 * to force two bulkfree passes to free it again. 1679 * 1680 * NOTE: Only applicable when chain->bytes != 0. 1681 * 1682 * XXX can a chain already be marked MODIFIED without a data 1683 * assignment? If not, assert here instead of testing the case. 1684 */ 1685 if (chain != &hmp->vchain && chain != &hmp->fchain && 1686 chain->bytes) { 1687 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1688 newmod 1689 ) { 1690 /* 1691 * NOTE: We do not have to remove the dedup 1692 * registration because the area is still 1693 * allocated and the underlying DIO will 1694 * still be flushed. 1695 */ 1696 if (dedup_off) { 1697 chain->bref.data_off = dedup_off; 1698 chain->bytes = 1 << (dedup_off & 1699 HAMMER2_OFF_MASK_RADIX); 1700 chain->error = 0; 1701 atomic_clear_int(&chain->flags, 1702 HAMMER2_CHAIN_MODIFIED); 1703 atomic_add_long(&hammer2_count_modified_chains, 1704 -1); 1705 if (chain->pmp) 1706 hammer2_pfs_memory_wakeup(chain->pmp); 1707 hammer2_freemap_adjust(hmp, &chain->bref, 1708 HAMMER2_FREEMAP_DORECOVER); 1709 atomic_set_int(&chain->flags, 1710 HAMMER2_CHAIN_DEDUPABLE); 1711 } else { 1712 error = hammer2_freemap_alloc(chain, 1713 chain->bytes); 1714 atomic_clear_int(&chain->flags, 1715 HAMMER2_CHAIN_DEDUPABLE); 1716 } 1717 } 1718 } 1719 1720 /* 1721 * Stop here if error. We have to undo any flag bits we might 1722 * have set above. 1723 */ 1724 if (error) { 1725 if (setmodified) { 1726 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1727 atomic_add_long(&hammer2_count_modified_chains, -1); 1728 if (chain->pmp) 1729 hammer2_pfs_memory_wakeup(chain->pmp); 1730 } 1731 if (setupdate) { 1732 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1733 } 1734 return error; 1735 } 1736 1737 /* 1738 * Update mirror_tid and modify_tid. modify_tid is only updated 1739 * if not passed as zero (during flushes, parent propagation passes 1740 * the value 0). 1741 * 1742 * NOTE: chain->pmp could be the device spmp. 1743 */ 1744 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1745 if (mtid) 1746 chain->bref.modify_tid = mtid; 1747 1748 /* 1749 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1750 * requires updating as well as to tell the delete code that the 1751 * chain's blockref might not exactly match (in terms of physical size 1752 * or block offset) the one in the parent's blocktable. The base key 1753 * of course will still match. 1754 */ 1755 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1756 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1757 1758 /* 1759 * Short-cut data blocks which the caller does not need an actual 1760 * data reference to (aka OPTDATA), as long as the chain does not 1761 * already have a data pointer to the data. This generally means 1762 * that the modifications are being done via the logical buffer cache. 1763 * The INITIAL flag relates only to the device data buffer and thus 1764 * remains unchange in this situation. 1765 * 1766 * This code also handles bytes == 0 (most dirents). 1767 */ 1768 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1769 (flags & HAMMER2_MODIFY_OPTDATA) && 1770 chain->data == NULL) { 1771 KKASSERT(chain->dio == NULL); 1772 goto skip2; 1773 } 1774 1775 /* 1776 * Clearing the INITIAL flag (for indirect blocks) indicates that 1777 * we've processed the uninitialized storage allocation. 1778 * 1779 * If this flag is already clear we are likely in a copy-on-write 1780 * situation but we have to be sure NOT to bzero the storage if 1781 * no data is present. 1782 */ 1783 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1784 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1785 wasinitial = 1; 1786 } else { 1787 wasinitial = 0; 1788 } 1789 1790 /* 1791 * Instantiate data buffer and possibly execute COW operation 1792 */ 1793 switch(chain->bref.type) { 1794 case HAMMER2_BREF_TYPE_VOLUME: 1795 case HAMMER2_BREF_TYPE_FREEMAP: 1796 /* 1797 * The data is embedded, no copy-on-write operation is 1798 * needed. 1799 */ 1800 KKASSERT(chain->dio == NULL); 1801 break; 1802 case HAMMER2_BREF_TYPE_DIRENT: 1803 /* 1804 * The data might be fully embedded. 1805 */ 1806 if (chain->bytes == 0) { 1807 KKASSERT(chain->dio == NULL); 1808 break; 1809 } 1810 /* fall through */ 1811 case HAMMER2_BREF_TYPE_INODE: 1812 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1813 case HAMMER2_BREF_TYPE_DATA: 1814 case HAMMER2_BREF_TYPE_INDIRECT: 1815 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1816 /* 1817 * Perform the copy-on-write operation 1818 * 1819 * zero-fill or copy-on-write depending on whether 1820 * chain->data exists or not and set the dirty state for 1821 * the new buffer. hammer2_io_new() will handle the 1822 * zero-fill. 1823 * 1824 * If a dedup_off was supplied this is an existing block 1825 * and no COW, copy, or further modification is required. 1826 */ 1827 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1828 1829 if (wasinitial && dedup_off == 0) { 1830 error = hammer2_io_new(hmp, chain->bref.type, 1831 chain->bref.data_off, 1832 chain->bytes, &dio); 1833 } else { 1834 error = hammer2_io_bread(hmp, chain->bref.type, 1835 chain->bref.data_off, 1836 chain->bytes, &dio); 1837 } 1838 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1839 1840 /* 1841 * If an I/O error occurs make sure callers cannot accidently 1842 * modify the old buffer's contents and corrupt the filesystem. 1843 * 1844 * NOTE: hammer2_io_data() call issues bkvasync() 1845 */ 1846 if (error) { 1847 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 1848 hmp); 1849 chain->error = HAMMER2_ERROR_EIO; 1850 hammer2_io_brelse(&dio); 1851 hammer2_io_brelse(&chain->dio); 1852 chain->data = NULL; 1853 break; 1854 } 1855 chain->error = 0; 1856 bdata = hammer2_io_data(dio, chain->bref.data_off); 1857 1858 if (chain->data) { 1859 /* 1860 * COW (unless a dedup). 1861 */ 1862 KKASSERT(chain->dio != NULL); 1863 if (chain->data != (void *)bdata && dedup_off == 0) { 1864 bcopy(chain->data, bdata, chain->bytes); 1865 } 1866 } else if (wasinitial == 0) { 1867 /* 1868 * We have a problem. We were asked to COW but 1869 * we don't have any data to COW with! 1870 */ 1871 panic("hammer2_chain_modify: having a COW %p\n", 1872 chain); 1873 } 1874 1875 /* 1876 * Retire the old buffer, replace with the new. Dirty or 1877 * redirty the new buffer. 1878 * 1879 * WARNING! The system buffer cache may have already flushed 1880 * the buffer, so we must be sure to [re]dirty it 1881 * for further modification. 1882 * 1883 * If dedup_off was supplied, the caller is not 1884 * expected to make any further modification to the 1885 * buffer. 1886 */ 1887 if (chain->dio) 1888 hammer2_io_bqrelse(&chain->dio); 1889 chain->data = (void *)bdata; 1890 chain->dio = dio; 1891 if (dedup_off == 0) 1892 hammer2_io_setdirty(dio); 1893 break; 1894 default: 1895 panic("hammer2_chain_modify: illegal non-embedded type %d", 1896 chain->bref.type); 1897 break; 1898 1899 } 1900 skip2: 1901 /* 1902 * setflush on parent indicating that the parent must recurse down 1903 * to us. Do not call on chain itself which might already have it 1904 * set. 1905 */ 1906 if (chain->parent) 1907 hammer2_chain_setflush(chain->parent); 1908 return (chain->error); 1909 } 1910 1911 /* 1912 * Modify the chain associated with an inode. 1913 */ 1914 int 1915 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 1916 hammer2_tid_t mtid, int flags) 1917 { 1918 int error; 1919 1920 hammer2_inode_modify(ip); 1921 error = hammer2_chain_modify(chain, mtid, 0, flags); 1922 1923 return error; 1924 } 1925 1926 /* 1927 * Volume header data locks 1928 */ 1929 void 1930 hammer2_voldata_lock(hammer2_dev_t *hmp) 1931 { 1932 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 1933 } 1934 1935 void 1936 hammer2_voldata_unlock(hammer2_dev_t *hmp) 1937 { 1938 lockmgr(&hmp->vollk, LK_RELEASE); 1939 } 1940 1941 void 1942 hammer2_voldata_modify(hammer2_dev_t *hmp) 1943 { 1944 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1945 atomic_add_long(&hammer2_count_modified_chains, 1); 1946 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1947 hammer2_pfs_memory_inc(hmp->vchain.pmp); 1948 } 1949 } 1950 1951 /* 1952 * This function returns the chain at the nearest key within the specified 1953 * range. The returned chain will be referenced but not locked. 1954 * 1955 * This function will recurse through chain->rbtree as necessary and will 1956 * return a *key_nextp suitable for iteration. *key_nextp is only set if 1957 * the iteration value is less than the current value of *key_nextp. 1958 * 1959 * The caller should use (*key_nextp) to calculate the actual range of 1960 * the returned element, which will be (key_beg to *key_nextp - 1), because 1961 * there might be another element which is superior to the returned element 1962 * and overlaps it. 1963 * 1964 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 1965 * chains continue to be returned. On EOF (*key_nextp) may overflow since 1966 * it will wind up being (key_end + 1). 1967 * 1968 * WARNING! Must be called with child's spinlock held. Spinlock remains 1969 * held through the operation. 1970 */ 1971 struct hammer2_chain_find_info { 1972 hammer2_chain_t *best; 1973 hammer2_key_t key_beg; 1974 hammer2_key_t key_end; 1975 hammer2_key_t key_next; 1976 }; 1977 1978 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 1979 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 1980 1981 static 1982 hammer2_chain_t * 1983 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 1984 hammer2_key_t key_beg, hammer2_key_t key_end) 1985 { 1986 struct hammer2_chain_find_info info; 1987 1988 info.best = NULL; 1989 info.key_beg = key_beg; 1990 info.key_end = key_end; 1991 info.key_next = *key_nextp; 1992 1993 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 1994 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1995 &info); 1996 *key_nextp = info.key_next; 1997 #if 0 1998 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 1999 parent, key_beg, key_end, *key_nextp); 2000 #endif 2001 2002 return (info.best); 2003 } 2004 2005 static 2006 int 2007 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2008 { 2009 struct hammer2_chain_find_info *info = data; 2010 hammer2_key_t child_beg; 2011 hammer2_key_t child_end; 2012 2013 child_beg = child->bref.key; 2014 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2015 2016 if (child_end < info->key_beg) 2017 return(-1); 2018 if (child_beg > info->key_end) 2019 return(1); 2020 return(0); 2021 } 2022 2023 static 2024 int 2025 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2026 { 2027 struct hammer2_chain_find_info *info = data; 2028 hammer2_chain_t *best; 2029 hammer2_key_t child_end; 2030 2031 /* 2032 * WARNING! Layerq is scanned forwards, exact matches should keep 2033 * the existing info->best. 2034 */ 2035 if ((best = info->best) == NULL) { 2036 /* 2037 * No previous best. Assign best 2038 */ 2039 info->best = child; 2040 } else if (best->bref.key <= info->key_beg && 2041 child->bref.key <= info->key_beg) { 2042 /* 2043 * Illegal overlap. 2044 */ 2045 KKASSERT(0); 2046 /*info->best = child;*/ 2047 } else if (child->bref.key < best->bref.key) { 2048 /* 2049 * Child has a nearer key and best is not flush with key_beg. 2050 * Set best to child. Truncate key_next to the old best key. 2051 */ 2052 info->best = child; 2053 if (info->key_next > best->bref.key || info->key_next == 0) 2054 info->key_next = best->bref.key; 2055 } else if (child->bref.key == best->bref.key) { 2056 /* 2057 * If our current best is flush with the child then this 2058 * is an illegal overlap. 2059 * 2060 * key_next will automatically be limited to the smaller of 2061 * the two end-points. 2062 */ 2063 KKASSERT(0); 2064 info->best = child; 2065 } else { 2066 /* 2067 * Keep the current best but truncate key_next to the child's 2068 * base. 2069 * 2070 * key_next will also automatically be limited to the smaller 2071 * of the two end-points (probably not necessary for this case 2072 * but we do it anyway). 2073 */ 2074 if (info->key_next > child->bref.key || info->key_next == 0) 2075 info->key_next = child->bref.key; 2076 } 2077 2078 /* 2079 * Always truncate key_next based on child's end-of-range. 2080 */ 2081 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2082 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2083 info->key_next = child_end; 2084 2085 return(0); 2086 } 2087 2088 /* 2089 * Retrieve the specified chain from a media blockref, creating the 2090 * in-memory chain structure which reflects it. The returned chain is 2091 * held but not locked. The caller must lock it to crc-check and 2092 * dereference its data, and should check chain->error after locking 2093 * before assuming that the data is good. 2094 * 2095 * To handle insertion races pass the INSERT_RACE flag along with the 2096 * generation number of the core. NULL will be returned if the generation 2097 * number changes before we have a chance to insert the chain. Insert 2098 * races can occur because the parent might be held shared. 2099 * 2100 * Caller must hold the parent locked shared or exclusive since we may 2101 * need the parent's bref array to find our block. 2102 * 2103 * WARNING! chain->pmp is always set to NULL for any chain representing 2104 * part of the super-root topology. 2105 */ 2106 hammer2_chain_t * 2107 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2108 hammer2_blockref_t *bref) 2109 { 2110 hammer2_dev_t *hmp = parent->hmp; 2111 hammer2_chain_t *chain; 2112 int error; 2113 2114 /* 2115 * Allocate a chain structure representing the existing media 2116 * entry. Resulting chain has one ref and is not locked. 2117 */ 2118 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2119 chain = hammer2_chain_alloc(hmp, NULL, bref); 2120 else 2121 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2122 /* ref'd chain returned */ 2123 2124 /* 2125 * Flag that the chain is in the parent's blockmap so delete/flush 2126 * knows what to do with it. 2127 */ 2128 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 2129 2130 /* 2131 * Link the chain into its parent. A spinlock is required to safely 2132 * access the RBTREE, and it is possible to collide with another 2133 * hammer2_chain_get() operation because the caller might only hold 2134 * a shared lock on the parent. 2135 * 2136 * NOTE: Get races can occur quite often when we distribute 2137 * asynchronous read-aheads across multiple threads. 2138 */ 2139 KKASSERT(parent->refs > 0); 2140 error = hammer2_chain_insert(parent, chain, 2141 HAMMER2_CHAIN_INSERT_SPIN | 2142 HAMMER2_CHAIN_INSERT_RACE, 2143 generation); 2144 if (error) { 2145 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2146 /*kprintf("chain %p get race\n", chain);*/ 2147 hammer2_chain_drop(chain); 2148 chain = NULL; 2149 } else { 2150 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2151 } 2152 2153 /* 2154 * Return our new chain referenced but not locked, or NULL if 2155 * a race occurred. 2156 */ 2157 return (chain); 2158 } 2159 2160 /* 2161 * Lookup initialization/completion API 2162 */ 2163 hammer2_chain_t * 2164 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2165 { 2166 hammer2_chain_ref(parent); 2167 if (flags & HAMMER2_LOOKUP_SHARED) { 2168 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2169 HAMMER2_RESOLVE_SHARED); 2170 } else { 2171 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2172 } 2173 return (parent); 2174 } 2175 2176 void 2177 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2178 { 2179 if (parent) { 2180 hammer2_chain_unlock(parent); 2181 hammer2_chain_drop(parent); 2182 } 2183 } 2184 2185 /* 2186 * Take the locked chain and return a locked parent. The chain remains 2187 * locked on return, but may have to be temporarily unlocked to acquire 2188 * the parent. Can return NULL. 2189 * 2190 * Pass HAMMER2_RESOLVE_* flags in flags. 2191 * 2192 * This will work even if the chain is errored, and the caller can check 2193 * parent->error on return if desired since the parent will be locked. 2194 * 2195 * This function handles the lock order reversal. 2196 */ 2197 hammer2_chain_t * 2198 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2199 { 2200 hammer2_chain_t *parent; 2201 2202 /* 2203 * Be careful of order, chain must be unlocked before parent 2204 * is locked below to avoid a deadlock. 2205 * 2206 * Safe access to fu->parent requires fu's core spinlock. 2207 */ 2208 again: 2209 hammer2_spin_ex(&chain->core.spin); 2210 parent = chain->parent; 2211 if (parent == NULL) { 2212 hammer2_spin_unex(&chain->core.spin); 2213 panic("hammer2_chain_getparent: no parent"); 2214 } 2215 hammer2_chain_ref(parent); 2216 hammer2_spin_unex(&chain->core.spin); 2217 2218 hammer2_chain_unlock(chain); 2219 hammer2_chain_lock(parent, flags); 2220 hammer2_chain_lock(chain, flags); 2221 2222 /* 2223 * Parent relinking races are quite common. We have to get it right 2224 * or we will blow up the block table. 2225 */ 2226 if (chain->parent != parent) { 2227 hammer2_chain_unlock(parent); 2228 hammer2_chain_drop(parent); 2229 goto again; 2230 } 2231 return parent; 2232 } 2233 2234 /* 2235 * Take the locked chain and return a locked parent. The chain is unlocked 2236 * and dropped. *chainp is set to the returned parent as a convenience. 2237 * Pass HAMMER2_RESOLVE_* flags in flags. 2238 * 2239 * This will work even if the chain is errored, and the caller can check 2240 * parent->error on return if desired since the parent will be locked. 2241 * 2242 * This function handles the lock order reversal. 2243 */ 2244 hammer2_chain_t * 2245 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2246 { 2247 hammer2_chain_t *chain; 2248 hammer2_chain_t *parent; 2249 2250 /* 2251 * Be careful of order, chain must be unlocked before parent 2252 * is locked below to avoid a deadlock. 2253 * 2254 * Safe access to fu->parent requires fu's core spinlock. 2255 */ 2256 chain = *chainp; 2257 again: 2258 hammer2_spin_ex(&chain->core.spin); 2259 parent = chain->parent; 2260 if (parent == NULL) { 2261 hammer2_spin_unex(&chain->core.spin); 2262 panic("hammer2_chain_getparent: no parent"); 2263 } 2264 hammer2_chain_ref(parent); 2265 hammer2_spin_unex(&chain->core.spin); 2266 2267 hammer2_chain_unlock(chain); 2268 hammer2_chain_lock(parent, flags); 2269 2270 /* 2271 * Parent relinking races are quite common. We have to get it right 2272 * or we will blow up the block table. 2273 */ 2274 if (chain->parent != parent) { 2275 hammer2_chain_lock(chain, flags); 2276 hammer2_chain_unlock(parent); 2277 hammer2_chain_drop(parent); 2278 goto again; 2279 } 2280 hammer2_chain_drop(chain); 2281 *chainp = parent; 2282 2283 return parent; 2284 } 2285 2286 /* 2287 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2288 * (*parentp) typically points to an inode but can also point to a related 2289 * indirect block and this function will recurse upwards and find the inode 2290 * or the nearest undeleted indirect block covering the key range. 2291 * 2292 * This function unconditionally sets *errorp, replacing any previous value. 2293 * 2294 * (*parentp) must be exclusive or shared locked (depending on flags) and 2295 * referenced and can be an inode or an existing indirect block within the 2296 * inode. 2297 * 2298 * If (*parent) is errored out, this function will not attempt to recurse 2299 * the radix tree and will return NULL along with an appropriate *errorp. 2300 * If NULL is returned and *errorp is 0, the requested lookup could not be 2301 * located. 2302 * 2303 * On return (*parentp) will be modified to point at the deepest parent chain 2304 * element encountered during the search, as a helper for an insertion or 2305 * deletion. 2306 * 2307 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2308 * and referenced, and the old will be unlocked and dereferenced (no change 2309 * if they are both the same). This is particularly important if the caller 2310 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2311 * is returned, as long as no error occurred. 2312 * 2313 * The matching chain will be returned locked according to flags. If NOLOCK 2314 * is requested the chain will be returned only referenced. Note that the 2315 * parent chain must always be locked shared or exclusive, matching the 2316 * HAMMER2_LOOKUP_SHARED flag. We can conceivably lock it SHARED temporarily 2317 * when NOLOCK is specified but that complicates matters if *parentp must 2318 * inherit the chain. 2319 * 2320 * NOLOCK also implies NODATA, since an unlocked chain usually has a NULL 2321 * data pointer or can otherwise be in flux. 2322 * 2323 * NULL is returned if no match was found, but (*parentp) will still 2324 * potentially be adjusted. 2325 * 2326 * On return (*key_nextp) will point to an iterative value for key_beg. 2327 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2328 * 2329 * This function will also recurse up the chain if the key is not within the 2330 * current parent's range. (*parentp) can never be set to NULL. An iteration 2331 * can simply allow (*parentp) to float inside the loop. 2332 * 2333 * NOTE! chain->data is not always resolved. By default it will not be 2334 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2335 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2336 * BREF_TYPE_DATA as the device buffer can alias the logical file 2337 * buffer). 2338 */ 2339 2340 hammer2_chain_t * 2341 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2342 hammer2_key_t key_beg, hammer2_key_t key_end, 2343 int *errorp, int flags) 2344 { 2345 hammer2_dev_t *hmp; 2346 hammer2_chain_t *parent; 2347 hammer2_chain_t *chain; 2348 hammer2_blockref_t *base; 2349 hammer2_blockref_t *bref; 2350 hammer2_blockref_t bcopy; 2351 hammer2_key_t scan_beg; 2352 hammer2_key_t scan_end; 2353 int count = 0; 2354 int how_always = HAMMER2_RESOLVE_ALWAYS; 2355 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2356 int how; 2357 int generation; 2358 int maxloops = 300000; 2359 volatile hammer2_mtx_t save_mtx; 2360 2361 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2362 how_maybe = how_always; 2363 how = HAMMER2_RESOLVE_ALWAYS; 2364 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2365 how = HAMMER2_RESOLVE_NEVER; 2366 } else { 2367 how = HAMMER2_RESOLVE_MAYBE; 2368 } 2369 if (flags & HAMMER2_LOOKUP_SHARED) { 2370 how_maybe |= HAMMER2_RESOLVE_SHARED; 2371 how_always |= HAMMER2_RESOLVE_SHARED; 2372 how |= HAMMER2_RESOLVE_SHARED; 2373 } 2374 2375 /* 2376 * Recurse (*parentp) upward if necessary until the parent completely 2377 * encloses the key range or we hit the inode. 2378 * 2379 * Handle races against the flusher deleting indirect nodes on its 2380 * way back up by continuing to recurse upward past the deletion. 2381 */ 2382 parent = *parentp; 2383 hmp = parent->hmp; 2384 *errorp = 0; 2385 2386 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2387 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2388 scan_beg = parent->bref.key; 2389 scan_end = scan_beg + 2390 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2391 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2392 if (key_beg >= scan_beg && key_end <= scan_end) 2393 break; 2394 } 2395 parent = hammer2_chain_repparent(parentp, how_maybe); 2396 } 2397 again: 2398 2399 if (--maxloops == 0) 2400 panic("hammer2_chain_lookup: maxloops"); 2401 /* 2402 * Locate the blockref array. Currently we do a fully associative 2403 * search through the array. 2404 */ 2405 switch(parent->bref.type) { 2406 case HAMMER2_BREF_TYPE_INODE: 2407 /* 2408 * Special shortcut for embedded data returns the inode 2409 * itself. Callers must detect this condition and access 2410 * the embedded data (the strategy code does this for us). 2411 * 2412 * This is only applicable to regular files and softlinks. 2413 * 2414 * We need a second lock on parent. Since we already have 2415 * a lock we must pass LOCKAGAIN to prevent unexpected 2416 * blocking (we don't want to block on a second shared 2417 * ref if an exclusive lock is pending) 2418 */ 2419 if (parent->data->ipdata.meta.op_flags & 2420 HAMMER2_OPFLAG_DIRECTDATA) { 2421 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2422 chain = NULL; 2423 *key_nextp = key_end + 1; 2424 goto done; 2425 } 2426 hammer2_chain_ref(parent); 2427 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 2428 hammer2_chain_lock(parent, 2429 how_always | 2430 HAMMER2_RESOLVE_LOCKAGAIN); 2431 *key_nextp = key_end + 1; 2432 return (parent); 2433 } 2434 base = &parent->data->ipdata.u.blockset.blockref[0]; 2435 count = HAMMER2_SET_COUNT; 2436 break; 2437 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2438 case HAMMER2_BREF_TYPE_INDIRECT: 2439 /* 2440 * Handle MATCHIND on the parent 2441 */ 2442 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2443 scan_beg = parent->bref.key; 2444 scan_end = scan_beg + 2445 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2446 if (key_beg == scan_beg && key_end == scan_end) { 2447 chain = parent; 2448 hammer2_chain_ref(chain); 2449 hammer2_chain_lock(chain, how_maybe); 2450 *key_nextp = scan_end + 1; 2451 goto done; 2452 } 2453 } 2454 2455 /* 2456 * Optimize indirect blocks in the INITIAL state to avoid 2457 * I/O. 2458 */ 2459 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2460 base = NULL; 2461 } else { 2462 if (parent->data == NULL) { 2463 kprintf("parent->data is NULL %p\n", parent); 2464 while (1) 2465 tsleep(parent, 0, "xxx", 0); 2466 } 2467 base = &parent->data->npdata[0]; 2468 } 2469 count = parent->bytes / sizeof(hammer2_blockref_t); 2470 break; 2471 case HAMMER2_BREF_TYPE_VOLUME: 2472 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2473 count = HAMMER2_SET_COUNT; 2474 break; 2475 case HAMMER2_BREF_TYPE_FREEMAP: 2476 base = &parent->data->blkset.blockref[0]; 2477 count = HAMMER2_SET_COUNT; 2478 break; 2479 default: 2480 kprintf("hammer2_chain_lookup: unrecognized " 2481 "blockref(B) type: %d", 2482 parent->bref.type); 2483 while (1) 2484 tsleep(&base, 0, "dead", 0); 2485 panic("hammer2_chain_lookup: unrecognized " 2486 "blockref(B) type: %d", 2487 parent->bref.type); 2488 base = NULL; /* safety */ 2489 count = 0; /* safety */ 2490 } 2491 2492 /* 2493 * No lookup is possible if the parent is errored. We delayed 2494 * this check as long as we could to ensure that the parent backup, 2495 * embedded data, and MATCHIND code could still execute. 2496 */ 2497 if (parent->error) { 2498 *errorp = parent->error; 2499 return NULL; 2500 } 2501 2502 /* 2503 * Merged scan to find next candidate. 2504 * 2505 * hammer2_base_*() functions require the parent->core.live_* fields 2506 * to be synchronized. 2507 * 2508 * We need to hold the spinlock to access the block array and RB tree 2509 * and to interlock chain creation. 2510 */ 2511 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2512 hammer2_chain_countbrefs(parent, base, count); 2513 2514 /* 2515 * Combined search 2516 */ 2517 hammer2_spin_ex(&parent->core.spin); 2518 chain = hammer2_combined_find(parent, base, count, 2519 key_nextp, 2520 key_beg, key_end, 2521 &bref); 2522 generation = parent->core.generation; 2523 2524 /* 2525 * Exhausted parent chain, iterate. 2526 */ 2527 if (bref == NULL) { 2528 KKASSERT(chain == NULL); 2529 hammer2_spin_unex(&parent->core.spin); 2530 if (key_beg == key_end) /* short cut single-key case */ 2531 return (NULL); 2532 2533 /* 2534 * Stop if we reached the end of the iteration. 2535 */ 2536 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2537 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2538 return (NULL); 2539 } 2540 2541 /* 2542 * Calculate next key, stop if we reached the end of the 2543 * iteration, otherwise go up one level and loop. 2544 */ 2545 key_beg = parent->bref.key + 2546 ((hammer2_key_t)1 << parent->bref.keybits); 2547 if (key_beg == 0 || key_beg > key_end) 2548 return (NULL); 2549 parent = hammer2_chain_repparent(parentp, how_maybe); 2550 goto again; 2551 } 2552 2553 /* 2554 * Selected from blockref or in-memory chain. 2555 */ 2556 if (chain == NULL) { 2557 bcopy = *bref; 2558 hammer2_spin_unex(&parent->core.spin); 2559 chain = hammer2_chain_get(parent, generation, 2560 &bcopy); 2561 if (chain == NULL) { 2562 /* 2563 kprintf("retry lookup parent %p keys %016jx:%016jx\n", 2564 parent, key_beg, key_end); 2565 */ 2566 goto again; 2567 } 2568 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2569 hammer2_chain_drop(chain); 2570 chain = NULL; /* SAFETY */ 2571 goto again; 2572 } 2573 } else { 2574 hammer2_chain_ref(chain); 2575 hammer2_spin_unex(&parent->core.spin); 2576 } 2577 2578 /* 2579 * chain is referenced but not locked. We must lock the chain 2580 * to obtain definitive state. 2581 */ 2582 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2583 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2584 hammer2_chain_lock(chain, how_maybe); 2585 } else { 2586 hammer2_chain_lock(chain, how); 2587 } 2588 KKASSERT(chain->parent == parent); 2589 2590 /* 2591 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2592 * 2593 * NOTE: Chain's key range is not relevant as there might be 2594 * one-offs within the range that are not deleted. 2595 * 2596 * NOTE: Lookups can race delete-duplicate because 2597 * delete-duplicate does not lock the parent's core 2598 * (they just use the spinlock on the core). 2599 */ 2600 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2601 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2602 chain->bref.data_off, chain->bref.type, 2603 chain->bref.key); 2604 hammer2_chain_unlock(chain); 2605 hammer2_chain_drop(chain); 2606 chain = NULL; /* SAFETY */ 2607 key_beg = *key_nextp; 2608 if (key_beg == 0 || key_beg > key_end) 2609 return(NULL); 2610 goto again; 2611 } 2612 2613 /* 2614 * If the chain element is an indirect block it becomes the new 2615 * parent and we loop on it. We must maintain our top-down locks 2616 * to prevent the flusher from interfering (i.e. doing a 2617 * delete-duplicate and leaving us recursing down a deleted chain). 2618 * 2619 * The parent always has to be locked with at least RESOLVE_MAYBE 2620 * so we can access its data. It might need a fixup if the caller 2621 * passed incompatible flags. Be careful not to cause a deadlock 2622 * as a data-load requires an exclusive lock. 2623 * 2624 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2625 * range is within the requested key range we return the indirect 2626 * block and do NOT loop. This is usually only used to acquire 2627 * freemap nodes. 2628 */ 2629 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2630 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2631 save_mtx = parent->lock; 2632 hammer2_chain_unlock(parent); 2633 hammer2_chain_drop(parent); 2634 *parentp = parent = chain; 2635 chain = NULL; /* SAFETY */ 2636 goto again; 2637 } 2638 done: 2639 /* 2640 * All done, return the chain. 2641 * 2642 * If the caller does not want a locked chain, replace the lock with 2643 * a ref. Perhaps this can eventually be optimized to not obtain the 2644 * lock in the first place for situations where the data does not 2645 * need to be resolved. 2646 * 2647 * NOTE! A chain->error must be tested by the caller upon return. 2648 * *errorp is only set based on issues which occur while 2649 * trying to reach the chain. 2650 */ 2651 if (chain) { 2652 if (flags & HAMMER2_LOOKUP_NOLOCK) 2653 hammer2_chain_unlock(chain); 2654 } 2655 return (chain); 2656 } 2657 2658 /* 2659 * After having issued a lookup we can iterate all matching keys. 2660 * 2661 * If chain is non-NULL we continue the iteration from just after it's index. 2662 * 2663 * If chain is NULL we assume the parent was exhausted and continue the 2664 * iteration at the next parent. 2665 * 2666 * If a fatal error occurs (typically an I/O error), a dummy chain is 2667 * returned with chain->error and error-identifying information set. This 2668 * chain will assert if you try to do anything fancy with it. 2669 * 2670 * XXX Depending on where the error occurs we should allow continued iteration. 2671 * 2672 * parent must be locked on entry and remains locked throughout. chain's 2673 * lock status must match flags. Chain is always at least referenced. 2674 * 2675 * WARNING! The MATCHIND flag does not apply to this function. 2676 */ 2677 hammer2_chain_t * 2678 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2679 hammer2_key_t *key_nextp, 2680 hammer2_key_t key_beg, hammer2_key_t key_end, 2681 int *errorp, int flags) 2682 { 2683 hammer2_chain_t *parent; 2684 int how_maybe; 2685 2686 /* 2687 * Calculate locking flags for upward recursion. 2688 */ 2689 how_maybe = HAMMER2_RESOLVE_MAYBE; 2690 if (flags & HAMMER2_LOOKUP_SHARED) 2691 how_maybe |= HAMMER2_RESOLVE_SHARED; 2692 2693 parent = *parentp; 2694 *errorp = 0; 2695 2696 /* 2697 * Calculate the next index and recalculate the parent if necessary. 2698 */ 2699 if (chain) { 2700 key_beg = chain->bref.key + 2701 ((hammer2_key_t)1 << chain->bref.keybits); 2702 if ((flags & (HAMMER2_LOOKUP_NOLOCK | 2703 HAMMER2_LOOKUP_NOUNLOCK)) == 0) { 2704 hammer2_chain_unlock(chain); 2705 } 2706 hammer2_chain_drop(chain); 2707 2708 /* 2709 * chain invalid past this point, but we can still do a 2710 * pointer comparison w/parent. 2711 * 2712 * Any scan where the lookup returned degenerate data embedded 2713 * in the inode has an invalid index and must terminate. 2714 */ 2715 if (chain == parent) 2716 return(NULL); 2717 if (key_beg == 0 || key_beg > key_end) 2718 return(NULL); 2719 chain = NULL; 2720 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2721 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2722 /* 2723 * We reached the end of the iteration. 2724 */ 2725 return (NULL); 2726 } else { 2727 /* 2728 * Continue iteration with next parent unless the current 2729 * parent covers the range. 2730 * 2731 * (This also handles the case of a deleted, empty indirect 2732 * node). 2733 */ 2734 key_beg = parent->bref.key + 2735 ((hammer2_key_t)1 << parent->bref.keybits); 2736 if (key_beg == 0 || key_beg > key_end) 2737 return (NULL); 2738 parent = hammer2_chain_repparent(parentp, how_maybe); 2739 } 2740 2741 /* 2742 * And execute 2743 */ 2744 return (hammer2_chain_lookup(parentp, key_nextp, 2745 key_beg, key_end, 2746 errorp, flags)); 2747 } 2748 2749 /* 2750 * Caller wishes to iterate chains under parent, loading new chains into 2751 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 2752 * then call hammer2_chain_scan() repeatedly until a non-zero return. 2753 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 2754 * with the returned chain for the scan. The returned *chainp will be 2755 * locked and referenced. Any prior contents will be unlocked and dropped. 2756 * 2757 * Caller should check the return value. A normal scan EOF will return 2758 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 2759 * error trying to access parent data. Any error in the returned chain 2760 * must be tested separately by the caller. 2761 * 2762 * (*chainp) is dropped on each scan, but will only be set if the returned 2763 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 2764 * returned via *chainp. The caller will get their bref only. 2765 * 2766 * The raw scan function is similar to lookup/next but does not seek to a key. 2767 * Blockrefs are iterated via first_bref = (parent, NULL) and 2768 * next_chain = (parent, bref). 2769 * 2770 * The passed-in parent must be locked and its data resolved. The function 2771 * nominally returns a locked and referenced *chainp != NULL for chains 2772 * the caller might need to recurse on (and will dipose of any *chainp passed 2773 * in). The caller must check the chain->bref.type either way. 2774 */ 2775 int 2776 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 2777 hammer2_blockref_t *bref, int *firstp, 2778 int flags) 2779 { 2780 hammer2_dev_t *hmp; 2781 hammer2_blockref_t *base; 2782 hammer2_blockref_t *bref_ptr; 2783 hammer2_key_t key; 2784 hammer2_key_t next_key; 2785 hammer2_chain_t *chain = NULL; 2786 int count = 0; 2787 int how_always = HAMMER2_RESOLVE_ALWAYS; 2788 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2789 int how; 2790 int generation; 2791 int maxloops = 300000; 2792 int error; 2793 2794 hmp = parent->hmp; 2795 error = 0; 2796 2797 /* 2798 * Scan flags borrowed from lookup. 2799 */ 2800 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2801 how_maybe = how_always; 2802 how = HAMMER2_RESOLVE_ALWAYS; 2803 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2804 how = HAMMER2_RESOLVE_NEVER; 2805 } else { 2806 how = HAMMER2_RESOLVE_MAYBE; 2807 } 2808 if (flags & HAMMER2_LOOKUP_SHARED) { 2809 how_maybe |= HAMMER2_RESOLVE_SHARED; 2810 how_always |= HAMMER2_RESOLVE_SHARED; 2811 how |= HAMMER2_RESOLVE_SHARED; 2812 } 2813 2814 /* 2815 * Calculate key to locate first/next element, unlocking the previous 2816 * element as we go. Be careful, the key calculation can overflow. 2817 * 2818 * (also reset bref to NULL) 2819 */ 2820 if (*firstp) { 2821 key = 0; 2822 *firstp = 0; 2823 } else { 2824 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 2825 if ((chain = *chainp) != NULL) { 2826 *chainp = NULL; 2827 hammer2_chain_unlock(chain); 2828 hammer2_chain_drop(chain); 2829 chain = NULL; 2830 } 2831 if (key == 0) { 2832 error |= HAMMER2_ERROR_EOF; 2833 goto done; 2834 } 2835 } 2836 2837 again: 2838 if (parent->error) { 2839 error = parent->error; 2840 goto done; 2841 } 2842 if (--maxloops == 0) 2843 panic("hammer2_chain_scan: maxloops"); 2844 2845 /* 2846 * Locate the blockref array. Currently we do a fully associative 2847 * search through the array. 2848 */ 2849 switch(parent->bref.type) { 2850 case HAMMER2_BREF_TYPE_INODE: 2851 /* 2852 * An inode with embedded data has no sub-chains. 2853 * 2854 * WARNING! Bulk scan code may pass a static chain marked 2855 * as BREF_TYPE_INODE with a copy of the volume 2856 * root blockset to snapshot the volume. 2857 */ 2858 if (parent->data->ipdata.meta.op_flags & 2859 HAMMER2_OPFLAG_DIRECTDATA) { 2860 error |= HAMMER2_ERROR_EOF; 2861 goto done; 2862 } 2863 base = &parent->data->ipdata.u.blockset.blockref[0]; 2864 count = HAMMER2_SET_COUNT; 2865 break; 2866 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2867 case HAMMER2_BREF_TYPE_INDIRECT: 2868 /* 2869 * Optimize indirect blocks in the INITIAL state to avoid 2870 * I/O. 2871 */ 2872 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2873 base = NULL; 2874 } else { 2875 if (parent->data == NULL) 2876 panic("parent->data is NULL"); 2877 base = &parent->data->npdata[0]; 2878 } 2879 count = parent->bytes / sizeof(hammer2_blockref_t); 2880 break; 2881 case HAMMER2_BREF_TYPE_VOLUME: 2882 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2883 count = HAMMER2_SET_COUNT; 2884 break; 2885 case HAMMER2_BREF_TYPE_FREEMAP: 2886 base = &parent->data->blkset.blockref[0]; 2887 count = HAMMER2_SET_COUNT; 2888 break; 2889 default: 2890 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2891 parent->bref.type); 2892 base = NULL; /* safety */ 2893 count = 0; /* safety */ 2894 } 2895 2896 /* 2897 * Merged scan to find next candidate. 2898 * 2899 * hammer2_base_*() functions require the parent->core.live_* fields 2900 * to be synchronized. 2901 * 2902 * We need to hold the spinlock to access the block array and RB tree 2903 * and to interlock chain creation. 2904 */ 2905 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2906 hammer2_chain_countbrefs(parent, base, count); 2907 2908 next_key = 0; 2909 bref_ptr = NULL; 2910 hammer2_spin_ex(&parent->core.spin); 2911 chain = hammer2_combined_find(parent, base, count, 2912 &next_key, 2913 key, HAMMER2_KEY_MAX, 2914 &bref_ptr); 2915 generation = parent->core.generation; 2916 2917 /* 2918 * Exhausted parent chain, we're done. 2919 */ 2920 if (bref_ptr == NULL) { 2921 hammer2_spin_unex(&parent->core.spin); 2922 KKASSERT(chain == NULL); 2923 error |= HAMMER2_ERROR_EOF; 2924 goto done; 2925 } 2926 2927 /* 2928 * Copy into the supplied stack-based blockref. 2929 */ 2930 *bref = *bref_ptr; 2931 2932 /* 2933 * Selected from blockref or in-memory chain. 2934 */ 2935 if (chain == NULL) { 2936 switch(bref->type) { 2937 case HAMMER2_BREF_TYPE_INODE: 2938 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2939 case HAMMER2_BREF_TYPE_INDIRECT: 2940 case HAMMER2_BREF_TYPE_VOLUME: 2941 case HAMMER2_BREF_TYPE_FREEMAP: 2942 /* 2943 * Recursion, always get the chain 2944 */ 2945 hammer2_spin_unex(&parent->core.spin); 2946 chain = hammer2_chain_get(parent, generation, bref); 2947 if (chain == NULL) { 2948 kprintf("retry scan parent %p keys %016jx\n", 2949 parent, key); 2950 goto again; 2951 } 2952 if (bcmp(bref, bref_ptr, sizeof(*bref))) { 2953 hammer2_chain_drop(chain); 2954 chain = NULL; 2955 goto again; 2956 } 2957 break; 2958 default: 2959 /* 2960 * No recursion, do not waste time instantiating 2961 * a chain, just iterate using the bref. 2962 */ 2963 hammer2_spin_unex(&parent->core.spin); 2964 break; 2965 } 2966 } else { 2967 /* 2968 * Recursion or not we need the chain in order to supply 2969 * the bref. 2970 */ 2971 hammer2_chain_ref(chain); 2972 hammer2_spin_unex(&parent->core.spin); 2973 } 2974 2975 /* 2976 * chain is referenced but not locked. We must lock the chain 2977 * to obtain definitive state. 2978 */ 2979 if (chain) 2980 hammer2_chain_lock(chain, how); 2981 2982 /* 2983 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2984 * 2985 * NOTE: chain's key range is not relevant as there might be 2986 * one-offs within the range that are not deleted. 2987 * 2988 * NOTE: XXX this could create problems with scans used in 2989 * situations other than mount-time recovery. 2990 * 2991 * NOTE: Lookups can race delete-duplicate because 2992 * delete-duplicate does not lock the parent's core 2993 * (they just use the spinlock on the core). 2994 */ 2995 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 2996 hammer2_chain_unlock(chain); 2997 hammer2_chain_drop(chain); 2998 chain = NULL; 2999 3000 key = next_key; 3001 if (key == 0) { 3002 error |= HAMMER2_ERROR_EOF; 3003 goto done; 3004 } 3005 goto again; 3006 } 3007 3008 done: 3009 /* 3010 * All done, return the bref or NULL, supply chain if necessary. 3011 */ 3012 if (chain) 3013 *chainp = chain; 3014 return (error); 3015 } 3016 3017 /* 3018 * Create and return a new hammer2 system memory structure of the specified 3019 * key, type and size and insert it under (*parentp). This is a full 3020 * insertion, based on the supplied key/keybits, and may involve creating 3021 * indirect blocks and moving other chains around via delete/duplicate. 3022 * 3023 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3024 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3025 * FULL. This typically means that the caller is creating the chain after 3026 * doing a hammer2_chain_lookup(). 3027 * 3028 * (*parentp) must be exclusive locked and may be replaced on return 3029 * depending on how much work the function had to do. 3030 * 3031 * (*parentp) must not be errored or this function will assert. 3032 * 3033 * (*chainp) usually starts out NULL and returns the newly created chain, 3034 * but if the caller desires the caller may allocate a disconnected chain 3035 * and pass it in instead. 3036 * 3037 * This function should NOT be used to insert INDIRECT blocks. It is 3038 * typically used to create/insert inodes and data blocks. 3039 * 3040 * Caller must pass-in an exclusively locked parent the new chain is to 3041 * be inserted under, and optionally pass-in a disconnected, exclusively 3042 * locked chain to insert (else we create a new chain). The function will 3043 * adjust (*parentp) as necessary, create or connect the chain, and 3044 * return an exclusively locked chain in *chainp. 3045 * 3046 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3047 * and will be reassigned. 3048 * 3049 * NOTE: returns HAMMER_ERROR_* flags 3050 */ 3051 int 3052 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3053 hammer2_pfs_t *pmp, int methods, 3054 hammer2_key_t key, int keybits, int type, size_t bytes, 3055 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3056 { 3057 hammer2_dev_t *hmp; 3058 hammer2_chain_t *chain; 3059 hammer2_chain_t *parent; 3060 hammer2_blockref_t *base; 3061 hammer2_blockref_t dummy; 3062 int allocated = 0; 3063 int error = 0; 3064 int count; 3065 int maxloops = 300000; 3066 3067 /* 3068 * Topology may be crossing a PFS boundary. 3069 */ 3070 parent = *parentp; 3071 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3072 KKASSERT(parent->error == 0); 3073 hmp = parent->hmp; 3074 chain = *chainp; 3075 3076 if (chain == NULL) { 3077 /* 3078 * First allocate media space and construct the dummy bref, 3079 * then allocate the in-memory chain structure. Set the 3080 * INITIAL flag for fresh chains which do not have embedded 3081 * data. 3082 * 3083 * XXX for now set the check mode of the child based on 3084 * the parent or, if the parent is an inode, the 3085 * specification in the inode. 3086 */ 3087 bzero(&dummy, sizeof(dummy)); 3088 dummy.type = type; 3089 dummy.key = key; 3090 dummy.keybits = keybits; 3091 dummy.data_off = hammer2_getradix(bytes); 3092 3093 /* 3094 * Inherit methods from parent by default. Primarily used 3095 * for BREF_TYPE_DATA. Non-data types *must* be set to 3096 * a non-NONE check algorithm. 3097 */ 3098 if (methods == -1) 3099 dummy.methods = parent->bref.methods; 3100 else 3101 dummy.methods = (uint8_t)methods; 3102 3103 if (type != HAMMER2_BREF_TYPE_DATA && 3104 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3105 dummy.methods |= 3106 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3107 } 3108 3109 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3110 3111 /* 3112 * Lock the chain manually, chain_lock will load the chain 3113 * which we do NOT want to do. (note: chain->refs is set 3114 * to 1 by chain_alloc() for us, but lockcnt is not). 3115 */ 3116 chain->lockcnt = 1; 3117 hammer2_mtx_ex(&chain->lock); 3118 allocated = 1; 3119 ++curthread->td_tracker; 3120 3121 /* 3122 * Set INITIAL to optimize I/O. The flag will generally be 3123 * processed when we call hammer2_chain_modify(). 3124 * 3125 * Recalculate bytes to reflect the actual media block 3126 * allocation. Handle special case radix 0 == 0 bytes. 3127 */ 3128 bytes = (size_t)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 3129 if (bytes) 3130 bytes = (hammer2_off_t)1 << bytes; 3131 chain->bytes = bytes; 3132 3133 switch(type) { 3134 case HAMMER2_BREF_TYPE_VOLUME: 3135 case HAMMER2_BREF_TYPE_FREEMAP: 3136 panic("hammer2_chain_create: called with volume type"); 3137 break; 3138 case HAMMER2_BREF_TYPE_INDIRECT: 3139 panic("hammer2_chain_create: cannot be used to" 3140 "create indirect block"); 3141 break; 3142 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3143 panic("hammer2_chain_create: cannot be used to" 3144 "create freemap root or node"); 3145 break; 3146 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3147 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3148 /* fall through */ 3149 case HAMMER2_BREF_TYPE_DIRENT: 3150 case HAMMER2_BREF_TYPE_INODE: 3151 case HAMMER2_BREF_TYPE_DATA: 3152 default: 3153 /* 3154 * leave chain->data NULL, set INITIAL 3155 */ 3156 KKASSERT(chain->data == NULL); 3157 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3158 break; 3159 } 3160 } else { 3161 /* 3162 * We are reattaching a previously deleted chain, possibly 3163 * under a new parent and possibly with a new key/keybits. 3164 * The chain does not have to be in a modified state. The 3165 * UPDATE flag will be set later on in this routine. 3166 * 3167 * Do NOT mess with the current state of the INITIAL flag. 3168 */ 3169 chain->bref.key = key; 3170 chain->bref.keybits = keybits; 3171 if (chain->flags & HAMMER2_CHAIN_DELETED) 3172 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3173 KKASSERT(chain->parent == NULL); 3174 } 3175 if (flags & HAMMER2_INSERT_PFSROOT) 3176 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3177 else 3178 chain->bref.flags &= ~HAMMER2_BREF_FLAG_PFSROOT; 3179 3180 /* 3181 * Calculate how many entries we have in the blockref array and 3182 * determine if an indirect block is required. 3183 */ 3184 again: 3185 if (--maxloops == 0) 3186 panic("hammer2_chain_create: maxloops"); 3187 3188 switch(parent->bref.type) { 3189 case HAMMER2_BREF_TYPE_INODE: 3190 if ((parent->data->ipdata.meta.op_flags & 3191 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3192 kprintf("hammer2: parent set for direct-data! " 3193 "pkey=%016jx ckey=%016jx\n", 3194 parent->bref.key, 3195 chain->bref.key); 3196 } 3197 KKASSERT((parent->data->ipdata.meta.op_flags & 3198 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3199 KKASSERT(parent->data != NULL); 3200 base = &parent->data->ipdata.u.blockset.blockref[0]; 3201 count = HAMMER2_SET_COUNT; 3202 break; 3203 case HAMMER2_BREF_TYPE_INDIRECT: 3204 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3205 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3206 base = NULL; 3207 else 3208 base = &parent->data->npdata[0]; 3209 count = parent->bytes / sizeof(hammer2_blockref_t); 3210 break; 3211 case HAMMER2_BREF_TYPE_VOLUME: 3212 KKASSERT(parent->data != NULL); 3213 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3214 count = HAMMER2_SET_COUNT; 3215 break; 3216 case HAMMER2_BREF_TYPE_FREEMAP: 3217 KKASSERT(parent->data != NULL); 3218 base = &parent->data->blkset.blockref[0]; 3219 count = HAMMER2_SET_COUNT; 3220 break; 3221 default: 3222 panic("hammer2_chain_create: unrecognized blockref type: %d", 3223 parent->bref.type); 3224 base = NULL; 3225 count = 0; 3226 break; 3227 } 3228 3229 /* 3230 * Make sure we've counted the brefs 3231 */ 3232 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3233 hammer2_chain_countbrefs(parent, base, count); 3234 3235 KASSERT(parent->core.live_count >= 0 && 3236 parent->core.live_count <= count, 3237 ("bad live_count %d/%d (%02x, %d)", 3238 parent->core.live_count, count, 3239 parent->bref.type, parent->bytes)); 3240 3241 /* 3242 * If no free blockref could be found we must create an indirect 3243 * block and move a number of blockrefs into it. With the parent 3244 * locked we can safely lock each child in order to delete+duplicate 3245 * it without causing a deadlock. 3246 * 3247 * This may return the new indirect block or the old parent depending 3248 * on where the key falls. NULL is returned on error. 3249 */ 3250 if (parent->core.live_count == count) { 3251 hammer2_chain_t *nparent; 3252 3253 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3254 3255 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3256 mtid, type, &error); 3257 if (nparent == NULL) { 3258 if (allocated) 3259 hammer2_chain_drop(chain); 3260 chain = NULL; 3261 goto done; 3262 } 3263 if (parent != nparent) { 3264 hammer2_chain_unlock(parent); 3265 hammer2_chain_drop(parent); 3266 parent = *parentp = nparent; 3267 } 3268 goto again; 3269 } 3270 3271 if (chain->flags & HAMMER2_CHAIN_DELETED) 3272 kprintf("Inserting deleted chain @%016jx\n", 3273 chain->bref.key); 3274 3275 /* 3276 * Link the chain into its parent. 3277 */ 3278 if (chain->parent != NULL) 3279 panic("hammer2: hammer2_chain_create: chain already connected"); 3280 KKASSERT(chain->parent == NULL); 3281 KKASSERT(parent->core.live_count < count); 3282 hammer2_chain_insert(parent, chain, 3283 HAMMER2_CHAIN_INSERT_SPIN | 3284 HAMMER2_CHAIN_INSERT_LIVE, 3285 0); 3286 3287 if (allocated) { 3288 /* 3289 * Mark the newly created chain modified. This will cause 3290 * UPDATE to be set and process the INITIAL flag. 3291 * 3292 * Device buffers are not instantiated for DATA elements 3293 * as these are handled by logical buffers. 3294 * 3295 * Indirect and freemap node indirect blocks are handled 3296 * by hammer2_chain_create_indirect() and not by this 3297 * function. 3298 * 3299 * Data for all other bref types is expected to be 3300 * instantiated (INODE, LEAF). 3301 */ 3302 switch(chain->bref.type) { 3303 case HAMMER2_BREF_TYPE_DATA: 3304 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3305 case HAMMER2_BREF_TYPE_DIRENT: 3306 case HAMMER2_BREF_TYPE_INODE: 3307 error = hammer2_chain_modify(chain, mtid, dedup_off, 3308 HAMMER2_MODIFY_OPTDATA); 3309 break; 3310 default: 3311 /* 3312 * Remaining types are not supported by this function. 3313 * In particular, INDIRECT and LEAF_NODE types are 3314 * handled by create_indirect(). 3315 */ 3316 panic("hammer2_chain_create: bad type: %d", 3317 chain->bref.type); 3318 /* NOT REACHED */ 3319 break; 3320 } 3321 } else { 3322 /* 3323 * When reconnecting a chain we must set UPDATE and 3324 * setflush so the flush recognizes that it must update 3325 * the bref in the parent. 3326 */ 3327 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3328 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3329 } 3330 3331 /* 3332 * We must setflush(parent) to ensure that it recurses through to 3333 * chain. setflush(chain) might not work because ONFLUSH is possibly 3334 * already set in the chain (so it won't recurse up to set it in the 3335 * parent). 3336 */ 3337 hammer2_chain_setflush(parent); 3338 3339 done: 3340 *chainp = chain; 3341 3342 return (error); 3343 } 3344 3345 /* 3346 * Move the chain from its old parent to a new parent. The chain must have 3347 * already been deleted or already disconnected (or never associated) with 3348 * a parent. The chain is reassociated with the new parent and the deleted 3349 * flag will be cleared (no longer deleted). The chain's modification state 3350 * is not altered. 3351 * 3352 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3353 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3354 * FULL. This typically means that the caller is creating the chain after 3355 * doing a hammer2_chain_lookup(). 3356 * 3357 * A non-NULL bref is typically passed when key and keybits must be overridden. 3358 * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields 3359 * from a passed-in bref and uses the old chain's bref for everything else. 3360 * 3361 * Neither (parent) or (chain) can be errored. 3362 * 3363 * If (parent) is non-NULL then the chain is inserted under the parent. 3364 * 3365 * If (parent) is NULL then the newly duplicated chain is not inserted 3366 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3367 * passing into hammer2_chain_create() after this function returns). 3368 * 3369 * WARNING! This function calls create which means it can insert indirect 3370 * blocks. This can cause other unrelated chains in the parent to 3371 * be moved to a newly inserted indirect block in addition to the 3372 * specific chain. 3373 */ 3374 void 3375 hammer2_chain_rename(hammer2_blockref_t *bref, 3376 hammer2_chain_t **parentp, hammer2_chain_t *chain, 3377 hammer2_tid_t mtid, int flags) 3378 { 3379 hammer2_dev_t *hmp; 3380 hammer2_chain_t *parent; 3381 size_t bytes; 3382 3383 /* 3384 * WARNING! We should never resolve DATA to device buffers 3385 * (XXX allow it if the caller did?), and since 3386 * we currently do not have the logical buffer cache 3387 * buffer in-hand to fix its cached physical offset 3388 * we also force the modify code to not COW it. XXX 3389 */ 3390 hmp = chain->hmp; 3391 KKASSERT(chain->parent == NULL); 3392 KKASSERT(chain->error == 0); 3393 3394 /* 3395 * Now create a duplicate of the chain structure, associating 3396 * it with the same core, making it the same size, pointing it 3397 * to the same bref (the same media block). 3398 * 3399 * NOTE: Handle special radix == 0 case (means 0 bytes). 3400 */ 3401 if (bref == NULL) 3402 bref = &chain->bref; 3403 bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 3404 if (bytes) 3405 bytes = (hammer2_off_t)1 << bytes; 3406 3407 /* 3408 * If parent is not NULL the duplicated chain will be entered under 3409 * the parent and the UPDATE bit set to tell flush to update 3410 * the blockref. 3411 * 3412 * We must setflush(parent) to ensure that it recurses through to 3413 * chain. setflush(chain) might not work because ONFLUSH is possibly 3414 * already set in the chain (so it won't recurse up to set it in the 3415 * parent). 3416 * 3417 * Having both chains locked is extremely important for atomicy. 3418 */ 3419 if (parentp && (parent = *parentp) != NULL) { 3420 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3421 KKASSERT(parent->refs > 0); 3422 KKASSERT(parent->error == 0); 3423 3424 hammer2_chain_create(parentp, &chain, 3425 chain->pmp, HAMMER2_METH_DEFAULT, 3426 bref->key, bref->keybits, bref->type, 3427 chain->bytes, mtid, 0, flags); 3428 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3429 hammer2_chain_setflush(*parentp); 3430 } 3431 } 3432 3433 /* 3434 * Helper function for deleting chains. 3435 * 3436 * The chain is removed from the live view (the RBTREE) as well as the parent's 3437 * blockmap. Both chain and its parent must be locked. 3438 * 3439 * parent may not be errored. chain can be errored. 3440 */ 3441 static int 3442 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3443 hammer2_tid_t mtid, int flags) 3444 { 3445 hammer2_dev_t *hmp; 3446 int error = 0; 3447 3448 KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED | 3449 HAMMER2_CHAIN_FICTITIOUS)) == 0); 3450 KKASSERT(chain->parent == parent); 3451 hmp = chain->hmp; 3452 3453 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 3454 /* 3455 * Chain is blockmapped, so there must be a parent. 3456 * Atomically remove the chain from the parent and remove 3457 * the blockmap entry. The parent must be set modified 3458 * to remove the blockmap entry. 3459 */ 3460 hammer2_blockref_t *base; 3461 int count; 3462 3463 KKASSERT(parent != NULL); 3464 KKASSERT(parent->error == 0); 3465 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3466 error = hammer2_chain_modify(parent, mtid, 0, 0); 3467 if (error) 3468 goto done; 3469 3470 /* 3471 * Calculate blockmap pointer 3472 */ 3473 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3474 hammer2_spin_ex(&chain->core.spin); 3475 hammer2_spin_ex(&parent->core.spin); 3476 3477 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3478 atomic_add_int(&parent->core.live_count, -1); 3479 ++parent->core.generation; 3480 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3481 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3482 --parent->core.chain_count; 3483 chain->parent = NULL; 3484 3485 switch(parent->bref.type) { 3486 case HAMMER2_BREF_TYPE_INODE: 3487 /* 3488 * Access the inode's block array. However, there 3489 * is no block array if the inode is flagged 3490 * DIRECTDATA. 3491 */ 3492 if (parent->data && 3493 (parent->data->ipdata.meta.op_flags & 3494 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3495 base = 3496 &parent->data->ipdata.u.blockset.blockref[0]; 3497 } else { 3498 base = NULL; 3499 } 3500 count = HAMMER2_SET_COUNT; 3501 break; 3502 case HAMMER2_BREF_TYPE_INDIRECT: 3503 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3504 if (parent->data) 3505 base = &parent->data->npdata[0]; 3506 else 3507 base = NULL; 3508 count = parent->bytes / sizeof(hammer2_blockref_t); 3509 break; 3510 case HAMMER2_BREF_TYPE_VOLUME: 3511 base = &parent->data->voldata. 3512 sroot_blockset.blockref[0]; 3513 count = HAMMER2_SET_COUNT; 3514 break; 3515 case HAMMER2_BREF_TYPE_FREEMAP: 3516 base = &parent->data->blkset.blockref[0]; 3517 count = HAMMER2_SET_COUNT; 3518 break; 3519 default: 3520 base = NULL; 3521 count = 0; 3522 panic("hammer2_flush_pass2: " 3523 "unrecognized blockref type: %d", 3524 parent->bref.type); 3525 } 3526 3527 /* 3528 * delete blockmapped chain from its parent. 3529 * 3530 * The parent is not affected by any statistics in chain 3531 * which are pending synchronization. That is, there is 3532 * nothing to undo in the parent since they have not yet 3533 * been incorporated into the parent. 3534 * 3535 * The parent is affected by statistics stored in inodes. 3536 * Those have already been synchronized, so they must be 3537 * undone. XXX split update possible w/delete in middle? 3538 */ 3539 if (base) { 3540 hammer2_base_delete(parent, base, count, chain); 3541 } 3542 hammer2_spin_unex(&parent->core.spin); 3543 hammer2_spin_unex(&chain->core.spin); 3544 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3545 /* 3546 * Chain is not blockmapped but a parent is present. 3547 * Atomically remove the chain from the parent. There is 3548 * no blockmap entry to remove. 3549 * 3550 * Because chain was associated with a parent but not 3551 * synchronized, the chain's *_count_up fields contain 3552 * inode adjustment statistics which must be undone. 3553 */ 3554 hammer2_spin_ex(&chain->core.spin); 3555 hammer2_spin_ex(&parent->core.spin); 3556 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3557 atomic_add_int(&parent->core.live_count, -1); 3558 ++parent->core.generation; 3559 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3560 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3561 --parent->core.chain_count; 3562 chain->parent = NULL; 3563 hammer2_spin_unex(&parent->core.spin); 3564 hammer2_spin_unex(&chain->core.spin); 3565 } else { 3566 /* 3567 * Chain is not blockmapped and has no parent. This 3568 * is a degenerate case. 3569 */ 3570 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3571 } 3572 done: 3573 return error; 3574 } 3575 3576 /* 3577 * Create an indirect block that covers one or more of the elements in the 3578 * current parent. Either returns the existing parent with no locking or 3579 * ref changes or returns the new indirect block locked and referenced 3580 * and leaving the original parent lock/ref intact as well. 3581 * 3582 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3583 * 3584 * The returned chain depends on where the specified key falls. 3585 * 3586 * The key/keybits for the indirect mode only needs to follow three rules: 3587 * 3588 * (1) That all elements underneath it fit within its key space and 3589 * 3590 * (2) That all elements outside it are outside its key space. 3591 * 3592 * (3) When creating the new indirect block any elements in the current 3593 * parent that fit within the new indirect block's keyspace must be 3594 * moved into the new indirect block. 3595 * 3596 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3597 * keyspace the the current parent, but lookup/iteration rules will 3598 * ensure (and must ensure) that rule (2) for all parents leading up 3599 * to the nearest inode or the root volume header is adhered to. This 3600 * is accomplished by always recursing through matching keyspaces in 3601 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3602 * 3603 * The current implementation calculates the current worst-case keyspace by 3604 * iterating the current parent and then divides it into two halves, choosing 3605 * whichever half has the most elements (not necessarily the half containing 3606 * the requested key). 3607 * 3608 * We can also opt to use the half with the least number of elements. This 3609 * causes lower-numbered keys (aka logical file offsets) to recurse through 3610 * fewer indirect blocks and higher-numbered keys to recurse through more. 3611 * This also has the risk of not moving enough elements to the new indirect 3612 * block and being forced to create several indirect blocks before the element 3613 * can be inserted. 3614 * 3615 * Must be called with an exclusively locked parent. 3616 * 3617 * NOTE: *errorp set to HAMMER_ERROR_* flags 3618 */ 3619 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3620 hammer2_key_t *keyp, int keybits, 3621 hammer2_blockref_t *base, int count); 3622 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3623 hammer2_key_t *keyp, int keybits, 3624 hammer2_blockref_t *base, int count, 3625 int ncount); 3626 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3627 hammer2_key_t *keyp, int keybits, 3628 hammer2_blockref_t *base, int count, 3629 int ncount); 3630 static 3631 hammer2_chain_t * 3632 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3633 hammer2_key_t create_key, int create_bits, 3634 hammer2_tid_t mtid, int for_type, int *errorp) 3635 { 3636 hammer2_dev_t *hmp; 3637 hammer2_blockref_t *base; 3638 hammer2_blockref_t *bref; 3639 hammer2_blockref_t bcopy; 3640 hammer2_chain_t *chain; 3641 hammer2_chain_t *ichain; 3642 hammer2_chain_t dummy; 3643 hammer2_key_t key = create_key; 3644 hammer2_key_t key_beg; 3645 hammer2_key_t key_end; 3646 hammer2_key_t key_next; 3647 int keybits = create_bits; 3648 int count; 3649 int ncount; 3650 int nbytes; 3651 int loops; 3652 int error; 3653 int reason; 3654 int generation; 3655 int maxloops = 300000; 3656 3657 /* 3658 * Calculate the base blockref pointer or NULL if the chain 3659 * is known to be empty. We need to calculate the array count 3660 * for RB lookups either way. 3661 */ 3662 hmp = parent->hmp; 3663 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3664 3665 /* 3666 * Pre-modify the parent now to avoid having to deal with error 3667 * processing if we tried to later (in the middle of our loop). 3668 */ 3669 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 3670 if (*errorp) { 3671 kprintf("hammer2_create_indirect: error %08x %s\n", 3672 *errorp, hammer2_error_str(*errorp)); 3673 return NULL; 3674 } 3675 3676 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 3677 base = hammer2_chain_base_and_count(parent, &count); 3678 3679 /* 3680 * dummy used in later chain allocation (no longer used for lookups). 3681 */ 3682 bzero(&dummy, sizeof(dummy)); 3683 3684 /* 3685 * How big should our new indirect block be? It has to be at least 3686 * as large as its parent for splits to work properly. 3687 * 3688 * The freemap uses a specific indirect block size. The number of 3689 * levels are built dynamically and ultimately depend on the size 3690 * volume. Because freemap blocks are taken from the reserved areas 3691 * of the volume our goal is efficiency (fewer levels) and not so 3692 * much to save disk space. 3693 * 3694 * The first indirect block level for a directory usually uses 3695 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 3696 * the hash mechanism, this typically gives us a nominal 3697 * 32 * 4 entries with one level of indirection. 3698 * 3699 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 3700 * indirect blocks. The initial 4 entries in the inode gives us 3701 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 3702 * of indirection gives us 137GB, and so forth. H2 can support 3703 * huge file sizes but they are not typical, so we try to stick 3704 * with compactness and do not use a larger indirect block size. 3705 * 3706 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 3707 * due to the way indirect blocks are created this usually winds 3708 * up being extremely inefficient for small files. Even though 3709 * 16KB requires more levels of indirection for very large files, 3710 * the 16KB records can be ganged together into 64KB DIOs. 3711 */ 3712 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3713 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3714 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 3715 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 3716 if (parent->data->ipdata.meta.type == 3717 HAMMER2_OBJTYPE_DIRECTORY) 3718 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 3719 else 3720 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 3721 3722 } else { 3723 nbytes = HAMMER2_IND_BYTES_NOM; 3724 } 3725 if (nbytes < count * sizeof(hammer2_blockref_t)) { 3726 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 3727 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 3728 nbytes = count * sizeof(hammer2_blockref_t); 3729 } 3730 ncount = nbytes / sizeof(hammer2_blockref_t); 3731 3732 /* 3733 * When creating an indirect block for a freemap node or leaf 3734 * the key/keybits must be fitted to static radix levels because 3735 * particular radix levels use particular reserved blocks in the 3736 * related zone. 3737 * 3738 * This routine calculates the key/radix of the indirect block 3739 * we need to create, and whether it is on the high-side or the 3740 * low-side. 3741 */ 3742 switch(for_type) { 3743 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3744 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3745 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 3746 base, count); 3747 break; 3748 case HAMMER2_BREF_TYPE_DATA: 3749 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 3750 base, count, ncount); 3751 break; 3752 case HAMMER2_BREF_TYPE_DIRENT: 3753 case HAMMER2_BREF_TYPE_INODE: 3754 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 3755 base, count, ncount); 3756 break; 3757 default: 3758 panic("illegal indirect block for bref type %d", for_type); 3759 break; 3760 } 3761 3762 /* 3763 * Normalize the key for the radix being represented, keeping the 3764 * high bits and throwing away the low bits. 3765 */ 3766 key &= ~(((hammer2_key_t)1 << keybits) - 1); 3767 3768 /* 3769 * Ok, create our new indirect block 3770 */ 3771 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3772 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3773 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 3774 } else { 3775 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 3776 } 3777 dummy.bref.key = key; 3778 dummy.bref.keybits = keybits; 3779 dummy.bref.data_off = hammer2_getradix(nbytes); 3780 dummy.bref.methods = 3781 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 3782 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 3783 3784 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy.bref); 3785 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 3786 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 3787 /* ichain has one ref at this point */ 3788 3789 /* 3790 * We have to mark it modified to allocate its block, but use 3791 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 3792 * it won't be acted upon by the flush code. 3793 */ 3794 *errorp = hammer2_chain_modify(ichain, mtid, 0, HAMMER2_MODIFY_OPTDATA); 3795 if (*errorp) { 3796 kprintf("hammer2_alloc_indirect: error %08x %s\n", 3797 *errorp, hammer2_error_str(*errorp)); 3798 hammer2_chain_unlock(ichain); 3799 hammer2_chain_drop(ichain); 3800 return NULL; 3801 } 3802 3803 /* 3804 * Iterate the original parent and move the matching brefs into 3805 * the new indirect block. 3806 * 3807 * XXX handle flushes. 3808 */ 3809 key_beg = 0; 3810 key_end = HAMMER2_KEY_MAX; 3811 key_next = 0; /* avoid gcc warnings */ 3812 hammer2_spin_ex(&parent->core.spin); 3813 loops = 0; 3814 reason = 0; 3815 3816 for (;;) { 3817 /* 3818 * Parent may have been modified, relocating its block array. 3819 * Reload the base pointer. 3820 */ 3821 base = hammer2_chain_base_and_count(parent, &count); 3822 3823 if (++loops > 100000) { 3824 hammer2_spin_unex(&parent->core.spin); 3825 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 3826 reason, parent, base, count, key_next); 3827 } 3828 3829 /* 3830 * NOTE: spinlock stays intact, returned chain (if not NULL) 3831 * is not referenced or locked which means that we 3832 * cannot safely check its flagged / deletion status 3833 * until we lock it. 3834 */ 3835 chain = hammer2_combined_find(parent, base, count, 3836 &key_next, 3837 key_beg, key_end, 3838 &bref); 3839 generation = parent->core.generation; 3840 if (bref == NULL) 3841 break; 3842 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3843 3844 /* 3845 * Skip keys that are not within the key/radix of the new 3846 * indirect block. They stay in the parent. 3847 */ 3848 if ((~(((hammer2_key_t)1 << keybits) - 1) & 3849 (key ^ bref->key)) != 0) { 3850 goto next_key_spinlocked; 3851 } 3852 3853 /* 3854 * Load the new indirect block by acquiring the related 3855 * chains (potentially from media as it might not be 3856 * in-memory). Then move it to the new parent (ichain). 3857 * 3858 * chain is referenced but not locked. We must lock the 3859 * chain to obtain definitive state. 3860 */ 3861 if (chain) { 3862 /* 3863 * Use chain already present in the RBTREE 3864 */ 3865 hammer2_chain_ref(chain); 3866 hammer2_spin_unex(&parent->core.spin); 3867 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3868 } else { 3869 /* 3870 * Get chain for blockref element. _get returns NULL 3871 * on insertion race. 3872 */ 3873 bcopy = *bref; 3874 hammer2_spin_unex(&parent->core.spin); 3875 chain = hammer2_chain_get(parent, generation, &bcopy); 3876 if (chain == NULL) { 3877 reason = 1; 3878 hammer2_spin_ex(&parent->core.spin); 3879 continue; 3880 } 3881 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3882 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 3883 reason = 2; 3884 hammer2_chain_unlock(chain); 3885 hammer2_chain_drop(chain); 3886 hammer2_spin_ex(&parent->core.spin); 3887 continue; 3888 } 3889 } 3890 3891 /* 3892 * This is always live so if the chain has been deleted 3893 * we raced someone and we have to retry. 3894 * 3895 * NOTE: Lookups can race delete-duplicate because 3896 * delete-duplicate does not lock the parent's core 3897 * (they just use the spinlock on the core). 3898 * 3899 * (note reversed logic for this one) 3900 */ 3901 if (chain->parent != parent || 3902 (chain->flags & HAMMER2_CHAIN_DELETED)) { 3903 hammer2_chain_unlock(chain); 3904 hammer2_chain_drop(chain); 3905 kprintf("hammer2_chain_create_indirect " 3906 "RETRY (%p,%p)->%p %08x\n", 3907 parent, chain->parent, chain, chain->flags); 3908 hammer2_spin_ex(&parent->core.spin); 3909 continue; 3910 } 3911 3912 /* 3913 * Shift the chain to the indirect block. 3914 * 3915 * WARNING! No reason for us to load chain data, pass NOSTATS 3916 * to prevent delete/insert from trying to access 3917 * inode stats (and thus asserting if there is no 3918 * chain->data loaded). 3919 * 3920 * WARNING! The (parent, chain) deletion may modify the parent 3921 * and invalidate the base pointer. 3922 * 3923 * WARNING! Parent must already be marked modified, so we 3924 * can assume that chain_delete always suceeds. 3925 */ 3926 error = hammer2_chain_delete(parent, chain, mtid, 0); 3927 KKASSERT(error == 0); 3928 hammer2_chain_rename(NULL, &ichain, chain, mtid, 0); 3929 hammer2_chain_unlock(chain); 3930 hammer2_chain_drop(chain); 3931 KKASSERT(parent->refs > 0); 3932 chain = NULL; 3933 base = NULL; /* safety */ 3934 hammer2_spin_ex(&parent->core.spin); 3935 next_key_spinlocked: 3936 if (--maxloops == 0) 3937 panic("hammer2_chain_create_indirect: maxloops"); 3938 reason = 4; 3939 if (key_next == 0 || key_next > key_end) 3940 break; 3941 key_beg = key_next; 3942 /* loop */ 3943 } 3944 hammer2_spin_unex(&parent->core.spin); 3945 3946 /* 3947 * Insert the new indirect block into the parent now that we've 3948 * cleared out some entries in the parent. We calculated a good 3949 * insertion index in the loop above (ichain->index). 3950 * 3951 * We don't have to set UPDATE here because we mark ichain 3952 * modified down below (so the normal modified -> flush -> set-moved 3953 * sequence applies). 3954 * 3955 * The insertion shouldn't race as this is a completely new block 3956 * and the parent is locked. 3957 */ 3958 base = NULL; /* safety, parent modify may change address */ 3959 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 3960 KKASSERT(parent->core.live_count < count); 3961 hammer2_chain_insert(parent, ichain, 3962 HAMMER2_CHAIN_INSERT_SPIN | 3963 HAMMER2_CHAIN_INSERT_LIVE, 3964 0); 3965 3966 /* 3967 * Make sure flushes propogate after our manual insertion. 3968 */ 3969 hammer2_chain_setflush(ichain); 3970 hammer2_chain_setflush(parent); 3971 3972 /* 3973 * Figure out what to return. 3974 */ 3975 if (~(((hammer2_key_t)1 << keybits) - 1) & 3976 (create_key ^ key)) { 3977 /* 3978 * Key being created is outside the key range, 3979 * return the original parent. 3980 */ 3981 hammer2_chain_unlock(ichain); 3982 hammer2_chain_drop(ichain); 3983 } else { 3984 /* 3985 * Otherwise its in the range, return the new parent. 3986 * (leave both the new and old parent locked). 3987 */ 3988 parent = ichain; 3989 } 3990 3991 return(parent); 3992 } 3993 3994 /* 3995 * Do maintenance on an indirect chain. Both parent and chain are locked. 3996 * 3997 * Returns non-zero if (chain) is deleted, either due to being empty or 3998 * because its children were safely moved into the parent. 3999 */ 4000 int 4001 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 4002 hammer2_chain_t *chain) 4003 { 4004 hammer2_blockref_t *chain_base; 4005 hammer2_blockref_t *base; 4006 hammer2_blockref_t *bref; 4007 hammer2_blockref_t bcopy; 4008 hammer2_key_t key_next; 4009 hammer2_key_t key_beg; 4010 hammer2_key_t key_end; 4011 hammer2_chain_t *sub; 4012 int chain_count; 4013 int count; 4014 int error; 4015 int generation; 4016 4017 /* 4018 * Make sure we have an accurate live_count 4019 */ 4020 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4021 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4022 base = &chain->data->npdata[0]; 4023 count = chain->bytes / sizeof(hammer2_blockref_t); 4024 hammer2_chain_countbrefs(chain, base, count); 4025 } 4026 4027 /* 4028 * If the indirect block is empty we can delete it. 4029 * (ignore deletion error) 4030 */ 4031 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4032 hammer2_chain_delete(parent, chain, 4033 chain->bref.modify_tid, 4034 HAMMER2_DELETE_PERMANENT); 4035 return 1; 4036 } 4037 4038 base = hammer2_chain_base_and_count(parent, &count); 4039 4040 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4041 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4042 hammer2_chain_countbrefs(parent, base, count); 4043 } 4044 4045 /* 4046 * Determine if we can collapse chain into parent, calculate 4047 * hysteresis for chain emptiness. 4048 */ 4049 if (parent->core.live_count + chain->core.live_count - 1 > count) 4050 return 0; 4051 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4052 if (chain->core.live_count > chain_count * 3 / 4) 4053 return 0; 4054 4055 /* 4056 * Ok, theoretically we can collapse chain's contents into 4057 * parent. chain is locked, but any in-memory children of chain 4058 * are not. For this to work, we must be able to dispose of any 4059 * in-memory children of chain. 4060 * 4061 * For now require that there are no in-memory children of chain. 4062 * 4063 * WARNING! Both chain and parent must remain locked across this 4064 * entire operation. 4065 */ 4066 4067 /* 4068 * Parent must be marked modified. Don't try to collapse it if we 4069 * can't mark it modified. Once modified, destroy chain to make room 4070 * and to get rid of what will be a conflicting key (this is included 4071 * in the calculation above). Finally, move the children of chain 4072 * into chain's parent. 4073 * 4074 * This order creates an accounting problem for bref.embed.stats 4075 * because we destroy chain before we remove its children. Any 4076 * elements whos blockref is already synchronized will be counted 4077 * twice. To deal with the problem we clean out chain's stats prior 4078 * to deleting it. 4079 */ 4080 error = hammer2_chain_modify(parent, 0, 0, 0); 4081 if (error) { 4082 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4083 hammer2_error_str(error)); 4084 return 0; 4085 } 4086 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4087 if (error) { 4088 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4089 hammer2_error_str(error)); 4090 return 0; 4091 } 4092 4093 chain->bref.embed.stats.inode_count = 0; 4094 chain->bref.embed.stats.data_count = 0; 4095 error = hammer2_chain_delete(parent, chain, 4096 chain->bref.modify_tid, 4097 HAMMER2_DELETE_PERMANENT); 4098 KKASSERT(error == 0); 4099 4100 /* 4101 * The combined_find call requires core.spin to be held. One would 4102 * think there wouldn't be any conflicts since we hold chain 4103 * exclusively locked, but the caching mechanism for 0-ref children 4104 * does not require a chain lock. 4105 */ 4106 hammer2_spin_ex(&chain->core.spin); 4107 4108 key_next = 0; 4109 key_beg = 0; 4110 key_end = HAMMER2_KEY_MAX; 4111 for (;;) { 4112 chain_base = &chain->data->npdata[0]; 4113 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4114 sub = hammer2_combined_find(chain, chain_base, chain_count, 4115 &key_next, 4116 key_beg, key_end, 4117 &bref); 4118 generation = chain->core.generation; 4119 if (bref == NULL) 4120 break; 4121 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4122 4123 if (sub) { 4124 hammer2_chain_ref(sub); 4125 hammer2_spin_unex(&chain->core.spin); 4126 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4127 if (sub->parent != chain || 4128 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4129 hammer2_chain_unlock(sub); 4130 hammer2_chain_drop(sub); 4131 hammer2_spin_ex(&chain->core.spin); 4132 continue; 4133 } 4134 } else { 4135 bcopy = *bref; 4136 hammer2_spin_unex(&chain->core.spin); 4137 sub = hammer2_chain_get(chain, generation, &bcopy); 4138 if (sub == NULL) { 4139 hammer2_spin_ex(&chain->core.spin); 4140 continue; 4141 } 4142 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4143 if (bcmp(&bcopy, bref, sizeof(bcopy)) != 0) { 4144 hammer2_chain_unlock(sub); 4145 hammer2_chain_drop(sub); 4146 hammer2_spin_ex(&chain->core.spin); 4147 continue; 4148 } 4149 } 4150 error = hammer2_chain_delete(chain, sub, 4151 sub->bref.modify_tid, 0); 4152 KKASSERT(error == 0); 4153 hammer2_chain_rename(NULL, &parent, sub, 4154 sub->bref.modify_tid, 4155 HAMMER2_INSERT_SAMEPARENT); 4156 hammer2_chain_unlock(sub); 4157 hammer2_chain_drop(sub); 4158 hammer2_spin_ex(&chain->core.spin); 4159 4160 if (key_next == 0) 4161 break; 4162 key_beg = key_next; 4163 } 4164 hammer2_spin_unex(&chain->core.spin); 4165 4166 return 1; 4167 } 4168 4169 /* 4170 * Freemap indirect blocks 4171 * 4172 * Calculate the keybits and highside/lowside of the freemap node the 4173 * caller is creating. 4174 * 4175 * This routine will specify the next higher-level freemap key/radix 4176 * representing the lowest-ordered set. By doing so, eventually all 4177 * low-ordered sets will be moved one level down. 4178 * 4179 * We have to be careful here because the freemap reserves a limited 4180 * number of blocks for a limited number of levels. So we can't just 4181 * push indiscriminately. 4182 */ 4183 int 4184 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4185 int keybits, hammer2_blockref_t *base, int count) 4186 { 4187 hammer2_chain_t *chain; 4188 hammer2_blockref_t *bref; 4189 hammer2_key_t key; 4190 hammer2_key_t key_beg; 4191 hammer2_key_t key_end; 4192 hammer2_key_t key_next; 4193 int locount; 4194 int hicount; 4195 int maxloops = 300000; 4196 4197 key = *keyp; 4198 locount = 0; 4199 hicount = 0; 4200 keybits = 64; 4201 4202 /* 4203 * Calculate the range of keys in the array being careful to skip 4204 * slots which are overridden with a deletion. 4205 */ 4206 key_beg = 0; 4207 key_end = HAMMER2_KEY_MAX; 4208 hammer2_spin_ex(&parent->core.spin); 4209 4210 for (;;) { 4211 if (--maxloops == 0) { 4212 panic("indkey_freemap shit %p %p:%d\n", 4213 parent, base, count); 4214 } 4215 chain = hammer2_combined_find(parent, base, count, 4216 &key_next, 4217 key_beg, key_end, 4218 &bref); 4219 4220 /* 4221 * Exhausted search 4222 */ 4223 if (bref == NULL) 4224 break; 4225 4226 /* 4227 * Skip deleted chains. 4228 */ 4229 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4230 if (key_next == 0 || key_next > key_end) 4231 break; 4232 key_beg = key_next; 4233 continue; 4234 } 4235 4236 /* 4237 * Use the full live (not deleted) element for the scan 4238 * iteration. HAMMER2 does not allow partial replacements. 4239 * 4240 * XXX should be built into hammer2_combined_find(). 4241 */ 4242 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4243 4244 if (keybits > bref->keybits) { 4245 key = bref->key; 4246 keybits = bref->keybits; 4247 } else if (keybits == bref->keybits && bref->key < key) { 4248 key = bref->key; 4249 } 4250 if (key_next == 0) 4251 break; 4252 key_beg = key_next; 4253 } 4254 hammer2_spin_unex(&parent->core.spin); 4255 4256 /* 4257 * Return the keybits for a higher-level FREEMAP_NODE covering 4258 * this node. 4259 */ 4260 switch(keybits) { 4261 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4262 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4263 break; 4264 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4265 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4266 break; 4267 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4268 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4269 break; 4270 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4271 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4272 break; 4273 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4274 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4275 break; 4276 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4277 panic("hammer2_chain_indkey_freemap: level too high"); 4278 break; 4279 default: 4280 panic("hammer2_chain_indkey_freemap: bad radix"); 4281 break; 4282 } 4283 *keyp = key; 4284 4285 return (keybits); 4286 } 4287 4288 /* 4289 * File indirect blocks 4290 * 4291 * Calculate the key/keybits for the indirect block to create by scanning 4292 * existing keys. The key being created is also passed in *keyp and can be 4293 * inside or outside the indirect block. Regardless, the indirect block 4294 * must hold at least two keys in order to guarantee sufficient space. 4295 * 4296 * We use a modified version of the freemap's fixed radix tree, but taylored 4297 * for file data. Basically we configure an indirect block encompassing the 4298 * smallest key. 4299 */ 4300 static int 4301 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4302 int keybits, hammer2_blockref_t *base, int count, 4303 int ncount) 4304 { 4305 hammer2_chain_t *chain; 4306 hammer2_blockref_t *bref; 4307 hammer2_key_t key; 4308 hammer2_key_t key_beg; 4309 hammer2_key_t key_end; 4310 hammer2_key_t key_next; 4311 int nradix; 4312 int locount; 4313 int hicount; 4314 int maxloops = 300000; 4315 4316 key = *keyp; 4317 locount = 0; 4318 hicount = 0; 4319 keybits = 64; 4320 4321 /* 4322 * Calculate the range of keys in the array being careful to skip 4323 * slots which are overridden with a deletion. 4324 * 4325 * Locate the smallest key. 4326 */ 4327 key_beg = 0; 4328 key_end = HAMMER2_KEY_MAX; 4329 hammer2_spin_ex(&parent->core.spin); 4330 4331 for (;;) { 4332 if (--maxloops == 0) { 4333 panic("indkey_freemap shit %p %p:%d\n", 4334 parent, base, count); 4335 } 4336 chain = hammer2_combined_find(parent, base, count, 4337 &key_next, 4338 key_beg, key_end, 4339 &bref); 4340 4341 /* 4342 * Exhausted search 4343 */ 4344 if (bref == NULL) 4345 break; 4346 4347 /* 4348 * Skip deleted chains. 4349 */ 4350 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4351 if (key_next == 0 || key_next > key_end) 4352 break; 4353 key_beg = key_next; 4354 continue; 4355 } 4356 4357 /* 4358 * Use the full live (not deleted) element for the scan 4359 * iteration. HAMMER2 does not allow partial replacements. 4360 * 4361 * XXX should be built into hammer2_combined_find(). 4362 */ 4363 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4364 4365 if (keybits > bref->keybits) { 4366 key = bref->key; 4367 keybits = bref->keybits; 4368 } else if (keybits == bref->keybits && bref->key < key) { 4369 key = bref->key; 4370 } 4371 if (key_next == 0) 4372 break; 4373 key_beg = key_next; 4374 } 4375 hammer2_spin_unex(&parent->core.spin); 4376 4377 /* 4378 * Calculate the static keybits for a higher-level indirect block 4379 * that contains the key. 4380 */ 4381 *keyp = key; 4382 4383 switch(ncount) { 4384 case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t): 4385 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4386 break; 4387 case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t): 4388 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4389 break; 4390 case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t): 4391 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4392 break; 4393 default: 4394 panic("bad ncount %d\n", ncount); 4395 nradix = 0; 4396 break; 4397 } 4398 4399 /* 4400 * The largest radix that can be returned for an indirect block is 4401 * 63 bits. (The largest practical indirect block radix is actually 4402 * 62 bits because the top-level inode or volume root contains four 4403 * entries, but allow 63 to be returned). 4404 */ 4405 if (nradix >= 64) 4406 nradix = 63; 4407 4408 return keybits + nradix; 4409 } 4410 4411 #if 1 4412 4413 /* 4414 * Directory indirect blocks. 4415 * 4416 * Covers both the inode index (directory of inodes), and directory contents 4417 * (filenames hardlinked to inodes). 4418 * 4419 * Because directory keys are hashed we generally try to cut the space in 4420 * half. We accomodate the inode index (which tends to have linearly 4421 * increasing inode numbers) by ensuring that the keyspace is at least large 4422 * enough to fill up the indirect block being created. 4423 */ 4424 static int 4425 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4426 int keybits, hammer2_blockref_t *base, int count, 4427 int ncount) 4428 { 4429 hammer2_blockref_t *bref; 4430 hammer2_chain_t *chain; 4431 hammer2_key_t key_beg; 4432 hammer2_key_t key_end; 4433 hammer2_key_t key_next; 4434 hammer2_key_t key; 4435 int nkeybits; 4436 int locount; 4437 int hicount; 4438 int maxloops = 300000; 4439 4440 /* 4441 * NOTE: We can't take a shortcut here anymore for inodes because 4442 * the root directory can contain a mix of inodes and directory 4443 * entries (we used to just return 63 if parent->bref.type was 4444 * HAMMER2_BREF_TYPE_INODE. 4445 */ 4446 key = *keyp; 4447 locount = 0; 4448 hicount = 0; 4449 4450 /* 4451 * Calculate the range of keys in the array being careful to skip 4452 * slots which are overridden with a deletion. 4453 */ 4454 key_beg = 0; 4455 key_end = HAMMER2_KEY_MAX; 4456 hammer2_spin_ex(&parent->core.spin); 4457 4458 for (;;) { 4459 if (--maxloops == 0) { 4460 panic("indkey_freemap shit %p %p:%d\n", 4461 parent, base, count); 4462 } 4463 chain = hammer2_combined_find(parent, base, count, 4464 &key_next, 4465 key_beg, key_end, 4466 &bref); 4467 4468 /* 4469 * Exhausted search 4470 */ 4471 if (bref == NULL) 4472 break; 4473 4474 /* 4475 * Deleted object 4476 */ 4477 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4478 if (key_next == 0 || key_next > key_end) 4479 break; 4480 key_beg = key_next; 4481 continue; 4482 } 4483 4484 /* 4485 * Use the full live (not deleted) element for the scan 4486 * iteration. HAMMER2 does not allow partial replacements. 4487 * 4488 * XXX should be built into hammer2_combined_find(). 4489 */ 4490 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4491 4492 /* 4493 * Expand our calculated key range (key, keybits) to fit 4494 * the scanned key. nkeybits represents the full range 4495 * that we will later cut in half (two halves @ nkeybits - 1). 4496 */ 4497 nkeybits = keybits; 4498 if (nkeybits < bref->keybits) { 4499 if (bref->keybits > 64) { 4500 kprintf("bad bref chain %p bref %p\n", 4501 chain, bref); 4502 Debugger("fubar"); 4503 } 4504 nkeybits = bref->keybits; 4505 } 4506 while (nkeybits < 64 && 4507 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4508 (key ^ bref->key)) != 0) { 4509 ++nkeybits; 4510 } 4511 4512 /* 4513 * If the new key range is larger we have to determine 4514 * which side of the new key range the existing keys fall 4515 * under by checking the high bit, then collapsing the 4516 * locount into the hicount or vise-versa. 4517 */ 4518 if (keybits != nkeybits) { 4519 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4520 hicount += locount; 4521 locount = 0; 4522 } else { 4523 locount += hicount; 4524 hicount = 0; 4525 } 4526 keybits = nkeybits; 4527 } 4528 4529 /* 4530 * The newly scanned key will be in the lower half or the 4531 * upper half of the (new) key range. 4532 */ 4533 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4534 ++hicount; 4535 else 4536 ++locount; 4537 4538 if (key_next == 0) 4539 break; 4540 key_beg = key_next; 4541 } 4542 hammer2_spin_unex(&parent->core.spin); 4543 bref = NULL; /* now invalid (safety) */ 4544 4545 /* 4546 * Adjust keybits to represent half of the full range calculated 4547 * above (radix 63 max) for our new indirect block. 4548 */ 4549 --keybits; 4550 4551 /* 4552 * Expand keybits to hold at least ncount elements. ncount will be 4553 * a power of 2. This is to try to completely fill leaf nodes (at 4554 * least for keys which are not hashes). 4555 * 4556 * We aren't counting 'in' or 'out', we are counting 'high side' 4557 * and 'low side' based on the bit at (1LL << keybits). We want 4558 * everything to be inside in these cases so shift it all to 4559 * the low or high side depending on the new high bit. 4560 */ 4561 while (((hammer2_key_t)1 << keybits) < ncount) { 4562 ++keybits; 4563 if (key & ((hammer2_key_t)1 << keybits)) { 4564 hicount += locount; 4565 locount = 0; 4566 } else { 4567 locount += hicount; 4568 hicount = 0; 4569 } 4570 } 4571 4572 if (hicount > locount) 4573 key |= (hammer2_key_t)1 << keybits; 4574 else 4575 key &= ~(hammer2_key_t)1 << keybits; 4576 4577 *keyp = key; 4578 4579 return (keybits); 4580 } 4581 4582 #else 4583 4584 /* 4585 * Directory indirect blocks. 4586 * 4587 * Covers both the inode index (directory of inodes), and directory contents 4588 * (filenames hardlinked to inodes). 4589 * 4590 * Because directory keys are hashed we generally try to cut the space in 4591 * half. We accomodate the inode index (which tends to have linearly 4592 * increasing inode numbers) by ensuring that the keyspace is at least large 4593 * enough to fill up the indirect block being created. 4594 */ 4595 static int 4596 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4597 int keybits, hammer2_blockref_t *base, int count, 4598 int ncount) 4599 { 4600 hammer2_blockref_t *bref; 4601 hammer2_chain_t *chain; 4602 hammer2_key_t key_beg; 4603 hammer2_key_t key_end; 4604 hammer2_key_t key_next; 4605 hammer2_key_t key; 4606 int nkeybits; 4607 int locount; 4608 int hicount; 4609 int maxloops = 300000; 4610 4611 /* 4612 * Shortcut if the parent is the inode. In this situation the 4613 * parent has 4+1 directory entries and we are creating an indirect 4614 * block capable of holding many more. 4615 */ 4616 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4617 return 63; 4618 } 4619 4620 key = *keyp; 4621 locount = 0; 4622 hicount = 0; 4623 4624 /* 4625 * Calculate the range of keys in the array being careful to skip 4626 * slots which are overridden with a deletion. 4627 */ 4628 key_beg = 0; 4629 key_end = HAMMER2_KEY_MAX; 4630 hammer2_spin_ex(&parent->core.spin); 4631 4632 for (;;) { 4633 if (--maxloops == 0) { 4634 panic("indkey_freemap shit %p %p:%d\n", 4635 parent, base, count); 4636 } 4637 chain = hammer2_combined_find(parent, base, count, 4638 &key_next, 4639 key_beg, key_end, 4640 &bref); 4641 4642 /* 4643 * Exhausted search 4644 */ 4645 if (bref == NULL) 4646 break; 4647 4648 /* 4649 * Deleted object 4650 */ 4651 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4652 if (key_next == 0 || key_next > key_end) 4653 break; 4654 key_beg = key_next; 4655 continue; 4656 } 4657 4658 /* 4659 * Use the full live (not deleted) element for the scan 4660 * iteration. HAMMER2 does not allow partial replacements. 4661 * 4662 * XXX should be built into hammer2_combined_find(). 4663 */ 4664 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4665 4666 /* 4667 * Expand our calculated key range (key, keybits) to fit 4668 * the scanned key. nkeybits represents the full range 4669 * that we will later cut in half (two halves @ nkeybits - 1). 4670 */ 4671 nkeybits = keybits; 4672 if (nkeybits < bref->keybits) { 4673 if (bref->keybits > 64) { 4674 kprintf("bad bref chain %p bref %p\n", 4675 chain, bref); 4676 Debugger("fubar"); 4677 } 4678 nkeybits = bref->keybits; 4679 } 4680 while (nkeybits < 64 && 4681 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4682 (key ^ bref->key)) != 0) { 4683 ++nkeybits; 4684 } 4685 4686 /* 4687 * If the new key range is larger we have to determine 4688 * which side of the new key range the existing keys fall 4689 * under by checking the high bit, then collapsing the 4690 * locount into the hicount or vise-versa. 4691 */ 4692 if (keybits != nkeybits) { 4693 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4694 hicount += locount; 4695 locount = 0; 4696 } else { 4697 locount += hicount; 4698 hicount = 0; 4699 } 4700 keybits = nkeybits; 4701 } 4702 4703 /* 4704 * The newly scanned key will be in the lower half or the 4705 * upper half of the (new) key range. 4706 */ 4707 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4708 ++hicount; 4709 else 4710 ++locount; 4711 4712 if (key_next == 0) 4713 break; 4714 key_beg = key_next; 4715 } 4716 hammer2_spin_unex(&parent->core.spin); 4717 bref = NULL; /* now invalid (safety) */ 4718 4719 /* 4720 * Adjust keybits to represent half of the full range calculated 4721 * above (radix 63 max) for our new indirect block. 4722 */ 4723 --keybits; 4724 4725 /* 4726 * Expand keybits to hold at least ncount elements. ncount will be 4727 * a power of 2. This is to try to completely fill leaf nodes (at 4728 * least for keys which are not hashes). 4729 * 4730 * We aren't counting 'in' or 'out', we are counting 'high side' 4731 * and 'low side' based on the bit at (1LL << keybits). We want 4732 * everything to be inside in these cases so shift it all to 4733 * the low or high side depending on the new high bit. 4734 */ 4735 while (((hammer2_key_t)1 << keybits) < ncount) { 4736 ++keybits; 4737 if (key & ((hammer2_key_t)1 << keybits)) { 4738 hicount += locount; 4739 locount = 0; 4740 } else { 4741 locount += hicount; 4742 hicount = 0; 4743 } 4744 } 4745 4746 if (hicount > locount) 4747 key |= (hammer2_key_t)1 << keybits; 4748 else 4749 key &= ~(hammer2_key_t)1 << keybits; 4750 4751 *keyp = key; 4752 4753 return (keybits); 4754 } 4755 4756 #endif 4757 4758 /* 4759 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 4760 * it exists. 4761 * 4762 * Both parent and chain must be locked exclusively. 4763 * 4764 * This function will modify the parent if the blockref requires removal 4765 * from the parent's block table. 4766 * 4767 * This function is NOT recursive. Any entity already pushed into the 4768 * chain (such as an inode) may still need visibility into its contents, 4769 * as well as the ability to read and modify the contents. For example, 4770 * for an unlinked file which is still open. 4771 * 4772 * Also note that the flusher is responsible for cleaning up empty 4773 * indirect blocks. 4774 */ 4775 int 4776 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 4777 hammer2_tid_t mtid, int flags) 4778 { 4779 int error = 0; 4780 4781 KKASSERT(hammer2_mtx_owned(&chain->lock)); 4782 4783 /* 4784 * Nothing to do if already marked. 4785 * 4786 * We need the spinlock on the core whos RBTREE contains chain 4787 * to protect against races. 4788 */ 4789 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 4790 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 4791 chain->parent == parent); 4792 error = _hammer2_chain_delete_helper(parent, chain, 4793 mtid, flags); 4794 } 4795 4796 /* 4797 * Permanent deletions mark the chain as destroyed. 4798 */ 4799 if (error == 0) { 4800 if (flags & HAMMER2_DELETE_PERMANENT) 4801 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 4802 hammer2_chain_setflush(chain); 4803 } 4804 4805 return error; 4806 } 4807 4808 /* 4809 * Returns the index of the nearest element in the blockref array >= elm. 4810 * Returns (count) if no element could be found. 4811 * 4812 * Sets *key_nextp to the next key for loop purposes but does not modify 4813 * it if the next key would be higher than the current value of *key_nextp. 4814 * Note that *key_nexp can overflow to 0, which should be tested by the 4815 * caller. 4816 * 4817 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4818 * held through the operation. 4819 */ 4820 static int 4821 hammer2_base_find(hammer2_chain_t *parent, 4822 hammer2_blockref_t *base, int count, 4823 hammer2_key_t *key_nextp, 4824 hammer2_key_t key_beg, hammer2_key_t key_end) 4825 { 4826 hammer2_blockref_t *scan; 4827 hammer2_key_t scan_end; 4828 int i; 4829 int limit; 4830 4831 /* 4832 * Require the live chain's already have their core's counted 4833 * so we can optimize operations. 4834 */ 4835 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 4836 4837 /* 4838 * Degenerate case 4839 */ 4840 if (count == 0 || base == NULL) 4841 return(count); 4842 4843 /* 4844 * Sequential optimization using parent->cache_index. This is 4845 * the most likely scenario. 4846 * 4847 * We can avoid trailing empty entries on live chains, otherwise 4848 * we might have to check the whole block array. 4849 */ 4850 i = parent->cache_index; /* SMP RACE OK */ 4851 cpu_ccfence(); 4852 limit = parent->core.live_zero; 4853 if (i >= limit) 4854 i = limit - 1; 4855 if (i < 0) 4856 i = 0; 4857 KKASSERT(i < count); 4858 4859 /* 4860 * Search backwards 4861 */ 4862 scan = &base[i]; 4863 while (i > 0 && (scan->type == 0 || scan->key > key_beg)) { 4864 --scan; 4865 --i; 4866 } 4867 parent->cache_index = i; 4868 4869 /* 4870 * Search forwards, stop when we find a scan element which 4871 * encloses the key or until we know that there are no further 4872 * elements. 4873 */ 4874 while (i < count) { 4875 if (scan->type != 0) { 4876 scan_end = scan->key + 4877 ((hammer2_key_t)1 << scan->keybits) - 1; 4878 if (scan->key > key_beg || scan_end >= key_beg) 4879 break; 4880 } 4881 if (i >= limit) 4882 return (count); 4883 ++scan; 4884 ++i; 4885 } 4886 if (i != count) { 4887 parent->cache_index = i; 4888 if (i >= limit) { 4889 i = count; 4890 } else { 4891 scan_end = scan->key + 4892 ((hammer2_key_t)1 << scan->keybits); 4893 if (scan_end && (*key_nextp > scan_end || 4894 *key_nextp == 0)) { 4895 *key_nextp = scan_end; 4896 } 4897 } 4898 } 4899 return (i); 4900 } 4901 4902 /* 4903 * Do a combined search and return the next match either from the blockref 4904 * array or from the in-memory chain. Sets *bresp to the returned bref in 4905 * both cases, or sets it to NULL if the search exhausted. Only returns 4906 * a non-NULL chain if the search matched from the in-memory chain. 4907 * 4908 * When no in-memory chain has been found and a non-NULL bref is returned 4909 * in *bresp. 4910 * 4911 * 4912 * The returned chain is not locked or referenced. Use the returned bref 4913 * to determine if the search exhausted or not. Iterate if the base find 4914 * is chosen but matches a deleted chain. 4915 * 4916 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4917 * held through the operation. 4918 */ 4919 hammer2_chain_t * 4920 hammer2_combined_find(hammer2_chain_t *parent, 4921 hammer2_blockref_t *base, int count, 4922 hammer2_key_t *key_nextp, 4923 hammer2_key_t key_beg, hammer2_key_t key_end, 4924 hammer2_blockref_t **bresp) 4925 { 4926 hammer2_blockref_t *bref; 4927 hammer2_chain_t *chain; 4928 int i; 4929 4930 /* 4931 * Lookup in block array and in rbtree. 4932 */ 4933 *key_nextp = key_end + 1; 4934 i = hammer2_base_find(parent, base, count, key_nextp, 4935 key_beg, key_end); 4936 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 4937 4938 /* 4939 * Neither matched 4940 */ 4941 if (i == count && chain == NULL) { 4942 *bresp = NULL; 4943 return(NULL); 4944 } 4945 4946 /* 4947 * Only chain matched. 4948 */ 4949 if (i == count) { 4950 bref = &chain->bref; 4951 goto found; 4952 } 4953 4954 /* 4955 * Only blockref matched. 4956 */ 4957 if (chain == NULL) { 4958 bref = &base[i]; 4959 goto found; 4960 } 4961 4962 /* 4963 * Both in-memory and blockref matched, select the nearer element. 4964 * 4965 * If both are flush with the left-hand side or both are the 4966 * same distance away, select the chain. In this situation the 4967 * chain must have been loaded from the matching blockmap. 4968 */ 4969 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 4970 chain->bref.key == base[i].key) { 4971 KKASSERT(chain->bref.key == base[i].key); 4972 bref = &chain->bref; 4973 goto found; 4974 } 4975 4976 /* 4977 * Select the nearer key 4978 */ 4979 if (chain->bref.key < base[i].key) { 4980 bref = &chain->bref; 4981 } else { 4982 bref = &base[i]; 4983 chain = NULL; 4984 } 4985 4986 /* 4987 * If the bref is out of bounds we've exhausted our search. 4988 */ 4989 found: 4990 if (bref->key > key_end) { 4991 *bresp = NULL; 4992 chain = NULL; 4993 } else { 4994 *bresp = bref; 4995 } 4996 return(chain); 4997 } 4998 4999 /* 5000 * Locate the specified block array element and delete it. The element 5001 * must exist. 5002 * 5003 * The spin lock on the related chain must be held. 5004 * 5005 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5006 * need to be adjusted when we commit the media change. 5007 */ 5008 void 5009 hammer2_base_delete(hammer2_chain_t *parent, 5010 hammer2_blockref_t *base, int count, 5011 hammer2_chain_t *chain) 5012 { 5013 hammer2_blockref_t *elm = &chain->bref; 5014 hammer2_blockref_t *scan; 5015 hammer2_key_t key_next; 5016 int i; 5017 5018 /* 5019 * Delete element. Expect the element to exist. 5020 * 5021 * XXX see caller, flush code not yet sophisticated enough to prevent 5022 * re-flushed in some cases. 5023 */ 5024 key_next = 0; /* max range */ 5025 i = hammer2_base_find(parent, base, count, &key_next, 5026 elm->key, elm->key); 5027 scan = &base[i]; 5028 if (i == count || scan->type == 0 || 5029 scan->key != elm->key || 5030 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 5031 scan->keybits != elm->keybits)) { 5032 hammer2_spin_unex(&parent->core.spin); 5033 panic("delete base %p element not found at %d/%d elm %p\n", 5034 base, i, count, elm); 5035 return; 5036 } 5037 5038 /* 5039 * Update stats and zero the entry. 5040 * 5041 * NOTE: Handle radix == 0 (0 bytes) case. 5042 */ 5043 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5044 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5045 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5046 } 5047 switch(scan->type) { 5048 case HAMMER2_BREF_TYPE_INODE: 5049 --parent->bref.embed.stats.inode_count; 5050 /* fall through */ 5051 case HAMMER2_BREF_TYPE_DATA: 5052 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5053 atomic_set_int(&chain->flags, 5054 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5055 } else { 5056 if (parent->bref.leaf_count) 5057 --parent->bref.leaf_count; 5058 } 5059 /* fall through */ 5060 case HAMMER2_BREF_TYPE_INDIRECT: 5061 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5062 parent->bref.embed.stats.data_count -= 5063 scan->embed.stats.data_count; 5064 parent->bref.embed.stats.inode_count -= 5065 scan->embed.stats.inode_count; 5066 } 5067 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5068 break; 5069 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5070 atomic_set_int(&chain->flags, 5071 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5072 } else { 5073 if (parent->bref.leaf_count <= scan->leaf_count) 5074 parent->bref.leaf_count = 0; 5075 else 5076 parent->bref.leaf_count -= scan->leaf_count; 5077 } 5078 break; 5079 case HAMMER2_BREF_TYPE_DIRENT: 5080 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5081 atomic_set_int(&chain->flags, 5082 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5083 } else { 5084 if (parent->bref.leaf_count) 5085 --parent->bref.leaf_count; 5086 } 5087 default: 5088 break; 5089 } 5090 5091 bzero(scan, sizeof(*scan)); 5092 5093 /* 5094 * We can only optimize parent->core.live_zero for live chains. 5095 */ 5096 if (parent->core.live_zero == i + 1) { 5097 while (--i >= 0 && base[i].type == 0) 5098 ; 5099 parent->core.live_zero = i + 1; 5100 } 5101 5102 /* 5103 * Clear appropriate blockmap flags in chain. 5104 */ 5105 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 5106 HAMMER2_CHAIN_BMAPUPD); 5107 } 5108 5109 /* 5110 * Insert the specified element. The block array must not already have the 5111 * element and must have space available for the insertion. 5112 * 5113 * The spin lock on the related chain must be held. 5114 * 5115 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5116 * need to be adjusted when we commit the media change. 5117 */ 5118 void 5119 hammer2_base_insert(hammer2_chain_t *parent, 5120 hammer2_blockref_t *base, int count, 5121 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5122 { 5123 hammer2_key_t key_next; 5124 hammer2_key_t xkey; 5125 int i; 5126 int j; 5127 int k; 5128 int l; 5129 int u = 1; 5130 5131 /* 5132 * Insert new element. Expect the element to not already exist 5133 * unless we are replacing it. 5134 * 5135 * XXX see caller, flush code not yet sophisticated enough to prevent 5136 * re-flushed in some cases. 5137 */ 5138 key_next = 0; /* max range */ 5139 i = hammer2_base_find(parent, base, count, &key_next, 5140 elm->key, elm->key); 5141 5142 /* 5143 * Shortcut fill optimization, typical ordered insertion(s) may not 5144 * require a search. 5145 */ 5146 KKASSERT(i >= 0 && i <= count); 5147 5148 /* 5149 * Set appropriate blockmap flags in chain (if not NULL) 5150 */ 5151 if (chain) 5152 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 5153 5154 /* 5155 * Update stats and zero the entry 5156 */ 5157 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5158 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5159 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5160 } 5161 switch(elm->type) { 5162 case HAMMER2_BREF_TYPE_INODE: 5163 ++parent->bref.embed.stats.inode_count; 5164 /* fall through */ 5165 case HAMMER2_BREF_TYPE_DATA: 5166 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5167 ++parent->bref.leaf_count; 5168 /* fall through */ 5169 case HAMMER2_BREF_TYPE_INDIRECT: 5170 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5171 parent->bref.embed.stats.data_count += 5172 elm->embed.stats.data_count; 5173 parent->bref.embed.stats.inode_count += 5174 elm->embed.stats.inode_count; 5175 } 5176 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5177 break; 5178 if (parent->bref.leaf_count + elm->leaf_count < 5179 HAMMER2_BLOCKREF_LEAF_MAX) { 5180 parent->bref.leaf_count += elm->leaf_count; 5181 } else { 5182 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5183 } 5184 break; 5185 case HAMMER2_BREF_TYPE_DIRENT: 5186 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5187 ++parent->bref.leaf_count; 5188 break; 5189 default: 5190 break; 5191 } 5192 5193 5194 /* 5195 * We can only optimize parent->core.live_zero for live chains. 5196 */ 5197 if (i == count && parent->core.live_zero < count) { 5198 i = parent->core.live_zero++; 5199 base[i] = *elm; 5200 return; 5201 } 5202 5203 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5204 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5205 hammer2_spin_unex(&parent->core.spin); 5206 panic("insert base %p overlapping elements at %d elm %p\n", 5207 base, i, elm); 5208 } 5209 5210 /* 5211 * Try to find an empty slot before or after. 5212 */ 5213 j = i; 5214 k = i; 5215 while (j > 0 || k < count) { 5216 --j; 5217 if (j >= 0 && base[j].type == 0) { 5218 if (j == i - 1) { 5219 base[j] = *elm; 5220 } else { 5221 bcopy(&base[j+1], &base[j], 5222 (i - j - 1) * sizeof(*base)); 5223 base[i - 1] = *elm; 5224 } 5225 goto validate; 5226 } 5227 ++k; 5228 if (k < count && base[k].type == 0) { 5229 bcopy(&base[i], &base[i+1], 5230 (k - i) * sizeof(hammer2_blockref_t)); 5231 base[i] = *elm; 5232 5233 /* 5234 * We can only update parent->core.live_zero for live 5235 * chains. 5236 */ 5237 if (parent->core.live_zero <= k) 5238 parent->core.live_zero = k + 1; 5239 u = 2; 5240 goto validate; 5241 } 5242 } 5243 panic("hammer2_base_insert: no room!"); 5244 5245 /* 5246 * Debugging 5247 */ 5248 validate: 5249 key_next = 0; 5250 for (l = 0; l < count; ++l) { 5251 if (base[l].type) { 5252 key_next = base[l].key + 5253 ((hammer2_key_t)1 << base[l].keybits) - 1; 5254 break; 5255 } 5256 } 5257 while (++l < count) { 5258 if (base[l].type) { 5259 if (base[l].key <= key_next) 5260 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5261 key_next = base[l].key + 5262 ((hammer2_key_t)1 << base[l].keybits) - 1; 5263 5264 } 5265 } 5266 5267 } 5268 5269 #if 0 5270 5271 /* 5272 * Sort the blockref array for the chain. Used by the flush code to 5273 * sort the blockref[] array. 5274 * 5275 * The chain must be exclusively locked AND spin-locked. 5276 */ 5277 typedef hammer2_blockref_t *hammer2_blockref_p; 5278 5279 static 5280 int 5281 hammer2_base_sort_callback(const void *v1, const void *v2) 5282 { 5283 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5284 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5285 5286 /* 5287 * Make sure empty elements are placed at the end of the array 5288 */ 5289 if (bref1->type == 0) { 5290 if (bref2->type == 0) 5291 return(0); 5292 return(1); 5293 } else if (bref2->type == 0) { 5294 return(-1); 5295 } 5296 5297 /* 5298 * Sort by key 5299 */ 5300 if (bref1->key < bref2->key) 5301 return(-1); 5302 if (bref1->key > bref2->key) 5303 return(1); 5304 return(0); 5305 } 5306 5307 void 5308 hammer2_base_sort(hammer2_chain_t *chain) 5309 { 5310 hammer2_blockref_t *base; 5311 int count; 5312 5313 switch(chain->bref.type) { 5314 case HAMMER2_BREF_TYPE_INODE: 5315 /* 5316 * Special shortcut for embedded data returns the inode 5317 * itself. Callers must detect this condition and access 5318 * the embedded data (the strategy code does this for us). 5319 * 5320 * This is only applicable to regular files and softlinks. 5321 */ 5322 if (chain->data->ipdata.meta.op_flags & 5323 HAMMER2_OPFLAG_DIRECTDATA) { 5324 return; 5325 } 5326 base = &chain->data->ipdata.u.blockset.blockref[0]; 5327 count = HAMMER2_SET_COUNT; 5328 break; 5329 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5330 case HAMMER2_BREF_TYPE_INDIRECT: 5331 /* 5332 * Optimize indirect blocks in the INITIAL state to avoid 5333 * I/O. 5334 */ 5335 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5336 base = &chain->data->npdata[0]; 5337 count = chain->bytes / sizeof(hammer2_blockref_t); 5338 break; 5339 case HAMMER2_BREF_TYPE_VOLUME: 5340 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5341 count = HAMMER2_SET_COUNT; 5342 break; 5343 case HAMMER2_BREF_TYPE_FREEMAP: 5344 base = &chain->data->blkset.blockref[0]; 5345 count = HAMMER2_SET_COUNT; 5346 break; 5347 default: 5348 kprintf("hammer2_chain_lookup: unrecognized " 5349 "blockref(A) type: %d", 5350 chain->bref.type); 5351 while (1) 5352 tsleep(&base, 0, "dead", 0); 5353 panic("hammer2_chain_lookup: unrecognized " 5354 "blockref(A) type: %d", 5355 chain->bref.type); 5356 base = NULL; /* safety */ 5357 count = 0; /* safety */ 5358 } 5359 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5360 } 5361 5362 #endif 5363 5364 /* 5365 * Chain memory management 5366 */ 5367 void 5368 hammer2_chain_wait(hammer2_chain_t *chain) 5369 { 5370 tsleep(chain, 0, "chnflw", 1); 5371 } 5372 5373 const hammer2_media_data_t * 5374 hammer2_chain_rdata(hammer2_chain_t *chain) 5375 { 5376 KKASSERT(chain->data != NULL); 5377 return (chain->data); 5378 } 5379 5380 hammer2_media_data_t * 5381 hammer2_chain_wdata(hammer2_chain_t *chain) 5382 { 5383 KKASSERT(chain->data != NULL); 5384 return (chain->data); 5385 } 5386 5387 /* 5388 * Set the check data for a chain. This can be a heavy-weight operation 5389 * and typically only runs on-flush. For file data check data is calculated 5390 * when the logical buffers are flushed. 5391 */ 5392 void 5393 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5394 { 5395 chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO; 5396 5397 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5398 case HAMMER2_CHECK_NONE: 5399 break; 5400 case HAMMER2_CHECK_DISABLED: 5401 break; 5402 case HAMMER2_CHECK_ISCSI32: 5403 chain->bref.check.iscsi32.value = 5404 hammer2_icrc32(bdata, chain->bytes); 5405 break; 5406 case HAMMER2_CHECK_XXHASH64: 5407 chain->bref.check.xxhash64.value = 5408 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5409 break; 5410 case HAMMER2_CHECK_SHA192: 5411 { 5412 SHA256_CTX hash_ctx; 5413 union { 5414 uint8_t digest[SHA256_DIGEST_LENGTH]; 5415 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5416 } u; 5417 5418 SHA256_Init(&hash_ctx); 5419 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5420 SHA256_Final(u.digest, &hash_ctx); 5421 u.digest64[2] ^= u.digest64[3]; 5422 bcopy(u.digest, 5423 chain->bref.check.sha192.data, 5424 sizeof(chain->bref.check.sha192.data)); 5425 } 5426 break; 5427 case HAMMER2_CHECK_FREEMAP: 5428 chain->bref.check.freemap.icrc32 = 5429 hammer2_icrc32(bdata, chain->bytes); 5430 break; 5431 default: 5432 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5433 chain->bref.methods); 5434 break; 5435 } 5436 } 5437 5438 int 5439 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5440 { 5441 uint32_t check32; 5442 uint64_t check64; 5443 int r; 5444 5445 if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO) 5446 return 1; 5447 5448 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5449 case HAMMER2_CHECK_NONE: 5450 r = 1; 5451 break; 5452 case HAMMER2_CHECK_DISABLED: 5453 r = 1; 5454 break; 5455 case HAMMER2_CHECK_ISCSI32: 5456 check32 = hammer2_icrc32(bdata, chain->bytes); 5457 r = (chain->bref.check.iscsi32.value == check32); 5458 if (r == 0) { 5459 kprintf("chain %016jx.%02x meth=%02x CHECK FAIL " 5460 "(flags=%08x, bref/data %08x/%08x)\n", 5461 chain->bref.data_off, 5462 chain->bref.type, 5463 chain->bref.methods, 5464 chain->flags, 5465 chain->bref.check.iscsi32.value, 5466 check32); 5467 } 5468 hammer2_check_icrc32 += chain->bytes; 5469 break; 5470 case HAMMER2_CHECK_XXHASH64: 5471 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5472 r = (chain->bref.check.xxhash64.value == check64); 5473 if (r == 0) { 5474 kprintf("chain %016jx.%02x key=%016jx " 5475 "meth=%02x CHECK FAIL " 5476 "(flags=%08x, bref/data %016jx/%016jx)\n", 5477 chain->bref.data_off, 5478 chain->bref.type, 5479 chain->bref.key, 5480 chain->bref.methods, 5481 chain->flags, 5482 chain->bref.check.xxhash64.value, 5483 check64); 5484 } 5485 hammer2_check_xxhash64 += chain->bytes; 5486 break; 5487 case HAMMER2_CHECK_SHA192: 5488 { 5489 SHA256_CTX hash_ctx; 5490 union { 5491 uint8_t digest[SHA256_DIGEST_LENGTH]; 5492 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5493 } u; 5494 5495 SHA256_Init(&hash_ctx); 5496 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5497 SHA256_Final(u.digest, &hash_ctx); 5498 u.digest64[2] ^= u.digest64[3]; 5499 if (bcmp(u.digest, 5500 chain->bref.check.sha192.data, 5501 sizeof(chain->bref.check.sha192.data)) == 0) { 5502 r = 1; 5503 } else { 5504 r = 0; 5505 kprintf("chain %016jx.%02x meth=%02x " 5506 "CHECK FAIL\n", 5507 chain->bref.data_off, 5508 chain->bref.type, 5509 chain->bref.methods); 5510 } 5511 } 5512 break; 5513 case HAMMER2_CHECK_FREEMAP: 5514 r = (chain->bref.check.freemap.icrc32 == 5515 hammer2_icrc32(bdata, chain->bytes)); 5516 if (r == 0) { 5517 kprintf("chain %016jx.%02x meth=%02x " 5518 "CHECK FAIL\n", 5519 chain->bref.data_off, 5520 chain->bref.type, 5521 chain->bref.methods); 5522 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 5523 chain->bref.check.freemap.icrc32, 5524 hammer2_icrc32(bdata, chain->bytes), 5525 chain->bytes); 5526 if (chain->dio) 5527 kprintf("dio %p buf %016jx,%d bdata %p/%p\n", 5528 chain->dio, chain->dio->bp->b_loffset, 5529 chain->dio->bp->b_bufsize, bdata, 5530 chain->dio->bp->b_data); 5531 } 5532 5533 break; 5534 default: 5535 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5536 chain->bref.methods); 5537 r = 1; 5538 break; 5539 } 5540 return r; 5541 } 5542 5543 /* 5544 * Acquire the chain and parent representing the specified inode for the 5545 * device at the specified cluster index. 5546 * 5547 * The flags passed in are LOOKUP flags, not RESOLVE flags. 5548 * 5549 * If we are unable to locate the hardlink, INVAL is returned and *chainp 5550 * will be NULL. *parentp may still be set error or not, or NULL if the 5551 * parent itself could not be resolved. 5552 * 5553 * Caller must pass-in a valid or NULL *parentp or *chainp. The passed-in 5554 * *parentp and *chainp will be unlocked if not NULL. 5555 */ 5556 int 5557 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 5558 int clindex, int flags, 5559 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 5560 { 5561 hammer2_chain_t *parent; 5562 hammer2_chain_t *rchain; 5563 hammer2_key_t key_dummy; 5564 int resolve_flags; 5565 int error; 5566 5567 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 5568 HAMMER2_RESOLVE_SHARED : 0; 5569 5570 /* 5571 * Caller expects us to replace these. 5572 */ 5573 if (*chainp) { 5574 hammer2_chain_unlock(*chainp); 5575 hammer2_chain_drop(*chainp); 5576 *chainp = NULL; 5577 } 5578 if (*parentp) { 5579 hammer2_chain_unlock(*parentp); 5580 hammer2_chain_drop(*parentp); 5581 *parentp = NULL; 5582 } 5583 5584 /* 5585 * Inodes hang off of the iroot (bit 63 is clear, differentiating 5586 * inodes from root directory entries in the key lookup). 5587 */ 5588 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 5589 rchain = NULL; 5590 if (parent) { 5591 rchain = hammer2_chain_lookup(&parent, &key_dummy, 5592 inum, inum, 5593 &error, flags); 5594 } else { 5595 error = HAMMER2_ERROR_EIO; 5596 } 5597 *parentp = parent; 5598 *chainp = rchain; 5599 5600 return error; 5601 } 5602 5603 /* 5604 * Used by the bulkscan code to snapshot the synchronized storage for 5605 * a volume, allowing it to be scanned concurrently against normal 5606 * operation. 5607 */ 5608 hammer2_chain_t * 5609 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 5610 { 5611 hammer2_chain_t *copy; 5612 5613 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 5614 copy->data = kmalloc(sizeof(copy->data->voldata), 5615 hmp->mchain, 5616 M_WAITOK | M_ZERO); 5617 hammer2_voldata_lock(hmp); 5618 copy->data->voldata = hmp->volsync; 5619 hammer2_voldata_unlock(hmp); 5620 5621 return copy; 5622 } 5623 5624 void 5625 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 5626 { 5627 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 5628 KKASSERT(copy->data); 5629 kfree(copy->data, copy->hmp->mchain); 5630 copy->data = NULL; 5631 atomic_add_long(&hammer2_chain_allocs, -1); 5632 hammer2_chain_drop(copy); 5633 } 5634 5635 /* 5636 * Returns non-zero if the chain (INODE or DIRENT) matches the 5637 * filename. 5638 */ 5639 int 5640 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 5641 size_t name_len) 5642 { 5643 const hammer2_inode_data_t *ripdata; 5644 const hammer2_dirent_head_t *den; 5645 5646 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5647 ripdata = &chain->data->ipdata; 5648 if (ripdata->meta.name_len == name_len && 5649 bcmp(ripdata->filename, name, name_len) == 0) { 5650 return 1; 5651 } 5652 } 5653 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 5654 chain->bref.embed.dirent.namlen == name_len) { 5655 den = &chain->bref.embed.dirent; 5656 if (name_len > sizeof(chain->bref.check.buf) && 5657 bcmp(chain->data->buf, name, name_len) == 0) { 5658 return 1; 5659 } 5660 if (name_len <= sizeof(chain->bref.check.buf) && 5661 bcmp(chain->bref.check.buf, name, name_len) == 0) { 5662 return 1; 5663 } 5664 } 5665 return 0; 5666 } 5667