1 /* 2 * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static hammer2_chain_t *hammer2_chain_create_indirect( 68 hammer2_chain_t *parent, 69 hammer2_key_t key, int keybits, 70 hammer2_tid_t mtid, int for_type, int *errorp); 71 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain); 72 static hammer2_chain_t *hammer2_combined_find( 73 hammer2_chain_t *parent, 74 hammer2_blockref_t *base, int count, 75 hammer2_key_t *key_nextp, 76 hammer2_key_t key_beg, hammer2_key_t key_end, 77 hammer2_blockref_t **bresp); 78 79 static struct krate krate_h2me = { .freq = 1 }; 80 81 /* 82 * Basic RBTree for chains (core->rbtree and core->dbtree). Chains cannot 83 * overlap in the RB trees. Deleted chains are moved from rbtree to either 84 * dbtree or to dbq. 85 * 86 * Chains in delete-duplicate sequences can always iterate through core_entry 87 * to locate the live version of the chain. 88 */ 89 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 90 91 int 92 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 93 { 94 hammer2_key_t c1_beg; 95 hammer2_key_t c1_end; 96 hammer2_key_t c2_beg; 97 hammer2_key_t c2_end; 98 99 /* 100 * Compare chains. Overlaps are not supposed to happen and catch 101 * any software issues early we count overlaps as a match. 102 */ 103 c1_beg = chain1->bref.key; 104 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 105 c2_beg = chain2->bref.key; 106 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 107 108 if (c1_end < c2_beg) /* fully to the left */ 109 return(-1); 110 if (c1_beg > c2_end) /* fully to the right */ 111 return(1); 112 return(0); /* overlap (must not cross edge boundary) */ 113 } 114 115 /* 116 * Assert that a chain has no media data associated with it. 117 */ 118 static __inline void 119 hammer2_chain_assert_no_data(hammer2_chain_t *chain) 120 { 121 KKASSERT(chain->dio == NULL); 122 if (chain->bref.type != HAMMER2_BREF_TYPE_VOLUME && 123 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP && 124 chain->data) { 125 panic("hammer2_assert_no_data: chain %p still has data", chain); 126 } 127 } 128 129 /* 130 * Make a chain visible to the flusher. The flusher needs to be able to 131 * do flushes of subdirectory chains or single files so it does a top-down 132 * recursion using the ONFLUSH flag for the recursion. It locates MODIFIED 133 * or UPDATE chains and flushes back up the chain to the volume root. 134 * 135 * This routine sets ONFLUSH upward until it hits the volume root. For 136 * simplicity we ignore PFSROOT boundaries whos rules can be complex. 137 * Extra ONFLUSH flagging doesn't hurt the filesystem. 138 */ 139 void 140 hammer2_chain_setflush(hammer2_chain_t *chain) 141 { 142 hammer2_chain_t *parent; 143 144 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 145 hammer2_spin_sh(&chain->core.spin); 146 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 147 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 148 if ((parent = chain->parent) == NULL) 149 break; 150 hammer2_spin_sh(&parent->core.spin); 151 hammer2_spin_unsh(&chain->core.spin); 152 chain = parent; 153 } 154 hammer2_spin_unsh(&chain->core.spin); 155 } 156 } 157 158 /* 159 * Allocate a new disconnected chain element representing the specified 160 * bref. chain->refs is set to 1 and the passed bref is copied to 161 * chain->bref. chain->bytes is derived from the bref. 162 * 163 * chain->pmp inherits pmp unless the chain is an inode (other than the 164 * super-root inode). 165 * 166 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 167 */ 168 hammer2_chain_t * 169 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 170 hammer2_blockref_t *bref) 171 { 172 hammer2_chain_t *chain; 173 u_int bytes; 174 175 /* 176 * Special case - radix of 0 indicates a chain that does not 177 * need a data reference (context is completely embedded in the 178 * bref). 179 */ 180 if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX)) 181 bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 182 else 183 bytes = 0; 184 185 atomic_add_long(&hammer2_chain_allocs, 1); 186 187 /* 188 * Construct the appropriate system structure. 189 */ 190 switch(bref->type) { 191 case HAMMER2_BREF_TYPE_DIRENT: 192 case HAMMER2_BREF_TYPE_INODE: 193 case HAMMER2_BREF_TYPE_INDIRECT: 194 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 195 case HAMMER2_BREF_TYPE_DATA: 196 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 197 /* 198 * Chain's are really only associated with the hmp but we 199 * maintain a pmp association for per-mount memory tracking 200 * purposes. The pmp can be NULL. 201 */ 202 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 203 break; 204 case HAMMER2_BREF_TYPE_VOLUME: 205 case HAMMER2_BREF_TYPE_FREEMAP: 206 /* 207 * Only hammer2_chain_bulksnap() calls this function with these 208 * types. 209 */ 210 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 211 break; 212 default: 213 chain = NULL; 214 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 215 bref->type); 216 } 217 218 /* 219 * Initialize the new chain structure. pmp must be set to NULL for 220 * chains belonging to the super-root topology of a device mount. 221 */ 222 if (pmp == hmp->spmp) 223 chain->pmp = NULL; 224 else 225 chain->pmp = pmp; 226 chain->hmp = hmp; 227 chain->bref = *bref; 228 chain->bytes = bytes; 229 chain->refs = 1; 230 chain->flags = HAMMER2_CHAIN_ALLOCATED; 231 232 /* 233 * Set the PFS boundary flag if this chain represents a PFS root. 234 */ 235 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 236 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 237 hammer2_chain_core_init(chain); 238 239 return (chain); 240 } 241 242 /* 243 * Initialize a chain's core structure. This structure used to be allocated 244 * but is now embedded. 245 * 246 * The core is not locked. No additional refs on the chain are made. 247 * (trans) must not be NULL if (core) is not NULL. 248 */ 249 void 250 hammer2_chain_core_init(hammer2_chain_t *chain) 251 { 252 /* 253 * Fresh core under nchain (no multi-homing of ochain's 254 * sub-tree). 255 */ 256 RB_INIT(&chain->core.rbtree); /* live chains */ 257 hammer2_mtx_init(&chain->lock, "h2chain"); 258 } 259 260 /* 261 * Add a reference to a chain element, preventing its destruction. 262 * 263 * (can be called with spinlock held) 264 */ 265 void 266 hammer2_chain_ref(hammer2_chain_t *chain) 267 { 268 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 269 /* 270 * 0->non-zero transition must ensure that chain is removed 271 * from the LRU list. 272 * 273 * NOTE: Already holding lru_spin here so we cannot call 274 * hammer2_chain_ref() to get it off lru_list, do 275 * it manually. 276 */ 277 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 278 hammer2_pfs_t *pmp = chain->pmp; 279 hammer2_spin_ex(&pmp->lru_spin); 280 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 281 atomic_add_int(&pmp->lru_count, -1); 282 atomic_clear_int(&chain->flags, 283 HAMMER2_CHAIN_ONLRU); 284 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 285 } 286 hammer2_spin_unex(&pmp->lru_spin); 287 } 288 } 289 #if 0 290 kprintf("REFC %p %d %08x\n", chain, chain->refs - 1, chain->flags); 291 print_backtrace(8); 292 #endif 293 } 294 295 /* 296 * Ref a locked chain and force the data to be held across an unlock. 297 * Chain must be currently locked. The user of the chain who desires 298 * to release the hold must call hammer2_chain_lock_unhold() to relock 299 * and unhold the chain, then unlock normally, or may simply call 300 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 301 */ 302 void 303 hammer2_chain_ref_hold(hammer2_chain_t *chain) 304 { 305 atomic_add_int(&chain->lockcnt, 1); 306 hammer2_chain_ref(chain); 307 } 308 309 /* 310 * Insert the chain in the core rbtree. 311 * 312 * Normal insertions are placed in the live rbtree. Insertion of a deleted 313 * chain is a special case used by the flush code that is placed on the 314 * unstaged deleted list to avoid confusing the live view. 315 */ 316 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 317 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 318 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 319 320 static 321 int 322 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 323 int flags, int generation) 324 { 325 hammer2_chain_t *xchain; 326 int error = 0; 327 328 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 329 hammer2_spin_ex(&parent->core.spin); 330 331 /* 332 * Interlocked by spinlock, check for race 333 */ 334 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 335 parent->core.generation != generation) { 336 error = HAMMER2_ERROR_EAGAIN; 337 goto failed; 338 } 339 340 /* 341 * Insert chain 342 */ 343 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 344 KASSERT(xchain == NULL, 345 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 346 chain, xchain, chain->bref.key)); 347 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 348 chain->parent = parent; 349 ++parent->core.chain_count; 350 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 351 352 /* 353 * We have to keep track of the effective live-view blockref count 354 * so the create code knows when to push an indirect block. 355 */ 356 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 357 atomic_add_int(&parent->core.live_count, 1); 358 failed: 359 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 360 hammer2_spin_unex(&parent->core.spin); 361 return error; 362 } 363 364 /* 365 * Drop the caller's reference to the chain. When the ref count drops to 366 * zero this function will try to disassociate the chain from its parent and 367 * deallocate it, then recursely drop the parent using the implied ref 368 * from the chain's chain->parent. 369 * 370 * Nobody should own chain's mutex on the 1->0 transition, unless this drop 371 * races an acquisition by another cpu. Therefore we can loop if we are 372 * unable to acquire the mutex, and refs is unlikely to be 1 unless we again 373 * race against another drop. 374 */ 375 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain); 376 377 void 378 hammer2_chain_drop(hammer2_chain_t *chain) 379 { 380 u_int refs; 381 382 if (hammer2_debug & 0x200000) 383 Debugger("drop"); 384 #if 0 385 kprintf("DROP %p %d %08x\n", chain, chain->refs - 1, chain->flags); 386 print_backtrace(8); 387 #endif 388 389 KKASSERT(chain->refs > 0); 390 391 while (chain) { 392 refs = chain->refs; 393 cpu_ccfence(); 394 KKASSERT(refs > 0); 395 396 if (refs == 1) { 397 if (mtx_lock_ex_try(&chain->lock) == 0) 398 chain = hammer2_chain_lastdrop(chain); 399 /* retry the same chain */ 400 } else { 401 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 402 break; 403 /* retry the same chain */ 404 } 405 cpu_pause(); 406 } 407 } 408 409 /* 410 * Unhold a held and probably not-locked chain, ensure that the data is 411 * dropped on the 1->0 transition of lockcnt by obtaining an exclusive 412 * lock and then simply unlocking the chain. 413 */ 414 void 415 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 416 { 417 u_int lockcnt; 418 int iter = 0; 419 420 for (;;) { 421 lockcnt = chain->lockcnt; 422 cpu_ccfence(); 423 if (lockcnt > 1) { 424 if (atomic_cmpset_int(&chain->lockcnt, 425 lockcnt, lockcnt - 1)) { 426 break; 427 } 428 } else if (mtx_lock_ex_try(&chain->lock) == 0) { 429 hammer2_chain_unlock(chain); 430 break; 431 } else { 432 /* 433 * This situation can easily occur on SMP due to 434 * the gap inbetween the 1->0 transition and the 435 * final unlock. We cannot safely block on the 436 * mutex because lockcnt might go above 1. 437 * 438 * XXX Sleep for one tick if it takes too long. 439 */ 440 if (++iter > 1000) { 441 if (iter > 1000 + hz) { 442 kprintf("hammer2: h2race1 %p\n", chain); 443 iter = 1000; 444 } 445 tsleep(&iter, 0, "h2race1", 1); 446 } 447 cpu_pause(); 448 } 449 } 450 hammer2_chain_drop(chain); 451 } 452 453 /* 454 * Handles the (potential) last drop of chain->refs from 1->0. Called with 455 * the mutex exclusively locked, refs == 1, and lockcnt 0. SMP races are 456 * possible against refs and lockcnt. We must dispose of the mutex on chain. 457 * 458 * This function returns an unlocked chain for recursive drop or NULL. It 459 * can return the same chain if it determines it has raced another ref. 460 * 461 * -- 462 * 463 * When two chains need to be recursively dropped we use the chain we 464 * would otherwise free to placehold the additional chain. It's a bit 465 * convoluted but we can't just recurse without potentially blowing out 466 * the kernel stack. 467 * 468 * The chain cannot be freed if it has any children. 469 * The chain cannot be freed if flagged MODIFIED unless we can dispose of it. 470 * The chain cannot be freed if flagged UPDATE unless we can dispose of it. 471 * Any dedup registration can remain intact. 472 * 473 * The core spinlock is allowed to nest child-to-parent (not parent-to-child). 474 */ 475 static 476 hammer2_chain_t * 477 hammer2_chain_lastdrop(hammer2_chain_t *chain) 478 { 479 hammer2_pfs_t *pmp; 480 hammer2_dev_t *hmp; 481 hammer2_chain_t *parent; 482 hammer2_chain_t *rdrop; 483 #if 0 484 hammer2_io_t *dio; 485 #endif 486 487 #if 0 488 /* 489 * On last drop if there is no parent and data_off is good (at 490 * least does not represent the volume root), the modified chain 491 * is probably going to be destroyed. We have to make sure that 492 * the data area is not registered for dedup. 493 * 494 * XXX removed. In fact, we do not have to make sure that the 495 * data area is not registered for dedup. The data area 496 * can, in fact, still be used for dedup because it is 497 * still allocated in the freemap and the underlying I/O 498 * will still be flushed. 499 */ 500 if (chain->parent == NULL && 501 (chain->flags & HAMMER2_CHAIN_MODIFIED) && 502 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 503 hmp = chain->hmp; 504 hammer2_io_dedup_delete(hmp, chain->bref.type, 505 chain->bref.data_off, chain->bytes); 506 } 507 #endif 508 /* 509 * We need chain's spinlock to interlock the sub-tree test. 510 * We already have chain's mutex, protecting chain->parent. 511 * 512 * Remember that chain->refs can be in flux. 513 */ 514 hammer2_spin_ex(&chain->core.spin); 515 516 if ((parent = chain->parent) != NULL) { 517 /* 518 * If the chain has a parent the UPDATE bit prevents scrapping 519 * as the chain is needed to properly flush the parent. Try 520 * to complete the 1->0 transition and return NULL. Retry 521 * (return chain) if we are unable to complete the 1->0 522 * transition, else return NULL (nothing more to do). 523 * 524 * If the chain has a parent the MODIFIED bit prevents 525 * scrapping. 526 * 527 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 528 */ 529 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 530 HAMMER2_CHAIN_MODIFIED)) { 531 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 532 hammer2_spin_unex(&chain->core.spin); 533 #if 0 534 dio = hammer2_chain_drop_data(chain, 0); 535 if (dio) 536 hammer2_io_bqrelse(&dio); 537 #endif 538 hammer2_chain_assert_no_data(chain); 539 hammer2_mtx_unlock(&chain->lock); 540 chain = NULL; 541 } else { 542 hammer2_spin_unex(&chain->core.spin); 543 hammer2_mtx_unlock(&chain->lock); 544 } 545 return (chain); 546 } 547 /* spinlock still held */ 548 } else { 549 /* 550 * The chain has no parent and can be flagged for destruction. 551 * Since it has no parent, UPDATE can also be cleared. 552 */ 553 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 554 if (chain->flags & HAMMER2_CHAIN_UPDATE) 555 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 556 557 /* 558 * If the chain has children we must still flush the chain. 559 * Any dedup is already handled by the underlying DIO, so 560 * we do not have to specifically flush it here. 561 * 562 * In the case where it has children, the DESTROY flag test 563 * in the flush code will prevent unnecessary flushes of 564 * MODIFIED chains that are not flagged DEDUP so don't worry 565 * about that here. 566 */ 567 if (chain->core.chain_count) { 568 /* 569 * Put on flushq (should ensure refs > 1), retry 570 * the drop. 571 */ 572 hammer2_spin_unex(&chain->core.spin); 573 hammer2_delayed_flush(chain); 574 hammer2_mtx_unlock(&chain->lock); 575 576 return(chain); /* retry drop */ 577 } 578 579 /* 580 * Otherwise we can scrap the MODIFIED bit if it is set, 581 * and continue along the freeing path. 582 * 583 * Be sure to clean-out any dedup bits. Without a parent 584 * this chain will no longer be visible to the flush code. 585 * Easy check data_off to avoid the volume root. 586 */ 587 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 588 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 589 atomic_add_long(&hammer2_count_modified_chains, -1); 590 if (chain->pmp) 591 hammer2_pfs_memory_wakeup(chain->pmp); 592 } 593 /* spinlock still held */ 594 } 595 596 /* spinlock still held */ 597 #if 0 598 dio = NULL; 599 #endif 600 601 /* 602 * If any children exist we must leave the chain intact with refs == 0. 603 * They exist because chains are retained below us which have refs or 604 * may require flushing. 605 * 606 * Retry (return chain) if we fail to transition the refs to 0, else 607 * return NULL indication nothing more to do. 608 * 609 * Chains with children are NOT put on the LRU list. 610 */ 611 if (chain->core.chain_count) { 612 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 613 hammer2_spin_unex(&chain->core.spin); 614 hammer2_chain_assert_no_data(chain); 615 hammer2_mtx_unlock(&chain->lock); 616 chain = NULL; 617 } else { 618 hammer2_spin_unex(&chain->core.spin); 619 hammer2_mtx_unlock(&chain->lock); 620 } 621 return (chain); 622 } 623 /* spinlock still held */ 624 /* no chains left under us */ 625 626 /* 627 * chain->core has no children left so no accessors can get to our 628 * chain from there. Now we have to lock the parent core to interlock 629 * remaining possible accessors that might bump chain's refs before 630 * we can safely drop chain's refs with intent to free the chain. 631 */ 632 hmp = chain->hmp; 633 pmp = chain->pmp; /* can be NULL */ 634 rdrop = NULL; 635 636 parent = chain->parent; 637 638 /* 639 * WARNING! chain's spin lock is still held here, and other spinlocks 640 * will be acquired and released in the code below. We 641 * cannot be making fancy procedure calls! 642 */ 643 644 /* 645 * We can cache the chain if it is associated with a pmp 646 * and not flagged as being destroyed or requesting a full 647 * release. In this situation the chain is not removed 648 * from its parent, i.e. it can still be looked up. 649 * 650 * We intentionally do not cache DATA chains because these 651 * were likely used to load data into the logical buffer cache 652 * and will not be accessed again for some time. 653 */ 654 if ((chain->flags & 655 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 656 chain->pmp && 657 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 658 if (parent) 659 hammer2_spin_ex(&parent->core.spin); 660 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 661 /* 662 * 1->0 transition failed, retry. Do not drop 663 * the chain's data yet! 664 */ 665 if (parent) 666 hammer2_spin_unex(&parent->core.spin); 667 hammer2_spin_unex(&chain->core.spin); 668 hammer2_mtx_unlock(&chain->lock); 669 670 return(chain); 671 } 672 673 /* 674 * Success 675 */ 676 #if 0 677 dio = hammer2_chain_drop_data(chain, 1); 678 #endif 679 hammer2_chain_assert_no_data(chain); 680 681 KKASSERT((chain->flags & HAMMER2_CHAIN_ONLRU) == 0); 682 hammer2_spin_ex(&pmp->lru_spin); 683 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 684 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 685 686 /* 687 * If we are over the LRU limit we need to drop something. 688 */ 689 if (pmp->lru_count > HAMMER2_LRU_LIMIT) { 690 rdrop = TAILQ_FIRST(&pmp->lru_list); 691 atomic_clear_int(&rdrop->flags, HAMMER2_CHAIN_ONLRU); 692 TAILQ_REMOVE(&pmp->lru_list, rdrop, lru_node); 693 atomic_add_int(&rdrop->refs, 1); 694 atomic_set_int(&rdrop->flags, HAMMER2_CHAIN_RELEASE); 695 } else { 696 atomic_add_int(&pmp->lru_count, 1); 697 } 698 hammer2_spin_unex(&pmp->lru_spin); 699 if (parent) { 700 hammer2_spin_unex(&parent->core.spin); 701 parent = NULL; /* safety */ 702 } 703 hammer2_spin_unex(&chain->core.spin); 704 hammer2_mtx_unlock(&chain->lock); 705 #if 0 706 if (dio) 707 hammer2_io_bqrelse(&dio); 708 #endif 709 710 return rdrop; 711 /* NOT REACHED */ 712 } 713 714 /* 715 * Spinlock the parent and try to drop the last ref on chain. 716 * On success determine if we should dispose of the chain 717 * (remove the chain from its parent, etc). 718 * 719 * (normal core locks are top-down recursive but we define 720 * core spinlocks as bottom-up recursive, so this is safe). 721 */ 722 if (parent) { 723 hammer2_spin_ex(&parent->core.spin); 724 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 725 #if 0 726 /* XXX remove, don't try to drop data on fail */ 727 hammer2_spin_unex(&parent->core.spin); 728 dio = hammer2_chain_drop_data(chain, 0); 729 hammer2_spin_unex(&chain->core.spin); 730 if (dio) 731 hammer2_io_bqrelse(&dio); 732 #endif 733 /* 734 * 1->0 transition failed, retry. 735 */ 736 hammer2_spin_unex(&parent->core.spin); 737 hammer2_spin_unex(&chain->core.spin); 738 hammer2_mtx_unlock(&chain->lock); 739 740 return(chain); 741 } 742 743 /* 744 * 1->0 transition successful, parent spin held to prevent 745 * new lookups, chain spinlock held to protect parent field. 746 * Remove chain from the parent. 747 * 748 * If the chain is being removed from the parent's btree but 749 * is not bmapped, we have to adjust live_count downward. If 750 * it is bmapped then the blockref is retained in the parent 751 * as is its associated live_count. This case can occur when 752 * a chain added to the topology is unable to flush and is 753 * then later deleted. 754 */ 755 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 756 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) && 757 (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 758 atomic_add_int(&parent->core.live_count, -1); 759 } 760 RB_REMOVE(hammer2_chain_tree, 761 &parent->core.rbtree, chain); 762 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 763 --parent->core.chain_count; 764 chain->parent = NULL; 765 } 766 767 /* 768 * If our chain was the last chain in the parent's core the 769 * core is now empty and its parent might have to be 770 * re-dropped if it has 0 refs. 771 */ 772 if (parent->core.chain_count == 0) { 773 rdrop = parent; 774 atomic_add_int(&rdrop->refs, 1); 775 /* 776 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 777 rdrop = NULL; 778 */ 779 } 780 hammer2_spin_unex(&parent->core.spin); 781 parent = NULL; /* safety */ 782 /* FALL THROUGH */ 783 } else { 784 /* 785 * No-parent case. 786 */ 787 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 788 /* 789 * 1->0 transition failed, retry. 790 */ 791 hammer2_spin_unex(&parent->core.spin); 792 hammer2_spin_unex(&chain->core.spin); 793 hammer2_mtx_unlock(&chain->lock); 794 795 return(chain); 796 } 797 } 798 799 /* 800 * Successful 1->0 transition, no parent, no children... no way for 801 * anyone to ref this chain any more. We can clean-up and free it. 802 * 803 * We still have the core spinlock, and core's chain_count is 0. 804 * Any parent spinlock is gone. 805 */ 806 hammer2_spin_unex(&chain->core.spin); 807 hammer2_chain_assert_no_data(chain); 808 hammer2_mtx_unlock(&chain->lock); 809 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 810 chain->core.chain_count == 0); 811 812 /* 813 * All locks are gone, no pointers remain to the chain, finish 814 * freeing it. 815 */ 816 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 817 HAMMER2_CHAIN_MODIFIED)) == 0); 818 #if 0 819 dio = hammer2_chain_drop_data(chain, 1); 820 if (dio) 821 hammer2_io_bqrelse(&dio); 822 #endif 823 824 /* 825 * Once chain resources are gone we can use the now dead chain 826 * structure to placehold what might otherwise require a recursive 827 * drop, because we have potentially two things to drop and can only 828 * return one directly. 829 */ 830 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 831 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 832 chain->hmp = NULL; 833 kfree(chain, hmp->mchain); 834 } 835 836 /* 837 * Possible chaining loop when parent re-drop needed. 838 */ 839 return(rdrop); 840 } 841 842 /* 843 * On last lock release. 844 */ 845 static hammer2_io_t * 846 hammer2_chain_drop_data(hammer2_chain_t *chain) 847 { 848 hammer2_io_t *dio; 849 850 if ((dio = chain->dio) != NULL) { 851 chain->dio = NULL; 852 chain->data = NULL; 853 } else { 854 switch(chain->bref.type) { 855 case HAMMER2_BREF_TYPE_VOLUME: 856 case HAMMER2_BREF_TYPE_FREEMAP: 857 break; 858 default: 859 if (chain->data != NULL) { 860 hammer2_spin_unex(&chain->core.spin); 861 panic("chain data not null: " 862 "chain %p bref %016jx.%02x " 863 "refs %d parent %p dio %p data %p", 864 chain, chain->bref.data_off, 865 chain->bref.type, chain->refs, 866 chain->parent, 867 chain->dio, chain->data); 868 } 869 KKASSERT(chain->data == NULL); 870 break; 871 } 872 } 873 return dio; 874 } 875 876 /* 877 * Lock a referenced chain element, acquiring its data with I/O if necessary, 878 * and specify how you would like the data to be resolved. 879 * 880 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 881 * 882 * The lock is allowed to recurse, multiple locking ops will aggregate 883 * the requested resolve types. Once data is assigned it will not be 884 * removed until the last unlock. 885 * 886 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 887 * (typically used to avoid device/logical buffer 888 * aliasing for data) 889 * 890 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 891 * the INITIAL-create state (indirect blocks only). 892 * 893 * Do not resolve data elements for DATA chains. 894 * (typically used to avoid device/logical buffer 895 * aliasing for data) 896 * 897 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 898 * 899 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 900 * it will be locked exclusive. 901 * 902 * NOTE: Embedded elements (volume header, inodes) are always resolved 903 * regardless. 904 * 905 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 906 * element will instantiate and zero its buffer, and flush it on 907 * release. 908 * 909 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 910 * so as not to instantiate a device buffer, which could alias against 911 * a logical file buffer. However, if ALWAYS is specified the 912 * device buffer will be instantiated anyway. 913 * 914 * WARNING! This function blocks on I/O if data needs to be fetched. This 915 * blocking can run concurrent with other compatible lock holders 916 * who do not need data returning. The lock is not upgraded to 917 * exclusive during a data fetch, a separate bit is used to 918 * interlock I/O. However, an exclusive lock holder can still count 919 * on being interlocked against an I/O fetch managed by a shared 920 * lock holder. 921 */ 922 void 923 hammer2_chain_lock(hammer2_chain_t *chain, int how) 924 { 925 /* 926 * Ref and lock the element. Recursive locks are allowed. 927 */ 928 KKASSERT(chain->refs > 0); 929 atomic_add_int(&chain->lockcnt, 1); 930 931 /* 932 * Get the appropriate lock. If LOCKAGAIN is flagged with SHARED 933 * the caller expects a shared lock to already be present and we 934 * are giving it another ref. This case must importantly not block 935 * if there is a pending exclusive lock request. 936 */ 937 if (how & HAMMER2_RESOLVE_SHARED) { 938 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 939 hammer2_mtx_sh_again(&chain->lock); 940 } else { 941 hammer2_mtx_sh(&chain->lock); 942 } 943 } else { 944 hammer2_mtx_ex(&chain->lock); 945 } 946 ++curthread->td_tracker; 947 948 /* 949 * If we already have a valid data pointer make sure the data is 950 * synchronized to the current cpu, and then no further action is 951 * necessary. 952 */ 953 if (chain->data) { 954 if (chain->dio) 955 hammer2_io_bkvasync(chain->dio); 956 return; 957 } 958 959 /* 960 * Do we have to resolve the data? This is generally only 961 * applicable to HAMMER2_BREF_TYPE_DATA which is special-cased. 962 * Other BREF types expects the data to be there. 963 */ 964 switch(how & HAMMER2_RESOLVE_MASK) { 965 case HAMMER2_RESOLVE_NEVER: 966 return; 967 case HAMMER2_RESOLVE_MAYBE: 968 if (chain->flags & HAMMER2_CHAIN_INITIAL) 969 return; 970 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 971 return; 972 #if 0 973 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 974 return; 975 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 976 return; 977 #endif 978 /* fall through */ 979 case HAMMER2_RESOLVE_ALWAYS: 980 default: 981 break; 982 } 983 984 /* 985 * Caller requires data 986 */ 987 hammer2_chain_load_data(chain); 988 } 989 990 /* 991 * Lock the chain, retain the hold, and drop the data persistence count. 992 * The data should remain valid because we never transitioned lockcnt 993 * through 0. 994 */ 995 void 996 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 997 { 998 hammer2_chain_lock(chain, how); 999 atomic_add_int(&chain->lockcnt, -1); 1000 } 1001 1002 #if 0 1003 /* 1004 * Downgrade an exclusive chain lock to a shared chain lock. 1005 * 1006 * NOTE: There is no upgrade equivalent due to the ease of 1007 * deadlocks in that direction. 1008 */ 1009 void 1010 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 1011 { 1012 hammer2_mtx_downgrade(&chain->lock); 1013 } 1014 #endif 1015 1016 /* 1017 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 1018 * may be of any type. 1019 * 1020 * Once chain->data is set it cannot be disposed of until all locks are 1021 * released. 1022 * 1023 * Make sure the data is synchronized to the current cpu. 1024 */ 1025 void 1026 hammer2_chain_load_data(hammer2_chain_t *chain) 1027 { 1028 hammer2_blockref_t *bref; 1029 hammer2_dev_t *hmp; 1030 hammer2_io_t *dio; 1031 char *bdata; 1032 int error; 1033 1034 /* 1035 * Degenerate case, data already present, or chain has no media 1036 * reference to load. 1037 */ 1038 if (chain->data) { 1039 if (chain->dio) 1040 hammer2_io_bkvasync(chain->dio); 1041 return; 1042 } 1043 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) 1044 return; 1045 1046 hmp = chain->hmp; 1047 KKASSERT(hmp != NULL); 1048 1049 /* 1050 * Gain the IOINPROG bit, interlocked block. 1051 */ 1052 for (;;) { 1053 u_int oflags; 1054 u_int nflags; 1055 1056 oflags = chain->flags; 1057 cpu_ccfence(); 1058 if (oflags & HAMMER2_CHAIN_IOINPROG) { 1059 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 1060 tsleep_interlock(&chain->flags, 0); 1061 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1062 tsleep(&chain->flags, PINTERLOCKED, 1063 "h2iocw", 0); 1064 } 1065 /* retry */ 1066 } else { 1067 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 1068 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1069 break; 1070 } 1071 /* retry */ 1072 } 1073 } 1074 1075 /* 1076 * We own CHAIN_IOINPROG 1077 * 1078 * Degenerate case if we raced another load. 1079 */ 1080 if (chain->data) 1081 goto done; 1082 1083 /* 1084 * We must resolve to a device buffer, either by issuing I/O or 1085 * by creating a zero-fill element. We do not mark the buffer 1086 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1087 * API must still be used to do that). 1088 * 1089 * The device buffer is variable-sized in powers of 2 down 1090 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1091 * chunk always contains buffers of the same size. (XXX) 1092 * 1093 * The minimum physical IO size may be larger than the variable 1094 * block size. 1095 */ 1096 bref = &chain->bref; 1097 1098 /* 1099 * The getblk() optimization can only be used on newly created 1100 * elements if the physical block size matches the request. 1101 */ 1102 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1103 error = hammer2_io_new(hmp, bref->type, 1104 bref->data_off, chain->bytes, 1105 &chain->dio); 1106 } else { 1107 error = hammer2_io_bread(hmp, bref->type, 1108 bref->data_off, chain->bytes, 1109 &chain->dio); 1110 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1111 } 1112 if (error) { 1113 chain->error = HAMMER2_ERROR_EIO; 1114 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 1115 (intmax_t)bref->data_off, error); 1116 hammer2_io_bqrelse(&chain->dio); 1117 goto done; 1118 } 1119 chain->error = 0; 1120 1121 /* 1122 * This isn't perfect and can be ignored on OSs which do not have 1123 * an indication as to whether a buffer is coming from cache or 1124 * if I/O was actually issued for the read. TESTEDGOOD will work 1125 * pretty well without the B_IOISSUED logic because chains are 1126 * cached, but in that situation (without B_IOISSUED) it will not 1127 * detect whether a re-read via I/O is corrupted verses the original 1128 * read. 1129 * 1130 * We can't re-run the CRC on every fresh lock. That would be 1131 * insanely expensive. 1132 * 1133 * If the underlying kernel buffer covers the entire chain we can 1134 * use the B_IOISSUED indication to determine if we have to re-run 1135 * the CRC on chain data for chains that managed to stay cached 1136 * across the kernel disposal of the original buffer. 1137 */ 1138 if ((dio = chain->dio) != NULL && dio->bp) { 1139 struct buf *bp = dio->bp; 1140 1141 if (dio->psize == chain->bytes && 1142 (bp->b_flags & B_IOISSUED)) { 1143 atomic_clear_int(&chain->flags, 1144 HAMMER2_CHAIN_TESTEDGOOD); 1145 bp->b_flags &= ~B_IOISSUED; 1146 } 1147 } 1148 1149 /* 1150 * NOTE: A locked chain's data cannot be modified without first 1151 * calling hammer2_chain_modify(). 1152 */ 1153 1154 /* 1155 * Clear INITIAL. In this case we used io_new() and the buffer has 1156 * been zero'd and marked dirty. 1157 * 1158 * NOTE: hammer2_io_data() call issues bkvasync() 1159 */ 1160 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1161 1162 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1163 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1164 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO; 1165 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1166 /* 1167 * check data not currently synchronized due to 1168 * modification. XXX assumes data stays in the buffer 1169 * cache, which might not be true (need biodep on flush 1170 * to calculate crc? or simple crc?). 1171 */ 1172 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1173 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1174 chain->error = HAMMER2_ERROR_CHECK; 1175 } else { 1176 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1177 } 1178 } 1179 1180 /* 1181 * Setup the data pointer, either pointing it to an embedded data 1182 * structure and copying the data from the buffer, or pointing it 1183 * into the buffer. 1184 * 1185 * The buffer is not retained when copying to an embedded data 1186 * structure in order to avoid potential deadlocks or recursions 1187 * on the same physical buffer. 1188 * 1189 * WARNING! Other threads can start using the data the instant we 1190 * set chain->data non-NULL. 1191 */ 1192 switch (bref->type) { 1193 case HAMMER2_BREF_TYPE_VOLUME: 1194 case HAMMER2_BREF_TYPE_FREEMAP: 1195 /* 1196 * Copy data from bp to embedded buffer 1197 */ 1198 panic("hammer2_chain_load_data: unresolved volume header"); 1199 break; 1200 case HAMMER2_BREF_TYPE_DIRENT: 1201 KKASSERT(chain->bytes != 0); 1202 /* fall through */ 1203 case HAMMER2_BREF_TYPE_INODE: 1204 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1205 case HAMMER2_BREF_TYPE_INDIRECT: 1206 case HAMMER2_BREF_TYPE_DATA: 1207 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1208 default: 1209 /* 1210 * Point data at the device buffer and leave dio intact. 1211 */ 1212 chain->data = (void *)bdata; 1213 break; 1214 } 1215 1216 /* 1217 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1218 */ 1219 done: 1220 for (;;) { 1221 u_int oflags; 1222 u_int nflags; 1223 1224 oflags = chain->flags; 1225 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1226 HAMMER2_CHAIN_IOSIGNAL); 1227 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1228 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1229 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1230 wakeup(&chain->flags); 1231 break; 1232 } 1233 } 1234 } 1235 1236 /* 1237 * Unlock and deref a chain element. 1238 * 1239 * Remember that the presence of children under chain prevent the chain's 1240 * destruction but do not add additional references, so the dio will still 1241 * be dropped. 1242 */ 1243 void 1244 hammer2_chain_unlock(hammer2_chain_t *chain) 1245 { 1246 hammer2_io_t *dio; 1247 u_int lockcnt; 1248 int iter = 0; 1249 1250 --curthread->td_tracker; 1251 1252 /* 1253 * If multiple locks are present (or being attempted) on this 1254 * particular chain we can just unlock, drop refs, and return. 1255 * 1256 * Otherwise fall-through on the 1->0 transition. 1257 */ 1258 for (;;) { 1259 lockcnt = chain->lockcnt; 1260 KKASSERT(lockcnt > 0); 1261 cpu_ccfence(); 1262 if (lockcnt > 1) { 1263 if (atomic_cmpset_int(&chain->lockcnt, 1264 lockcnt, lockcnt - 1)) { 1265 hammer2_mtx_unlock(&chain->lock); 1266 return; 1267 } 1268 } else if (hammer2_mtx_upgrade_try(&chain->lock) == 0) { 1269 /* while holding the mutex exclusively */ 1270 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1271 break; 1272 } else { 1273 /* 1274 * This situation can easily occur on SMP due to 1275 * the gap inbetween the 1->0 transition and the 1276 * final unlock. We cannot safely block on the 1277 * mutex because lockcnt might go above 1. 1278 * 1279 * XXX Sleep for one tick if it takes too long. 1280 */ 1281 if (++iter > 1000) { 1282 if (iter > 1000 + hz) { 1283 kprintf("hammer2: h2race2 %p\n", chain); 1284 iter = 1000; 1285 } 1286 tsleep(&iter, 0, "h2race2", 1); 1287 } 1288 cpu_pause(); 1289 } 1290 /* retry */ 1291 } 1292 1293 /* 1294 * Last unlock / mutex upgraded to exclusive. Drop the data 1295 * reference. 1296 */ 1297 dio = hammer2_chain_drop_data(chain); 1298 if (dio) 1299 hammer2_io_bqrelse(&dio); 1300 hammer2_mtx_unlock(&chain->lock); 1301 } 1302 1303 /* 1304 * Unlock and hold chain data intact 1305 */ 1306 void 1307 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1308 { 1309 atomic_add_int(&chain->lockcnt, 1); 1310 hammer2_chain_unlock(chain); 1311 } 1312 1313 /* 1314 * Helper to obtain the blockref[] array base and count for a chain. 1315 * 1316 * XXX Not widely used yet, various use cases need to be validated and 1317 * converted to use this function. 1318 */ 1319 static 1320 hammer2_blockref_t * 1321 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1322 { 1323 hammer2_blockref_t *base; 1324 int count; 1325 1326 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1327 base = NULL; 1328 1329 switch(parent->bref.type) { 1330 case HAMMER2_BREF_TYPE_INODE: 1331 count = HAMMER2_SET_COUNT; 1332 break; 1333 case HAMMER2_BREF_TYPE_INDIRECT: 1334 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1335 count = parent->bytes / sizeof(hammer2_blockref_t); 1336 break; 1337 case HAMMER2_BREF_TYPE_VOLUME: 1338 count = HAMMER2_SET_COUNT; 1339 break; 1340 case HAMMER2_BREF_TYPE_FREEMAP: 1341 count = HAMMER2_SET_COUNT; 1342 break; 1343 default: 1344 panic("hammer2_chain_create_indirect: " 1345 "unrecognized blockref type: %d", 1346 parent->bref.type); 1347 count = 0; 1348 break; 1349 } 1350 } else { 1351 switch(parent->bref.type) { 1352 case HAMMER2_BREF_TYPE_INODE: 1353 base = &parent->data->ipdata.u.blockset.blockref[0]; 1354 count = HAMMER2_SET_COUNT; 1355 break; 1356 case HAMMER2_BREF_TYPE_INDIRECT: 1357 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1358 base = &parent->data->npdata[0]; 1359 count = parent->bytes / sizeof(hammer2_blockref_t); 1360 break; 1361 case HAMMER2_BREF_TYPE_VOLUME: 1362 base = &parent->data->voldata. 1363 sroot_blockset.blockref[0]; 1364 count = HAMMER2_SET_COUNT; 1365 break; 1366 case HAMMER2_BREF_TYPE_FREEMAP: 1367 base = &parent->data->blkset.blockref[0]; 1368 count = HAMMER2_SET_COUNT; 1369 break; 1370 default: 1371 panic("hammer2_chain_create_indirect: " 1372 "unrecognized blockref type: %d", 1373 parent->bref.type); 1374 count = 0; 1375 break; 1376 } 1377 } 1378 *countp = count; 1379 1380 return base; 1381 } 1382 1383 /* 1384 * This counts the number of live blockrefs in a block array and 1385 * also calculates the point at which all remaining blockrefs are empty. 1386 * This routine can only be called on a live chain. 1387 * 1388 * Caller holds the chain locked, but possibly with a shared lock. We 1389 * must use an exclusive spinlock to prevent corruption. 1390 * 1391 * NOTE: Flag is not set until after the count is complete, allowing 1392 * callers to test the flag without holding the spinlock. 1393 * 1394 * NOTE: If base is NULL the related chain is still in the INITIAL 1395 * state and there are no blockrefs to count. 1396 * 1397 * NOTE: live_count may already have some counts accumulated due to 1398 * creation and deletion and could even be initially negative. 1399 */ 1400 void 1401 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1402 hammer2_blockref_t *base, int count) 1403 { 1404 hammer2_spin_ex(&chain->core.spin); 1405 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1406 if (base) { 1407 while (--count >= 0) { 1408 if (base[count].type) 1409 break; 1410 } 1411 chain->core.live_zero = count + 1; 1412 while (count >= 0) { 1413 if (base[count].type) 1414 atomic_add_int(&chain->core.live_count, 1415 1); 1416 --count; 1417 } 1418 } else { 1419 chain->core.live_zero = 0; 1420 } 1421 /* else do not modify live_count */ 1422 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1423 } 1424 hammer2_spin_unex(&chain->core.spin); 1425 } 1426 1427 /* 1428 * Resize the chain's physical storage allocation in-place. This function does 1429 * not usually adjust the data pointer and must be followed by (typically) a 1430 * hammer2_chain_modify() call to copy any old data over and adjust the 1431 * data pointer. 1432 * 1433 * Chains can be resized smaller without reallocating the storage. Resizing 1434 * larger will reallocate the storage. Excess or prior storage is reclaimed 1435 * asynchronously at a later time. 1436 * 1437 * An nradix value of 0 is special-cased to mean that the storage should 1438 * be disassociated, that is the chain is being resized to 0 bytes (not 1 1439 * byte). 1440 * 1441 * Must be passed an exclusively locked parent and chain. 1442 * 1443 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1444 * to avoid instantiating a device buffer that conflicts with the vnode data 1445 * buffer. However, because H2 can compress or encrypt data, the chain may 1446 * have a dio assigned to it in those situations, and they do not conflict. 1447 * 1448 * XXX return error if cannot resize. 1449 */ 1450 int 1451 hammer2_chain_resize(hammer2_chain_t *chain, 1452 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1453 int nradix, int flags) 1454 { 1455 hammer2_dev_t *hmp; 1456 size_t obytes; 1457 size_t nbytes; 1458 int error; 1459 1460 hmp = chain->hmp; 1461 1462 /* 1463 * Only data and indirect blocks can be resized for now. 1464 * (The volu root, inodes, and freemap elements use a fixed size). 1465 */ 1466 KKASSERT(chain != &hmp->vchain); 1467 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1468 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1469 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1470 1471 /* 1472 * Nothing to do if the element is already the proper size 1473 */ 1474 obytes = chain->bytes; 1475 nbytes = (nradix) ? (1U << nradix) : 0; 1476 if (obytes == nbytes) 1477 return (chain->error); 1478 1479 /* 1480 * Make sure the old data is instantiated so we can copy it. If this 1481 * is a data block, the device data may be superfluous since the data 1482 * might be in a logical block, but compressed or encrypted data is 1483 * another matter. 1484 * 1485 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 1486 */ 1487 error = hammer2_chain_modify(chain, mtid, dedup_off, 0); 1488 if (error) 1489 return error; 1490 1491 /* 1492 * Relocate the block, even if making it smaller (because different 1493 * block sizes may be in different regions). 1494 * 1495 * NOTE: Operation does not copy the data and may only be used 1496 * to resize data blocks in-place, or directory entry blocks 1497 * which are about to be modified in some manner. 1498 */ 1499 error = hammer2_freemap_alloc(chain, nbytes); 1500 if (error) 1501 return error; 1502 1503 chain->bytes = nbytes; 1504 1505 /* 1506 * We don't want the followup chain_modify() to try to copy data 1507 * from the old (wrong-sized) buffer. It won't know how much to 1508 * copy. This case should only occur during writes when the 1509 * originator already has the data to write in-hand. 1510 */ 1511 if (chain->dio) { 1512 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1513 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT); 1514 hammer2_io_brelse(&chain->dio); 1515 chain->data = NULL; 1516 } 1517 return (chain->error); 1518 } 1519 1520 /* 1521 * Set the chain modified so its data can be changed by the caller, or 1522 * install deduplicated data. The caller must call this routine for each 1523 * set of modifications it makes, even if the chain is already flagged 1524 * MODIFIED. 1525 * 1526 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1527 * is a CLC (cluster level change) field and is not updated by parent 1528 * propagation during a flush. 1529 * 1530 * Returns an appropriate HAMMER2_ERROR_* code, which will generally reflect 1531 * chain->error except for HAMMER2_ERROR_ENOSPC. If the allocation fails 1532 * due to no space available, HAMMER2_ERROR_ENOSPC is returned and the chain 1533 * remains unmodified with its old data ref intact and chain->error 1534 * unchanged. 1535 * 1536 * Dedup Handling 1537 * 1538 * If the DEDUPABLE flag is set in the chain the storage must be reallocated 1539 * even if the chain is still flagged MODIFIED. In this case the chain's 1540 * DEDUPABLE flag will be cleared once the new storage has been assigned. 1541 * 1542 * If the caller passes a non-zero dedup_off we will use it to assign the 1543 * new storage. The MODIFIED flag will be *CLEARED* in this case, and 1544 * DEDUPABLE will be set (NOTE: the UPDATE flag is always set). The caller 1545 * must not modify the data content upon return. 1546 */ 1547 int 1548 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1549 hammer2_off_t dedup_off, int flags) 1550 { 1551 hammer2_blockref_t obref; 1552 hammer2_dev_t *hmp; 1553 hammer2_io_t *dio; 1554 int error; 1555 int wasinitial; 1556 int setmodified; 1557 int setupdate; 1558 int newmod; 1559 char *bdata; 1560 1561 hmp = chain->hmp; 1562 obref = chain->bref; 1563 KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0); 1564 1565 /* 1566 * Data is not optional for freemap chains (we must always be sure 1567 * to copy the data on COW storage allocations). 1568 */ 1569 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1570 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1571 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1572 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1573 } 1574 1575 /* 1576 * Data must be resolved if already assigned, unless explicitly 1577 * flagged otherwise. If we cannot safety load the data the 1578 * modification fails and we return early. 1579 */ 1580 if (chain->data == NULL && chain->bytes != 0 && 1581 (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1582 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1583 hammer2_chain_load_data(chain); 1584 if (chain->error) 1585 return (chain->error); 1586 } 1587 error = 0; 1588 1589 /* 1590 * Set MODIFIED to indicate that the chain has been modified. A new 1591 * allocation is required when modifying a chain. 1592 * 1593 * Set UPDATE to ensure that the blockref is updated in the parent. 1594 * 1595 * If MODIFIED is already set determine if we can reuse the assigned 1596 * data block or if we need a new data block. 1597 */ 1598 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1599 /* 1600 * Must set modified bit. 1601 */ 1602 atomic_add_long(&hammer2_count_modified_chains, 1); 1603 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1604 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1605 setmodified = 1; 1606 1607 /* 1608 * We may be able to avoid a copy-on-write if the chain's 1609 * check mode is set to NONE and the chain's current 1610 * modify_tid is beyond the last explicit snapshot tid. 1611 * 1612 * This implements HAMMER2's overwrite-in-place feature. 1613 * 1614 * NOTE! This data-block cannot be used as a de-duplication 1615 * source when the check mode is set to NONE. 1616 */ 1617 if ((chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1618 chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) && 1619 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1620 (chain->flags & HAMMER2_CHAIN_DEDUPABLE) == 0 && 1621 HAMMER2_DEC_CHECK(chain->bref.methods) == 1622 HAMMER2_CHECK_NONE && 1623 chain->pmp && 1624 chain->bref.modify_tid > 1625 chain->pmp->iroot->meta.pfs_lsnap_tid) { 1626 /* 1627 * Sector overwrite allowed. 1628 */ 1629 newmod = 0; 1630 } else { 1631 /* 1632 * Sector overwrite not allowed, must copy-on-write. 1633 */ 1634 newmod = 1; 1635 } 1636 } else if (chain->flags & HAMMER2_CHAIN_DEDUPABLE) { 1637 /* 1638 * If the modified chain was registered for dedup we need 1639 * a new allocation. This only happens for delayed-flush 1640 * chains (i.e. which run through the front-end buffer 1641 * cache). 1642 */ 1643 newmod = 1; 1644 setmodified = 0; 1645 } else { 1646 /* 1647 * Already flagged modified, no new allocation is needed. 1648 */ 1649 newmod = 0; 1650 setmodified = 0; 1651 } 1652 1653 /* 1654 * Flag parent update required. 1655 */ 1656 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1657 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1658 setupdate = 1; 1659 } else { 1660 setupdate = 0; 1661 } 1662 1663 /* 1664 * The modification or re-modification requires an allocation and 1665 * possible COW. If an error occurs, the previous content and data 1666 * reference is retained and the modification fails. 1667 * 1668 * If dedup_off is non-zero, the caller is requesting a deduplication 1669 * rather than a modification. The MODIFIED bit is not set and the 1670 * data offset is set to the deduplication offset. The data cannot 1671 * be modified. 1672 * 1673 * NOTE: The dedup offset is allowed to be in a partially free state 1674 * and we must be sure to reset it to a fully allocated state 1675 * to force two bulkfree passes to free it again. 1676 * 1677 * NOTE: Only applicable when chain->bytes != 0. 1678 * 1679 * XXX can a chain already be marked MODIFIED without a data 1680 * assignment? If not, assert here instead of testing the case. 1681 */ 1682 if (chain != &hmp->vchain && chain != &hmp->fchain && 1683 chain->bytes) { 1684 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1685 newmod 1686 ) { 1687 /* 1688 * NOTE: We do not have to remove the dedup 1689 * registration because the area is still 1690 * allocated and the underlying DIO will 1691 * still be flushed. 1692 */ 1693 if (dedup_off) { 1694 chain->bref.data_off = dedup_off; 1695 chain->bytes = 1 << (dedup_off & 1696 HAMMER2_OFF_MASK_RADIX); 1697 chain->error = 0; 1698 atomic_clear_int(&chain->flags, 1699 HAMMER2_CHAIN_MODIFIED); 1700 atomic_add_long(&hammer2_count_modified_chains, 1701 -1); 1702 if (chain->pmp) 1703 hammer2_pfs_memory_wakeup(chain->pmp); 1704 hammer2_freemap_adjust(hmp, &chain->bref, 1705 HAMMER2_FREEMAP_DORECOVER); 1706 atomic_set_int(&chain->flags, 1707 HAMMER2_CHAIN_DEDUPABLE); 1708 } else { 1709 error = hammer2_freemap_alloc(chain, 1710 chain->bytes); 1711 atomic_clear_int(&chain->flags, 1712 HAMMER2_CHAIN_DEDUPABLE); 1713 } 1714 } 1715 } 1716 1717 /* 1718 * Stop here if error. We have to undo any flag bits we might 1719 * have set above. 1720 */ 1721 if (error) { 1722 if (setmodified) { 1723 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1724 atomic_add_long(&hammer2_count_modified_chains, -1); 1725 if (chain->pmp) 1726 hammer2_pfs_memory_wakeup(chain->pmp); 1727 } 1728 if (setupdate) { 1729 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1730 } 1731 return error; 1732 } 1733 1734 /* 1735 * Update mirror_tid and modify_tid. modify_tid is only updated 1736 * if not passed as zero (during flushes, parent propagation passes 1737 * the value 0). 1738 * 1739 * NOTE: chain->pmp could be the device spmp. 1740 */ 1741 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1742 if (mtid) 1743 chain->bref.modify_tid = mtid; 1744 1745 /* 1746 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1747 * requires updating as well as to tell the delete code that the 1748 * chain's blockref might not exactly match (in terms of physical size 1749 * or block offset) the one in the parent's blocktable. The base key 1750 * of course will still match. 1751 */ 1752 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1753 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1754 1755 /* 1756 * Short-cut data blocks which the caller does not need an actual 1757 * data reference to (aka OPTDATA), as long as the chain does not 1758 * already have a data pointer to the data. This generally means 1759 * that the modifications are being done via the logical buffer cache. 1760 * The INITIAL flag relates only to the device data buffer and thus 1761 * remains unchange in this situation. 1762 * 1763 * This code also handles bytes == 0 (most dirents). 1764 */ 1765 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1766 (flags & HAMMER2_MODIFY_OPTDATA) && 1767 chain->data == NULL) { 1768 KKASSERT(chain->dio == NULL); 1769 goto skip2; 1770 } 1771 1772 /* 1773 * Clearing the INITIAL flag (for indirect blocks) indicates that 1774 * we've processed the uninitialized storage allocation. 1775 * 1776 * If this flag is already clear we are likely in a copy-on-write 1777 * situation but we have to be sure NOT to bzero the storage if 1778 * no data is present. 1779 */ 1780 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1781 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1782 wasinitial = 1; 1783 } else { 1784 wasinitial = 0; 1785 } 1786 1787 /* 1788 * Instantiate data buffer and possibly execute COW operation 1789 */ 1790 switch(chain->bref.type) { 1791 case HAMMER2_BREF_TYPE_VOLUME: 1792 case HAMMER2_BREF_TYPE_FREEMAP: 1793 /* 1794 * The data is embedded, no copy-on-write operation is 1795 * needed. 1796 */ 1797 KKASSERT(chain->dio == NULL); 1798 break; 1799 case HAMMER2_BREF_TYPE_DIRENT: 1800 /* 1801 * The data might be fully embedded. 1802 */ 1803 if (chain->bytes == 0) { 1804 KKASSERT(chain->dio == NULL); 1805 break; 1806 } 1807 /* fall through */ 1808 case HAMMER2_BREF_TYPE_INODE: 1809 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1810 case HAMMER2_BREF_TYPE_DATA: 1811 case HAMMER2_BREF_TYPE_INDIRECT: 1812 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1813 /* 1814 * Perform the copy-on-write operation 1815 * 1816 * zero-fill or copy-on-write depending on whether 1817 * chain->data exists or not and set the dirty state for 1818 * the new buffer. hammer2_io_new() will handle the 1819 * zero-fill. 1820 * 1821 * If a dedup_off was supplied this is an existing block 1822 * and no COW, copy, or further modification is required. 1823 */ 1824 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1825 1826 if (wasinitial && dedup_off == 0) { 1827 error = hammer2_io_new(hmp, chain->bref.type, 1828 chain->bref.data_off, 1829 chain->bytes, &dio); 1830 } else { 1831 error = hammer2_io_bread(hmp, chain->bref.type, 1832 chain->bref.data_off, 1833 chain->bytes, &dio); 1834 } 1835 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1836 1837 /* 1838 * If an I/O error occurs make sure callers cannot accidently 1839 * modify the old buffer's contents and corrupt the filesystem. 1840 * 1841 * NOTE: hammer2_io_data() call issues bkvasync() 1842 */ 1843 if (error) { 1844 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 1845 hmp); 1846 chain->error = HAMMER2_ERROR_EIO; 1847 hammer2_io_brelse(&dio); 1848 hammer2_io_brelse(&chain->dio); 1849 chain->data = NULL; 1850 break; 1851 } 1852 chain->error = 0; 1853 bdata = hammer2_io_data(dio, chain->bref.data_off); 1854 1855 if (chain->data) { 1856 /* 1857 * COW (unless a dedup). 1858 */ 1859 KKASSERT(chain->dio != NULL); 1860 if (chain->data != (void *)bdata && dedup_off == 0) { 1861 bcopy(chain->data, bdata, chain->bytes); 1862 } 1863 } else if (wasinitial == 0) { 1864 /* 1865 * We have a problem. We were asked to COW but 1866 * we don't have any data to COW with! 1867 */ 1868 panic("hammer2_chain_modify: having a COW %p\n", 1869 chain); 1870 } 1871 1872 /* 1873 * Retire the old buffer, replace with the new. Dirty or 1874 * redirty the new buffer. 1875 * 1876 * WARNING! The system buffer cache may have already flushed 1877 * the buffer, so we must be sure to [re]dirty it 1878 * for further modification. 1879 * 1880 * If dedup_off was supplied, the caller is not 1881 * expected to make any further modification to the 1882 * buffer. 1883 */ 1884 if (chain->dio) 1885 hammer2_io_bqrelse(&chain->dio); 1886 chain->data = (void *)bdata; 1887 chain->dio = dio; 1888 if (dedup_off == 0) 1889 hammer2_io_setdirty(dio); 1890 break; 1891 default: 1892 panic("hammer2_chain_modify: illegal non-embedded type %d", 1893 chain->bref.type); 1894 break; 1895 1896 } 1897 skip2: 1898 /* 1899 * setflush on parent indicating that the parent must recurse down 1900 * to us. Do not call on chain itself which might already have it 1901 * set. 1902 */ 1903 if (chain->parent) 1904 hammer2_chain_setflush(chain->parent); 1905 return (chain->error); 1906 } 1907 1908 /* 1909 * Modify the chain associated with an inode. 1910 */ 1911 int 1912 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 1913 hammer2_tid_t mtid, int flags) 1914 { 1915 int error; 1916 1917 hammer2_inode_modify(ip); 1918 error = hammer2_chain_modify(chain, mtid, 0, flags); 1919 1920 return error; 1921 } 1922 1923 /* 1924 * Volume header data locks 1925 */ 1926 void 1927 hammer2_voldata_lock(hammer2_dev_t *hmp) 1928 { 1929 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 1930 } 1931 1932 void 1933 hammer2_voldata_unlock(hammer2_dev_t *hmp) 1934 { 1935 lockmgr(&hmp->vollk, LK_RELEASE); 1936 } 1937 1938 void 1939 hammer2_voldata_modify(hammer2_dev_t *hmp) 1940 { 1941 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1942 atomic_add_long(&hammer2_count_modified_chains, 1); 1943 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1944 hammer2_pfs_memory_inc(hmp->vchain.pmp); 1945 } 1946 } 1947 1948 /* 1949 * This function returns the chain at the nearest key within the specified 1950 * range. The returned chain will be referenced but not locked. 1951 * 1952 * This function will recurse through chain->rbtree as necessary and will 1953 * return a *key_nextp suitable for iteration. *key_nextp is only set if 1954 * the iteration value is less than the current value of *key_nextp. 1955 * 1956 * The caller should use (*key_nextp) to calculate the actual range of 1957 * the returned element, which will be (key_beg to *key_nextp - 1), because 1958 * there might be another element which is superior to the returned element 1959 * and overlaps it. 1960 * 1961 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 1962 * chains continue to be returned. On EOF (*key_nextp) may overflow since 1963 * it will wind up being (key_end + 1). 1964 * 1965 * WARNING! Must be called with child's spinlock held. Spinlock remains 1966 * held through the operation. 1967 */ 1968 struct hammer2_chain_find_info { 1969 hammer2_chain_t *best; 1970 hammer2_key_t key_beg; 1971 hammer2_key_t key_end; 1972 hammer2_key_t key_next; 1973 }; 1974 1975 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 1976 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 1977 1978 static 1979 hammer2_chain_t * 1980 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 1981 hammer2_key_t key_beg, hammer2_key_t key_end) 1982 { 1983 struct hammer2_chain_find_info info; 1984 1985 info.best = NULL; 1986 info.key_beg = key_beg; 1987 info.key_end = key_end; 1988 info.key_next = *key_nextp; 1989 1990 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 1991 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1992 &info); 1993 *key_nextp = info.key_next; 1994 #if 0 1995 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 1996 parent, key_beg, key_end, *key_nextp); 1997 #endif 1998 1999 return (info.best); 2000 } 2001 2002 static 2003 int 2004 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 2005 { 2006 struct hammer2_chain_find_info *info = data; 2007 hammer2_key_t child_beg; 2008 hammer2_key_t child_end; 2009 2010 child_beg = child->bref.key; 2011 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 2012 2013 if (child_end < info->key_beg) 2014 return(-1); 2015 if (child_beg > info->key_end) 2016 return(1); 2017 return(0); 2018 } 2019 2020 static 2021 int 2022 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 2023 { 2024 struct hammer2_chain_find_info *info = data; 2025 hammer2_chain_t *best; 2026 hammer2_key_t child_end; 2027 2028 /* 2029 * WARNING! Layerq is scanned forwards, exact matches should keep 2030 * the existing info->best. 2031 */ 2032 if ((best = info->best) == NULL) { 2033 /* 2034 * No previous best. Assign best 2035 */ 2036 info->best = child; 2037 } else if (best->bref.key <= info->key_beg && 2038 child->bref.key <= info->key_beg) { 2039 /* 2040 * Illegal overlap. 2041 */ 2042 KKASSERT(0); 2043 /*info->best = child;*/ 2044 } else if (child->bref.key < best->bref.key) { 2045 /* 2046 * Child has a nearer key and best is not flush with key_beg. 2047 * Set best to child. Truncate key_next to the old best key. 2048 */ 2049 info->best = child; 2050 if (info->key_next > best->bref.key || info->key_next == 0) 2051 info->key_next = best->bref.key; 2052 } else if (child->bref.key == best->bref.key) { 2053 /* 2054 * If our current best is flush with the child then this 2055 * is an illegal overlap. 2056 * 2057 * key_next will automatically be limited to the smaller of 2058 * the two end-points. 2059 */ 2060 KKASSERT(0); 2061 info->best = child; 2062 } else { 2063 /* 2064 * Keep the current best but truncate key_next to the child's 2065 * base. 2066 * 2067 * key_next will also automatically be limited to the smaller 2068 * of the two end-points (probably not necessary for this case 2069 * but we do it anyway). 2070 */ 2071 if (info->key_next > child->bref.key || info->key_next == 0) 2072 info->key_next = child->bref.key; 2073 } 2074 2075 /* 2076 * Always truncate key_next based on child's end-of-range. 2077 */ 2078 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 2079 if (child_end && (info->key_next > child_end || info->key_next == 0)) 2080 info->key_next = child_end; 2081 2082 return(0); 2083 } 2084 2085 /* 2086 * Retrieve the specified chain from a media blockref, creating the 2087 * in-memory chain structure which reflects it. The returned chain is 2088 * held but not locked. The caller must lock it to crc-check and 2089 * dereference its data, and should check chain->error after locking 2090 * before assuming that the data is good. 2091 * 2092 * To handle insertion races pass the INSERT_RACE flag along with the 2093 * generation number of the core. NULL will be returned if the generation 2094 * number changes before we have a chance to insert the chain. Insert 2095 * races can occur because the parent might be held shared. 2096 * 2097 * Caller must hold the parent locked shared or exclusive since we may 2098 * need the parent's bref array to find our block. 2099 * 2100 * WARNING! chain->pmp is always set to NULL for any chain representing 2101 * part of the super-root topology. 2102 */ 2103 hammer2_chain_t * 2104 hammer2_chain_get(hammer2_chain_t *parent, int generation, 2105 hammer2_blockref_t *bref) 2106 { 2107 hammer2_dev_t *hmp = parent->hmp; 2108 hammer2_chain_t *chain; 2109 int error; 2110 2111 /* 2112 * Allocate a chain structure representing the existing media 2113 * entry. Resulting chain has one ref and is not locked. 2114 */ 2115 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 2116 chain = hammer2_chain_alloc(hmp, NULL, bref); 2117 else 2118 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 2119 /* ref'd chain returned */ 2120 2121 /* 2122 * Flag that the chain is in the parent's blockmap so delete/flush 2123 * knows what to do with it. 2124 */ 2125 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 2126 2127 /* 2128 * Link the chain into its parent. A spinlock is required to safely 2129 * access the RBTREE, and it is possible to collide with another 2130 * hammer2_chain_get() operation because the caller might only hold 2131 * a shared lock on the parent. 2132 * 2133 * NOTE: Get races can occur quite often when we distribute 2134 * asynchronous read-aheads across multiple threads. 2135 */ 2136 KKASSERT(parent->refs > 0); 2137 error = hammer2_chain_insert(parent, chain, 2138 HAMMER2_CHAIN_INSERT_SPIN | 2139 HAMMER2_CHAIN_INSERT_RACE, 2140 generation); 2141 if (error) { 2142 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2143 /*kprintf("chain %p get race\n", chain);*/ 2144 hammer2_chain_drop(chain); 2145 chain = NULL; 2146 } else { 2147 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2148 } 2149 2150 /* 2151 * Return our new chain referenced but not locked, or NULL if 2152 * a race occurred. 2153 */ 2154 return (chain); 2155 } 2156 2157 /* 2158 * Lookup initialization/completion API 2159 */ 2160 hammer2_chain_t * 2161 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 2162 { 2163 hammer2_chain_ref(parent); 2164 if (flags & HAMMER2_LOOKUP_SHARED) { 2165 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 2166 HAMMER2_RESOLVE_SHARED); 2167 } else { 2168 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2169 } 2170 return (parent); 2171 } 2172 2173 void 2174 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2175 { 2176 if (parent) { 2177 hammer2_chain_unlock(parent); 2178 hammer2_chain_drop(parent); 2179 } 2180 } 2181 2182 /* 2183 * Take the locked chain and return a locked parent. The chain remains 2184 * locked on return, but may have to be temporarily unlocked to acquire 2185 * the parent. Can return NULL. 2186 * 2187 * Pass HAMMER2_RESOLVE_* flags in flags. 2188 * 2189 * This will work even if the chain is errored, and the caller can check 2190 * parent->error on return if desired since the parent will be locked. 2191 * 2192 * This function handles the lock order reversal. 2193 */ 2194 hammer2_chain_t * 2195 hammer2_chain_getparent(hammer2_chain_t *chain, int flags) 2196 { 2197 hammer2_chain_t *parent; 2198 2199 /* 2200 * Be careful of order, chain must be unlocked before parent 2201 * is locked below to avoid a deadlock. 2202 * 2203 * Safe access to fu->parent requires fu's core spinlock. 2204 */ 2205 again: 2206 hammer2_spin_ex(&chain->core.spin); 2207 parent = chain->parent; 2208 if (parent == NULL) { 2209 hammer2_spin_unex(&chain->core.spin); 2210 panic("hammer2_chain_getparent: no parent"); 2211 } 2212 hammer2_chain_ref(parent); 2213 hammer2_spin_unex(&chain->core.spin); 2214 2215 hammer2_chain_unlock(chain); 2216 hammer2_chain_lock(parent, flags); 2217 hammer2_chain_lock(chain, flags); 2218 2219 /* 2220 * Parent relinking races are quite common. We have to get it right 2221 * or we will blow up the block table. 2222 */ 2223 if (chain->parent != parent) { 2224 hammer2_chain_unlock(parent); 2225 hammer2_chain_drop(parent); 2226 goto again; 2227 } 2228 return parent; 2229 } 2230 2231 /* 2232 * Take the locked chain and return a locked parent. The chain is unlocked 2233 * and dropped. *chainp is set to the returned parent as a convenience. 2234 * Pass HAMMER2_RESOLVE_* flags in flags. 2235 * 2236 * This will work even if the chain is errored, and the caller can check 2237 * parent->error on return if desired since the parent will be locked. 2238 * 2239 * This function handles the lock order reversal. 2240 */ 2241 hammer2_chain_t * 2242 hammer2_chain_repparent(hammer2_chain_t **chainp, int flags) 2243 { 2244 hammer2_chain_t *chain; 2245 hammer2_chain_t *parent; 2246 2247 /* 2248 * Be careful of order, chain must be unlocked before parent 2249 * is locked below to avoid a deadlock. 2250 * 2251 * Safe access to fu->parent requires fu's core spinlock. 2252 */ 2253 chain = *chainp; 2254 again: 2255 hammer2_spin_ex(&chain->core.spin); 2256 parent = chain->parent; 2257 if (parent == NULL) { 2258 hammer2_spin_unex(&chain->core.spin); 2259 panic("hammer2_chain_getparent: no parent"); 2260 } 2261 hammer2_chain_ref(parent); 2262 hammer2_spin_unex(&chain->core.spin); 2263 2264 hammer2_chain_unlock(chain); 2265 hammer2_chain_lock(parent, flags); 2266 2267 /* 2268 * Parent relinking races are quite common. We have to get it right 2269 * or we will blow up the block table. 2270 */ 2271 if (chain->parent != parent) { 2272 hammer2_chain_lock(chain, flags); 2273 hammer2_chain_unlock(parent); 2274 hammer2_chain_drop(parent); 2275 goto again; 2276 } 2277 hammer2_chain_drop(chain); 2278 *chainp = parent; 2279 2280 return parent; 2281 } 2282 2283 /* 2284 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2285 * (*parentp) typically points to an inode but can also point to a related 2286 * indirect block and this function will recurse upwards and find the inode 2287 * or the nearest undeleted indirect block covering the key range. 2288 * 2289 * This function unconditionally sets *errorp, replacing any previous value. 2290 * 2291 * (*parentp) must be exclusive or shared locked (depending on flags) and 2292 * referenced and can be an inode or an existing indirect block within the 2293 * inode. 2294 * 2295 * If (*parent) is errored out, this function will not attempt to recurse 2296 * the radix tree and will return NULL along with an appropriate *errorp. 2297 * If NULL is returned and *errorp is 0, the requested lookup could not be 2298 * located. 2299 * 2300 * On return (*parentp) will be modified to point at the deepest parent chain 2301 * element encountered during the search, as a helper for an insertion or 2302 * deletion. 2303 * 2304 * The new (*parentp) will be locked shared or exclusive (depending on flags), 2305 * and referenced, and the old will be unlocked and dereferenced (no change 2306 * if they are both the same). This is particularly important if the caller 2307 * wishes to insert a new chain, (*parentp) will be set properly even if NULL 2308 * is returned, as long as no error occurred. 2309 * 2310 * The matching chain will be returned locked according to flags. If NOLOCK 2311 * is requested the chain will be returned only referenced. Note that the 2312 * parent chain must always be locked shared or exclusive, matching the 2313 * HAMMER2_LOOKUP_SHARED flag. We can conceivably lock it SHARED temporarily 2314 * when NOLOCK is specified but that complicates matters if *parentp must 2315 * inherit the chain. 2316 * 2317 * NOLOCK also implies NODATA, since an unlocked chain usually has a NULL 2318 * data pointer or can otherwise be in flux. 2319 * 2320 * NULL is returned if no match was found, but (*parentp) will still 2321 * potentially be adjusted. 2322 * 2323 * On return (*key_nextp) will point to an iterative value for key_beg. 2324 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2325 * 2326 * This function will also recurse up the chain if the key is not within the 2327 * current parent's range. (*parentp) can never be set to NULL. An iteration 2328 * can simply allow (*parentp) to float inside the loop. 2329 * 2330 * NOTE! chain->data is not always resolved. By default it will not be 2331 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2332 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2333 * BREF_TYPE_DATA as the device buffer can alias the logical file 2334 * buffer). 2335 */ 2336 2337 hammer2_chain_t * 2338 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2339 hammer2_key_t key_beg, hammer2_key_t key_end, 2340 int *errorp, int flags) 2341 { 2342 hammer2_dev_t *hmp; 2343 hammer2_chain_t *parent; 2344 hammer2_chain_t *chain; 2345 hammer2_blockref_t *base; 2346 hammer2_blockref_t *bref; 2347 hammer2_blockref_t bcopy; 2348 hammer2_key_t scan_beg; 2349 hammer2_key_t scan_end; 2350 int count = 0; 2351 int how_always = HAMMER2_RESOLVE_ALWAYS; 2352 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2353 int how; 2354 int generation; 2355 int maxloops = 300000; 2356 volatile hammer2_mtx_t save_mtx; 2357 2358 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2359 how_maybe = how_always; 2360 how = HAMMER2_RESOLVE_ALWAYS; 2361 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2362 how = HAMMER2_RESOLVE_NEVER; 2363 } else { 2364 how = HAMMER2_RESOLVE_MAYBE; 2365 } 2366 if (flags & HAMMER2_LOOKUP_SHARED) { 2367 how_maybe |= HAMMER2_RESOLVE_SHARED; 2368 how_always |= HAMMER2_RESOLVE_SHARED; 2369 how |= HAMMER2_RESOLVE_SHARED; 2370 } 2371 2372 /* 2373 * Recurse (*parentp) upward if necessary until the parent completely 2374 * encloses the key range or we hit the inode. 2375 * 2376 * Handle races against the flusher deleting indirect nodes on its 2377 * way back up by continuing to recurse upward past the deletion. 2378 */ 2379 parent = *parentp; 2380 hmp = parent->hmp; 2381 *errorp = 0; 2382 2383 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2384 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2385 scan_beg = parent->bref.key; 2386 scan_end = scan_beg + 2387 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2388 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2389 if (key_beg >= scan_beg && key_end <= scan_end) 2390 break; 2391 } 2392 parent = hammer2_chain_repparent(parentp, how_maybe); 2393 } 2394 again: 2395 2396 if (--maxloops == 0) 2397 panic("hammer2_chain_lookup: maxloops"); 2398 /* 2399 * Locate the blockref array. Currently we do a fully associative 2400 * search through the array. 2401 */ 2402 switch(parent->bref.type) { 2403 case HAMMER2_BREF_TYPE_INODE: 2404 /* 2405 * Special shortcut for embedded data returns the inode 2406 * itself. Callers must detect this condition and access 2407 * the embedded data (the strategy code does this for us). 2408 * 2409 * This is only applicable to regular files and softlinks. 2410 * 2411 * We need a second lock on parent. Since we already have 2412 * a lock we must pass LOCKAGAIN to prevent unexpected 2413 * blocking (we don't want to block on a second shared 2414 * ref if an exclusive lock is pending) 2415 */ 2416 if (parent->data->ipdata.meta.op_flags & 2417 HAMMER2_OPFLAG_DIRECTDATA) { 2418 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2419 chain = NULL; 2420 *key_nextp = key_end + 1; 2421 goto done; 2422 } 2423 hammer2_chain_ref(parent); 2424 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 2425 hammer2_chain_lock(parent, 2426 how_always | 2427 HAMMER2_RESOLVE_LOCKAGAIN); 2428 *key_nextp = key_end + 1; 2429 return (parent); 2430 } 2431 base = &parent->data->ipdata.u.blockset.blockref[0]; 2432 count = HAMMER2_SET_COUNT; 2433 break; 2434 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2435 case HAMMER2_BREF_TYPE_INDIRECT: 2436 /* 2437 * Handle MATCHIND on the parent 2438 */ 2439 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2440 scan_beg = parent->bref.key; 2441 scan_end = scan_beg + 2442 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2443 if (key_beg == scan_beg && key_end == scan_end) { 2444 chain = parent; 2445 hammer2_chain_ref(chain); 2446 hammer2_chain_lock(chain, how_maybe); 2447 *key_nextp = scan_end + 1; 2448 goto done; 2449 } 2450 } 2451 2452 /* 2453 * Optimize indirect blocks in the INITIAL state to avoid 2454 * I/O. 2455 */ 2456 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2457 base = NULL; 2458 } else { 2459 if (parent->data == NULL) { 2460 kprintf("parent->data is NULL %p\n", parent); 2461 while (1) 2462 tsleep(parent, 0, "xxx", 0); 2463 } 2464 base = &parent->data->npdata[0]; 2465 } 2466 count = parent->bytes / sizeof(hammer2_blockref_t); 2467 break; 2468 case HAMMER2_BREF_TYPE_VOLUME: 2469 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2470 count = HAMMER2_SET_COUNT; 2471 break; 2472 case HAMMER2_BREF_TYPE_FREEMAP: 2473 base = &parent->data->blkset.blockref[0]; 2474 count = HAMMER2_SET_COUNT; 2475 break; 2476 default: 2477 kprintf("hammer2_chain_lookup: unrecognized " 2478 "blockref(B) type: %d", 2479 parent->bref.type); 2480 while (1) 2481 tsleep(&base, 0, "dead", 0); 2482 panic("hammer2_chain_lookup: unrecognized " 2483 "blockref(B) type: %d", 2484 parent->bref.type); 2485 base = NULL; /* safety */ 2486 count = 0; /* safety */ 2487 } 2488 2489 /* 2490 * No lookup is possible if the parent is errored. We delayed 2491 * this check as long as we could to ensure that the parent backup, 2492 * embedded data, and MATCHIND code could still execute. 2493 */ 2494 if (parent->error) { 2495 *errorp = parent->error; 2496 return NULL; 2497 } 2498 2499 /* 2500 * Merged scan to find next candidate. 2501 * 2502 * hammer2_base_*() functions require the parent->core.live_* fields 2503 * to be synchronized. 2504 * 2505 * We need to hold the spinlock to access the block array and RB tree 2506 * and to interlock chain creation. 2507 */ 2508 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2509 hammer2_chain_countbrefs(parent, base, count); 2510 2511 /* 2512 * Combined search 2513 */ 2514 hammer2_spin_ex(&parent->core.spin); 2515 chain = hammer2_combined_find(parent, base, count, 2516 key_nextp, 2517 key_beg, key_end, 2518 &bref); 2519 generation = parent->core.generation; 2520 2521 /* 2522 * Exhausted parent chain, iterate. 2523 */ 2524 if (bref == NULL) { 2525 KKASSERT(chain == NULL); 2526 hammer2_spin_unex(&parent->core.spin); 2527 if (key_beg == key_end) /* short cut single-key case */ 2528 return (NULL); 2529 2530 /* 2531 * Stop if we reached the end of the iteration. 2532 */ 2533 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2534 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2535 return (NULL); 2536 } 2537 2538 /* 2539 * Calculate next key, stop if we reached the end of the 2540 * iteration, otherwise go up one level and loop. 2541 */ 2542 key_beg = parent->bref.key + 2543 ((hammer2_key_t)1 << parent->bref.keybits); 2544 if (key_beg == 0 || key_beg > key_end) 2545 return (NULL); 2546 parent = hammer2_chain_repparent(parentp, how_maybe); 2547 goto again; 2548 } 2549 2550 /* 2551 * Selected from blockref or in-memory chain. 2552 */ 2553 if (chain == NULL) { 2554 bcopy = *bref; 2555 hammer2_spin_unex(&parent->core.spin); 2556 chain = hammer2_chain_get(parent, generation, 2557 &bcopy); 2558 if (chain == NULL) { 2559 /* 2560 kprintf("retry lookup parent %p keys %016jx:%016jx\n", 2561 parent, key_beg, key_end); 2562 */ 2563 goto again; 2564 } 2565 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2566 hammer2_chain_drop(chain); 2567 chain = NULL; /* SAFETY */ 2568 goto again; 2569 } 2570 } else { 2571 hammer2_chain_ref(chain); 2572 hammer2_spin_unex(&parent->core.spin); 2573 } 2574 2575 /* 2576 * chain is referenced but not locked. We must lock the chain 2577 * to obtain definitive state. 2578 */ 2579 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2580 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2581 hammer2_chain_lock(chain, how_maybe); 2582 } else { 2583 hammer2_chain_lock(chain, how); 2584 } 2585 KKASSERT(chain->parent == parent); 2586 2587 /* 2588 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2589 * 2590 * NOTE: Chain's key range is not relevant as there might be 2591 * one-offs within the range that are not deleted. 2592 * 2593 * NOTE: Lookups can race delete-duplicate because 2594 * delete-duplicate does not lock the parent's core 2595 * (they just use the spinlock on the core). 2596 */ 2597 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2598 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2599 chain->bref.data_off, chain->bref.type, 2600 chain->bref.key); 2601 hammer2_chain_unlock(chain); 2602 hammer2_chain_drop(chain); 2603 chain = NULL; /* SAFETY */ 2604 key_beg = *key_nextp; 2605 if (key_beg == 0 || key_beg > key_end) 2606 return(NULL); 2607 goto again; 2608 } 2609 2610 /* 2611 * If the chain element is an indirect block it becomes the new 2612 * parent and we loop on it. We must maintain our top-down locks 2613 * to prevent the flusher from interfering (i.e. doing a 2614 * delete-duplicate and leaving us recursing down a deleted chain). 2615 * 2616 * The parent always has to be locked with at least RESOLVE_MAYBE 2617 * so we can access its data. It might need a fixup if the caller 2618 * passed incompatible flags. Be careful not to cause a deadlock 2619 * as a data-load requires an exclusive lock. 2620 * 2621 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2622 * range is within the requested key range we return the indirect 2623 * block and do NOT loop. This is usually only used to acquire 2624 * freemap nodes. 2625 */ 2626 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2627 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2628 save_mtx = parent->lock; 2629 hammer2_chain_unlock(parent); 2630 hammer2_chain_drop(parent); 2631 *parentp = parent = chain; 2632 chain = NULL; /* SAFETY */ 2633 goto again; 2634 } 2635 done: 2636 /* 2637 * All done, return the chain. 2638 * 2639 * If the caller does not want a locked chain, replace the lock with 2640 * a ref. Perhaps this can eventually be optimized to not obtain the 2641 * lock in the first place for situations where the data does not 2642 * need to be resolved. 2643 * 2644 * NOTE! A chain->error must be tested by the caller upon return. 2645 * *errorp is only set based on issues which occur while 2646 * trying to reach the chain. 2647 */ 2648 if (chain) { 2649 if (flags & HAMMER2_LOOKUP_NOLOCK) 2650 hammer2_chain_unlock(chain); 2651 } 2652 return (chain); 2653 } 2654 2655 /* 2656 * After having issued a lookup we can iterate all matching keys. 2657 * 2658 * If chain is non-NULL we continue the iteration from just after it's index. 2659 * 2660 * If chain is NULL we assume the parent was exhausted and continue the 2661 * iteration at the next parent. 2662 * 2663 * If a fatal error occurs (typically an I/O error), a dummy chain is 2664 * returned with chain->error and error-identifying information set. This 2665 * chain will assert if you try to do anything fancy with it. 2666 * 2667 * XXX Depending on where the error occurs we should allow continued iteration. 2668 * 2669 * parent must be locked on entry and remains locked throughout. chain's 2670 * lock status must match flags. Chain is always at least referenced. 2671 * 2672 * WARNING! The MATCHIND flag does not apply to this function. 2673 */ 2674 hammer2_chain_t * 2675 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2676 hammer2_key_t *key_nextp, 2677 hammer2_key_t key_beg, hammer2_key_t key_end, 2678 int *errorp, int flags) 2679 { 2680 hammer2_chain_t *parent; 2681 int how_maybe; 2682 2683 /* 2684 * Calculate locking flags for upward recursion. 2685 */ 2686 how_maybe = HAMMER2_RESOLVE_MAYBE; 2687 if (flags & HAMMER2_LOOKUP_SHARED) 2688 how_maybe |= HAMMER2_RESOLVE_SHARED; 2689 2690 parent = *parentp; 2691 *errorp = 0; 2692 2693 /* 2694 * Calculate the next index and recalculate the parent if necessary. 2695 */ 2696 if (chain) { 2697 key_beg = chain->bref.key + 2698 ((hammer2_key_t)1 << chain->bref.keybits); 2699 if ((flags & (HAMMER2_LOOKUP_NOLOCK | 2700 HAMMER2_LOOKUP_NOUNLOCK)) == 0) { 2701 hammer2_chain_unlock(chain); 2702 } 2703 hammer2_chain_drop(chain); 2704 2705 /* 2706 * chain invalid past this point, but we can still do a 2707 * pointer comparison w/parent. 2708 * 2709 * Any scan where the lookup returned degenerate data embedded 2710 * in the inode has an invalid index and must terminate. 2711 */ 2712 if (chain == parent) 2713 return(NULL); 2714 if (key_beg == 0 || key_beg > key_end) 2715 return(NULL); 2716 chain = NULL; 2717 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2718 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2719 /* 2720 * We reached the end of the iteration. 2721 */ 2722 return (NULL); 2723 } else { 2724 /* 2725 * Continue iteration with next parent unless the current 2726 * parent covers the range. 2727 * 2728 * (This also handles the case of a deleted, empty indirect 2729 * node). 2730 */ 2731 key_beg = parent->bref.key + 2732 ((hammer2_key_t)1 << parent->bref.keybits); 2733 if (key_beg == 0 || key_beg > key_end) 2734 return (NULL); 2735 parent = hammer2_chain_repparent(parentp, how_maybe); 2736 } 2737 2738 /* 2739 * And execute 2740 */ 2741 return (hammer2_chain_lookup(parentp, key_nextp, 2742 key_beg, key_end, 2743 errorp, flags)); 2744 } 2745 2746 /* 2747 * Caller wishes to iterate chains under parent, loading new chains into 2748 * chainp. Caller must initialize *chainp to NULL and *firstp to 1, and 2749 * then call hammer2_chain_scan() repeatedly until a non-zero return. 2750 * During the scan, *firstp will be set to 0 and (*chainp) will be replaced 2751 * with the returned chain for the scan. The returned *chainp will be 2752 * locked and referenced. Any prior contents will be unlocked and dropped. 2753 * 2754 * Caller should check the return value. A normal scan EOF will return 2755 * exactly HAMMER2_ERROR_EOF. Any other non-zero value indicates an 2756 * error trying to access parent data. Any error in the returned chain 2757 * must be tested separately by the caller. 2758 * 2759 * (*chainp) is dropped on each scan, but will only be set if the returned 2760 * element itself can recurse. Leaf elements are NOT resolved, loaded, or 2761 * returned via *chainp. The caller will get their bref only. 2762 * 2763 * The raw scan function is similar to lookup/next but does not seek to a key. 2764 * Blockrefs are iterated via first_bref = (parent, NULL) and 2765 * next_chain = (parent, bref). 2766 * 2767 * The passed-in parent must be locked and its data resolved. The function 2768 * nominally returns a locked and referenced *chainp != NULL for chains 2769 * the caller might need to recurse on (and will dipose of any *chainp passed 2770 * in). The caller must check the chain->bref.type either way. 2771 */ 2772 int 2773 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 2774 hammer2_blockref_t *bref, int *firstp, 2775 int flags) 2776 { 2777 hammer2_dev_t *hmp; 2778 hammer2_blockref_t *base; 2779 hammer2_blockref_t *bref_ptr; 2780 hammer2_key_t key; 2781 hammer2_key_t next_key; 2782 hammer2_chain_t *chain = NULL; 2783 int count = 0; 2784 int how_always = HAMMER2_RESOLVE_ALWAYS; 2785 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2786 int how; 2787 int generation; 2788 int maxloops = 300000; 2789 int error; 2790 2791 hmp = parent->hmp; 2792 error = 0; 2793 2794 /* 2795 * Scan flags borrowed from lookup. 2796 */ 2797 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2798 how_maybe = how_always; 2799 how = HAMMER2_RESOLVE_ALWAYS; 2800 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2801 how = HAMMER2_RESOLVE_NEVER; 2802 } else { 2803 how = HAMMER2_RESOLVE_MAYBE; 2804 } 2805 if (flags & HAMMER2_LOOKUP_SHARED) { 2806 how_maybe |= HAMMER2_RESOLVE_SHARED; 2807 how_always |= HAMMER2_RESOLVE_SHARED; 2808 how |= HAMMER2_RESOLVE_SHARED; 2809 } 2810 2811 /* 2812 * Calculate key to locate first/next element, unlocking the previous 2813 * element as we go. Be careful, the key calculation can overflow. 2814 * 2815 * (also reset bref to NULL) 2816 */ 2817 if (*firstp) { 2818 key = 0; 2819 *firstp = 0; 2820 } else { 2821 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 2822 if ((chain = *chainp) != NULL) { 2823 *chainp = NULL; 2824 hammer2_chain_unlock(chain); 2825 hammer2_chain_drop(chain); 2826 chain = NULL; 2827 } 2828 if (key == 0) { 2829 error |= HAMMER2_ERROR_EOF; 2830 goto done; 2831 } 2832 } 2833 2834 again: 2835 if (parent->error) { 2836 error = parent->error; 2837 goto done; 2838 } 2839 if (--maxloops == 0) 2840 panic("hammer2_chain_scan: maxloops"); 2841 2842 /* 2843 * Locate the blockref array. Currently we do a fully associative 2844 * search through the array. 2845 */ 2846 switch(parent->bref.type) { 2847 case HAMMER2_BREF_TYPE_INODE: 2848 /* 2849 * An inode with embedded data has no sub-chains. 2850 * 2851 * WARNING! Bulk scan code may pass a static chain marked 2852 * as BREF_TYPE_INODE with a copy of the volume 2853 * root blockset to snapshot the volume. 2854 */ 2855 if (parent->data->ipdata.meta.op_flags & 2856 HAMMER2_OPFLAG_DIRECTDATA) { 2857 error |= HAMMER2_ERROR_EOF; 2858 goto done; 2859 } 2860 base = &parent->data->ipdata.u.blockset.blockref[0]; 2861 count = HAMMER2_SET_COUNT; 2862 break; 2863 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2864 case HAMMER2_BREF_TYPE_INDIRECT: 2865 /* 2866 * Optimize indirect blocks in the INITIAL state to avoid 2867 * I/O. 2868 */ 2869 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2870 base = NULL; 2871 } else { 2872 if (parent->data == NULL) 2873 panic("parent->data is NULL"); 2874 base = &parent->data->npdata[0]; 2875 } 2876 count = parent->bytes / sizeof(hammer2_blockref_t); 2877 break; 2878 case HAMMER2_BREF_TYPE_VOLUME: 2879 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2880 count = HAMMER2_SET_COUNT; 2881 break; 2882 case HAMMER2_BREF_TYPE_FREEMAP: 2883 base = &parent->data->blkset.blockref[0]; 2884 count = HAMMER2_SET_COUNT; 2885 break; 2886 default: 2887 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2888 parent->bref.type); 2889 base = NULL; /* safety */ 2890 count = 0; /* safety */ 2891 } 2892 2893 /* 2894 * Merged scan to find next candidate. 2895 * 2896 * hammer2_base_*() functions require the parent->core.live_* fields 2897 * to be synchronized. 2898 * 2899 * We need to hold the spinlock to access the block array and RB tree 2900 * and to interlock chain creation. 2901 */ 2902 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2903 hammer2_chain_countbrefs(parent, base, count); 2904 2905 next_key = 0; 2906 bref_ptr = NULL; 2907 hammer2_spin_ex(&parent->core.spin); 2908 chain = hammer2_combined_find(parent, base, count, 2909 &next_key, 2910 key, HAMMER2_KEY_MAX, 2911 &bref_ptr); 2912 generation = parent->core.generation; 2913 2914 /* 2915 * Exhausted parent chain, we're done. 2916 */ 2917 if (bref_ptr == NULL) { 2918 hammer2_spin_unex(&parent->core.spin); 2919 KKASSERT(chain == NULL); 2920 error |= HAMMER2_ERROR_EOF; 2921 goto done; 2922 } 2923 2924 /* 2925 * Copy into the supplied stack-based blockref. 2926 */ 2927 *bref = *bref_ptr; 2928 2929 /* 2930 * Selected from blockref or in-memory chain. 2931 */ 2932 if (chain == NULL) { 2933 switch(bref->type) { 2934 case HAMMER2_BREF_TYPE_INODE: 2935 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2936 case HAMMER2_BREF_TYPE_INDIRECT: 2937 case HAMMER2_BREF_TYPE_VOLUME: 2938 case HAMMER2_BREF_TYPE_FREEMAP: 2939 /* 2940 * Recursion, always get the chain 2941 */ 2942 hammer2_spin_unex(&parent->core.spin); 2943 chain = hammer2_chain_get(parent, generation, bref); 2944 if (chain == NULL) { 2945 kprintf("retry scan parent %p keys %016jx\n", 2946 parent, key); 2947 goto again; 2948 } 2949 if (bcmp(bref, bref_ptr, sizeof(*bref))) { 2950 hammer2_chain_drop(chain); 2951 chain = NULL; 2952 goto again; 2953 } 2954 break; 2955 default: 2956 /* 2957 * No recursion, do not waste time instantiating 2958 * a chain, just iterate using the bref. 2959 */ 2960 hammer2_spin_unex(&parent->core.spin); 2961 break; 2962 } 2963 } else { 2964 /* 2965 * Recursion or not we need the chain in order to supply 2966 * the bref. 2967 */ 2968 hammer2_chain_ref(chain); 2969 hammer2_spin_unex(&parent->core.spin); 2970 } 2971 2972 /* 2973 * chain is referenced but not locked. We must lock the chain 2974 * to obtain definitive state. 2975 */ 2976 if (chain) 2977 hammer2_chain_lock(chain, how); 2978 2979 /* 2980 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2981 * 2982 * NOTE: chain's key range is not relevant as there might be 2983 * one-offs within the range that are not deleted. 2984 * 2985 * NOTE: XXX this could create problems with scans used in 2986 * situations other than mount-time recovery. 2987 * 2988 * NOTE: Lookups can race delete-duplicate because 2989 * delete-duplicate does not lock the parent's core 2990 * (they just use the spinlock on the core). 2991 */ 2992 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 2993 hammer2_chain_unlock(chain); 2994 hammer2_chain_drop(chain); 2995 chain = NULL; 2996 2997 key = next_key; 2998 if (key == 0) { 2999 error |= HAMMER2_ERROR_EOF; 3000 goto done; 3001 } 3002 goto again; 3003 } 3004 3005 done: 3006 /* 3007 * All done, return the bref or NULL, supply chain if necessary. 3008 */ 3009 if (chain) 3010 *chainp = chain; 3011 return (error); 3012 } 3013 3014 /* 3015 * Create and return a new hammer2 system memory structure of the specified 3016 * key, type and size and insert it under (*parentp). This is a full 3017 * insertion, based on the supplied key/keybits, and may involve creating 3018 * indirect blocks and moving other chains around via delete/duplicate. 3019 * 3020 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 3021 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3022 * FULL. This typically means that the caller is creating the chain after 3023 * doing a hammer2_chain_lookup(). 3024 * 3025 * (*parentp) must be exclusive locked and may be replaced on return 3026 * depending on how much work the function had to do. 3027 * 3028 * (*parentp) must not be errored or this function will assert. 3029 * 3030 * (*chainp) usually starts out NULL and returns the newly created chain, 3031 * but if the caller desires the caller may allocate a disconnected chain 3032 * and pass it in instead. 3033 * 3034 * This function should NOT be used to insert INDIRECT blocks. It is 3035 * typically used to create/insert inodes and data blocks. 3036 * 3037 * Caller must pass-in an exclusively locked parent the new chain is to 3038 * be inserted under, and optionally pass-in a disconnected, exclusively 3039 * locked chain to insert (else we create a new chain). The function will 3040 * adjust (*parentp) as necessary, create or connect the chain, and 3041 * return an exclusively locked chain in *chainp. 3042 * 3043 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 3044 * and will be reassigned. 3045 * 3046 * NOTE: returns HAMMER_ERROR_* flags 3047 */ 3048 int 3049 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 3050 hammer2_pfs_t *pmp, int methods, 3051 hammer2_key_t key, int keybits, int type, size_t bytes, 3052 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 3053 { 3054 hammer2_dev_t *hmp; 3055 hammer2_chain_t *chain; 3056 hammer2_chain_t *parent; 3057 hammer2_blockref_t *base; 3058 hammer2_blockref_t dummy; 3059 int allocated = 0; 3060 int error = 0; 3061 int count; 3062 int maxloops = 300000; 3063 3064 /* 3065 * Topology may be crossing a PFS boundary. 3066 */ 3067 parent = *parentp; 3068 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3069 KKASSERT(parent->error == 0); 3070 hmp = parent->hmp; 3071 chain = *chainp; 3072 3073 if (chain == NULL) { 3074 /* 3075 * First allocate media space and construct the dummy bref, 3076 * then allocate the in-memory chain structure. Set the 3077 * INITIAL flag for fresh chains which do not have embedded 3078 * data. 3079 * 3080 * XXX for now set the check mode of the child based on 3081 * the parent or, if the parent is an inode, the 3082 * specification in the inode. 3083 */ 3084 bzero(&dummy, sizeof(dummy)); 3085 dummy.type = type; 3086 dummy.key = key; 3087 dummy.keybits = keybits; 3088 dummy.data_off = hammer2_getradix(bytes); 3089 3090 /* 3091 * Inherit methods from parent by default. Primarily used 3092 * for BREF_TYPE_DATA. Non-data types *must* be set to 3093 * a non-NONE check algorithm. 3094 */ 3095 if (methods == -1) 3096 dummy.methods = parent->bref.methods; 3097 else 3098 dummy.methods = (uint8_t)methods; 3099 3100 if (type != HAMMER2_BREF_TYPE_DATA && 3101 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 3102 dummy.methods |= 3103 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 3104 } 3105 3106 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 3107 3108 /* 3109 * Lock the chain manually, chain_lock will load the chain 3110 * which we do NOT want to do. (note: chain->refs is set 3111 * to 1 by chain_alloc() for us, but lockcnt is not). 3112 */ 3113 chain->lockcnt = 1; 3114 hammer2_mtx_ex(&chain->lock); 3115 allocated = 1; 3116 ++curthread->td_tracker; 3117 3118 /* 3119 * Set INITIAL to optimize I/O. The flag will generally be 3120 * processed when we call hammer2_chain_modify(). 3121 * 3122 * Recalculate bytes to reflect the actual media block 3123 * allocation. Handle special case radix 0 == 0 bytes. 3124 */ 3125 bytes = (size_t)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 3126 if (bytes) 3127 bytes = (hammer2_off_t)1 << bytes; 3128 chain->bytes = bytes; 3129 3130 switch(type) { 3131 case HAMMER2_BREF_TYPE_VOLUME: 3132 case HAMMER2_BREF_TYPE_FREEMAP: 3133 panic("hammer2_chain_create: called with volume type"); 3134 break; 3135 case HAMMER2_BREF_TYPE_INDIRECT: 3136 panic("hammer2_chain_create: cannot be used to" 3137 "create indirect block"); 3138 break; 3139 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3140 panic("hammer2_chain_create: cannot be used to" 3141 "create freemap root or node"); 3142 break; 3143 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3144 KKASSERT(bytes == sizeof(chain->data->bmdata)); 3145 /* fall through */ 3146 case HAMMER2_BREF_TYPE_DIRENT: 3147 case HAMMER2_BREF_TYPE_INODE: 3148 case HAMMER2_BREF_TYPE_DATA: 3149 default: 3150 /* 3151 * leave chain->data NULL, set INITIAL 3152 */ 3153 KKASSERT(chain->data == NULL); 3154 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 3155 break; 3156 } 3157 } else { 3158 /* 3159 * We are reattaching a previously deleted chain, possibly 3160 * under a new parent and possibly with a new key/keybits. 3161 * The chain does not have to be in a modified state. The 3162 * UPDATE flag will be set later on in this routine. 3163 * 3164 * Do NOT mess with the current state of the INITIAL flag. 3165 */ 3166 chain->bref.key = key; 3167 chain->bref.keybits = keybits; 3168 if (chain->flags & HAMMER2_CHAIN_DELETED) 3169 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3170 KKASSERT(chain->parent == NULL); 3171 } 3172 if (flags & HAMMER2_INSERT_PFSROOT) 3173 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 3174 else 3175 chain->bref.flags &= ~HAMMER2_BREF_FLAG_PFSROOT; 3176 3177 /* 3178 * Calculate how many entries we have in the blockref array and 3179 * determine if an indirect block is required. 3180 */ 3181 again: 3182 if (--maxloops == 0) 3183 panic("hammer2_chain_create: maxloops"); 3184 3185 switch(parent->bref.type) { 3186 case HAMMER2_BREF_TYPE_INODE: 3187 if ((parent->data->ipdata.meta.op_flags & 3188 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 3189 kprintf("hammer2: parent set for direct-data! " 3190 "pkey=%016jx ckey=%016jx\n", 3191 parent->bref.key, 3192 chain->bref.key); 3193 } 3194 KKASSERT((parent->data->ipdata.meta.op_flags & 3195 HAMMER2_OPFLAG_DIRECTDATA) == 0); 3196 KKASSERT(parent->data != NULL); 3197 base = &parent->data->ipdata.u.blockset.blockref[0]; 3198 count = HAMMER2_SET_COUNT; 3199 break; 3200 case HAMMER2_BREF_TYPE_INDIRECT: 3201 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3202 if (parent->flags & HAMMER2_CHAIN_INITIAL) 3203 base = NULL; 3204 else 3205 base = &parent->data->npdata[0]; 3206 count = parent->bytes / sizeof(hammer2_blockref_t); 3207 break; 3208 case HAMMER2_BREF_TYPE_VOLUME: 3209 KKASSERT(parent->data != NULL); 3210 base = &parent->data->voldata.sroot_blockset.blockref[0]; 3211 count = HAMMER2_SET_COUNT; 3212 break; 3213 case HAMMER2_BREF_TYPE_FREEMAP: 3214 KKASSERT(parent->data != NULL); 3215 base = &parent->data->blkset.blockref[0]; 3216 count = HAMMER2_SET_COUNT; 3217 break; 3218 default: 3219 panic("hammer2_chain_create: unrecognized blockref type: %d", 3220 parent->bref.type); 3221 base = NULL; 3222 count = 0; 3223 break; 3224 } 3225 3226 /* 3227 * Make sure we've counted the brefs 3228 */ 3229 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 3230 hammer2_chain_countbrefs(parent, base, count); 3231 3232 KASSERT(parent->core.live_count >= 0 && 3233 parent->core.live_count <= count, 3234 ("bad live_count %d/%d (%02x, %d)", 3235 parent->core.live_count, count, 3236 parent->bref.type, parent->bytes)); 3237 3238 /* 3239 * If no free blockref could be found we must create an indirect 3240 * block and move a number of blockrefs into it. With the parent 3241 * locked we can safely lock each child in order to delete+duplicate 3242 * it without causing a deadlock. 3243 * 3244 * This may return the new indirect block or the old parent depending 3245 * on where the key falls. NULL is returned on error. 3246 */ 3247 if (parent->core.live_count == count) { 3248 hammer2_chain_t *nparent; 3249 3250 KKASSERT((flags & HAMMER2_INSERT_SAMEPARENT) == 0); 3251 3252 nparent = hammer2_chain_create_indirect(parent, key, keybits, 3253 mtid, type, &error); 3254 if (nparent == NULL) { 3255 if (allocated) 3256 hammer2_chain_drop(chain); 3257 chain = NULL; 3258 goto done; 3259 } 3260 if (parent != nparent) { 3261 hammer2_chain_unlock(parent); 3262 hammer2_chain_drop(parent); 3263 parent = *parentp = nparent; 3264 } 3265 goto again; 3266 } 3267 3268 if (chain->flags & HAMMER2_CHAIN_DELETED) 3269 kprintf("Inserting deleted chain @%016jx\n", 3270 chain->bref.key); 3271 3272 /* 3273 * Link the chain into its parent. 3274 */ 3275 if (chain->parent != NULL) 3276 panic("hammer2: hammer2_chain_create: chain already connected"); 3277 KKASSERT(chain->parent == NULL); 3278 KKASSERT(parent->core.live_count < count); 3279 hammer2_chain_insert(parent, chain, 3280 HAMMER2_CHAIN_INSERT_SPIN | 3281 HAMMER2_CHAIN_INSERT_LIVE, 3282 0); 3283 3284 if (allocated) { 3285 /* 3286 * Mark the newly created chain modified. This will cause 3287 * UPDATE to be set and process the INITIAL flag. 3288 * 3289 * Device buffers are not instantiated for DATA elements 3290 * as these are handled by logical buffers. 3291 * 3292 * Indirect and freemap node indirect blocks are handled 3293 * by hammer2_chain_create_indirect() and not by this 3294 * function. 3295 * 3296 * Data for all other bref types is expected to be 3297 * instantiated (INODE, LEAF). 3298 */ 3299 switch(chain->bref.type) { 3300 case HAMMER2_BREF_TYPE_DATA: 3301 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3302 case HAMMER2_BREF_TYPE_DIRENT: 3303 case HAMMER2_BREF_TYPE_INODE: 3304 error = hammer2_chain_modify(chain, mtid, dedup_off, 3305 HAMMER2_MODIFY_OPTDATA); 3306 break; 3307 default: 3308 /* 3309 * Remaining types are not supported by this function. 3310 * In particular, INDIRECT and LEAF_NODE types are 3311 * handled by create_indirect(). 3312 */ 3313 panic("hammer2_chain_create: bad type: %d", 3314 chain->bref.type); 3315 /* NOT REACHED */ 3316 break; 3317 } 3318 } else { 3319 /* 3320 * When reconnecting a chain we must set UPDATE and 3321 * setflush so the flush recognizes that it must update 3322 * the bref in the parent. 3323 */ 3324 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3325 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3326 } 3327 3328 /* 3329 * We must setflush(parent) to ensure that it recurses through to 3330 * chain. setflush(chain) might not work because ONFLUSH is possibly 3331 * already set in the chain (so it won't recurse up to set it in the 3332 * parent). 3333 */ 3334 hammer2_chain_setflush(parent); 3335 3336 done: 3337 *chainp = chain; 3338 3339 return (error); 3340 } 3341 3342 /* 3343 * Move the chain from its old parent to a new parent. The chain must have 3344 * already been deleted or already disconnected (or never associated) with 3345 * a parent. The chain is reassociated with the new parent and the deleted 3346 * flag will be cleared (no longer deleted). The chain's modification state 3347 * is not altered. 3348 * 3349 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3350 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3351 * FULL. This typically means that the caller is creating the chain after 3352 * doing a hammer2_chain_lookup(). 3353 * 3354 * A non-NULL bref is typically passed when key and keybits must be overridden. 3355 * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields 3356 * from a passed-in bref and uses the old chain's bref for everything else. 3357 * 3358 * Neither (parent) or (chain) can be errored. 3359 * 3360 * If (parent) is non-NULL then the chain is inserted under the parent. 3361 * 3362 * If (parent) is NULL then the newly duplicated chain is not inserted 3363 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3364 * passing into hammer2_chain_create() after this function returns). 3365 * 3366 * WARNING! This function calls create which means it can insert indirect 3367 * blocks. This can cause other unrelated chains in the parent to 3368 * be moved to a newly inserted indirect block in addition to the 3369 * specific chain. 3370 */ 3371 void 3372 hammer2_chain_rename(hammer2_blockref_t *bref, 3373 hammer2_chain_t **parentp, hammer2_chain_t *chain, 3374 hammer2_tid_t mtid, int flags) 3375 { 3376 hammer2_dev_t *hmp; 3377 hammer2_chain_t *parent; 3378 size_t bytes; 3379 3380 /* 3381 * WARNING! We should never resolve DATA to device buffers 3382 * (XXX allow it if the caller did?), and since 3383 * we currently do not have the logical buffer cache 3384 * buffer in-hand to fix its cached physical offset 3385 * we also force the modify code to not COW it. XXX 3386 */ 3387 hmp = chain->hmp; 3388 KKASSERT(chain->parent == NULL); 3389 KKASSERT(chain->error == 0); 3390 3391 /* 3392 * Now create a duplicate of the chain structure, associating 3393 * it with the same core, making it the same size, pointing it 3394 * to the same bref (the same media block). 3395 * 3396 * NOTE: Handle special radix == 0 case (means 0 bytes). 3397 */ 3398 if (bref == NULL) 3399 bref = &chain->bref; 3400 bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 3401 if (bytes) 3402 bytes = (hammer2_off_t)1 << bytes; 3403 3404 /* 3405 * If parent is not NULL the duplicated chain will be entered under 3406 * the parent and the UPDATE bit set to tell flush to update 3407 * the blockref. 3408 * 3409 * We must setflush(parent) to ensure that it recurses through to 3410 * chain. setflush(chain) might not work because ONFLUSH is possibly 3411 * already set in the chain (so it won't recurse up to set it in the 3412 * parent). 3413 * 3414 * Having both chains locked is extremely important for atomicy. 3415 */ 3416 if (parentp && (parent = *parentp) != NULL) { 3417 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3418 KKASSERT(parent->refs > 0); 3419 KKASSERT(parent->error == 0); 3420 3421 hammer2_chain_create(parentp, &chain, 3422 chain->pmp, HAMMER2_METH_DEFAULT, 3423 bref->key, bref->keybits, bref->type, 3424 chain->bytes, mtid, 0, flags); 3425 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3426 hammer2_chain_setflush(*parentp); 3427 } 3428 } 3429 3430 /* 3431 * Helper function for deleting chains. 3432 * 3433 * The chain is removed from the live view (the RBTREE) as well as the parent's 3434 * blockmap. Both chain and its parent must be locked. 3435 * 3436 * parent may not be errored. chain can be errored. 3437 */ 3438 static int 3439 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3440 hammer2_tid_t mtid, int flags) 3441 { 3442 hammer2_dev_t *hmp; 3443 int error = 0; 3444 3445 KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED | 3446 HAMMER2_CHAIN_FICTITIOUS)) == 0); 3447 KKASSERT(chain->parent == parent); 3448 hmp = chain->hmp; 3449 3450 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 3451 /* 3452 * Chain is blockmapped, so there must be a parent. 3453 * Atomically remove the chain from the parent and remove 3454 * the blockmap entry. The parent must be set modified 3455 * to remove the blockmap entry. 3456 */ 3457 hammer2_blockref_t *base; 3458 int count; 3459 3460 KKASSERT(parent != NULL); 3461 KKASSERT(parent->error == 0); 3462 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3463 error = hammer2_chain_modify(parent, mtid, 0, 0); 3464 if (error) 3465 goto done; 3466 3467 /* 3468 * Calculate blockmap pointer 3469 */ 3470 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3471 hammer2_spin_ex(&chain->core.spin); 3472 hammer2_spin_ex(&parent->core.spin); 3473 3474 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3475 atomic_add_int(&parent->core.live_count, -1); 3476 ++parent->core.generation; 3477 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3478 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3479 --parent->core.chain_count; 3480 chain->parent = NULL; 3481 3482 switch(parent->bref.type) { 3483 case HAMMER2_BREF_TYPE_INODE: 3484 /* 3485 * Access the inode's block array. However, there 3486 * is no block array if the inode is flagged 3487 * DIRECTDATA. 3488 */ 3489 if (parent->data && 3490 (parent->data->ipdata.meta.op_flags & 3491 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3492 base = 3493 &parent->data->ipdata.u.blockset.blockref[0]; 3494 } else { 3495 base = NULL; 3496 } 3497 count = HAMMER2_SET_COUNT; 3498 break; 3499 case HAMMER2_BREF_TYPE_INDIRECT: 3500 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3501 if (parent->data) 3502 base = &parent->data->npdata[0]; 3503 else 3504 base = NULL; 3505 count = parent->bytes / sizeof(hammer2_blockref_t); 3506 break; 3507 case HAMMER2_BREF_TYPE_VOLUME: 3508 base = &parent->data->voldata. 3509 sroot_blockset.blockref[0]; 3510 count = HAMMER2_SET_COUNT; 3511 break; 3512 case HAMMER2_BREF_TYPE_FREEMAP: 3513 base = &parent->data->blkset.blockref[0]; 3514 count = HAMMER2_SET_COUNT; 3515 break; 3516 default: 3517 base = NULL; 3518 count = 0; 3519 panic("hammer2_flush_pass2: " 3520 "unrecognized blockref type: %d", 3521 parent->bref.type); 3522 } 3523 3524 /* 3525 * delete blockmapped chain from its parent. 3526 * 3527 * The parent is not affected by any statistics in chain 3528 * which are pending synchronization. That is, there is 3529 * nothing to undo in the parent since they have not yet 3530 * been incorporated into the parent. 3531 * 3532 * The parent is affected by statistics stored in inodes. 3533 * Those have already been synchronized, so they must be 3534 * undone. XXX split update possible w/delete in middle? 3535 */ 3536 if (base) { 3537 hammer2_base_delete(parent, base, count, chain); 3538 } 3539 hammer2_spin_unex(&parent->core.spin); 3540 hammer2_spin_unex(&chain->core.spin); 3541 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3542 /* 3543 * Chain is not blockmapped but a parent is present. 3544 * Atomically remove the chain from the parent. There is 3545 * no blockmap entry to remove. 3546 * 3547 * Because chain was associated with a parent but not 3548 * synchronized, the chain's *_count_up fields contain 3549 * inode adjustment statistics which must be undone. 3550 */ 3551 hammer2_spin_ex(&chain->core.spin); 3552 hammer2_spin_ex(&parent->core.spin); 3553 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3554 atomic_add_int(&parent->core.live_count, -1); 3555 ++parent->core.generation; 3556 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3557 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3558 --parent->core.chain_count; 3559 chain->parent = NULL; 3560 hammer2_spin_unex(&parent->core.spin); 3561 hammer2_spin_unex(&chain->core.spin); 3562 } else { 3563 /* 3564 * Chain is not blockmapped and has no parent. This 3565 * is a degenerate case. 3566 */ 3567 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3568 } 3569 done: 3570 return error; 3571 } 3572 3573 /* 3574 * Create an indirect block that covers one or more of the elements in the 3575 * current parent. Either returns the existing parent with no locking or 3576 * ref changes or returns the new indirect block locked and referenced 3577 * and leaving the original parent lock/ref intact as well. 3578 * 3579 * If an error occurs, NULL is returned and *errorp is set to the H2 error. 3580 * 3581 * The returned chain depends on where the specified key falls. 3582 * 3583 * The key/keybits for the indirect mode only needs to follow three rules: 3584 * 3585 * (1) That all elements underneath it fit within its key space and 3586 * 3587 * (2) That all elements outside it are outside its key space. 3588 * 3589 * (3) When creating the new indirect block any elements in the current 3590 * parent that fit within the new indirect block's keyspace must be 3591 * moved into the new indirect block. 3592 * 3593 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3594 * keyspace the the current parent, but lookup/iteration rules will 3595 * ensure (and must ensure) that rule (2) for all parents leading up 3596 * to the nearest inode or the root volume header is adhered to. This 3597 * is accomplished by always recursing through matching keyspaces in 3598 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3599 * 3600 * The current implementation calculates the current worst-case keyspace by 3601 * iterating the current parent and then divides it into two halves, choosing 3602 * whichever half has the most elements (not necessarily the half containing 3603 * the requested key). 3604 * 3605 * We can also opt to use the half with the least number of elements. This 3606 * causes lower-numbered keys (aka logical file offsets) to recurse through 3607 * fewer indirect blocks and higher-numbered keys to recurse through more. 3608 * This also has the risk of not moving enough elements to the new indirect 3609 * block and being forced to create several indirect blocks before the element 3610 * can be inserted. 3611 * 3612 * Must be called with an exclusively locked parent. 3613 * 3614 * NOTE: *errorp set to HAMMER_ERROR_* flags 3615 */ 3616 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3617 hammer2_key_t *keyp, int keybits, 3618 hammer2_blockref_t *base, int count); 3619 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3620 hammer2_key_t *keyp, int keybits, 3621 hammer2_blockref_t *base, int count, 3622 int ncount); 3623 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3624 hammer2_key_t *keyp, int keybits, 3625 hammer2_blockref_t *base, int count, 3626 int ncount); 3627 static 3628 hammer2_chain_t * 3629 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3630 hammer2_key_t create_key, int create_bits, 3631 hammer2_tid_t mtid, int for_type, int *errorp) 3632 { 3633 hammer2_dev_t *hmp; 3634 hammer2_blockref_t *base; 3635 hammer2_blockref_t *bref; 3636 hammer2_blockref_t bcopy; 3637 hammer2_chain_t *chain; 3638 hammer2_chain_t *ichain; 3639 hammer2_chain_t dummy; 3640 hammer2_key_t key = create_key; 3641 hammer2_key_t key_beg; 3642 hammer2_key_t key_end; 3643 hammer2_key_t key_next; 3644 int keybits = create_bits; 3645 int count; 3646 int ncount; 3647 int nbytes; 3648 int loops; 3649 int error; 3650 int reason; 3651 int generation; 3652 int maxloops = 300000; 3653 3654 /* 3655 * Calculate the base blockref pointer or NULL if the chain 3656 * is known to be empty. We need to calculate the array count 3657 * for RB lookups either way. 3658 */ 3659 hmp = parent->hmp; 3660 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3661 3662 /* 3663 * Pre-modify the parent now to avoid having to deal with error 3664 * processing if we tried to later (in the middle of our loop). 3665 */ 3666 *errorp = hammer2_chain_modify(parent, mtid, 0, 0); 3667 if (*errorp) { 3668 kprintf("hammer2_create_indirect: error %08x %s\n", 3669 *errorp, hammer2_error_str(*errorp)); 3670 return NULL; 3671 } 3672 3673 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 3674 base = hammer2_chain_base_and_count(parent, &count); 3675 3676 /* 3677 * dummy used in later chain allocation (no longer used for lookups). 3678 */ 3679 bzero(&dummy, sizeof(dummy)); 3680 3681 /* 3682 * How big should our new indirect block be? It has to be at least 3683 * as large as its parent for splits to work properly. 3684 * 3685 * The freemap uses a specific indirect block size. The number of 3686 * levels are built dynamically and ultimately depend on the size 3687 * volume. Because freemap blocks are taken from the reserved areas 3688 * of the volume our goal is efficiency (fewer levels) and not so 3689 * much to save disk space. 3690 * 3691 * The first indirect block level for a directory usually uses 3692 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to 3693 * the hash mechanism, this typically gives us a nominal 3694 * 32 * 4 entries with one level of indirection. 3695 * 3696 * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE 3697 * indirect blocks. The initial 4 entries in the inode gives us 3698 * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels 3699 * of indirection gives us 137GB, and so forth. H2 can support 3700 * huge file sizes but they are not typical, so we try to stick 3701 * with compactness and do not use a larger indirect block size. 3702 * 3703 * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but 3704 * due to the way indirect blocks are created this usually winds 3705 * up being extremely inefficient for small files. Even though 3706 * 16KB requires more levels of indirection for very large files, 3707 * the 16KB records can be ganged together into 64KB DIOs. 3708 */ 3709 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3710 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3711 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 3712 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 3713 if (parent->data->ipdata.meta.type == 3714 HAMMER2_OBJTYPE_DIRECTORY) 3715 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 3716 else 3717 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 3718 3719 } else { 3720 nbytes = HAMMER2_IND_BYTES_NOM; 3721 } 3722 if (nbytes < count * sizeof(hammer2_blockref_t)) { 3723 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 3724 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 3725 nbytes = count * sizeof(hammer2_blockref_t); 3726 } 3727 ncount = nbytes / sizeof(hammer2_blockref_t); 3728 3729 /* 3730 * When creating an indirect block for a freemap node or leaf 3731 * the key/keybits must be fitted to static radix levels because 3732 * particular radix levels use particular reserved blocks in the 3733 * related zone. 3734 * 3735 * This routine calculates the key/radix of the indirect block 3736 * we need to create, and whether it is on the high-side or the 3737 * low-side. 3738 */ 3739 switch(for_type) { 3740 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3741 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3742 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 3743 base, count); 3744 break; 3745 case HAMMER2_BREF_TYPE_DATA: 3746 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 3747 base, count, ncount); 3748 break; 3749 case HAMMER2_BREF_TYPE_DIRENT: 3750 case HAMMER2_BREF_TYPE_INODE: 3751 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 3752 base, count, ncount); 3753 break; 3754 default: 3755 panic("illegal indirect block for bref type %d", for_type); 3756 break; 3757 } 3758 3759 /* 3760 * Normalize the key for the radix being represented, keeping the 3761 * high bits and throwing away the low bits. 3762 */ 3763 key &= ~(((hammer2_key_t)1 << keybits) - 1); 3764 3765 /* 3766 * Ok, create our new indirect block 3767 */ 3768 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3769 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3770 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 3771 } else { 3772 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 3773 } 3774 dummy.bref.key = key; 3775 dummy.bref.keybits = keybits; 3776 dummy.bref.data_off = hammer2_getradix(nbytes); 3777 dummy.bref.methods = 3778 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 3779 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 3780 3781 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy.bref); 3782 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 3783 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 3784 /* ichain has one ref at this point */ 3785 3786 /* 3787 * We have to mark it modified to allocate its block, but use 3788 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 3789 * it won't be acted upon by the flush code. 3790 */ 3791 *errorp = hammer2_chain_modify(ichain, mtid, 0, HAMMER2_MODIFY_OPTDATA); 3792 if (*errorp) { 3793 kprintf("hammer2_alloc_indirect: error %08x %s\n", 3794 *errorp, hammer2_error_str(*errorp)); 3795 hammer2_chain_unlock(ichain); 3796 hammer2_chain_drop(ichain); 3797 return NULL; 3798 } 3799 3800 /* 3801 * Iterate the original parent and move the matching brefs into 3802 * the new indirect block. 3803 * 3804 * XXX handle flushes. 3805 */ 3806 key_beg = 0; 3807 key_end = HAMMER2_KEY_MAX; 3808 key_next = 0; /* avoid gcc warnings */ 3809 hammer2_spin_ex(&parent->core.spin); 3810 loops = 0; 3811 reason = 0; 3812 3813 for (;;) { 3814 /* 3815 * Parent may have been modified, relocating its block array. 3816 * Reload the base pointer. 3817 */ 3818 base = hammer2_chain_base_and_count(parent, &count); 3819 3820 if (++loops > 100000) { 3821 hammer2_spin_unex(&parent->core.spin); 3822 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 3823 reason, parent, base, count, key_next); 3824 } 3825 3826 /* 3827 * NOTE: spinlock stays intact, returned chain (if not NULL) 3828 * is not referenced or locked which means that we 3829 * cannot safely check its flagged / deletion status 3830 * until we lock it. 3831 */ 3832 chain = hammer2_combined_find(parent, base, count, 3833 &key_next, 3834 key_beg, key_end, 3835 &bref); 3836 generation = parent->core.generation; 3837 if (bref == NULL) 3838 break; 3839 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3840 3841 /* 3842 * Skip keys that are not within the key/radix of the new 3843 * indirect block. They stay in the parent. 3844 */ 3845 if ((~(((hammer2_key_t)1 << keybits) - 1) & 3846 (key ^ bref->key)) != 0) { 3847 goto next_key_spinlocked; 3848 } 3849 3850 /* 3851 * Load the new indirect block by acquiring the related 3852 * chains (potentially from media as it might not be 3853 * in-memory). Then move it to the new parent (ichain). 3854 * 3855 * chain is referenced but not locked. We must lock the 3856 * chain to obtain definitive state. 3857 */ 3858 if (chain) { 3859 /* 3860 * Use chain already present in the RBTREE 3861 */ 3862 hammer2_chain_ref(chain); 3863 hammer2_spin_unex(&parent->core.spin); 3864 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3865 } else { 3866 /* 3867 * Get chain for blockref element. _get returns NULL 3868 * on insertion race. 3869 */ 3870 bcopy = *bref; 3871 hammer2_spin_unex(&parent->core.spin); 3872 chain = hammer2_chain_get(parent, generation, &bcopy); 3873 if (chain == NULL) { 3874 reason = 1; 3875 hammer2_spin_ex(&parent->core.spin); 3876 continue; 3877 } 3878 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3879 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 3880 reason = 2; 3881 hammer2_chain_unlock(chain); 3882 hammer2_chain_drop(chain); 3883 hammer2_spin_ex(&parent->core.spin); 3884 continue; 3885 } 3886 } 3887 3888 /* 3889 * This is always live so if the chain has been deleted 3890 * we raced someone and we have to retry. 3891 * 3892 * NOTE: Lookups can race delete-duplicate because 3893 * delete-duplicate does not lock the parent's core 3894 * (they just use the spinlock on the core). 3895 * 3896 * (note reversed logic for this one) 3897 */ 3898 if (chain->parent != parent || 3899 (chain->flags & HAMMER2_CHAIN_DELETED)) { 3900 hammer2_chain_unlock(chain); 3901 hammer2_chain_drop(chain); 3902 kprintf("hammer2_chain_create_indirect " 3903 "RETRY (%p,%p)->%p %08x\n", 3904 parent, chain->parent, chain, chain->flags); 3905 hammer2_spin_ex(&parent->core.spin); 3906 continue; 3907 } 3908 3909 /* 3910 * Shift the chain to the indirect block. 3911 * 3912 * WARNING! No reason for us to load chain data, pass NOSTATS 3913 * to prevent delete/insert from trying to access 3914 * inode stats (and thus asserting if there is no 3915 * chain->data loaded). 3916 * 3917 * WARNING! The (parent, chain) deletion may modify the parent 3918 * and invalidate the base pointer. 3919 * 3920 * WARNING! Parent must already be marked modified, so we 3921 * can assume that chain_delete always suceeds. 3922 */ 3923 error = hammer2_chain_delete(parent, chain, mtid, 0); 3924 KKASSERT(error == 0); 3925 hammer2_chain_rename(NULL, &ichain, chain, mtid, 0); 3926 hammer2_chain_unlock(chain); 3927 hammer2_chain_drop(chain); 3928 KKASSERT(parent->refs > 0); 3929 chain = NULL; 3930 base = NULL; /* safety */ 3931 hammer2_spin_ex(&parent->core.spin); 3932 next_key_spinlocked: 3933 if (--maxloops == 0) 3934 panic("hammer2_chain_create_indirect: maxloops"); 3935 reason = 4; 3936 if (key_next == 0 || key_next > key_end) 3937 break; 3938 key_beg = key_next; 3939 /* loop */ 3940 } 3941 hammer2_spin_unex(&parent->core.spin); 3942 3943 /* 3944 * Insert the new indirect block into the parent now that we've 3945 * cleared out some entries in the parent. We calculated a good 3946 * insertion index in the loop above (ichain->index). 3947 * 3948 * We don't have to set UPDATE here because we mark ichain 3949 * modified down below (so the normal modified -> flush -> set-moved 3950 * sequence applies). 3951 * 3952 * The insertion shouldn't race as this is a completely new block 3953 * and the parent is locked. 3954 */ 3955 base = NULL; /* safety, parent modify may change address */ 3956 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 3957 KKASSERT(parent->core.live_count < count); 3958 hammer2_chain_insert(parent, ichain, 3959 HAMMER2_CHAIN_INSERT_SPIN | 3960 HAMMER2_CHAIN_INSERT_LIVE, 3961 0); 3962 3963 /* 3964 * Make sure flushes propogate after our manual insertion. 3965 */ 3966 hammer2_chain_setflush(ichain); 3967 hammer2_chain_setflush(parent); 3968 3969 /* 3970 * Figure out what to return. 3971 */ 3972 if (~(((hammer2_key_t)1 << keybits) - 1) & 3973 (create_key ^ key)) { 3974 /* 3975 * Key being created is outside the key range, 3976 * return the original parent. 3977 */ 3978 hammer2_chain_unlock(ichain); 3979 hammer2_chain_drop(ichain); 3980 } else { 3981 /* 3982 * Otherwise its in the range, return the new parent. 3983 * (leave both the new and old parent locked). 3984 */ 3985 parent = ichain; 3986 } 3987 3988 return(parent); 3989 } 3990 3991 /* 3992 * Do maintenance on an indirect chain. Both parent and chain are locked. 3993 * 3994 * Returns non-zero if (chain) is deleted, either due to being empty or 3995 * because its children were safely moved into the parent. 3996 */ 3997 int 3998 hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, 3999 hammer2_chain_t *chain) 4000 { 4001 hammer2_blockref_t *chain_base; 4002 hammer2_blockref_t *base; 4003 hammer2_blockref_t *bref; 4004 hammer2_blockref_t bcopy; 4005 hammer2_key_t key_next; 4006 hammer2_key_t key_beg; 4007 hammer2_key_t key_end; 4008 hammer2_chain_t *sub; 4009 int chain_count; 4010 int count; 4011 int error; 4012 int generation; 4013 4014 /* 4015 * Make sure we have an accurate live_count 4016 */ 4017 if ((chain->flags & (HAMMER2_CHAIN_INITIAL | 4018 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4019 base = &chain->data->npdata[0]; 4020 count = chain->bytes / sizeof(hammer2_blockref_t); 4021 hammer2_chain_countbrefs(chain, base, count); 4022 } 4023 4024 /* 4025 * If the indirect block is empty we can delete it. 4026 * (ignore deletion error) 4027 */ 4028 if (chain->core.live_count == 0 && RB_EMPTY(&chain->core.rbtree)) { 4029 hammer2_chain_delete(parent, chain, 4030 chain->bref.modify_tid, 4031 HAMMER2_DELETE_PERMANENT); 4032 return 1; 4033 } 4034 4035 base = hammer2_chain_base_and_count(parent, &count); 4036 4037 if ((parent->flags & (HAMMER2_CHAIN_INITIAL | 4038 HAMMER2_CHAIN_COUNTEDBREFS)) == 0) { 4039 hammer2_chain_countbrefs(parent, base, count); 4040 } 4041 4042 /* 4043 * Determine if we can collapse chain into parent, calculate 4044 * hysteresis for chain emptiness. 4045 */ 4046 if (parent->core.live_count + chain->core.live_count - 1 > count) 4047 return 0; 4048 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4049 if (chain->core.live_count > chain_count * 3 / 4) 4050 return 0; 4051 4052 /* 4053 * Ok, theoretically we can collapse chain's contents into 4054 * parent. chain is locked, but any in-memory children of chain 4055 * are not. For this to work, we must be able to dispose of any 4056 * in-memory children of chain. 4057 * 4058 * For now require that there are no in-memory children of chain. 4059 * 4060 * WARNING! Both chain and parent must remain locked across this 4061 * entire operation. 4062 */ 4063 4064 /* 4065 * Parent must be marked modified. Don't try to collapse it if we 4066 * can't mark it modified. Once modified, destroy chain to make room 4067 * and to get rid of what will be a conflicting key (this is included 4068 * in the calculation above). Finally, move the children of chain 4069 * into chain's parent. 4070 * 4071 * This order creates an accounting problem for bref.embed.stats 4072 * because we destroy chain before we remove its children. Any 4073 * elements whos blockref is already synchronized will be counted 4074 * twice. To deal with the problem we clean out chain's stats prior 4075 * to deleting it. 4076 */ 4077 error = hammer2_chain_modify(parent, 0, 0, 0); 4078 if (error) { 4079 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4080 hammer2_error_str(error)); 4081 return 0; 4082 } 4083 error = hammer2_chain_modify(chain, chain->bref.modify_tid, 0, 0); 4084 if (error) { 4085 krateprintf(&krate_h2me, "hammer2: indirect_maint: %s\n", 4086 hammer2_error_str(error)); 4087 return 0; 4088 } 4089 4090 chain->bref.embed.stats.inode_count = 0; 4091 chain->bref.embed.stats.data_count = 0; 4092 error = hammer2_chain_delete(parent, chain, 4093 chain->bref.modify_tid, 4094 HAMMER2_DELETE_PERMANENT); 4095 KKASSERT(error == 0); 4096 4097 /* 4098 * The combined_find call requires core.spin to be held. One would 4099 * think there wouldn't be any conflicts since we hold chain 4100 * exclusively locked, but the caching mechanism for 0-ref children 4101 * does not require a chain lock. 4102 */ 4103 hammer2_spin_ex(&chain->core.spin); 4104 4105 key_next = 0; 4106 key_beg = 0; 4107 key_end = HAMMER2_KEY_MAX; 4108 for (;;) { 4109 chain_base = &chain->data->npdata[0]; 4110 chain_count = chain->bytes / sizeof(hammer2_blockref_t); 4111 sub = hammer2_combined_find(chain, chain_base, chain_count, 4112 &key_next, 4113 key_beg, key_end, 4114 &bref); 4115 generation = chain->core.generation; 4116 if (bref == NULL) 4117 break; 4118 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4119 4120 if (sub) { 4121 hammer2_chain_ref(sub); 4122 hammer2_spin_unex(&chain->core.spin); 4123 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4124 if (sub->parent != chain || 4125 (sub->flags & HAMMER2_CHAIN_DELETED)) { 4126 hammer2_chain_unlock(sub); 4127 hammer2_chain_drop(sub); 4128 hammer2_spin_ex(&chain->core.spin); 4129 continue; 4130 } 4131 } else { 4132 bcopy = *bref; 4133 hammer2_spin_unex(&chain->core.spin); 4134 sub = hammer2_chain_get(chain, generation, &bcopy); 4135 if (sub == NULL) { 4136 hammer2_spin_ex(&chain->core.spin); 4137 continue; 4138 } 4139 hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); 4140 if (bcmp(&bcopy, bref, sizeof(bcopy)) != 0) { 4141 hammer2_chain_unlock(sub); 4142 hammer2_chain_drop(sub); 4143 hammer2_spin_ex(&chain->core.spin); 4144 continue; 4145 } 4146 } 4147 error = hammer2_chain_delete(chain, sub, 4148 sub->bref.modify_tid, 0); 4149 KKASSERT(error == 0); 4150 hammer2_chain_rename(NULL, &parent, sub, 4151 sub->bref.modify_tid, 4152 HAMMER2_INSERT_SAMEPARENT); 4153 hammer2_chain_unlock(sub); 4154 hammer2_chain_drop(sub); 4155 hammer2_spin_ex(&chain->core.spin); 4156 4157 if (key_next == 0) 4158 break; 4159 key_beg = key_next; 4160 } 4161 hammer2_spin_unex(&chain->core.spin); 4162 4163 return 1; 4164 } 4165 4166 /* 4167 * Freemap indirect blocks 4168 * 4169 * Calculate the keybits and highside/lowside of the freemap node the 4170 * caller is creating. 4171 * 4172 * This routine will specify the next higher-level freemap key/radix 4173 * representing the lowest-ordered set. By doing so, eventually all 4174 * low-ordered sets will be moved one level down. 4175 * 4176 * We have to be careful here because the freemap reserves a limited 4177 * number of blocks for a limited number of levels. So we can't just 4178 * push indiscriminately. 4179 */ 4180 int 4181 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 4182 int keybits, hammer2_blockref_t *base, int count) 4183 { 4184 hammer2_chain_t *chain; 4185 hammer2_blockref_t *bref; 4186 hammer2_key_t key; 4187 hammer2_key_t key_beg; 4188 hammer2_key_t key_end; 4189 hammer2_key_t key_next; 4190 int locount; 4191 int hicount; 4192 int maxloops = 300000; 4193 4194 key = *keyp; 4195 locount = 0; 4196 hicount = 0; 4197 keybits = 64; 4198 4199 /* 4200 * Calculate the range of keys in the array being careful to skip 4201 * slots which are overridden with a deletion. 4202 */ 4203 key_beg = 0; 4204 key_end = HAMMER2_KEY_MAX; 4205 hammer2_spin_ex(&parent->core.spin); 4206 4207 for (;;) { 4208 if (--maxloops == 0) { 4209 panic("indkey_freemap shit %p %p:%d\n", 4210 parent, base, count); 4211 } 4212 chain = hammer2_combined_find(parent, base, count, 4213 &key_next, 4214 key_beg, key_end, 4215 &bref); 4216 4217 /* 4218 * Exhausted search 4219 */ 4220 if (bref == NULL) 4221 break; 4222 4223 /* 4224 * Skip deleted chains. 4225 */ 4226 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4227 if (key_next == 0 || key_next > key_end) 4228 break; 4229 key_beg = key_next; 4230 continue; 4231 } 4232 4233 /* 4234 * Use the full live (not deleted) element for the scan 4235 * iteration. HAMMER2 does not allow partial replacements. 4236 * 4237 * XXX should be built into hammer2_combined_find(). 4238 */ 4239 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4240 4241 if (keybits > bref->keybits) { 4242 key = bref->key; 4243 keybits = bref->keybits; 4244 } else if (keybits == bref->keybits && bref->key < key) { 4245 key = bref->key; 4246 } 4247 if (key_next == 0) 4248 break; 4249 key_beg = key_next; 4250 } 4251 hammer2_spin_unex(&parent->core.spin); 4252 4253 /* 4254 * Return the keybits for a higher-level FREEMAP_NODE covering 4255 * this node. 4256 */ 4257 switch(keybits) { 4258 case HAMMER2_FREEMAP_LEVEL0_RADIX: 4259 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 4260 break; 4261 case HAMMER2_FREEMAP_LEVEL1_RADIX: 4262 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 4263 break; 4264 case HAMMER2_FREEMAP_LEVEL2_RADIX: 4265 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 4266 break; 4267 case HAMMER2_FREEMAP_LEVEL3_RADIX: 4268 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 4269 break; 4270 case HAMMER2_FREEMAP_LEVEL4_RADIX: 4271 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 4272 break; 4273 case HAMMER2_FREEMAP_LEVEL5_RADIX: 4274 panic("hammer2_chain_indkey_freemap: level too high"); 4275 break; 4276 default: 4277 panic("hammer2_chain_indkey_freemap: bad radix"); 4278 break; 4279 } 4280 *keyp = key; 4281 4282 return (keybits); 4283 } 4284 4285 /* 4286 * File indirect blocks 4287 * 4288 * Calculate the key/keybits for the indirect block to create by scanning 4289 * existing keys. The key being created is also passed in *keyp and can be 4290 * inside or outside the indirect block. Regardless, the indirect block 4291 * must hold at least two keys in order to guarantee sufficient space. 4292 * 4293 * We use a modified version of the freemap's fixed radix tree, but taylored 4294 * for file data. Basically we configure an indirect block encompassing the 4295 * smallest key. 4296 */ 4297 static int 4298 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 4299 int keybits, hammer2_blockref_t *base, int count, 4300 int ncount) 4301 { 4302 hammer2_chain_t *chain; 4303 hammer2_blockref_t *bref; 4304 hammer2_key_t key; 4305 hammer2_key_t key_beg; 4306 hammer2_key_t key_end; 4307 hammer2_key_t key_next; 4308 int nradix; 4309 int locount; 4310 int hicount; 4311 int maxloops = 300000; 4312 4313 key = *keyp; 4314 locount = 0; 4315 hicount = 0; 4316 keybits = 64; 4317 4318 /* 4319 * Calculate the range of keys in the array being careful to skip 4320 * slots which are overridden with a deletion. 4321 * 4322 * Locate the smallest key. 4323 */ 4324 key_beg = 0; 4325 key_end = HAMMER2_KEY_MAX; 4326 hammer2_spin_ex(&parent->core.spin); 4327 4328 for (;;) { 4329 if (--maxloops == 0) { 4330 panic("indkey_freemap shit %p %p:%d\n", 4331 parent, base, count); 4332 } 4333 chain = hammer2_combined_find(parent, base, count, 4334 &key_next, 4335 key_beg, key_end, 4336 &bref); 4337 4338 /* 4339 * Exhausted search 4340 */ 4341 if (bref == NULL) 4342 break; 4343 4344 /* 4345 * Skip deleted chains. 4346 */ 4347 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4348 if (key_next == 0 || key_next > key_end) 4349 break; 4350 key_beg = key_next; 4351 continue; 4352 } 4353 4354 /* 4355 * Use the full live (not deleted) element for the scan 4356 * iteration. HAMMER2 does not allow partial replacements. 4357 * 4358 * XXX should be built into hammer2_combined_find(). 4359 */ 4360 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4361 4362 if (keybits > bref->keybits) { 4363 key = bref->key; 4364 keybits = bref->keybits; 4365 } else if (keybits == bref->keybits && bref->key < key) { 4366 key = bref->key; 4367 } 4368 if (key_next == 0) 4369 break; 4370 key_beg = key_next; 4371 } 4372 hammer2_spin_unex(&parent->core.spin); 4373 4374 /* 4375 * Calculate the static keybits for a higher-level indirect block 4376 * that contains the key. 4377 */ 4378 *keyp = key; 4379 4380 switch(ncount) { 4381 case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t): 4382 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 4383 break; 4384 case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t): 4385 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 4386 break; 4387 case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t): 4388 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 4389 break; 4390 default: 4391 panic("bad ncount %d\n", ncount); 4392 nradix = 0; 4393 break; 4394 } 4395 4396 /* 4397 * The largest radix that can be returned for an indirect block is 4398 * 63 bits. (The largest practical indirect block radix is actually 4399 * 62 bits because the top-level inode or volume root contains four 4400 * entries, but allow 63 to be returned). 4401 */ 4402 if (nradix >= 64) 4403 nradix = 63; 4404 4405 return keybits + nradix; 4406 } 4407 4408 #if 1 4409 4410 /* 4411 * Directory indirect blocks. 4412 * 4413 * Covers both the inode index (directory of inodes), and directory contents 4414 * (filenames hardlinked to inodes). 4415 * 4416 * Because directory keys are hashed we generally try to cut the space in 4417 * half. We accomodate the inode index (which tends to have linearly 4418 * increasing inode numbers) by ensuring that the keyspace is at least large 4419 * enough to fill up the indirect block being created. 4420 */ 4421 static int 4422 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4423 int keybits, hammer2_blockref_t *base, int count, 4424 int ncount) 4425 { 4426 hammer2_blockref_t *bref; 4427 hammer2_chain_t *chain; 4428 hammer2_key_t key_beg; 4429 hammer2_key_t key_end; 4430 hammer2_key_t key_next; 4431 hammer2_key_t key; 4432 int nkeybits; 4433 int locount; 4434 int hicount; 4435 int maxloops = 300000; 4436 4437 /* 4438 * NOTE: We can't take a shortcut here anymore for inodes because 4439 * the root directory can contain a mix of inodes and directory 4440 * entries (we used to just return 63 if parent->bref.type was 4441 * HAMMER2_BREF_TYPE_INODE. 4442 */ 4443 key = *keyp; 4444 locount = 0; 4445 hicount = 0; 4446 4447 /* 4448 * Calculate the range of keys in the array being careful to skip 4449 * slots which are overridden with a deletion. 4450 */ 4451 key_beg = 0; 4452 key_end = HAMMER2_KEY_MAX; 4453 hammer2_spin_ex(&parent->core.spin); 4454 4455 for (;;) { 4456 if (--maxloops == 0) { 4457 panic("indkey_freemap shit %p %p:%d\n", 4458 parent, base, count); 4459 } 4460 chain = hammer2_combined_find(parent, base, count, 4461 &key_next, 4462 key_beg, key_end, 4463 &bref); 4464 4465 /* 4466 * Exhausted search 4467 */ 4468 if (bref == NULL) 4469 break; 4470 4471 /* 4472 * Deleted object 4473 */ 4474 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4475 if (key_next == 0 || key_next > key_end) 4476 break; 4477 key_beg = key_next; 4478 continue; 4479 } 4480 4481 /* 4482 * Use the full live (not deleted) element for the scan 4483 * iteration. HAMMER2 does not allow partial replacements. 4484 * 4485 * XXX should be built into hammer2_combined_find(). 4486 */ 4487 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4488 4489 /* 4490 * Expand our calculated key range (key, keybits) to fit 4491 * the scanned key. nkeybits represents the full range 4492 * that we will later cut in half (two halves @ nkeybits - 1). 4493 */ 4494 nkeybits = keybits; 4495 if (nkeybits < bref->keybits) { 4496 if (bref->keybits > 64) { 4497 kprintf("bad bref chain %p bref %p\n", 4498 chain, bref); 4499 Debugger("fubar"); 4500 } 4501 nkeybits = bref->keybits; 4502 } 4503 while (nkeybits < 64 && 4504 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4505 (key ^ bref->key)) != 0) { 4506 ++nkeybits; 4507 } 4508 4509 /* 4510 * If the new key range is larger we have to determine 4511 * which side of the new key range the existing keys fall 4512 * under by checking the high bit, then collapsing the 4513 * locount into the hicount or vise-versa. 4514 */ 4515 if (keybits != nkeybits) { 4516 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4517 hicount += locount; 4518 locount = 0; 4519 } else { 4520 locount += hicount; 4521 hicount = 0; 4522 } 4523 keybits = nkeybits; 4524 } 4525 4526 /* 4527 * The newly scanned key will be in the lower half or the 4528 * upper half of the (new) key range. 4529 */ 4530 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4531 ++hicount; 4532 else 4533 ++locount; 4534 4535 if (key_next == 0) 4536 break; 4537 key_beg = key_next; 4538 } 4539 hammer2_spin_unex(&parent->core.spin); 4540 bref = NULL; /* now invalid (safety) */ 4541 4542 /* 4543 * Adjust keybits to represent half of the full range calculated 4544 * above (radix 63 max) for our new indirect block. 4545 */ 4546 --keybits; 4547 4548 /* 4549 * Expand keybits to hold at least ncount elements. ncount will be 4550 * a power of 2. This is to try to completely fill leaf nodes (at 4551 * least for keys which are not hashes). 4552 * 4553 * We aren't counting 'in' or 'out', we are counting 'high side' 4554 * and 'low side' based on the bit at (1LL << keybits). We want 4555 * everything to be inside in these cases so shift it all to 4556 * the low or high side depending on the new high bit. 4557 */ 4558 while (((hammer2_key_t)1 << keybits) < ncount) { 4559 ++keybits; 4560 if (key & ((hammer2_key_t)1 << keybits)) { 4561 hicount += locount; 4562 locount = 0; 4563 } else { 4564 locount += hicount; 4565 hicount = 0; 4566 } 4567 } 4568 4569 if (hicount > locount) 4570 key |= (hammer2_key_t)1 << keybits; 4571 else 4572 key &= ~(hammer2_key_t)1 << keybits; 4573 4574 *keyp = key; 4575 4576 return (keybits); 4577 } 4578 4579 #else 4580 4581 /* 4582 * Directory indirect blocks. 4583 * 4584 * Covers both the inode index (directory of inodes), and directory contents 4585 * (filenames hardlinked to inodes). 4586 * 4587 * Because directory keys are hashed we generally try to cut the space in 4588 * half. We accomodate the inode index (which tends to have linearly 4589 * increasing inode numbers) by ensuring that the keyspace is at least large 4590 * enough to fill up the indirect block being created. 4591 */ 4592 static int 4593 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4594 int keybits, hammer2_blockref_t *base, int count, 4595 int ncount) 4596 { 4597 hammer2_blockref_t *bref; 4598 hammer2_chain_t *chain; 4599 hammer2_key_t key_beg; 4600 hammer2_key_t key_end; 4601 hammer2_key_t key_next; 4602 hammer2_key_t key; 4603 int nkeybits; 4604 int locount; 4605 int hicount; 4606 int maxloops = 300000; 4607 4608 /* 4609 * Shortcut if the parent is the inode. In this situation the 4610 * parent has 4+1 directory entries and we are creating an indirect 4611 * block capable of holding many more. 4612 */ 4613 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4614 return 63; 4615 } 4616 4617 key = *keyp; 4618 locount = 0; 4619 hicount = 0; 4620 4621 /* 4622 * Calculate the range of keys in the array being careful to skip 4623 * slots which are overridden with a deletion. 4624 */ 4625 key_beg = 0; 4626 key_end = HAMMER2_KEY_MAX; 4627 hammer2_spin_ex(&parent->core.spin); 4628 4629 for (;;) { 4630 if (--maxloops == 0) { 4631 panic("indkey_freemap shit %p %p:%d\n", 4632 parent, base, count); 4633 } 4634 chain = hammer2_combined_find(parent, base, count, 4635 &key_next, 4636 key_beg, key_end, 4637 &bref); 4638 4639 /* 4640 * Exhausted search 4641 */ 4642 if (bref == NULL) 4643 break; 4644 4645 /* 4646 * Deleted object 4647 */ 4648 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4649 if (key_next == 0 || key_next > key_end) 4650 break; 4651 key_beg = key_next; 4652 continue; 4653 } 4654 4655 /* 4656 * Use the full live (not deleted) element for the scan 4657 * iteration. HAMMER2 does not allow partial replacements. 4658 * 4659 * XXX should be built into hammer2_combined_find(). 4660 */ 4661 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4662 4663 /* 4664 * Expand our calculated key range (key, keybits) to fit 4665 * the scanned key. nkeybits represents the full range 4666 * that we will later cut in half (two halves @ nkeybits - 1). 4667 */ 4668 nkeybits = keybits; 4669 if (nkeybits < bref->keybits) { 4670 if (bref->keybits > 64) { 4671 kprintf("bad bref chain %p bref %p\n", 4672 chain, bref); 4673 Debugger("fubar"); 4674 } 4675 nkeybits = bref->keybits; 4676 } 4677 while (nkeybits < 64 && 4678 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4679 (key ^ bref->key)) != 0) { 4680 ++nkeybits; 4681 } 4682 4683 /* 4684 * If the new key range is larger we have to determine 4685 * which side of the new key range the existing keys fall 4686 * under by checking the high bit, then collapsing the 4687 * locount into the hicount or vise-versa. 4688 */ 4689 if (keybits != nkeybits) { 4690 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4691 hicount += locount; 4692 locount = 0; 4693 } else { 4694 locount += hicount; 4695 hicount = 0; 4696 } 4697 keybits = nkeybits; 4698 } 4699 4700 /* 4701 * The newly scanned key will be in the lower half or the 4702 * upper half of the (new) key range. 4703 */ 4704 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4705 ++hicount; 4706 else 4707 ++locount; 4708 4709 if (key_next == 0) 4710 break; 4711 key_beg = key_next; 4712 } 4713 hammer2_spin_unex(&parent->core.spin); 4714 bref = NULL; /* now invalid (safety) */ 4715 4716 /* 4717 * Adjust keybits to represent half of the full range calculated 4718 * above (radix 63 max) for our new indirect block. 4719 */ 4720 --keybits; 4721 4722 /* 4723 * Expand keybits to hold at least ncount elements. ncount will be 4724 * a power of 2. This is to try to completely fill leaf nodes (at 4725 * least for keys which are not hashes). 4726 * 4727 * We aren't counting 'in' or 'out', we are counting 'high side' 4728 * and 'low side' based on the bit at (1LL << keybits). We want 4729 * everything to be inside in these cases so shift it all to 4730 * the low or high side depending on the new high bit. 4731 */ 4732 while (((hammer2_key_t)1 << keybits) < ncount) { 4733 ++keybits; 4734 if (key & ((hammer2_key_t)1 << keybits)) { 4735 hicount += locount; 4736 locount = 0; 4737 } else { 4738 locount += hicount; 4739 hicount = 0; 4740 } 4741 } 4742 4743 if (hicount > locount) 4744 key |= (hammer2_key_t)1 << keybits; 4745 else 4746 key &= ~(hammer2_key_t)1 << keybits; 4747 4748 *keyp = key; 4749 4750 return (keybits); 4751 } 4752 4753 #endif 4754 4755 /* 4756 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 4757 * it exists. 4758 * 4759 * Both parent and chain must be locked exclusively. 4760 * 4761 * This function will modify the parent if the blockref requires removal 4762 * from the parent's block table. 4763 * 4764 * This function is NOT recursive. Any entity already pushed into the 4765 * chain (such as an inode) may still need visibility into its contents, 4766 * as well as the ability to read and modify the contents. For example, 4767 * for an unlinked file which is still open. 4768 * 4769 * Also note that the flusher is responsible for cleaning up empty 4770 * indirect blocks. 4771 */ 4772 int 4773 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 4774 hammer2_tid_t mtid, int flags) 4775 { 4776 int error = 0; 4777 4778 KKASSERT(hammer2_mtx_owned(&chain->lock)); 4779 4780 /* 4781 * Nothing to do if already marked. 4782 * 4783 * We need the spinlock on the core whos RBTREE contains chain 4784 * to protect against races. 4785 */ 4786 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 4787 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 4788 chain->parent == parent); 4789 error = _hammer2_chain_delete_helper(parent, chain, 4790 mtid, flags); 4791 } 4792 4793 /* 4794 * Permanent deletions mark the chain as destroyed. 4795 */ 4796 if (error == 0) { 4797 if (flags & HAMMER2_DELETE_PERMANENT) 4798 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 4799 hammer2_chain_setflush(chain); 4800 } 4801 4802 return error; 4803 } 4804 4805 /* 4806 * Returns the index of the nearest element in the blockref array >= elm. 4807 * Returns (count) if no element could be found. 4808 * 4809 * Sets *key_nextp to the next key for loop purposes but does not modify 4810 * it if the next key would be higher than the current value of *key_nextp. 4811 * Note that *key_nexp can overflow to 0, which should be tested by the 4812 * caller. 4813 * 4814 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4815 * held through the operation. 4816 */ 4817 static int 4818 hammer2_base_find(hammer2_chain_t *parent, 4819 hammer2_blockref_t *base, int count, 4820 hammer2_key_t *key_nextp, 4821 hammer2_key_t key_beg, hammer2_key_t key_end) 4822 { 4823 hammer2_blockref_t *scan; 4824 hammer2_key_t scan_end; 4825 int i; 4826 int limit; 4827 4828 /* 4829 * Require the live chain's already have their core's counted 4830 * so we can optimize operations. 4831 */ 4832 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 4833 4834 /* 4835 * Degenerate case 4836 */ 4837 if (count == 0 || base == NULL) 4838 return(count); 4839 4840 /* 4841 * Sequential optimization using parent->cache_index. This is 4842 * the most likely scenario. 4843 * 4844 * We can avoid trailing empty entries on live chains, otherwise 4845 * we might have to check the whole block array. 4846 */ 4847 i = parent->cache_index; /* SMP RACE OK */ 4848 cpu_ccfence(); 4849 limit = parent->core.live_zero; 4850 if (i >= limit) 4851 i = limit - 1; 4852 if (i < 0) 4853 i = 0; 4854 KKASSERT(i < count); 4855 4856 /* 4857 * Search backwards 4858 */ 4859 scan = &base[i]; 4860 while (i > 0 && (scan->type == 0 || scan->key > key_beg)) { 4861 --scan; 4862 --i; 4863 } 4864 parent->cache_index = i; 4865 4866 /* 4867 * Search forwards, stop when we find a scan element which 4868 * encloses the key or until we know that there are no further 4869 * elements. 4870 */ 4871 while (i < count) { 4872 if (scan->type != 0) { 4873 scan_end = scan->key + 4874 ((hammer2_key_t)1 << scan->keybits) - 1; 4875 if (scan->key > key_beg || scan_end >= key_beg) 4876 break; 4877 } 4878 if (i >= limit) 4879 return (count); 4880 ++scan; 4881 ++i; 4882 } 4883 if (i != count) { 4884 parent->cache_index = i; 4885 if (i >= limit) { 4886 i = count; 4887 } else { 4888 scan_end = scan->key + 4889 ((hammer2_key_t)1 << scan->keybits); 4890 if (scan_end && (*key_nextp > scan_end || 4891 *key_nextp == 0)) { 4892 *key_nextp = scan_end; 4893 } 4894 } 4895 } 4896 return (i); 4897 } 4898 4899 /* 4900 * Do a combined search and return the next match either from the blockref 4901 * array or from the in-memory chain. Sets *bresp to the returned bref in 4902 * both cases, or sets it to NULL if the search exhausted. Only returns 4903 * a non-NULL chain if the search matched from the in-memory chain. 4904 * 4905 * When no in-memory chain has been found and a non-NULL bref is returned 4906 * in *bresp. 4907 * 4908 * 4909 * The returned chain is not locked or referenced. Use the returned bref 4910 * to determine if the search exhausted or not. Iterate if the base find 4911 * is chosen but matches a deleted chain. 4912 * 4913 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4914 * held through the operation. 4915 */ 4916 hammer2_chain_t * 4917 hammer2_combined_find(hammer2_chain_t *parent, 4918 hammer2_blockref_t *base, int count, 4919 hammer2_key_t *key_nextp, 4920 hammer2_key_t key_beg, hammer2_key_t key_end, 4921 hammer2_blockref_t **bresp) 4922 { 4923 hammer2_blockref_t *bref; 4924 hammer2_chain_t *chain; 4925 int i; 4926 4927 /* 4928 * Lookup in block array and in rbtree. 4929 */ 4930 *key_nextp = key_end + 1; 4931 i = hammer2_base_find(parent, base, count, key_nextp, 4932 key_beg, key_end); 4933 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 4934 4935 /* 4936 * Neither matched 4937 */ 4938 if (i == count && chain == NULL) { 4939 *bresp = NULL; 4940 return(NULL); 4941 } 4942 4943 /* 4944 * Only chain matched. 4945 */ 4946 if (i == count) { 4947 bref = &chain->bref; 4948 goto found; 4949 } 4950 4951 /* 4952 * Only blockref matched. 4953 */ 4954 if (chain == NULL) { 4955 bref = &base[i]; 4956 goto found; 4957 } 4958 4959 /* 4960 * Both in-memory and blockref matched, select the nearer element. 4961 * 4962 * If both are flush with the left-hand side or both are the 4963 * same distance away, select the chain. In this situation the 4964 * chain must have been loaded from the matching blockmap. 4965 */ 4966 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 4967 chain->bref.key == base[i].key) { 4968 KKASSERT(chain->bref.key == base[i].key); 4969 bref = &chain->bref; 4970 goto found; 4971 } 4972 4973 /* 4974 * Select the nearer key 4975 */ 4976 if (chain->bref.key < base[i].key) { 4977 bref = &chain->bref; 4978 } else { 4979 bref = &base[i]; 4980 chain = NULL; 4981 } 4982 4983 /* 4984 * If the bref is out of bounds we've exhausted our search. 4985 */ 4986 found: 4987 if (bref->key > key_end) { 4988 *bresp = NULL; 4989 chain = NULL; 4990 } else { 4991 *bresp = bref; 4992 } 4993 return(chain); 4994 } 4995 4996 /* 4997 * Locate the specified block array element and delete it. The element 4998 * must exist. 4999 * 5000 * The spin lock on the related chain must be held. 5001 * 5002 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5003 * need to be adjusted when we commit the media change. 5004 */ 5005 void 5006 hammer2_base_delete(hammer2_chain_t *parent, 5007 hammer2_blockref_t *base, int count, 5008 hammer2_chain_t *chain) 5009 { 5010 hammer2_blockref_t *elm = &chain->bref; 5011 hammer2_blockref_t *scan; 5012 hammer2_key_t key_next; 5013 int i; 5014 5015 /* 5016 * Delete element. Expect the element to exist. 5017 * 5018 * XXX see caller, flush code not yet sophisticated enough to prevent 5019 * re-flushed in some cases. 5020 */ 5021 key_next = 0; /* max range */ 5022 i = hammer2_base_find(parent, base, count, &key_next, 5023 elm->key, elm->key); 5024 scan = &base[i]; 5025 if (i == count || scan->type == 0 || 5026 scan->key != elm->key || 5027 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 5028 scan->keybits != elm->keybits)) { 5029 hammer2_spin_unex(&parent->core.spin); 5030 panic("delete base %p element not found at %d/%d elm %p\n", 5031 base, i, count, elm); 5032 return; 5033 } 5034 5035 /* 5036 * Update stats and zero the entry. 5037 * 5038 * NOTE: Handle radix == 0 (0 bytes) case. 5039 */ 5040 if ((int)(scan->data_off & HAMMER2_OFF_MASK_RADIX)) { 5041 parent->bref.embed.stats.data_count -= (hammer2_off_t)1 << 5042 (int)(scan->data_off & HAMMER2_OFF_MASK_RADIX); 5043 } 5044 switch(scan->type) { 5045 case HAMMER2_BREF_TYPE_INODE: 5046 --parent->bref.embed.stats.inode_count; 5047 /* fall through */ 5048 case HAMMER2_BREF_TYPE_DATA: 5049 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5050 atomic_set_int(&chain->flags, 5051 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5052 } else { 5053 if (parent->bref.leaf_count) 5054 --parent->bref.leaf_count; 5055 } 5056 /* fall through */ 5057 case HAMMER2_BREF_TYPE_INDIRECT: 5058 if (scan->type != HAMMER2_BREF_TYPE_DATA) { 5059 parent->bref.embed.stats.data_count -= 5060 scan->embed.stats.data_count; 5061 parent->bref.embed.stats.inode_count -= 5062 scan->embed.stats.inode_count; 5063 } 5064 if (scan->type == HAMMER2_BREF_TYPE_INODE) 5065 break; 5066 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5067 atomic_set_int(&chain->flags, 5068 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5069 } else { 5070 if (parent->bref.leaf_count <= scan->leaf_count) 5071 parent->bref.leaf_count = 0; 5072 else 5073 parent->bref.leaf_count -= scan->leaf_count; 5074 } 5075 break; 5076 case HAMMER2_BREF_TYPE_DIRENT: 5077 if (parent->bref.leaf_count == HAMMER2_BLOCKREF_LEAF_MAX) { 5078 atomic_set_int(&chain->flags, 5079 HAMMER2_CHAIN_HINT_LEAF_COUNT); 5080 } else { 5081 if (parent->bref.leaf_count) 5082 --parent->bref.leaf_count; 5083 } 5084 default: 5085 break; 5086 } 5087 5088 bzero(scan, sizeof(*scan)); 5089 5090 /* 5091 * We can only optimize parent->core.live_zero for live chains. 5092 */ 5093 if (parent->core.live_zero == i + 1) { 5094 while (--i >= 0 && base[i].type == 0) 5095 ; 5096 parent->core.live_zero = i + 1; 5097 } 5098 5099 /* 5100 * Clear appropriate blockmap flags in chain. 5101 */ 5102 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 5103 HAMMER2_CHAIN_BMAPUPD); 5104 } 5105 5106 /* 5107 * Insert the specified element. The block array must not already have the 5108 * element and must have space available for the insertion. 5109 * 5110 * The spin lock on the related chain must be held. 5111 * 5112 * NOTE: live_count was adjusted when the chain was deleted, so it does not 5113 * need to be adjusted when we commit the media change. 5114 */ 5115 void 5116 hammer2_base_insert(hammer2_chain_t *parent, 5117 hammer2_blockref_t *base, int count, 5118 hammer2_chain_t *chain, hammer2_blockref_t *elm) 5119 { 5120 hammer2_key_t key_next; 5121 hammer2_key_t xkey; 5122 int i; 5123 int j; 5124 int k; 5125 int l; 5126 int u = 1; 5127 5128 /* 5129 * Insert new element. Expect the element to not already exist 5130 * unless we are replacing it. 5131 * 5132 * XXX see caller, flush code not yet sophisticated enough to prevent 5133 * re-flushed in some cases. 5134 */ 5135 key_next = 0; /* max range */ 5136 i = hammer2_base_find(parent, base, count, &key_next, 5137 elm->key, elm->key); 5138 5139 /* 5140 * Shortcut fill optimization, typical ordered insertion(s) may not 5141 * require a search. 5142 */ 5143 KKASSERT(i >= 0 && i <= count); 5144 5145 /* 5146 * Set appropriate blockmap flags in chain (if not NULL) 5147 */ 5148 if (chain) 5149 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 5150 5151 /* 5152 * Update stats and zero the entry 5153 */ 5154 if ((int)(elm->data_off & HAMMER2_OFF_MASK_RADIX)) { 5155 parent->bref.embed.stats.data_count += (hammer2_off_t)1 << 5156 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 5157 } 5158 switch(elm->type) { 5159 case HAMMER2_BREF_TYPE_INODE: 5160 ++parent->bref.embed.stats.inode_count; 5161 /* fall through */ 5162 case HAMMER2_BREF_TYPE_DATA: 5163 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5164 ++parent->bref.leaf_count; 5165 /* fall through */ 5166 case HAMMER2_BREF_TYPE_INDIRECT: 5167 if (elm->type != HAMMER2_BREF_TYPE_DATA) { 5168 parent->bref.embed.stats.data_count += 5169 elm->embed.stats.data_count; 5170 parent->bref.embed.stats.inode_count += 5171 elm->embed.stats.inode_count; 5172 } 5173 if (elm->type == HAMMER2_BREF_TYPE_INODE) 5174 break; 5175 if (parent->bref.leaf_count + elm->leaf_count < 5176 HAMMER2_BLOCKREF_LEAF_MAX) { 5177 parent->bref.leaf_count += elm->leaf_count; 5178 } else { 5179 parent->bref.leaf_count = HAMMER2_BLOCKREF_LEAF_MAX; 5180 } 5181 break; 5182 case HAMMER2_BREF_TYPE_DIRENT: 5183 if (parent->bref.leaf_count != HAMMER2_BLOCKREF_LEAF_MAX) 5184 ++parent->bref.leaf_count; 5185 break; 5186 default: 5187 break; 5188 } 5189 5190 5191 /* 5192 * We can only optimize parent->core.live_zero for live chains. 5193 */ 5194 if (i == count && parent->core.live_zero < count) { 5195 i = parent->core.live_zero++; 5196 base[i] = *elm; 5197 return; 5198 } 5199 5200 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 5201 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 5202 hammer2_spin_unex(&parent->core.spin); 5203 panic("insert base %p overlapping elements at %d elm %p\n", 5204 base, i, elm); 5205 } 5206 5207 /* 5208 * Try to find an empty slot before or after. 5209 */ 5210 j = i; 5211 k = i; 5212 while (j > 0 || k < count) { 5213 --j; 5214 if (j >= 0 && base[j].type == 0) { 5215 if (j == i - 1) { 5216 base[j] = *elm; 5217 } else { 5218 bcopy(&base[j+1], &base[j], 5219 (i - j - 1) * sizeof(*base)); 5220 base[i - 1] = *elm; 5221 } 5222 goto validate; 5223 } 5224 ++k; 5225 if (k < count && base[k].type == 0) { 5226 bcopy(&base[i], &base[i+1], 5227 (k - i) * sizeof(hammer2_blockref_t)); 5228 base[i] = *elm; 5229 5230 /* 5231 * We can only update parent->core.live_zero for live 5232 * chains. 5233 */ 5234 if (parent->core.live_zero <= k) 5235 parent->core.live_zero = k + 1; 5236 u = 2; 5237 goto validate; 5238 } 5239 } 5240 panic("hammer2_base_insert: no room!"); 5241 5242 /* 5243 * Debugging 5244 */ 5245 validate: 5246 key_next = 0; 5247 for (l = 0; l < count; ++l) { 5248 if (base[l].type) { 5249 key_next = base[l].key + 5250 ((hammer2_key_t)1 << base[l].keybits) - 1; 5251 break; 5252 } 5253 } 5254 while (++l < count) { 5255 if (base[l].type) { 5256 if (base[l].key <= key_next) 5257 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 5258 key_next = base[l].key + 5259 ((hammer2_key_t)1 << base[l].keybits) - 1; 5260 5261 } 5262 } 5263 5264 } 5265 5266 #if 0 5267 5268 /* 5269 * Sort the blockref array for the chain. Used by the flush code to 5270 * sort the blockref[] array. 5271 * 5272 * The chain must be exclusively locked AND spin-locked. 5273 */ 5274 typedef hammer2_blockref_t *hammer2_blockref_p; 5275 5276 static 5277 int 5278 hammer2_base_sort_callback(const void *v1, const void *v2) 5279 { 5280 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 5281 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 5282 5283 /* 5284 * Make sure empty elements are placed at the end of the array 5285 */ 5286 if (bref1->type == 0) { 5287 if (bref2->type == 0) 5288 return(0); 5289 return(1); 5290 } else if (bref2->type == 0) { 5291 return(-1); 5292 } 5293 5294 /* 5295 * Sort by key 5296 */ 5297 if (bref1->key < bref2->key) 5298 return(-1); 5299 if (bref1->key > bref2->key) 5300 return(1); 5301 return(0); 5302 } 5303 5304 void 5305 hammer2_base_sort(hammer2_chain_t *chain) 5306 { 5307 hammer2_blockref_t *base; 5308 int count; 5309 5310 switch(chain->bref.type) { 5311 case HAMMER2_BREF_TYPE_INODE: 5312 /* 5313 * Special shortcut for embedded data returns the inode 5314 * itself. Callers must detect this condition and access 5315 * the embedded data (the strategy code does this for us). 5316 * 5317 * This is only applicable to regular files and softlinks. 5318 */ 5319 if (chain->data->ipdata.meta.op_flags & 5320 HAMMER2_OPFLAG_DIRECTDATA) { 5321 return; 5322 } 5323 base = &chain->data->ipdata.u.blockset.blockref[0]; 5324 count = HAMMER2_SET_COUNT; 5325 break; 5326 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 5327 case HAMMER2_BREF_TYPE_INDIRECT: 5328 /* 5329 * Optimize indirect blocks in the INITIAL state to avoid 5330 * I/O. 5331 */ 5332 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 5333 base = &chain->data->npdata[0]; 5334 count = chain->bytes / sizeof(hammer2_blockref_t); 5335 break; 5336 case HAMMER2_BREF_TYPE_VOLUME: 5337 base = &chain->data->voldata.sroot_blockset.blockref[0]; 5338 count = HAMMER2_SET_COUNT; 5339 break; 5340 case HAMMER2_BREF_TYPE_FREEMAP: 5341 base = &chain->data->blkset.blockref[0]; 5342 count = HAMMER2_SET_COUNT; 5343 break; 5344 default: 5345 kprintf("hammer2_chain_lookup: unrecognized " 5346 "blockref(A) type: %d", 5347 chain->bref.type); 5348 while (1) 5349 tsleep(&base, 0, "dead", 0); 5350 panic("hammer2_chain_lookup: unrecognized " 5351 "blockref(A) type: %d", 5352 chain->bref.type); 5353 base = NULL; /* safety */ 5354 count = 0; /* safety */ 5355 } 5356 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 5357 } 5358 5359 #endif 5360 5361 /* 5362 * Chain memory management 5363 */ 5364 void 5365 hammer2_chain_wait(hammer2_chain_t *chain) 5366 { 5367 tsleep(chain, 0, "chnflw", 1); 5368 } 5369 5370 const hammer2_media_data_t * 5371 hammer2_chain_rdata(hammer2_chain_t *chain) 5372 { 5373 KKASSERT(chain->data != NULL); 5374 return (chain->data); 5375 } 5376 5377 hammer2_media_data_t * 5378 hammer2_chain_wdata(hammer2_chain_t *chain) 5379 { 5380 KKASSERT(chain->data != NULL); 5381 return (chain->data); 5382 } 5383 5384 /* 5385 * Set the check data for a chain. This can be a heavy-weight operation 5386 * and typically only runs on-flush. For file data check data is calculated 5387 * when the logical buffers are flushed. 5388 */ 5389 void 5390 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 5391 { 5392 chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO; 5393 5394 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5395 case HAMMER2_CHECK_NONE: 5396 break; 5397 case HAMMER2_CHECK_DISABLED: 5398 break; 5399 case HAMMER2_CHECK_ISCSI32: 5400 chain->bref.check.iscsi32.value = 5401 hammer2_icrc32(bdata, chain->bytes); 5402 break; 5403 case HAMMER2_CHECK_XXHASH64: 5404 chain->bref.check.xxhash64.value = 5405 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5406 break; 5407 case HAMMER2_CHECK_SHA192: 5408 { 5409 SHA256_CTX hash_ctx; 5410 union { 5411 uint8_t digest[SHA256_DIGEST_LENGTH]; 5412 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5413 } u; 5414 5415 SHA256_Init(&hash_ctx); 5416 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5417 SHA256_Final(u.digest, &hash_ctx); 5418 u.digest64[2] ^= u.digest64[3]; 5419 bcopy(u.digest, 5420 chain->bref.check.sha192.data, 5421 sizeof(chain->bref.check.sha192.data)); 5422 } 5423 break; 5424 case HAMMER2_CHECK_FREEMAP: 5425 chain->bref.check.freemap.icrc32 = 5426 hammer2_icrc32(bdata, chain->bytes); 5427 break; 5428 default: 5429 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5430 chain->bref.methods); 5431 break; 5432 } 5433 } 5434 5435 int 5436 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 5437 { 5438 uint32_t check32; 5439 uint64_t check64; 5440 int r; 5441 5442 if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO) 5443 return 1; 5444 5445 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 5446 case HAMMER2_CHECK_NONE: 5447 r = 1; 5448 break; 5449 case HAMMER2_CHECK_DISABLED: 5450 r = 1; 5451 break; 5452 case HAMMER2_CHECK_ISCSI32: 5453 check32 = hammer2_icrc32(bdata, chain->bytes); 5454 r = (chain->bref.check.iscsi32.value == check32); 5455 if (r == 0) { 5456 kprintf("chain %016jx.%02x meth=%02x CHECK FAIL " 5457 "(flags=%08x, bref/data %08x/%08x)\n", 5458 chain->bref.data_off, 5459 chain->bref.type, 5460 chain->bref.methods, 5461 chain->flags, 5462 chain->bref.check.iscsi32.value, 5463 check32); 5464 } 5465 hammer2_check_icrc32 += chain->bytes; 5466 break; 5467 case HAMMER2_CHECK_XXHASH64: 5468 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 5469 r = (chain->bref.check.xxhash64.value == check64); 5470 if (r == 0) { 5471 kprintf("chain %016jx.%02x key=%016jx " 5472 "meth=%02x CHECK FAIL " 5473 "(flags=%08x, bref/data %016jx/%016jx)\n", 5474 chain->bref.data_off, 5475 chain->bref.type, 5476 chain->bref.key, 5477 chain->bref.methods, 5478 chain->flags, 5479 chain->bref.check.xxhash64.value, 5480 check64); 5481 } 5482 hammer2_check_xxhash64 += chain->bytes; 5483 break; 5484 case HAMMER2_CHECK_SHA192: 5485 { 5486 SHA256_CTX hash_ctx; 5487 union { 5488 uint8_t digest[SHA256_DIGEST_LENGTH]; 5489 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 5490 } u; 5491 5492 SHA256_Init(&hash_ctx); 5493 SHA256_Update(&hash_ctx, bdata, chain->bytes); 5494 SHA256_Final(u.digest, &hash_ctx); 5495 u.digest64[2] ^= u.digest64[3]; 5496 if (bcmp(u.digest, 5497 chain->bref.check.sha192.data, 5498 sizeof(chain->bref.check.sha192.data)) == 0) { 5499 r = 1; 5500 } else { 5501 r = 0; 5502 kprintf("chain %016jx.%02x meth=%02x " 5503 "CHECK FAIL\n", 5504 chain->bref.data_off, 5505 chain->bref.type, 5506 chain->bref.methods); 5507 } 5508 } 5509 break; 5510 case HAMMER2_CHECK_FREEMAP: 5511 r = (chain->bref.check.freemap.icrc32 == 5512 hammer2_icrc32(bdata, chain->bytes)); 5513 if (r == 0) { 5514 kprintf("chain %016jx.%02x meth=%02x " 5515 "CHECK FAIL\n", 5516 chain->bref.data_off, 5517 chain->bref.type, 5518 chain->bref.methods); 5519 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 5520 chain->bref.check.freemap.icrc32, 5521 hammer2_icrc32(bdata, chain->bytes), 5522 chain->bytes); 5523 if (chain->dio) 5524 kprintf("dio %p buf %016jx,%d bdata %p/%p\n", 5525 chain->dio, chain->dio->bp->b_loffset, 5526 chain->dio->bp->b_bufsize, bdata, 5527 chain->dio->bp->b_data); 5528 } 5529 5530 break; 5531 default: 5532 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 5533 chain->bref.methods); 5534 r = 1; 5535 break; 5536 } 5537 return r; 5538 } 5539 5540 /* 5541 * Acquire the chain and parent representing the specified inode for the 5542 * device at the specified cluster index. 5543 * 5544 * The flags passed in are LOOKUP flags, not RESOLVE flags. 5545 * 5546 * If we are unable to locate the hardlink, INVAL is returned and *chainp 5547 * will be NULL. *parentp may still be set error or not, or NULL if the 5548 * parent itself could not be resolved. 5549 * 5550 * Caller must pass-in a valid or NULL *parentp or *chainp. The passed-in 5551 * *parentp and *chainp will be unlocked if not NULL. 5552 */ 5553 int 5554 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum, 5555 int clindex, int flags, 5556 hammer2_chain_t **parentp, hammer2_chain_t **chainp) 5557 { 5558 hammer2_chain_t *parent; 5559 hammer2_chain_t *rchain; 5560 hammer2_key_t key_dummy; 5561 int resolve_flags; 5562 int error; 5563 5564 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 5565 HAMMER2_RESOLVE_SHARED : 0; 5566 5567 /* 5568 * Caller expects us to replace these. 5569 */ 5570 if (*chainp) { 5571 hammer2_chain_unlock(*chainp); 5572 hammer2_chain_drop(*chainp); 5573 *chainp = NULL; 5574 } 5575 if (*parentp) { 5576 hammer2_chain_unlock(*parentp); 5577 hammer2_chain_drop(*parentp); 5578 *parentp = NULL; 5579 } 5580 5581 /* 5582 * Inodes hang off of the iroot (bit 63 is clear, differentiating 5583 * inodes from root directory entries in the key lookup). 5584 */ 5585 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 5586 rchain = NULL; 5587 if (parent) { 5588 rchain = hammer2_chain_lookup(&parent, &key_dummy, 5589 inum, inum, 5590 &error, flags); 5591 } else { 5592 error = HAMMER2_ERROR_EIO; 5593 } 5594 *parentp = parent; 5595 *chainp = rchain; 5596 5597 return error; 5598 } 5599 5600 /* 5601 * Used by the bulkscan code to snapshot the synchronized storage for 5602 * a volume, allowing it to be scanned concurrently against normal 5603 * operation. 5604 */ 5605 hammer2_chain_t * 5606 hammer2_chain_bulksnap(hammer2_dev_t *hmp) 5607 { 5608 hammer2_chain_t *copy; 5609 5610 copy = hammer2_chain_alloc(hmp, hmp->spmp, &hmp->vchain.bref); 5611 copy->data = kmalloc(sizeof(copy->data->voldata), 5612 hmp->mchain, 5613 M_WAITOK | M_ZERO); 5614 hammer2_voldata_lock(hmp); 5615 copy->data->voldata = hmp->volsync; 5616 hammer2_voldata_unlock(hmp); 5617 5618 return copy; 5619 } 5620 5621 void 5622 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 5623 { 5624 KKASSERT(copy->bref.type == HAMMER2_BREF_TYPE_VOLUME); 5625 KKASSERT(copy->data); 5626 kfree(copy->data, copy->hmp->mchain); 5627 copy->data = NULL; 5628 atomic_add_long(&hammer2_chain_allocs, -1); 5629 hammer2_chain_drop(copy); 5630 } 5631 5632 /* 5633 * Returns non-zero if the chain (INODE or DIRENT) matches the 5634 * filename. 5635 */ 5636 int 5637 hammer2_chain_dirent_test(hammer2_chain_t *chain, const char *name, 5638 size_t name_len) 5639 { 5640 const hammer2_inode_data_t *ripdata; 5641 const hammer2_dirent_head_t *den; 5642 5643 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 5644 ripdata = &chain->data->ipdata; 5645 if (ripdata->meta.name_len == name_len && 5646 bcmp(ripdata->filename, name, name_len) == 0) { 5647 return 1; 5648 } 5649 } 5650 if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT && 5651 chain->bref.embed.dirent.namlen == name_len) { 5652 den = &chain->bref.embed.dirent; 5653 if (name_len > sizeof(chain->bref.check.buf) && 5654 bcmp(chain->data->buf, name, name_len) == 0) { 5655 return 1; 5656 } 5657 if (name_len <= sizeof(chain->bref.check.buf) && 5658 bcmp(chain->bref.check.buf, name, name_len) == 0) { 5659 return 1; 5660 } 5661 } 5662 return 0; 5663 } 5664