1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static int hammer2_indirect_optimize; /* XXX SYSCTL */ 68 69 static hammer2_chain_t *hammer2_chain_create_indirect( 70 hammer2_trans_t *trans, hammer2_chain_t *parent, 71 hammer2_key_t key, int keybits, int for_type, int *errorp); 72 static void hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop); 73 static hammer2_chain_t *hammer2_combined_find( 74 hammer2_chain_t *parent, 75 hammer2_blockref_t *base, int count, 76 int *cache_indexp, hammer2_key_t *key_nextp, 77 hammer2_key_t key_beg, hammer2_key_t key_end, 78 hammer2_blockref_t **bresp); 79 80 /* 81 * Basic RBTree for chains (core->rbtree and core->dbtree). Chains cannot 82 * overlap in the RB trees. Deleted chains are moved from rbtree to either 83 * dbtree or to dbq. 84 * 85 * Chains in delete-duplicate sequences can always iterate through core_entry 86 * to locate the live version of the chain. 87 */ 88 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 89 90 int 91 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 92 { 93 hammer2_key_t c1_beg; 94 hammer2_key_t c1_end; 95 hammer2_key_t c2_beg; 96 hammer2_key_t c2_end; 97 98 /* 99 * Compare chains. Overlaps are not supposed to happen and catch 100 * any software issues early we count overlaps as a match. 101 */ 102 c1_beg = chain1->bref.key; 103 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 104 c2_beg = chain2->bref.key; 105 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 106 107 if (c1_end < c2_beg) /* fully to the left */ 108 return(-1); 109 if (c1_beg > c2_end) /* fully to the right */ 110 return(1); 111 return(0); /* overlap (must not cross edge boundary) */ 112 } 113 114 static __inline 115 int 116 hammer2_isclusterable(hammer2_chain_t *chain) 117 { 118 if (hammer2_cluster_enable) { 119 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 120 chain->bref.type == HAMMER2_BREF_TYPE_INODE || 121 chain->bref.type == HAMMER2_BREF_TYPE_DATA) { 122 return(1); 123 } 124 } 125 return(0); 126 } 127 128 /* 129 * Make a chain visible to the flusher. The flusher needs to be able to 130 * do flushes of a subdirectory chains or single files so it does a top-down 131 * recursion using the ONFLUSH flag for the recursion. It locates MODIFIED 132 * or UPDATE chains and flushes back up the chain to the root. 133 */ 134 void 135 hammer2_chain_setflush(hammer2_trans_t *trans, hammer2_chain_t *chain) 136 { 137 hammer2_chain_t *parent; 138 139 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 140 spin_lock(&chain->core.cst.spin); 141 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 142 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 143 if ((parent = chain->parent) == NULL) 144 break; 145 spin_lock(&parent->core.cst.spin); 146 spin_unlock(&chain->core.cst.spin); 147 chain = parent; 148 } 149 spin_unlock(&chain->core.cst.spin); 150 } 151 } 152 153 /* 154 * Allocate a new disconnected chain element representing the specified 155 * bref. chain->refs is set to 1 and the passed bref is copied to 156 * chain->bref. chain->bytes is derived from the bref. 157 * 158 * chain->core is NOT allocated and the media data and bp pointers are left 159 * NULL. The caller must call chain_core_alloc() to allocate or associate 160 * a core with the chain. 161 * 162 * chain->pmp inherits pmp unless the chain is an inode (other than the 163 * super-root inode). 164 * 165 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 166 */ 167 hammer2_chain_t * 168 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp, 169 hammer2_trans_t *trans, hammer2_blockref_t *bref) 170 { 171 hammer2_chain_t *chain; 172 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 173 174 /* 175 * Construct the appropriate system structure. 176 */ 177 switch(bref->type) { 178 case HAMMER2_BREF_TYPE_INODE: 179 case HAMMER2_BREF_TYPE_INDIRECT: 180 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 181 case HAMMER2_BREF_TYPE_DATA: 182 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 183 /* 184 * Chain's are really only associated with the hmp but we 185 * maintain a pmp association for per-mount memory tracking 186 * purposes. The pmp can be NULL. 187 */ 188 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 189 break; 190 case HAMMER2_BREF_TYPE_VOLUME: 191 case HAMMER2_BREF_TYPE_FREEMAP: 192 chain = NULL; 193 panic("hammer2_chain_alloc volume type illegal for op"); 194 default: 195 chain = NULL; 196 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 197 bref->type); 198 } 199 200 /* 201 * Initialize the new chain structure. 202 */ 203 chain->pmp = pmp; 204 chain->hmp = hmp; 205 chain->bref = *bref; 206 chain->bytes = bytes; 207 chain->refs = 1; 208 chain->flags = HAMMER2_CHAIN_ALLOCATED; 209 210 /* 211 * Set the PFS boundary flag if this chain represents a PFS root. 212 */ 213 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 214 chain->flags |= HAMMER2_CHAIN_PFSBOUNDARY; 215 216 return (chain); 217 } 218 219 /* 220 * Associate an existing core with the chain or allocate a new core. 221 * 222 * The core is not locked. No additional refs on the chain are made. 223 * (trans) must not be NULL if (core) is not NULL. 224 * 225 * When chains are delete-duplicated during flushes we insert nchain on 226 * the ownerq after ochain instead of at the end in order to give the 227 * drop code visibility in the correct order, otherwise drops can be missed. 228 */ 229 void 230 hammer2_chain_core_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain) 231 { 232 hammer2_chain_core_t *core = &chain->core; 233 234 /* 235 * Fresh core under nchain (no multi-homing of ochain's 236 * sub-tree). 237 */ 238 RB_INIT(&core->rbtree); /* live chains */ 239 ccms_cst_init(&core->cst, chain); 240 } 241 242 /* 243 * Add a reference to a chain element, preventing its destruction. 244 * 245 * (can be called with spinlock held) 246 */ 247 void 248 hammer2_chain_ref(hammer2_chain_t *chain) 249 { 250 atomic_add_int(&chain->refs, 1); 251 } 252 253 /* 254 * Insert the chain in the core rbtree. 255 * 256 * Normal insertions are placed in the live rbtree. Insertion of a deleted 257 * chain is a special case used by the flush code that is placed on the 258 * unstaged deleted list to avoid confusing the live view. 259 */ 260 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 261 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 262 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 263 264 static 265 int 266 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 267 int flags, int generation) 268 { 269 hammer2_chain_t *xchain; 270 int error = 0; 271 272 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 273 spin_lock(&parent->core.cst.spin); 274 275 /* 276 * Interlocked by spinlock, check for race 277 */ 278 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 279 parent->core.generation != generation) { 280 error = EAGAIN; 281 goto failed; 282 } 283 284 /* 285 * Insert chain 286 */ 287 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 288 KASSERT(xchain == NULL, 289 ("hammer2_chain_insert: collision %p %p", chain, xchain)); 290 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 291 chain->parent = parent; 292 ++parent->core.chain_count; 293 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 294 295 /* 296 * We have to keep track of the effective live-view blockref count 297 * so the create code knows when to push an indirect block. 298 */ 299 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 300 atomic_add_int(&parent->core.live_count, 1); 301 failed: 302 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 303 spin_unlock(&parent->core.cst.spin); 304 return error; 305 } 306 307 /* 308 * Drop the caller's reference to the chain. When the ref count drops to 309 * zero this function will try to disassociate the chain from its parent and 310 * deallocate it, then recursely drop the parent using the implied ref 311 * from the chain's chain->parent. 312 */ 313 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain); 314 315 void 316 hammer2_chain_drop(hammer2_chain_t *chain) 317 { 318 u_int refs; 319 u_int need = 0; 320 321 if (hammer2_debug & 0x200000) 322 Debugger("drop"); 323 324 if (chain->flags & HAMMER2_CHAIN_UPDATE) 325 ++need; 326 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 327 ++need; 328 KKASSERT(chain->refs > need); 329 330 while (chain) { 331 refs = chain->refs; 332 cpu_ccfence(); 333 KKASSERT(refs > 0); 334 335 if (refs == 1) { 336 chain = hammer2_chain_lastdrop(chain); 337 } else { 338 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 339 break; 340 /* retry the same chain */ 341 } 342 } 343 } 344 345 /* 346 * Safe handling of the 1->0 transition on chain. Returns a chain for 347 * recursive drop or NULL, possibly returning the same chain if the atomic 348 * op fails. 349 * 350 * Whem two chains need to be recursively dropped we use the chain 351 * we would otherwise free to placehold the additional chain. It's a bit 352 * convoluted but we can't just recurse without potentially blowing out 353 * the kernel stack. 354 * 355 * The chain cannot be freed if it has a non-empty core (children) or 356 * it is not at the head of ownerq. 357 * 358 * The cst spinlock is allowed nest child-to-parent (not parent-to-child). 359 */ 360 static 361 hammer2_chain_t * 362 hammer2_chain_lastdrop(hammer2_chain_t *chain) 363 { 364 hammer2_pfsmount_t *pmp; 365 hammer2_mount_t *hmp; 366 hammer2_chain_t *parent; 367 hammer2_chain_t *rdrop; 368 369 /* 370 * Spinlock the core and check to see if it is empty. If it is 371 * not empty we leave chain intact with refs == 0. The elements 372 * in core->rbtree are associated with other chains contemporary 373 * with ours but not with our chain directly. 374 */ 375 spin_lock(&chain->core.cst.spin); 376 377 /* 378 * We can't free non-stale chains with children until we are 379 * able to free the children because there might be a flush 380 * dependency. Flushes of stale children (which should also 381 * have their deleted flag set) short-cut recursive flush 382 * dependencies and can be freed here. Any flushes which run 383 * through stale children due to the flush synchronization 384 * point should have a FLUSH_* bit set in the chain and not 385 * reach lastdrop at this time. 386 * 387 * NOTE: We return (chain) on failure to retry. 388 */ 389 if (chain->core.chain_count) { 390 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 391 spin_unlock(&chain->core.cst.spin); 392 chain = NULL; /* success */ 393 } else { 394 spin_unlock(&chain->core.cst.spin); 395 } 396 return(chain); 397 } 398 /* no chains left under us */ 399 400 /* 401 * chain->core has no children left so no accessors can get to our 402 * chain from there. Now we have to lock the parent core to interlock 403 * remaining possible accessors that might bump chain's refs before 404 * we can safely drop chain's refs with intent to free the chain. 405 */ 406 hmp = chain->hmp; 407 pmp = chain->pmp; /* can be NULL */ 408 rdrop = NULL; 409 410 /* 411 * Spinlock the parent and try to drop the last ref on chain. 412 * On success remove chain from its parent, otherwise return NULL. 413 * 414 * (normal core locks are top-down recursive but we define core 415 * spinlocks as bottom-up recursive, so this is safe). 416 */ 417 if ((parent = chain->parent) != NULL) { 418 spin_lock(&parent->core.cst.spin); 419 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 420 /* 1->0 transition failed */ 421 spin_unlock(&parent->core.cst.spin); 422 spin_unlock(&chain->core.cst.spin); 423 return(chain); /* retry */ 424 } 425 426 /* 427 * 1->0 transition successful, remove chain from its 428 * above core. 429 */ 430 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 431 RB_REMOVE(hammer2_chain_tree, 432 &parent->core.rbtree, chain); 433 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 434 --parent->core.chain_count; 435 chain->parent = NULL; 436 } 437 438 /* 439 * If our chain was the last chain in the parent's core the 440 * core is now empty and its parent might have to be 441 * re-dropped if it has 0 refs. 442 */ 443 if (parent->core.chain_count == 0) { 444 rdrop = parent; 445 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) { 446 rdrop = NULL; 447 } 448 } 449 spin_unlock(&parent->core.cst.spin); 450 parent = NULL; /* safety */ 451 } 452 453 /* 454 * Successful 1->0 transition and the chain can be destroyed now. 455 * 456 * We still have the core spinlock, and core's chain_count is 0. 457 * Any parent spinlock is gone. 458 */ 459 spin_unlock(&chain->core.cst.spin); 460 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 461 chain->core.chain_count == 0); 462 KKASSERT(chain->core.cst.count == 0); 463 KKASSERT(chain->core.cst.upgrade == 0); 464 465 /* 466 * All spin locks are gone, finish freeing stuff. 467 */ 468 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 469 HAMMER2_CHAIN_MODIFIED)) == 0); 470 hammer2_chain_drop_data(chain, 1); 471 472 KKASSERT(chain->dio == NULL); 473 474 /* 475 * Once chain resources are gone we can use the now dead chain 476 * structure to placehold what might otherwise require a recursive 477 * drop, because we have potentially two things to drop and can only 478 * return one directly. 479 */ 480 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 481 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED; 482 chain->hmp = NULL; 483 kfree(chain, hmp->mchain); 484 } 485 486 /* 487 * Possible chaining loop when parent re-drop needed. 488 */ 489 return(rdrop); 490 } 491 492 /* 493 * On either last lock release or last drop 494 */ 495 static void 496 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop) 497 { 498 /*hammer2_mount_t *hmp = chain->hmp;*/ 499 500 switch(chain->bref.type) { 501 case HAMMER2_BREF_TYPE_VOLUME: 502 case HAMMER2_BREF_TYPE_FREEMAP: 503 if (lastdrop) 504 chain->data = NULL; 505 break; 506 default: 507 KKASSERT(chain->data == NULL); 508 break; 509 } 510 } 511 512 /* 513 * Ref and lock a chain element, acquiring its data with I/O if necessary, 514 * and specify how you would like the data to be resolved. 515 * 516 * Returns 0 on success or an error code if the data could not be acquired. 517 * The chain element is locked on return regardless of whether an error 518 * occurred or not. 519 * 520 * The lock is allowed to recurse, multiple locking ops will aggregate 521 * the requested resolve types. Once data is assigned it will not be 522 * removed until the last unlock. 523 * 524 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 525 * (typically used to avoid device/logical buffer 526 * aliasing for data) 527 * 528 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 529 * the INITIAL-create state (indirect blocks only). 530 * 531 * Do not resolve data elements for DATA chains. 532 * (typically used to avoid device/logical buffer 533 * aliasing for data) 534 * 535 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 536 * 537 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 538 * it will be locked exclusive. 539 * 540 * NOTE: Embedded elements (volume header, inodes) are always resolved 541 * regardless. 542 * 543 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 544 * element will instantiate and zero its buffer, and flush it on 545 * release. 546 * 547 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 548 * so as not to instantiate a device buffer, which could alias against 549 * a logical file buffer. However, if ALWAYS is specified the 550 * device buffer will be instantiated anyway. 551 * 552 * WARNING! If data must be fetched a shared lock will temporarily be 553 * upgraded to exclusive. However, a deadlock can occur if 554 * the caller owns more than one shared lock. 555 */ 556 int 557 hammer2_chain_lock(hammer2_chain_t *chain, int how) 558 { 559 hammer2_mount_t *hmp; 560 hammer2_blockref_t *bref; 561 ccms_state_t ostate; 562 char *bdata; 563 int error; 564 565 /* 566 * Ref and lock the element. Recursive locks are allowed. 567 */ 568 if ((how & HAMMER2_RESOLVE_NOREF) == 0) 569 hammer2_chain_ref(chain); 570 atomic_add_int(&chain->lockcnt, 1); 571 572 hmp = chain->hmp; 573 KKASSERT(hmp != NULL); 574 575 /* 576 * Get the appropriate lock. 577 */ 578 if (how & HAMMER2_RESOLVE_SHARED) 579 ccms_thread_lock(&chain->core.cst, CCMS_STATE_SHARED); 580 else 581 ccms_thread_lock(&chain->core.cst, CCMS_STATE_EXCLUSIVE); 582 583 /* 584 * If we already have a valid data pointer no further action is 585 * necessary. 586 */ 587 if (chain->data) 588 return (0); 589 590 /* 591 * Do we have to resolve the data? 592 */ 593 switch(how & HAMMER2_RESOLVE_MASK) { 594 case HAMMER2_RESOLVE_NEVER: 595 return(0); 596 case HAMMER2_RESOLVE_MAYBE: 597 if (chain->flags & HAMMER2_CHAIN_INITIAL) 598 return(0); 599 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 600 return(0); 601 #if 0 602 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 603 return(0); 604 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 605 return(0); 606 #endif 607 /* fall through */ 608 case HAMMER2_RESOLVE_ALWAYS: 609 break; 610 } 611 612 /* 613 * Upgrade to an exclusive lock so we can safely manipulate the 614 * buffer cache. If another thread got to it before us we 615 * can just return. 616 */ 617 ostate = ccms_thread_lock_upgrade(&chain->core.cst); 618 if (chain->data) { 619 ccms_thread_lock_downgrade(&chain->core.cst, ostate); 620 return (0); 621 } 622 623 /* 624 * We must resolve to a device buffer, either by issuing I/O or 625 * by creating a zero-fill element. We do not mark the buffer 626 * dirty when creating a zero-fill element (the hammer2_chain_modify() 627 * API must still be used to do that). 628 * 629 * The device buffer is variable-sized in powers of 2 down 630 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 631 * chunk always contains buffers of the same size. (XXX) 632 * 633 * The minimum physical IO size may be larger than the variable 634 * block size. 635 */ 636 bref = &chain->bref; 637 638 /* 639 * The getblk() optimization can only be used on newly created 640 * elements if the physical block size matches the request. 641 */ 642 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 643 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, 644 &chain->dio); 645 } else { 646 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes, 647 &chain->dio); 648 hammer2_adjreadcounter(&chain->bref, chain->bytes); 649 } 650 651 if (error) { 652 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 653 (intmax_t)bref->data_off, error); 654 hammer2_io_bqrelse(&chain->dio); 655 ccms_thread_lock_downgrade(&chain->core.cst, ostate); 656 return (error); 657 } 658 659 #if 0 660 /* 661 * No need for this, always require that hammer2_chain_modify() 662 * be called before any modifying operations, which ensures that 663 * the underlying dio is dirty. 664 */ 665 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) && 666 !hammer2_io_isdirty(chain->dio)) { 667 hammer2_io_setdirty(chain->dio); 668 } 669 #endif 670 671 /* 672 * Clear INITIAL. In this case we used io_new() and the buffer has 673 * been zero'd and marked dirty. 674 */ 675 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 676 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 677 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 678 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO; 679 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 680 /* 681 * check data not currently synchronized due to 682 * modification. XXX assumes data stays in the buffer 683 * cache, which might not be true (need biodep on flush 684 * to calculate crc? or simple crc?). 685 */ 686 } else { 687 if (hammer2_chain_testcheck(chain, bdata) == 0) { 688 kprintf("chain %016jx.%02x meth=%02x CHECK FAIL %08x (flags=%08x)\n", 689 690 chain->bref.data_off, 691 chain->bref.type, 692 chain->bref.methods, 693 hammer2_icrc32(bdata, chain->bytes), 694 chain->flags); 695 } 696 } 697 698 /* 699 * Setup the data pointer, either pointing it to an embedded data 700 * structure and copying the data from the buffer, or pointing it 701 * into the buffer. 702 * 703 * The buffer is not retained when copying to an embedded data 704 * structure in order to avoid potential deadlocks or recursions 705 * on the same physical buffer. 706 */ 707 switch (bref->type) { 708 case HAMMER2_BREF_TYPE_VOLUME: 709 case HAMMER2_BREF_TYPE_FREEMAP: 710 /* 711 * Copy data from bp to embedded buffer 712 */ 713 panic("hammer2_chain_lock: called on unresolved volume header"); 714 break; 715 case HAMMER2_BREF_TYPE_INODE: 716 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 717 case HAMMER2_BREF_TYPE_INDIRECT: 718 case HAMMER2_BREF_TYPE_DATA: 719 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 720 default: 721 /* 722 * Point data at the device buffer and leave dio intact. 723 */ 724 chain->data = (void *)bdata; 725 break; 726 } 727 ccms_thread_lock_downgrade(&chain->core.cst, ostate); 728 return (0); 729 } 730 731 /* 732 * Unlock and deref a chain element. 733 * 734 * On the last lock release any non-embedded data (chain->dio) will be 735 * retired. 736 */ 737 void 738 hammer2_chain_unlock(hammer2_chain_t *chain) 739 { 740 ccms_state_t ostate; 741 long *counterp; 742 u_int lockcnt; 743 744 /* 745 * The core->cst lock can be shared across several chains so we 746 * need to track the per-chain lockcnt separately. 747 * 748 * If multiple locks are present (or being attempted) on this 749 * particular chain we can just unlock, drop refs, and return. 750 * 751 * Otherwise fall-through on the 1->0 transition. 752 */ 753 for (;;) { 754 lockcnt = chain->lockcnt; 755 KKASSERT(lockcnt > 0); 756 cpu_ccfence(); 757 if (lockcnt > 1) { 758 if (atomic_cmpset_int(&chain->lockcnt, 759 lockcnt, lockcnt - 1)) { 760 ccms_thread_unlock(&chain->core.cst); 761 hammer2_chain_drop(chain); 762 return; 763 } 764 } else { 765 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 766 break; 767 } 768 /* retry */ 769 } 770 771 /* 772 * On the 1->0 transition we upgrade the core lock (if necessary) 773 * to exclusive for terminal processing. If after upgrading we find 774 * that lockcnt is non-zero, another thread is racing us and will 775 * handle the unload for us later on, so just cleanup and return 776 * leaving the data/io intact 777 * 778 * Otherwise if lockcnt is still 0 it is possible for it to become 779 * non-zero and race, but since we hold the core->cst lock 780 * exclusively all that will happen is that the chain will be 781 * reloaded after we unload it. 782 */ 783 ostate = ccms_thread_lock_upgrade(&chain->core.cst); 784 if (chain->lockcnt) { 785 ccms_thread_unlock_upgraded(&chain->core.cst, ostate); 786 hammer2_chain_drop(chain); 787 return; 788 } 789 790 /* 791 * Shortcut the case if the data is embedded or not resolved. 792 * 793 * Do NOT NULL out chain->data (e.g. inode data), it might be 794 * dirty. 795 */ 796 if (chain->dio == NULL) { 797 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) 798 hammer2_chain_drop_data(chain, 0); 799 ccms_thread_unlock_upgraded(&chain->core.cst, ostate); 800 hammer2_chain_drop(chain); 801 return; 802 } 803 804 /* 805 * Statistics 806 */ 807 if (hammer2_io_isdirty(chain->dio) == 0) { 808 ; 809 } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 810 switch(chain->bref.type) { 811 case HAMMER2_BREF_TYPE_DATA: 812 counterp = &hammer2_ioa_file_write; 813 break; 814 case HAMMER2_BREF_TYPE_INODE: 815 counterp = &hammer2_ioa_meta_write; 816 break; 817 case HAMMER2_BREF_TYPE_INDIRECT: 818 counterp = &hammer2_ioa_indr_write; 819 break; 820 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 821 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 822 counterp = &hammer2_ioa_fmap_write; 823 break; 824 default: 825 counterp = &hammer2_ioa_volu_write; 826 break; 827 } 828 *counterp += chain->bytes; 829 } else { 830 switch(chain->bref.type) { 831 case HAMMER2_BREF_TYPE_DATA: 832 counterp = &hammer2_iod_file_write; 833 break; 834 case HAMMER2_BREF_TYPE_INODE: 835 counterp = &hammer2_iod_meta_write; 836 break; 837 case HAMMER2_BREF_TYPE_INDIRECT: 838 counterp = &hammer2_iod_indr_write; 839 break; 840 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 841 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 842 counterp = &hammer2_iod_fmap_write; 843 break; 844 default: 845 counterp = &hammer2_iod_volu_write; 846 break; 847 } 848 *counterp += chain->bytes; 849 } 850 851 /* 852 * Clean out the dio. 853 * 854 * If a device buffer was used for data be sure to destroy the 855 * buffer when we are done to avoid aliases (XXX what about the 856 * underlying VM pages?). 857 * 858 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing 859 * is possible. 860 * 861 * NOTE: The isdirty check tracks whether we have to bdwrite() the 862 * buffer or not. The buffer might already be dirty. The 863 * flag is re-set when chain_modify() is called, even if 864 * MODIFIED is already set, allowing the OS to retire the 865 * buffer independent of a hammer2 flush. 866 */ 867 chain->data = NULL; 868 if ((chain->flags & HAMMER2_CHAIN_IOFLUSH) && 869 hammer2_io_isdirty(chain->dio)) { 870 hammer2_io_bawrite(&chain->dio); 871 } else { 872 hammer2_io_bqrelse(&chain->dio); 873 } 874 ccms_thread_unlock_upgraded(&chain->core.cst, ostate); 875 hammer2_chain_drop(chain); 876 } 877 878 /* 879 * This counts the number of live blockrefs in a block array and 880 * also calculates the point at which all remaining blockrefs are empty. 881 * This routine can only be called on a live chain (DUPLICATED flag not set). 882 * 883 * NOTE: Flag is not set until after the count is complete, allowing 884 * callers to test the flag without holding the spinlock. 885 * 886 * NOTE: If base is NULL the related chain is still in the INITIAL 887 * state and there are no blockrefs to count. 888 * 889 * NOTE: live_count may already have some counts accumulated due to 890 * creation and deletion and could even be initially negative. 891 */ 892 void 893 hammer2_chain_countbrefs(hammer2_chain_t *chain, 894 hammer2_blockref_t *base, int count) 895 { 896 spin_lock(&chain->core.cst.spin); 897 if ((chain->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) { 898 if (base) { 899 while (--count >= 0) { 900 if (base[count].type) 901 break; 902 } 903 chain->core.live_zero = count + 1; 904 while (count >= 0) { 905 if (base[count].type) 906 atomic_add_int(&chain->core.live_count, 907 1); 908 --count; 909 } 910 } else { 911 chain->core.live_zero = 0; 912 } 913 /* else do not modify live_count */ 914 atomic_set_int(&chain->core.flags, HAMMER2_CORE_COUNTEDBREFS); 915 } 916 spin_unlock(&chain->core.cst.spin); 917 } 918 919 /* 920 * Resize the chain's physical storage allocation in-place. This function does 921 * not adjust the data pointer and must be followed by (typically) a 922 * hammer2_chain_modify() call to copy any old data over and adjust the 923 * data pointer. 924 * 925 * Chains can be resized smaller without reallocating the storage. Resizing 926 * larger will reallocate the storage. Excess or prior storage is reclaimed 927 * asynchronously at a later time. 928 * 929 * Must be passed an exclusively locked parent and chain. 930 * 931 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 932 * to avoid instantiating a device buffer that conflicts with the vnode data 933 * buffer. However, because H2 can compress or encrypt data, the chain may 934 * have a dio assigned to it in those situations, and they do not conflict. 935 * 936 * XXX return error if cannot resize. 937 */ 938 void 939 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 940 hammer2_chain_t *parent, hammer2_chain_t *chain, 941 int nradix, int flags) 942 { 943 hammer2_mount_t *hmp; 944 size_t obytes; 945 size_t nbytes; 946 947 hmp = chain->hmp; 948 949 /* 950 * Only data and indirect blocks can be resized for now. 951 * (The volu root, inodes, and freemap elements use a fixed size). 952 */ 953 KKASSERT(chain != &hmp->vchain); 954 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 955 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT); 956 957 /* 958 * Nothing to do if the element is already the proper size 959 */ 960 obytes = chain->bytes; 961 nbytes = 1U << nradix; 962 if (obytes == nbytes) 963 return; 964 chain->data_count += (ssize_t)(nbytes - obytes); 965 966 /* 967 * Make sure the old data is instantiated so we can copy it. If this 968 * is a data block, the device data may be superfluous since the data 969 * might be in a logical block, but compressed or encrypted data is 970 * another matter. 971 * 972 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 973 */ 974 hammer2_chain_modify(trans, chain, 0); 975 976 /* 977 * Relocate the block, even if making it smaller (because different 978 * block sizes may be in different regions). 979 * 980 * (data blocks only, we aren't copying the storage here). 981 */ 982 hammer2_freemap_alloc(trans, chain, nbytes); 983 chain->bytes = nbytes; 984 /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */ 985 986 /* 987 * We don't want the followup chain_modify() to try to copy data 988 * from the old (wrong-sized) buffer. It won't know how much to 989 * copy. This case should only occur during writes when the 990 * originator already has the data to write in-hand. 991 */ 992 if (chain->dio) { 993 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA); 994 hammer2_io_brelse(&chain->dio); 995 chain->data = NULL; 996 } 997 } 998 999 #if 0 1000 1001 /* 1002 * REMOVED - see cluster code 1003 * 1004 * Set a chain modified, making it read-write and duplicating it if necessary. 1005 * This function will assign a new physical block to the chain if necessary 1006 * 1007 * Duplication of already-modified chains is possible when the modification 1008 * crosses a flush synchronization boundary. 1009 * 1010 * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE 1011 * level or the COW operation will not work. 1012 * 1013 * Data blocks - The chain is usually locked RESOLVE_NEVER so as not to 1014 * run the data through the device buffers. 1015 * 1016 * This function may return a different chain than was passed, in which case 1017 * the old chain will be unlocked and the new chain will be locked. 1018 * 1019 * ip->chain may be adjusted by hammer2_chain_modify_ip(). 1020 */ 1021 hammer2_inode_data_t * 1022 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 1023 hammer2_chain_t **chainp, int flags) 1024 { 1025 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 1026 hammer2_chain_modify(trans, chainp, flags); 1027 if (ip->chain != *chainp) 1028 hammer2_inode_repoint(ip, NULL, *chainp); 1029 if (ip->vp) 1030 vsetisdirty(ip->vp); 1031 return(&ip->chain->data->ipdata); 1032 } 1033 1034 #endif 1035 1036 void 1037 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags) 1038 { 1039 hammer2_blockref_t obref; 1040 hammer2_mount_t *hmp; 1041 hammer2_io_t *dio; 1042 int error; 1043 int wasinitial; 1044 int newmod; 1045 char *bdata; 1046 1047 hmp = chain->hmp; 1048 obref = chain->bref; 1049 1050 /* 1051 * Data is not optional for freemap chains (we must always be sure 1052 * to copy the data on COW storage allocations). 1053 */ 1054 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1055 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1056 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1057 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1058 } 1059 1060 /* 1061 * Data must be resolved if already assigned unless explicitly 1062 * flagged otherwise. 1063 */ 1064 if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1065 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1066 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1067 hammer2_chain_unlock(chain); 1068 } 1069 1070 /* 1071 * Otherwise do initial-chain handling. Set MODIFIED to indicate 1072 * that the chain has been modified. Set UPDATE to ensure that 1073 * the blockref is updated in the parent. 1074 */ 1075 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1076 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1077 hammer2_chain_ref(chain); 1078 hammer2_pfs_memory_inc(chain->pmp); 1079 newmod = 1; 1080 } else { 1081 newmod = 0; 1082 } 1083 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1084 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1085 hammer2_chain_ref(chain); 1086 } 1087 1088 /* 1089 * The modification or re-modification requires an allocation and 1090 * possible COW. 1091 * 1092 * We normally always allocate new storage here. If storage exists 1093 * and MODIFY_NOREALLOC is passed in, we do not allocate new storage. 1094 */ 1095 if (chain != &hmp->vchain && chain != &hmp->fchain) { 1096 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1097 ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 && newmod) 1098 ) { 1099 hammer2_freemap_alloc(trans, chain, chain->bytes); 1100 /* XXX failed allocation */ 1101 } 1102 } 1103 1104 /* 1105 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1106 * requires updating as well as to tell the delete code that the 1107 * chain's blockref might not exactly match (in terms of physical size 1108 * or block offset) the one in the parent's blocktable. The base key 1109 * of course will still match. 1110 */ 1111 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1112 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1113 1114 /* 1115 * Short-cut data blocks which the caller does not need an actual 1116 * data reference to (aka OPTDATA), as long as the chain does not 1117 * already have a data pointer to the data. This generally means 1118 * that the modifications are being done via the logical buffer cache. 1119 * The INITIAL flag relates only to the device data buffer and thus 1120 * remains unchange in this situation. 1121 */ 1122 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1123 (flags & HAMMER2_MODIFY_OPTDATA) && 1124 chain->data == NULL) { 1125 goto skip2; 1126 } 1127 1128 /* 1129 * Clearing the INITIAL flag (for indirect blocks) indicates that 1130 * we've processed the uninitialized storage allocation. 1131 * 1132 * If this flag is already clear we are likely in a copy-on-write 1133 * situation but we have to be sure NOT to bzero the storage if 1134 * no data is present. 1135 */ 1136 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1137 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1138 wasinitial = 1; 1139 } else { 1140 wasinitial = 0; 1141 } 1142 1143 /* 1144 * Instantiate data buffer and possibly execute COW operation 1145 */ 1146 switch(chain->bref.type) { 1147 case HAMMER2_BREF_TYPE_VOLUME: 1148 case HAMMER2_BREF_TYPE_FREEMAP: 1149 /* 1150 * The data is embedded, no copy-on-write operation is 1151 * needed. 1152 */ 1153 KKASSERT(chain->dio == NULL); 1154 break; 1155 case HAMMER2_BREF_TYPE_INODE: 1156 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1157 case HAMMER2_BREF_TYPE_DATA: 1158 case HAMMER2_BREF_TYPE_INDIRECT: 1159 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1160 /* 1161 * Perform the copy-on-write operation 1162 * 1163 * zero-fill or copy-on-write depending on whether 1164 * chain->data exists or not and set the dirty state for 1165 * the new buffer. hammer2_io_new() will handle the 1166 * zero-fill. 1167 */ 1168 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1169 1170 if (wasinitial) { 1171 error = hammer2_io_new(hmp, chain->bref.data_off, 1172 chain->bytes, &dio); 1173 } else { 1174 error = hammer2_io_bread(hmp, chain->bref.data_off, 1175 chain->bytes, &dio); 1176 } 1177 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1178 KKASSERT(error == 0); 1179 1180 bdata = hammer2_io_data(dio, chain->bref.data_off); 1181 1182 if (chain->data) { 1183 KKASSERT(chain->dio != NULL); 1184 if (chain->data != (void *)bdata) { 1185 bcopy(chain->data, bdata, chain->bytes); 1186 } 1187 } else if (wasinitial == 0) { 1188 /* 1189 * We have a problem. We were asked to COW but 1190 * we don't have any data to COW with! 1191 */ 1192 panic("hammer2_chain_modify: having a COW %p\n", 1193 chain); 1194 } 1195 1196 /* 1197 * Retire the old buffer, replace with the new. Dirty or 1198 * redirty the new buffer. 1199 * 1200 * WARNING! The system buffer cache may have already flushed 1201 * the buffer, so we must be sure to [re]dirty it 1202 * for further modification. 1203 */ 1204 if (chain->dio) 1205 hammer2_io_brelse(&chain->dio); 1206 chain->data = (void *)bdata; 1207 chain->dio = dio; 1208 hammer2_io_setdirty(dio); /* modified by bcopy above */ 1209 break; 1210 default: 1211 panic("hammer2_chain_modify: illegal non-embedded type %d", 1212 chain->bref.type); 1213 break; 1214 1215 } 1216 skip2: 1217 /* 1218 * setflush on parent indicating that the parent must recurse down 1219 * to us. Do not call on chain itself which might already have it 1220 * set. 1221 */ 1222 if (chain->parent) 1223 hammer2_chain_setflush(trans, chain->parent); 1224 1225 /* 1226 * Adjust the freemap bitmap to indicate that the related blocks 1227 * MIGHT be freeable. Bulkfree must still determine that the blocks 1228 * are actually freeable. 1229 */ 1230 if (obref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 1231 obref.type != HAMMER2_BREF_TYPE_FREEMAP_LEAF && 1232 (obref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1233 hammer2_freemap_adjust(trans, hmp, 1234 &obref, HAMMER2_FREEMAP_DOMAYFREE); 1235 } 1236 } 1237 1238 /* 1239 * Volume header data locks 1240 */ 1241 void 1242 hammer2_voldata_lock(hammer2_mount_t *hmp) 1243 { 1244 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 1245 } 1246 1247 void 1248 hammer2_voldata_unlock(hammer2_mount_t *hmp) 1249 { 1250 lockmgr(&hmp->vollk, LK_RELEASE); 1251 } 1252 1253 void 1254 hammer2_voldata_modify(hammer2_mount_t *hmp) 1255 { 1256 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1257 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1258 hammer2_chain_ref(&hmp->vchain); 1259 hammer2_pfs_memory_inc(hmp->vchain.pmp); 1260 } 1261 } 1262 1263 /* 1264 * This function returns the chain at the nearest key within the specified 1265 * range. The returned chain will be referenced but not locked. 1266 * 1267 * This function will recurse through chain->rbtree as necessary and will 1268 * return a *key_nextp suitable for iteration. *key_nextp is only set if 1269 * the iteration value is less than the current value of *key_nextp. 1270 * 1271 * The caller should use (*key_nextp) to calculate the actual range of 1272 * the returned element, which will be (key_beg to *key_nextp - 1), because 1273 * there might be another element which is superior to the returned element 1274 * and overlaps it. 1275 * 1276 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 1277 * chains continue to be returned. On EOF (*key_nextp) may overflow since 1278 * it will wind up being (key_end + 1). 1279 * 1280 * WARNING! Must be called with child's spinlock held. Spinlock remains 1281 * held through the operation. 1282 */ 1283 struct hammer2_chain_find_info { 1284 hammer2_chain_t *best; 1285 hammer2_key_t key_beg; 1286 hammer2_key_t key_end; 1287 hammer2_key_t key_next; 1288 }; 1289 1290 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 1291 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 1292 1293 static 1294 hammer2_chain_t * 1295 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 1296 hammer2_key_t key_beg, hammer2_key_t key_end) 1297 { 1298 struct hammer2_chain_find_info info; 1299 1300 info.best = NULL; 1301 info.key_beg = key_beg; 1302 info.key_end = key_end; 1303 info.key_next = *key_nextp; 1304 1305 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 1306 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1307 &info); 1308 *key_nextp = info.key_next; 1309 #if 0 1310 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 1311 parent, key_beg, key_end, *key_nextp); 1312 #endif 1313 1314 return (info.best); 1315 } 1316 1317 static 1318 int 1319 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 1320 { 1321 struct hammer2_chain_find_info *info = data; 1322 hammer2_key_t child_beg; 1323 hammer2_key_t child_end; 1324 1325 child_beg = child->bref.key; 1326 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 1327 1328 if (child_end < info->key_beg) 1329 return(-1); 1330 if (child_beg > info->key_end) 1331 return(1); 1332 return(0); 1333 } 1334 1335 static 1336 int 1337 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 1338 { 1339 struct hammer2_chain_find_info *info = data; 1340 hammer2_chain_t *best; 1341 hammer2_key_t child_end; 1342 1343 /* 1344 * WARNING! Do not discard DUPLICATED chains, it is possible that 1345 * we are catching an insertion half-way done. If a 1346 * duplicated chain turns out to be the best choice the 1347 * caller will re-check its flags after locking it. 1348 * 1349 * WARNING! Layerq is scanned forwards, exact matches should keep 1350 * the existing info->best. 1351 */ 1352 if ((best = info->best) == NULL) { 1353 /* 1354 * No previous best. Assign best 1355 */ 1356 info->best = child; 1357 } else if (best->bref.key <= info->key_beg && 1358 child->bref.key <= info->key_beg) { 1359 /* 1360 * Illegal overlap. 1361 */ 1362 KKASSERT(0); 1363 /*info->best = child;*/ 1364 } else if (child->bref.key < best->bref.key) { 1365 /* 1366 * Child has a nearer key and best is not flush with key_beg. 1367 * Set best to child. Truncate key_next to the old best key. 1368 */ 1369 info->best = child; 1370 if (info->key_next > best->bref.key || info->key_next == 0) 1371 info->key_next = best->bref.key; 1372 } else if (child->bref.key == best->bref.key) { 1373 /* 1374 * If our current best is flush with the child then this 1375 * is an illegal overlap. 1376 * 1377 * key_next will automatically be limited to the smaller of 1378 * the two end-points. 1379 */ 1380 KKASSERT(0); 1381 info->best = child; 1382 } else { 1383 /* 1384 * Keep the current best but truncate key_next to the child's 1385 * base. 1386 * 1387 * key_next will also automatically be limited to the smaller 1388 * of the two end-points (probably not necessary for this case 1389 * but we do it anyway). 1390 */ 1391 if (info->key_next > child->bref.key || info->key_next == 0) 1392 info->key_next = child->bref.key; 1393 } 1394 1395 /* 1396 * Always truncate key_next based on child's end-of-range. 1397 */ 1398 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 1399 if (child_end && (info->key_next > child_end || info->key_next == 0)) 1400 info->key_next = child_end; 1401 1402 return(0); 1403 } 1404 1405 /* 1406 * Retrieve the specified chain from a media blockref, creating the 1407 * in-memory chain structure which reflects it. 1408 * 1409 * To handle insertion races pass the INSERT_RACE flag along with the 1410 * generation number of the core. NULL will be returned if the generation 1411 * number changes before we have a chance to insert the chain. Insert 1412 * races can occur because the parent might be held shared. 1413 * 1414 * Caller must hold the parent locked shared or exclusive since we may 1415 * need the parent's bref array to find our block. 1416 * 1417 * WARNING! chain->pmp is left NULL if the bref represents a PFS mount 1418 * point. 1419 */ 1420 hammer2_chain_t * 1421 hammer2_chain_get(hammer2_chain_t *parent, int generation, 1422 hammer2_blockref_t *bref) 1423 { 1424 hammer2_mount_t *hmp = parent->hmp; 1425 hammer2_chain_t *chain; 1426 int error; 1427 1428 /* 1429 * Allocate a chain structure representing the existing media 1430 * entry. Resulting chain has one ref and is not locked. 1431 */ 1432 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 1433 chain = hammer2_chain_alloc(hmp, NULL, NULL, bref); 1434 else 1435 chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref); 1436 hammer2_chain_core_alloc(NULL, chain); 1437 /* ref'd chain returned */ 1438 1439 /* 1440 * Flag that the chain is in the parent's blockmap so delete/flush 1441 * knows what to do with it. 1442 */ 1443 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 1444 1445 /* 1446 * Link the chain into its parent. A spinlock is required to safely 1447 * access the RBTREE, and it is possible to collide with another 1448 * hammer2_chain_get() operation because the caller might only hold 1449 * a shared lock on the parent. 1450 */ 1451 KKASSERT(parent->refs > 0); 1452 error = hammer2_chain_insert(parent, chain, 1453 HAMMER2_CHAIN_INSERT_SPIN | 1454 HAMMER2_CHAIN_INSERT_RACE, 1455 generation); 1456 if (error) { 1457 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 1458 kprintf("chain %p get race\n", chain); 1459 hammer2_chain_drop(chain); 1460 chain = NULL; 1461 } else { 1462 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 1463 } 1464 1465 /* 1466 * Return our new chain referenced but not locked, or NULL if 1467 * a race occurred. 1468 */ 1469 return (chain); 1470 } 1471 1472 /* 1473 * Lookup initialization/completion API 1474 */ 1475 hammer2_chain_t * 1476 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 1477 { 1478 if (flags & HAMMER2_LOOKUP_SHARED) { 1479 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 1480 HAMMER2_RESOLVE_SHARED); 1481 } else { 1482 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 1483 } 1484 return (parent); 1485 } 1486 1487 void 1488 hammer2_chain_lookup_done(hammer2_chain_t *parent) 1489 { 1490 if (parent) 1491 hammer2_chain_unlock(parent); 1492 } 1493 1494 static 1495 hammer2_chain_t * 1496 hammer2_chain_getparent(hammer2_chain_t **parentp, int how) 1497 { 1498 hammer2_chain_t *oparent; 1499 hammer2_chain_t *nparent; 1500 1501 /* 1502 * Be careful of order, oparent must be unlocked before nparent 1503 * is locked below to avoid a deadlock. 1504 */ 1505 oparent = *parentp; 1506 spin_lock(&oparent->core.cst.spin); 1507 nparent = oparent->parent; 1508 hammer2_chain_ref(nparent); 1509 spin_unlock(&oparent->core.cst.spin); 1510 if (oparent) { 1511 hammer2_chain_unlock(oparent); 1512 oparent = NULL; 1513 } 1514 1515 hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF); 1516 *parentp = nparent; 1517 1518 return (nparent); 1519 } 1520 1521 /* 1522 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 1523 * (*parentp) typically points to an inode but can also point to a related 1524 * indirect block and this function will recurse upwards and find the inode 1525 * again. 1526 * 1527 * (*parentp) must be exclusively locked and referenced and can be an inode 1528 * or an existing indirect block within the inode. 1529 * 1530 * On return (*parentp) will be modified to point at the deepest parent chain 1531 * element encountered during the search, as a helper for an insertion or 1532 * deletion. The new (*parentp) will be locked and referenced and the old 1533 * will be unlocked and dereferenced (no change if they are both the same). 1534 * 1535 * The matching chain will be returned exclusively locked. If NOLOCK is 1536 * requested the chain will be returned only referenced. 1537 * 1538 * NULL is returned if no match was found, but (*parentp) will still 1539 * potentially be adjusted. 1540 * 1541 * On return (*key_nextp) will point to an iterative value for key_beg. 1542 * (If NULL is returned (*key_nextp) is set to key_end). 1543 * 1544 * This function will also recurse up the chain if the key is not within the 1545 * current parent's range. (*parentp) can never be set to NULL. An iteration 1546 * can simply allow (*parentp) to float inside the loop. 1547 * 1548 * NOTE! chain->data is not always resolved. By default it will not be 1549 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 1550 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 1551 * BREF_TYPE_DATA as the device buffer can alias the logical file 1552 * buffer). 1553 */ 1554 hammer2_chain_t * 1555 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 1556 hammer2_key_t key_beg, hammer2_key_t key_end, 1557 int *cache_indexp, int flags, int *ddflagp) 1558 { 1559 hammer2_mount_t *hmp; 1560 hammer2_chain_t *parent; 1561 hammer2_chain_t *chain; 1562 hammer2_blockref_t *base; 1563 hammer2_blockref_t *bref; 1564 hammer2_blockref_t bcopy; 1565 hammer2_key_t scan_beg; 1566 hammer2_key_t scan_end; 1567 int count = 0; 1568 int how_always = HAMMER2_RESOLVE_ALWAYS; 1569 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1570 int how; 1571 int generation; 1572 int maxloops = 300000; 1573 1574 *ddflagp = 0; 1575 if (flags & HAMMER2_LOOKUP_ALWAYS) { 1576 how_maybe = how_always; 1577 how = HAMMER2_RESOLVE_ALWAYS; 1578 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 1579 how = HAMMER2_RESOLVE_NEVER; 1580 } else { 1581 how = HAMMER2_RESOLVE_MAYBE; 1582 } 1583 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 1584 how_maybe |= HAMMER2_RESOLVE_SHARED; 1585 how_always |= HAMMER2_RESOLVE_SHARED; 1586 how |= HAMMER2_RESOLVE_SHARED; 1587 } 1588 1589 /* 1590 * Recurse (*parentp) upward if necessary until the parent completely 1591 * encloses the key range or we hit the inode. 1592 * 1593 * This function handles races against the flusher doing a delete- 1594 * duplicate above us and re-homes the parent to the duplicate in 1595 * that case, otherwise we'd wind up recursing down a stale chain. 1596 */ 1597 parent = *parentp; 1598 hmp = parent->hmp; 1599 1600 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1601 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1602 scan_beg = parent->bref.key; 1603 scan_end = scan_beg + 1604 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1605 if (key_beg >= scan_beg && key_end <= scan_end) 1606 break; 1607 parent = hammer2_chain_getparent(parentp, how_maybe); 1608 } 1609 1610 again: 1611 if (--maxloops == 0) 1612 panic("hammer2_chain_lookup: maxloops"); 1613 /* 1614 * Locate the blockref array. Currently we do a fully associative 1615 * search through the array. 1616 */ 1617 switch(parent->bref.type) { 1618 case HAMMER2_BREF_TYPE_INODE: 1619 /* 1620 * Special shortcut for embedded data returns the inode 1621 * itself. Callers must detect this condition and access 1622 * the embedded data (the strategy code does this for us). 1623 * 1624 * This is only applicable to regular files and softlinks. 1625 */ 1626 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 1627 if (flags & HAMMER2_LOOKUP_NOLOCK) 1628 hammer2_chain_ref(parent); 1629 else 1630 hammer2_chain_lock(parent, how_always); 1631 *key_nextp = key_end + 1; 1632 *ddflagp = 1; 1633 return (parent); 1634 } 1635 base = &parent->data->ipdata.u.blockset.blockref[0]; 1636 count = HAMMER2_SET_COUNT; 1637 break; 1638 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1639 case HAMMER2_BREF_TYPE_INDIRECT: 1640 /* 1641 * Handle MATCHIND on the parent 1642 */ 1643 if (flags & HAMMER2_LOOKUP_MATCHIND) { 1644 scan_beg = parent->bref.key; 1645 scan_end = scan_beg + 1646 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1647 if (key_beg == scan_beg && key_end == scan_end) { 1648 chain = parent; 1649 hammer2_chain_lock(chain, how_maybe); 1650 *key_nextp = scan_end + 1; 1651 goto done; 1652 } 1653 } 1654 /* 1655 * Optimize indirect blocks in the INITIAL state to avoid 1656 * I/O. 1657 */ 1658 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1659 base = NULL; 1660 } else { 1661 if (parent->data == NULL) 1662 panic("parent->data is NULL"); 1663 base = &parent->data->npdata[0]; 1664 } 1665 count = parent->bytes / sizeof(hammer2_blockref_t); 1666 break; 1667 case HAMMER2_BREF_TYPE_VOLUME: 1668 base = &hmp->voldata.sroot_blockset.blockref[0]; 1669 count = HAMMER2_SET_COUNT; 1670 break; 1671 case HAMMER2_BREF_TYPE_FREEMAP: 1672 base = &hmp->voldata.freemap_blockset.blockref[0]; 1673 count = HAMMER2_SET_COUNT; 1674 break; 1675 default: 1676 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 1677 parent->bref.type); 1678 base = NULL; /* safety */ 1679 count = 0; /* safety */ 1680 } 1681 1682 /* 1683 * Merged scan to find next candidate. 1684 * 1685 * hammer2_base_*() functions require the parent->core.live_* fields 1686 * to be synchronized. 1687 * 1688 * We need to hold the spinlock to access the block array and RB tree 1689 * and to interlock chain creation. 1690 */ 1691 if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 1692 hammer2_chain_countbrefs(parent, base, count); 1693 1694 /* 1695 * Combined search 1696 */ 1697 spin_lock(&parent->core.cst.spin); 1698 chain = hammer2_combined_find(parent, base, count, 1699 cache_indexp, key_nextp, 1700 key_beg, key_end, 1701 &bref); 1702 generation = parent->core.generation; 1703 1704 /* 1705 * Exhausted parent chain, iterate. 1706 */ 1707 if (bref == NULL) { 1708 spin_unlock(&parent->core.cst.spin); 1709 if (key_beg == key_end) /* short cut single-key case */ 1710 return (NULL); 1711 1712 /* 1713 * Stop if we reached the end of the iteration. 1714 */ 1715 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 1716 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1717 return (NULL); 1718 } 1719 1720 /* 1721 * Calculate next key, stop if we reached the end of the 1722 * iteration, otherwise go up one level and loop. 1723 */ 1724 key_beg = parent->bref.key + 1725 ((hammer2_key_t)1 << parent->bref.keybits); 1726 if (key_beg == 0 || key_beg > key_end) 1727 return (NULL); 1728 parent = hammer2_chain_getparent(parentp, how_maybe); 1729 goto again; 1730 } 1731 1732 /* 1733 * Selected from blockref or in-memory chain. 1734 */ 1735 if (chain == NULL) { 1736 bcopy = *bref; 1737 spin_unlock(&parent->core.cst.spin); 1738 chain = hammer2_chain_get(parent, generation, 1739 &bcopy); 1740 if (chain == NULL) { 1741 kprintf("retry lookup parent %p keys %016jx:%016jx\n", 1742 parent, key_beg, key_end); 1743 goto again; 1744 } 1745 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 1746 hammer2_chain_drop(chain); 1747 goto again; 1748 } 1749 } else { 1750 hammer2_chain_ref(chain); 1751 spin_unlock(&parent->core.cst.spin); 1752 } 1753 1754 /* 1755 * chain is referenced but not locked. We must lock the chain 1756 * to obtain definitive DUPLICATED/DELETED state 1757 */ 1758 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1759 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1760 hammer2_chain_lock(chain, how_maybe | HAMMER2_RESOLVE_NOREF); 1761 } else { 1762 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); 1763 } 1764 1765 /* 1766 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 1767 * 1768 * NOTE: Chain's key range is not relevant as there might be 1769 * one-offs within the range that are not deleted. 1770 * 1771 * NOTE: Lookups can race delete-duplicate because 1772 * delete-duplicate does not lock the parent's core 1773 * (they just use the spinlock on the core). We must 1774 * check for races by comparing the DUPLICATED flag before 1775 * releasing the spinlock with the flag after locking the 1776 * chain. 1777 */ 1778 if (chain->flags & HAMMER2_CHAIN_DELETED) { 1779 hammer2_chain_unlock(chain); 1780 key_beg = *key_nextp; 1781 if (key_beg == 0 || key_beg > key_end) 1782 return(NULL); 1783 goto again; 1784 } 1785 1786 /* 1787 * If the chain element is an indirect block it becomes the new 1788 * parent and we loop on it. We must maintain our top-down locks 1789 * to prevent the flusher from interfering (i.e. doing a 1790 * delete-duplicate and leaving us recursing down a deleted chain). 1791 * 1792 * The parent always has to be locked with at least RESOLVE_MAYBE 1793 * so we can access its data. It might need a fixup if the caller 1794 * passed incompatible flags. Be careful not to cause a deadlock 1795 * as a data-load requires an exclusive lock. 1796 * 1797 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 1798 * range is within the requested key range we return the indirect 1799 * block and do NOT loop. This is usually only used to acquire 1800 * freemap nodes. 1801 */ 1802 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1803 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1804 hammer2_chain_unlock(parent); 1805 *parentp = parent = chain; 1806 goto again; 1807 } 1808 done: 1809 /* 1810 * All done, return the chain 1811 */ 1812 return (chain); 1813 } 1814 1815 /* 1816 * After having issued a lookup we can iterate all matching keys. 1817 * 1818 * If chain is non-NULL we continue the iteration from just after it's index. 1819 * 1820 * If chain is NULL we assume the parent was exhausted and continue the 1821 * iteration at the next parent. 1822 * 1823 * parent must be locked on entry and remains locked throughout. chain's 1824 * lock status must match flags. Chain is always at least referenced. 1825 * 1826 * WARNING! The MATCHIND flag does not apply to this function. 1827 */ 1828 hammer2_chain_t * 1829 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 1830 hammer2_key_t *key_nextp, 1831 hammer2_key_t key_beg, hammer2_key_t key_end, 1832 int *cache_indexp, int flags) 1833 { 1834 hammer2_chain_t *parent; 1835 int how_maybe; 1836 int ddflag; 1837 1838 /* 1839 * Calculate locking flags for upward recursion. 1840 */ 1841 how_maybe = HAMMER2_RESOLVE_MAYBE; 1842 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 1843 how_maybe |= HAMMER2_RESOLVE_SHARED; 1844 1845 parent = *parentp; 1846 1847 /* 1848 * Calculate the next index and recalculate the parent if necessary. 1849 */ 1850 if (chain) { 1851 key_beg = chain->bref.key + 1852 ((hammer2_key_t)1 << chain->bref.keybits); 1853 if (flags & HAMMER2_LOOKUP_NOLOCK) 1854 hammer2_chain_drop(chain); 1855 else 1856 hammer2_chain_unlock(chain); 1857 1858 /* 1859 * Any scan where the lookup returned degenerate data embedded 1860 * in the inode has an invalid index and must terminate. 1861 */ 1862 if (chain == parent) 1863 return(NULL); 1864 if (key_beg == 0 || key_beg > key_end) 1865 return(NULL); 1866 chain = NULL; 1867 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 1868 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1869 /* 1870 * We reached the end of the iteration. 1871 */ 1872 return (NULL); 1873 } else { 1874 /* 1875 * Continue iteration with next parent unless the current 1876 * parent covers the range. 1877 */ 1878 key_beg = parent->bref.key + 1879 ((hammer2_key_t)1 << parent->bref.keybits); 1880 if (key_beg == 0 || key_beg > key_end) 1881 return (NULL); 1882 parent = hammer2_chain_getparent(parentp, how_maybe); 1883 } 1884 1885 /* 1886 * And execute 1887 */ 1888 return (hammer2_chain_lookup(parentp, key_nextp, 1889 key_beg, key_end, 1890 cache_indexp, flags, &ddflag)); 1891 } 1892 1893 /* 1894 * The raw scan function is similar to lookup/next but does not seek to a key. 1895 * Blockrefs are iterated via first_chain = (parent, NULL) and 1896 * next_chain = (parent, chain). 1897 * 1898 * The passed-in parent must be locked and its data resolved. The returned 1899 * chain will be locked. Pass chain == NULL to acquire the first sub-chain 1900 * under parent and then iterate with the passed-in chain (which this 1901 * function will unlock). 1902 */ 1903 hammer2_chain_t * 1904 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain, 1905 int *cache_indexp, int flags) 1906 { 1907 hammer2_mount_t *hmp; 1908 hammer2_blockref_t *base; 1909 hammer2_blockref_t *bref; 1910 hammer2_blockref_t bcopy; 1911 hammer2_key_t key; 1912 hammer2_key_t next_key; 1913 int count = 0; 1914 int how_always = HAMMER2_RESOLVE_ALWAYS; 1915 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1916 int how; 1917 int generation; 1918 int maxloops = 300000; 1919 1920 hmp = parent->hmp; 1921 1922 /* 1923 * Scan flags borrowed from lookup 1924 */ 1925 if (flags & HAMMER2_LOOKUP_ALWAYS) { 1926 how_maybe = how_always; 1927 how = HAMMER2_RESOLVE_ALWAYS; 1928 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 1929 how = HAMMER2_RESOLVE_NEVER; 1930 } else { 1931 how = HAMMER2_RESOLVE_MAYBE; 1932 } 1933 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 1934 how_maybe |= HAMMER2_RESOLVE_SHARED; 1935 how_always |= HAMMER2_RESOLVE_SHARED; 1936 how |= HAMMER2_RESOLVE_SHARED; 1937 } 1938 1939 /* 1940 * Calculate key to locate first/next element, unlocking the previous 1941 * element as we go. Be careful, the key calculation can overflow. 1942 */ 1943 if (chain) { 1944 key = chain->bref.key + 1945 ((hammer2_key_t)1 << chain->bref.keybits); 1946 hammer2_chain_unlock(chain); 1947 chain = NULL; 1948 if (key == 0) 1949 goto done; 1950 } else { 1951 key = 0; 1952 } 1953 1954 again: 1955 if (--maxloops == 0) 1956 panic("hammer2_chain_scan: maxloops"); 1957 /* 1958 * Locate the blockref array. Currently we do a fully associative 1959 * search through the array. 1960 */ 1961 switch(parent->bref.type) { 1962 case HAMMER2_BREF_TYPE_INODE: 1963 /* 1964 * An inode with embedded data has no sub-chains. 1965 */ 1966 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) 1967 goto done; 1968 base = &parent->data->ipdata.u.blockset.blockref[0]; 1969 count = HAMMER2_SET_COUNT; 1970 break; 1971 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1972 case HAMMER2_BREF_TYPE_INDIRECT: 1973 /* 1974 * Optimize indirect blocks in the INITIAL state to avoid 1975 * I/O. 1976 */ 1977 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1978 base = NULL; 1979 } else { 1980 if (parent->data == NULL) 1981 panic("parent->data is NULL"); 1982 base = &parent->data->npdata[0]; 1983 } 1984 count = parent->bytes / sizeof(hammer2_blockref_t); 1985 break; 1986 case HAMMER2_BREF_TYPE_VOLUME: 1987 base = &hmp->voldata.sroot_blockset.blockref[0]; 1988 count = HAMMER2_SET_COUNT; 1989 break; 1990 case HAMMER2_BREF_TYPE_FREEMAP: 1991 base = &hmp->voldata.freemap_blockset.blockref[0]; 1992 count = HAMMER2_SET_COUNT; 1993 break; 1994 default: 1995 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 1996 parent->bref.type); 1997 base = NULL; /* safety */ 1998 count = 0; /* safety */ 1999 } 2000 2001 /* 2002 * Merged scan to find next candidate. 2003 * 2004 * hammer2_base_*() functions require the parent->core.live_* fields 2005 * to be synchronized. 2006 * 2007 * We need to hold the spinlock to access the block array and RB tree 2008 * and to interlock chain creation. 2009 */ 2010 if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2011 hammer2_chain_countbrefs(parent, base, count); 2012 2013 next_key = 0; 2014 spin_lock(&parent->core.cst.spin); 2015 chain = hammer2_combined_find(parent, base, count, 2016 cache_indexp, &next_key, 2017 key, HAMMER2_KEY_MAX, 2018 &bref); 2019 generation = parent->core.generation; 2020 2021 /* 2022 * Exhausted parent chain, we're done. 2023 */ 2024 if (bref == NULL) { 2025 spin_unlock(&parent->core.cst.spin); 2026 KKASSERT(chain == NULL); 2027 goto done; 2028 } 2029 2030 /* 2031 * Selected from blockref or in-memory chain. 2032 */ 2033 if (chain == NULL) { 2034 bcopy = *bref; 2035 spin_unlock(&parent->core.cst.spin); 2036 chain = hammer2_chain_get(parent, generation, &bcopy); 2037 if (chain == NULL) { 2038 kprintf("retry scan parent %p keys %016jx\n", 2039 parent, key); 2040 goto again; 2041 } 2042 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2043 hammer2_chain_drop(chain); 2044 chain = NULL; 2045 goto again; 2046 } 2047 } else { 2048 hammer2_chain_ref(chain); 2049 spin_unlock(&parent->core.cst.spin); 2050 } 2051 2052 /* 2053 * chain is referenced but not locked. We must lock the chain 2054 * to obtain definitive DUPLICATED/DELETED state 2055 */ 2056 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); 2057 2058 /* 2059 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2060 * 2061 * NOTE: chain's key range is not relevant as there might be 2062 * one-offs within the range that are not deleted. 2063 * 2064 * NOTE: XXX this could create problems with scans used in 2065 * situations other than mount-time recovery. 2066 * 2067 * NOTE: Lookups can race delete-duplicate because 2068 * delete-duplicate does not lock the parent's core 2069 * (they just use the spinlock on the core). We must 2070 * check for races by comparing the DUPLICATED flag before 2071 * releasing the spinlock with the flag after locking the 2072 * chain. 2073 */ 2074 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2075 hammer2_chain_unlock(chain); 2076 chain = NULL; 2077 2078 key = next_key; 2079 if (key == 0) 2080 goto done; 2081 goto again; 2082 } 2083 2084 done: 2085 /* 2086 * All done, return the chain or NULL 2087 */ 2088 return (chain); 2089 } 2090 2091 /* 2092 * Create and return a new hammer2 system memory structure of the specified 2093 * key, type and size and insert it under (*parentp). This is a full 2094 * insertion, based on the supplied key/keybits, and may involve creating 2095 * indirect blocks and moving other chains around via delete/duplicate. 2096 * 2097 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 2098 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 2099 * FULL. This typically means that the caller is creating the chain after 2100 * doing a hammer2_chain_lookup(). 2101 * 2102 * (*parentp) must be exclusive locked and may be replaced on return 2103 * depending on how much work the function had to do. 2104 * 2105 * (*chainp) usually starts out NULL and returns the newly created chain, 2106 * but if the caller desires the caller may allocate a disconnected chain 2107 * and pass it in instead. 2108 * 2109 * This function should NOT be used to insert INDIRECT blocks. It is 2110 * typically used to create/insert inodes and data blocks. 2111 * 2112 * Caller must pass-in an exclusively locked parent the new chain is to 2113 * be inserted under, and optionally pass-in a disconnected, exclusively 2114 * locked chain to insert (else we create a new chain). The function will 2115 * adjust (*parentp) as necessary, create or connect the chain, and 2116 * return an exclusively locked chain in *chainp. 2117 */ 2118 int 2119 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp, 2120 hammer2_chain_t **chainp, hammer2_pfsmount_t *pmp, 2121 hammer2_key_t key, int keybits, int type, size_t bytes, 2122 int flags) 2123 { 2124 hammer2_mount_t *hmp; 2125 hammer2_chain_t *chain; 2126 hammer2_chain_t *parent; 2127 hammer2_blockref_t *base; 2128 hammer2_blockref_t dummy; 2129 int allocated = 0; 2130 int error = 0; 2131 int count; 2132 int maxloops = 300000; 2133 2134 /* 2135 * Topology may be crossing a PFS boundary. 2136 */ 2137 parent = *parentp; 2138 KKASSERT(ccms_thread_lock_owned(&parent->core.cst)); 2139 hmp = parent->hmp; 2140 chain = *chainp; 2141 2142 if (chain == NULL) { 2143 /* 2144 * First allocate media space and construct the dummy bref, 2145 * then allocate the in-memory chain structure. Set the 2146 * INITIAL flag for fresh chains which do not have embedded 2147 * data. 2148 */ 2149 bzero(&dummy, sizeof(dummy)); 2150 dummy.type = type; 2151 dummy.key = key; 2152 dummy.keybits = keybits; 2153 dummy.data_off = hammer2_getradix(bytes); 2154 dummy.methods = parent->bref.methods; 2155 chain = hammer2_chain_alloc(hmp, pmp, trans, &dummy); 2156 hammer2_chain_core_alloc(trans, chain); 2157 2158 /* 2159 * Lock the chain manually, chain_lock will load the chain 2160 * which we do NOT want to do. (note: chain->refs is set 2161 * to 1 by chain_alloc() for us, but lockcnt is not). 2162 */ 2163 chain->lockcnt = 1; 2164 ccms_thread_lock(&chain->core.cst, CCMS_STATE_EXCLUSIVE); 2165 allocated = 1; 2166 2167 /* 2168 * We do NOT set INITIAL here (yet). INITIAL is only 2169 * used for indirect blocks. 2170 * 2171 * Recalculate bytes to reflect the actual media block 2172 * allocation. 2173 */ 2174 bytes = (hammer2_off_t)1 << 2175 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 2176 chain->bytes = bytes; 2177 2178 switch(type) { 2179 case HAMMER2_BREF_TYPE_VOLUME: 2180 case HAMMER2_BREF_TYPE_FREEMAP: 2181 panic("hammer2_chain_create: called with volume type"); 2182 break; 2183 case HAMMER2_BREF_TYPE_INDIRECT: 2184 panic("hammer2_chain_create: cannot be used to" 2185 "create indirect block"); 2186 break; 2187 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2188 panic("hammer2_chain_create: cannot be used to" 2189 "create freemap root or node"); 2190 break; 2191 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2192 KKASSERT(bytes == sizeof(chain->data->bmdata)); 2193 /* fall through */ 2194 case HAMMER2_BREF_TYPE_INODE: 2195 case HAMMER2_BREF_TYPE_DATA: 2196 default: 2197 /* 2198 * leave chain->data NULL, set INITIAL 2199 */ 2200 KKASSERT(chain->data == NULL); 2201 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 2202 break; 2203 } 2204 2205 /* 2206 * Set statistics for pending updates. These will be 2207 * synchronized by the flush code. 2208 */ 2209 switch(type) { 2210 case HAMMER2_BREF_TYPE_INODE: 2211 chain->inode_count = 1; 2212 break; 2213 case HAMMER2_BREF_TYPE_DATA: 2214 case HAMMER2_BREF_TYPE_INDIRECT: 2215 chain->data_count = chain->bytes; 2216 break; 2217 } 2218 } else { 2219 /* 2220 * We are reattaching a previously deleted chain, possibly 2221 * under a new parent and possibly with a new key/keybits. 2222 * The chain does not have to be in a modified state. The 2223 * UPDATE flag will be set later on in this routine. 2224 * 2225 * Do NOT mess with the current state of the INITIAL flag. 2226 */ 2227 chain->bref.key = key; 2228 chain->bref.keybits = keybits; 2229 if (chain->flags & HAMMER2_CHAIN_DELETED) 2230 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2231 KKASSERT(chain->parent == NULL); 2232 } 2233 2234 /* 2235 * Calculate how many entries we have in the blockref array and 2236 * determine if an indirect block is required. 2237 */ 2238 again: 2239 if (--maxloops == 0) 2240 panic("hammer2_chain_create: maxloops"); 2241 2242 switch(parent->bref.type) { 2243 case HAMMER2_BREF_TYPE_INODE: 2244 KKASSERT((parent->data->ipdata.op_flags & 2245 HAMMER2_OPFLAG_DIRECTDATA) == 0); 2246 KKASSERT(parent->data != NULL); 2247 base = &parent->data->ipdata.u.blockset.blockref[0]; 2248 count = HAMMER2_SET_COUNT; 2249 break; 2250 case HAMMER2_BREF_TYPE_INDIRECT: 2251 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2252 if (parent->flags & HAMMER2_CHAIN_INITIAL) 2253 base = NULL; 2254 else 2255 base = &parent->data->npdata[0]; 2256 count = parent->bytes / sizeof(hammer2_blockref_t); 2257 break; 2258 case HAMMER2_BREF_TYPE_VOLUME: 2259 KKASSERT(parent->data != NULL); 2260 base = &hmp->voldata.sroot_blockset.blockref[0]; 2261 count = HAMMER2_SET_COUNT; 2262 break; 2263 case HAMMER2_BREF_TYPE_FREEMAP: 2264 KKASSERT(parent->data != NULL); 2265 base = &hmp->voldata.freemap_blockset.blockref[0]; 2266 count = HAMMER2_SET_COUNT; 2267 break; 2268 default: 2269 panic("hammer2_chain_create: unrecognized blockref type: %d", 2270 parent->bref.type); 2271 base = NULL; 2272 count = 0; 2273 break; 2274 } 2275 2276 /* 2277 * Make sure we've counted the brefs 2278 */ 2279 if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2280 hammer2_chain_countbrefs(parent, base, count); 2281 2282 KKASSERT(parent->core.live_count >= 0 && 2283 parent->core.live_count <= count); 2284 2285 /* 2286 * If no free blockref could be found we must create an indirect 2287 * block and move a number of blockrefs into it. With the parent 2288 * locked we can safely lock each child in order to delete+duplicate 2289 * it without causing a deadlock. 2290 * 2291 * This may return the new indirect block or the old parent depending 2292 * on where the key falls. NULL is returned on error. 2293 */ 2294 if (parent->core.live_count == count) { 2295 hammer2_chain_t *nparent; 2296 2297 nparent = hammer2_chain_create_indirect(trans, parent, 2298 key, keybits, 2299 type, &error); 2300 if (nparent == NULL) { 2301 if (allocated) 2302 hammer2_chain_drop(chain); 2303 chain = NULL; 2304 goto done; 2305 } 2306 if (parent != nparent) { 2307 hammer2_chain_unlock(parent); 2308 parent = *parentp = nparent; 2309 } 2310 goto again; 2311 } 2312 2313 /* 2314 * Link the chain into its parent. 2315 */ 2316 if (chain->parent != NULL) 2317 panic("hammer2: hammer2_chain_create: chain already connected"); 2318 KKASSERT(chain->parent == NULL); 2319 hammer2_chain_insert(parent, chain, 2320 HAMMER2_CHAIN_INSERT_SPIN | 2321 HAMMER2_CHAIN_INSERT_LIVE, 2322 0); 2323 2324 if (allocated) { 2325 /* 2326 * Mark the newly created chain modified. This will cause 2327 * UPDATE to be set. 2328 * 2329 * Device buffers are not instantiated for DATA elements 2330 * as these are handled by logical buffers. 2331 * 2332 * Indirect and freemap node indirect blocks are handled 2333 * by hammer2_chain_create_indirect() and not by this 2334 * function. 2335 * 2336 * Data for all other bref types is expected to be 2337 * instantiated (INODE, LEAF). 2338 */ 2339 switch(chain->bref.type) { 2340 case HAMMER2_BREF_TYPE_DATA: 2341 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2342 case HAMMER2_BREF_TYPE_INODE: 2343 hammer2_chain_modify(trans, chain, 2344 HAMMER2_MODIFY_OPTDATA); 2345 break; 2346 default: 2347 /* 2348 * Remaining types are not supported by this function. 2349 * In particular, INDIRECT and LEAF_NODE types are 2350 * handled by create_indirect(). 2351 */ 2352 panic("hammer2_chain_create: bad type: %d", 2353 chain->bref.type); 2354 /* NOT REACHED */ 2355 break; 2356 } 2357 } else { 2358 /* 2359 * When reconnecting a chain we must set UPDATE and 2360 * setflush so the flush recognizes that it must update 2361 * the bref in the parent. 2362 */ 2363 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 2364 hammer2_chain_ref(chain); 2365 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 2366 } 2367 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 2368 (flags & HAMMER2_INSERT_NOSTATS) == 0) { 2369 KKASSERT(chain->data); 2370 chain->inode_count_up += 2371 chain->data->ipdata.inode_count; 2372 chain->data_count_up += 2373 chain->data->ipdata.data_count; 2374 } 2375 } 2376 2377 /* 2378 * We must setflush(parent) to ensure that it recurses through to 2379 * chain. setflush(chain) might not work because ONFLUSH is possibly 2380 * already set in the chain (so it won't recurse up to set it in the 2381 * parent). 2382 */ 2383 hammer2_chain_setflush(trans, parent); 2384 2385 done: 2386 *chainp = chain; 2387 2388 return (error); 2389 } 2390 2391 /* 2392 * Move the chain from its old parent to a new parent. The chain must have 2393 * already been deleted or already disconnected (or never associated) with 2394 * a parent. The chain is reassociated with the new parent and the deleted 2395 * flag will be cleared (no longer deleted). The chain's modification state 2396 * is not altered. 2397 * 2398 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 2399 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 2400 * FULL. This typically means that the caller is creating the chain after 2401 * doing a hammer2_chain_lookup(). 2402 * 2403 * A non-NULL bref is typically passed when key and keybits must be overridden. 2404 * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields 2405 * from a passed-in bref and uses the old chain's bref for everything else. 2406 * 2407 * If (parent) is non-NULL then the new duplicated chain is inserted under 2408 * the parent. 2409 * 2410 * If (parent) is NULL then the newly duplicated chain is not inserted 2411 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 2412 * passing into hammer2_chain_create() after this function returns). 2413 * 2414 * WARNING! This function calls create which means it can insert indirect 2415 * blocks. This can cause other unrelated chains in the parent to 2416 * be moved to a newly inserted indirect block in addition to the 2417 * specific chain. 2418 */ 2419 void 2420 hammer2_chain_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 2421 hammer2_chain_t **parentp, hammer2_chain_t *chain, 2422 int flags) 2423 { 2424 hammer2_mount_t *hmp; 2425 hammer2_chain_t *parent; 2426 size_t bytes; 2427 2428 /* 2429 * WARNING! We should never resolve DATA to device buffers 2430 * (XXX allow it if the caller did?), and since 2431 * we currently do not have the logical buffer cache 2432 * buffer in-hand to fix its cached physical offset 2433 * we also force the modify code to not COW it. XXX 2434 */ 2435 hmp = chain->hmp; 2436 KKASSERT(chain->parent == NULL); 2437 2438 /* 2439 * Now create a duplicate of the chain structure, associating 2440 * it with the same core, making it the same size, pointing it 2441 * to the same bref (the same media block). 2442 */ 2443 if (bref == NULL) 2444 bref = &chain->bref; 2445 bytes = (hammer2_off_t)1 << 2446 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 2447 2448 /* 2449 * If parent is not NULL the duplicated chain will be entered under 2450 * the parent and the UPDATE bit set to tell flush to update 2451 * the blockref. 2452 * 2453 * We must setflush(parent) to ensure that it recurses through to 2454 * chain. setflush(chain) might not work because ONFLUSH is possibly 2455 * already set in the chain (so it won't recurse up to set it in the 2456 * parent). 2457 * 2458 * Having both chains locked is extremely important for atomicy. 2459 */ 2460 if (parentp && (parent = *parentp) != NULL) { 2461 KKASSERT(ccms_thread_lock_owned(&parent->core.cst)); 2462 KKASSERT(parent->refs > 0); 2463 2464 hammer2_chain_create(trans, parentp, &chain, chain->pmp, 2465 bref->key, bref->keybits, bref->type, 2466 chain->bytes, flags); 2467 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 2468 hammer2_chain_setflush(trans, *parentp); 2469 } 2470 } 2471 2472 /* 2473 * Helper function for deleting chains. 2474 * 2475 * The chain is removed from the live view (the RBTREE) as well as the parent's 2476 * blockmap. Both chain and its parent must be locked. 2477 */ 2478 static void 2479 _hammer2_chain_delete_helper(hammer2_trans_t *trans, 2480 hammer2_chain_t *parent, hammer2_chain_t *chain, 2481 int flags) 2482 { 2483 hammer2_mount_t *hmp; 2484 2485 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 2486 hmp = chain->hmp; 2487 2488 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 2489 /* 2490 * Chain is blockmapped, so there must be a parent. 2491 * Atomically remove the chain from the parent and remove 2492 * the blockmap entry. 2493 */ 2494 hammer2_blockref_t *base; 2495 int count; 2496 2497 KKASSERT(parent != NULL); 2498 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 2499 hammer2_chain_modify(trans, parent, 2500 HAMMER2_MODIFY_OPTDATA); 2501 2502 /* 2503 * Calculate blockmap pointer 2504 */ 2505 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2506 spin_lock(&parent->core.cst.spin); 2507 2508 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2509 atomic_add_int(&parent->core.live_count, -1); 2510 ++parent->core.generation; 2511 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 2512 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 2513 --parent->core.chain_count; 2514 chain->parent = NULL; 2515 2516 switch(parent->bref.type) { 2517 case HAMMER2_BREF_TYPE_INODE: 2518 /* 2519 * Access the inode's block array. However, there 2520 * is no block array if the inode is flagged 2521 * DIRECTDATA. The DIRECTDATA case typicaly only 2522 * occurs when a hardlink has been shifted up the 2523 * tree and the original inode gets replaced with 2524 * an OBJTYPE_HARDLINK placeholding inode. 2525 */ 2526 if (parent->data && 2527 (parent->data->ipdata.op_flags & 2528 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 2529 base = 2530 &parent->data->ipdata.u.blockset.blockref[0]; 2531 } else { 2532 base = NULL; 2533 } 2534 count = HAMMER2_SET_COUNT; 2535 break; 2536 case HAMMER2_BREF_TYPE_INDIRECT: 2537 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2538 if (parent->data) 2539 base = &parent->data->npdata[0]; 2540 else 2541 base = NULL; 2542 count = parent->bytes / sizeof(hammer2_blockref_t); 2543 break; 2544 case HAMMER2_BREF_TYPE_VOLUME: 2545 base = &hmp->voldata.sroot_blockset.blockref[0]; 2546 count = HAMMER2_SET_COUNT; 2547 break; 2548 case HAMMER2_BREF_TYPE_FREEMAP: 2549 base = &parent->data->npdata[0]; 2550 count = HAMMER2_SET_COUNT; 2551 break; 2552 default: 2553 base = NULL; 2554 count = 0; 2555 panic("hammer2_flush_pass2: " 2556 "unrecognized blockref type: %d", 2557 parent->bref.type); 2558 } 2559 2560 /* 2561 * delete blockmapped chain from its parent. 2562 * 2563 * The parent is not affected by any statistics in chain 2564 * which are pending synchronization. That is, there is 2565 * nothing to undo in the parent since they have not yet 2566 * been incorporated into the parent. 2567 * 2568 * The parent is affected by statistics stored in inodes. 2569 * Those have already been synchronized, so they must be 2570 * undone. XXX split update possible w/delete in middle? 2571 */ 2572 if (base) { 2573 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 2574 (flags & HAMMER2_DELETE_NOSTATS) == 0) { 2575 KKASSERT(chain->data != NULL); 2576 parent->data_count -= 2577 chain->data->ipdata.data_count; 2578 parent->inode_count -= 2579 chain->data->ipdata.inode_count; 2580 } 2581 2582 int cache_index = -1; 2583 hammer2_base_delete(trans, parent, base, count, 2584 &cache_index, chain); 2585 } 2586 spin_unlock(&parent->core.cst.spin); 2587 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 2588 /* 2589 * Chain is not blockmapped but a parent is present. 2590 * Atomically remove the chain from the parent. There is 2591 * no blockmap entry to remove. 2592 * 2593 * Because chain was associated with a parent but not 2594 * synchronized, the chain's *_count_up fields contain 2595 * inode adjustment statistics which must be undone. 2596 */ 2597 spin_lock(&parent->core.cst.spin); 2598 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 2599 (flags & HAMMER2_DELETE_NOSTATS) == 0) { 2600 KKASSERT(chain->data != NULL); 2601 chain->data_count_up -= 2602 chain->data->ipdata.data_count; 2603 chain->inode_count_up -= 2604 chain->data->ipdata.inode_count; 2605 } 2606 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2607 atomic_add_int(&parent->core.live_count, -1); 2608 ++parent->core.generation; 2609 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 2610 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 2611 --parent->core.chain_count; 2612 chain->parent = NULL; 2613 spin_unlock(&parent->core.cst.spin); 2614 } else { 2615 /* 2616 * Chain is not blockmapped and has no parent. This 2617 * is a degenerate case. 2618 */ 2619 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2620 } 2621 2622 /* 2623 * If the deletion is permanent (i.e. the chain is not simply being 2624 * moved within the topology), adjust the freemap to indicate that 2625 * the block *might* be freeable. bulkfree must still determine 2626 * that it is actually freeable. 2627 */ 2628 if ((flags & HAMMER2_DELETE_PERMANENT) && 2629 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 2630 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP_LEAF && 2631 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 2632 hammer2_freemap_adjust(trans, hmp, &chain->bref, 2633 HAMMER2_FREEMAP_DOMAYFREE); 2634 } 2635 } 2636 2637 /* 2638 * Create an indirect block that covers one or more of the elements in the 2639 * current parent. Either returns the existing parent with no locking or 2640 * ref changes or returns the new indirect block locked and referenced 2641 * and leaving the original parent lock/ref intact as well. 2642 * 2643 * If an error occurs, NULL is returned and *errorp is set to the error. 2644 * 2645 * The returned chain depends on where the specified key falls. 2646 * 2647 * The key/keybits for the indirect mode only needs to follow three rules: 2648 * 2649 * (1) That all elements underneath it fit within its key space and 2650 * 2651 * (2) That all elements outside it are outside its key space. 2652 * 2653 * (3) When creating the new indirect block any elements in the current 2654 * parent that fit within the new indirect block's keyspace must be 2655 * moved into the new indirect block. 2656 * 2657 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 2658 * keyspace the the current parent, but lookup/iteration rules will 2659 * ensure (and must ensure) that rule (2) for all parents leading up 2660 * to the nearest inode or the root volume header is adhered to. This 2661 * is accomplished by always recursing through matching keyspaces in 2662 * the hammer2_chain_lookup() and hammer2_chain_next() API. 2663 * 2664 * The current implementation calculates the current worst-case keyspace by 2665 * iterating the current parent and then divides it into two halves, choosing 2666 * whichever half has the most elements (not necessarily the half containing 2667 * the requested key). 2668 * 2669 * We can also opt to use the half with the least number of elements. This 2670 * causes lower-numbered keys (aka logical file offsets) to recurse through 2671 * fewer indirect blocks and higher-numbered keys to recurse through more. 2672 * This also has the risk of not moving enough elements to the new indirect 2673 * block and being forced to create several indirect blocks before the element 2674 * can be inserted. 2675 * 2676 * Must be called with an exclusively locked parent. 2677 */ 2678 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 2679 hammer2_key_t *keyp, int keybits, 2680 hammer2_blockref_t *base, int count); 2681 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent, 2682 hammer2_key_t *keyp, int keybits, 2683 hammer2_blockref_t *base, int count); 2684 static 2685 hammer2_chain_t * 2686 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, 2687 hammer2_key_t create_key, int create_bits, 2688 int for_type, int *errorp) 2689 { 2690 hammer2_mount_t *hmp; 2691 hammer2_blockref_t *base; 2692 hammer2_blockref_t *bref; 2693 hammer2_blockref_t bcopy; 2694 hammer2_chain_t *chain; 2695 hammer2_chain_t *ichain; 2696 hammer2_chain_t dummy; 2697 hammer2_key_t key = create_key; 2698 hammer2_key_t key_beg; 2699 hammer2_key_t key_end; 2700 hammer2_key_t key_next; 2701 int keybits = create_bits; 2702 int count; 2703 int nbytes; 2704 int cache_index; 2705 int loops; 2706 int reason; 2707 int generation; 2708 int maxloops = 300000; 2709 2710 /* 2711 * Calculate the base blockref pointer or NULL if the chain 2712 * is known to be empty. We need to calculate the array count 2713 * for RB lookups either way. 2714 */ 2715 hmp = parent->hmp; 2716 *errorp = 0; 2717 KKASSERT(ccms_thread_lock_owned(&parent->core.cst)); 2718 2719 /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/ 2720 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2721 base = NULL; 2722 2723 switch(parent->bref.type) { 2724 case HAMMER2_BREF_TYPE_INODE: 2725 count = HAMMER2_SET_COUNT; 2726 break; 2727 case HAMMER2_BREF_TYPE_INDIRECT: 2728 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2729 count = parent->bytes / sizeof(hammer2_blockref_t); 2730 break; 2731 case HAMMER2_BREF_TYPE_VOLUME: 2732 count = HAMMER2_SET_COUNT; 2733 break; 2734 case HAMMER2_BREF_TYPE_FREEMAP: 2735 count = HAMMER2_SET_COUNT; 2736 break; 2737 default: 2738 panic("hammer2_chain_create_indirect: " 2739 "unrecognized blockref type: %d", 2740 parent->bref.type); 2741 count = 0; 2742 break; 2743 } 2744 } else { 2745 switch(parent->bref.type) { 2746 case HAMMER2_BREF_TYPE_INODE: 2747 base = &parent->data->ipdata.u.blockset.blockref[0]; 2748 count = HAMMER2_SET_COUNT; 2749 break; 2750 case HAMMER2_BREF_TYPE_INDIRECT: 2751 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2752 base = &parent->data->npdata[0]; 2753 count = parent->bytes / sizeof(hammer2_blockref_t); 2754 break; 2755 case HAMMER2_BREF_TYPE_VOLUME: 2756 base = &hmp->voldata.sroot_blockset.blockref[0]; 2757 count = HAMMER2_SET_COUNT; 2758 break; 2759 case HAMMER2_BREF_TYPE_FREEMAP: 2760 base = &hmp->voldata.freemap_blockset.blockref[0]; 2761 count = HAMMER2_SET_COUNT; 2762 break; 2763 default: 2764 panic("hammer2_chain_create_indirect: " 2765 "unrecognized blockref type: %d", 2766 parent->bref.type); 2767 count = 0; 2768 break; 2769 } 2770 } 2771 2772 /* 2773 * dummy used in later chain allocation (no longer used for lookups). 2774 */ 2775 bzero(&dummy, sizeof(dummy)); 2776 2777 /* 2778 * When creating an indirect block for a freemap node or leaf 2779 * the key/keybits must be fitted to static radix levels because 2780 * particular radix levels use particular reserved blocks in the 2781 * related zone. 2782 * 2783 * This routine calculates the key/radix of the indirect block 2784 * we need to create, and whether it is on the high-side or the 2785 * low-side. 2786 */ 2787 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 2788 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 2789 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 2790 base, count); 2791 } else { 2792 keybits = hammer2_chain_indkey_normal(parent, &key, keybits, 2793 base, count); 2794 } 2795 2796 /* 2797 * Normalize the key for the radix being represented, keeping the 2798 * high bits and throwing away the low bits. 2799 */ 2800 key &= ~(((hammer2_key_t)1 << keybits) - 1); 2801 2802 /* 2803 * How big should our new indirect block be? It has to be at least 2804 * as large as its parent. 2805 */ 2806 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) 2807 nbytes = HAMMER2_IND_BYTES_MIN; 2808 else 2809 nbytes = HAMMER2_IND_BYTES_MAX; 2810 if (nbytes < count * sizeof(hammer2_blockref_t)) 2811 nbytes = count * sizeof(hammer2_blockref_t); 2812 2813 /* 2814 * Ok, create our new indirect block 2815 */ 2816 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 2817 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 2818 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 2819 } else { 2820 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 2821 } 2822 dummy.bref.key = key; 2823 dummy.bref.keybits = keybits; 2824 dummy.bref.data_off = hammer2_getradix(nbytes); 2825 dummy.bref.methods = parent->bref.methods; 2826 2827 ichain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy.bref); 2828 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 2829 hammer2_chain_core_alloc(trans, ichain); 2830 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 2831 hammer2_chain_drop(ichain); /* excess ref from alloc */ 2832 2833 /* 2834 * We have to mark it modified to allocate its block, but use 2835 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 2836 * it won't be acted upon by the flush code. 2837 */ 2838 hammer2_chain_modify(trans, ichain, HAMMER2_MODIFY_OPTDATA); 2839 2840 /* 2841 * Iterate the original parent and move the matching brefs into 2842 * the new indirect block. 2843 * 2844 * XXX handle flushes. 2845 */ 2846 key_beg = 0; 2847 key_end = HAMMER2_KEY_MAX; 2848 cache_index = 0; 2849 spin_lock(&parent->core.cst.spin); 2850 loops = 0; 2851 reason = 0; 2852 2853 for (;;) { 2854 if (++loops > 100000) { 2855 spin_unlock(&parent->core.cst.spin); 2856 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 2857 reason, parent, base, count, key_next); 2858 } 2859 2860 /* 2861 * NOTE: spinlock stays intact, returned chain (if not NULL) 2862 * is not referenced or locked which means that we 2863 * cannot safely check its flagged / deletion status 2864 * until we lock it. 2865 */ 2866 chain = hammer2_combined_find(parent, base, count, 2867 &cache_index, &key_next, 2868 key_beg, key_end, 2869 &bref); 2870 generation = parent->core.generation; 2871 if (bref == NULL) 2872 break; 2873 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 2874 2875 /* 2876 * Skip keys that are not within the key/radix of the new 2877 * indirect block. They stay in the parent. 2878 */ 2879 if ((~(((hammer2_key_t)1 << keybits) - 1) & 2880 (key ^ bref->key)) != 0) { 2881 goto next_key_spinlocked; 2882 } 2883 2884 /* 2885 * Load the new indirect block by acquiring the related 2886 * chains (potentially from media as it might not be 2887 * in-memory). Then move it to the new parent (ichain) 2888 * via DELETE-DUPLICATE. 2889 * 2890 * chain is referenced but not locked. We must lock the 2891 * chain to obtain definitive DUPLICATED/DELETED state 2892 */ 2893 if (chain) { 2894 /* 2895 * Use chain already present in the RBTREE 2896 */ 2897 hammer2_chain_ref(chain); 2898 spin_unlock(&parent->core.cst.spin); 2899 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER | 2900 HAMMER2_RESOLVE_NOREF); 2901 } else { 2902 /* 2903 * Get chain for blockref element. _get returns NULL 2904 * on insertion race. 2905 */ 2906 bcopy = *bref; 2907 spin_unlock(&parent->core.cst.spin); 2908 chain = hammer2_chain_get(parent, generation, &bcopy); 2909 if (chain == NULL) { 2910 reason = 1; 2911 spin_lock(&parent->core.cst.spin); 2912 continue; 2913 } 2914 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2915 kprintf("REASON 2\n"); 2916 reason = 2; 2917 hammer2_chain_drop(chain); 2918 spin_lock(&parent->core.cst.spin); 2919 continue; 2920 } 2921 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER | 2922 HAMMER2_RESOLVE_NOREF); 2923 } 2924 2925 /* 2926 * This is always live so if the chain has been deleted 2927 * we raced someone and we have to retry. 2928 * 2929 * NOTE: Lookups can race delete-duplicate because 2930 * delete-duplicate does not lock the parent's core 2931 * (they just use the spinlock on the core). We must 2932 * check for races by comparing the DUPLICATED flag before 2933 * releasing the spinlock with the flag after locking the 2934 * chain. 2935 * 2936 * (note reversed logic for this one) 2937 */ 2938 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2939 hammer2_chain_unlock(chain); 2940 goto next_key; 2941 } 2942 2943 /* 2944 * Shift the chain to the indirect block. 2945 * 2946 * WARNING! No reason for us to load chain data, pass NOSTATS 2947 * to prevent delete/insert from trying to access 2948 * inode stats (and thus asserting if there is no 2949 * chain->data loaded). 2950 */ 2951 hammer2_chain_delete(trans, parent, chain, 2952 HAMMER2_DELETE_NOSTATS); 2953 hammer2_chain_rename(trans, NULL, &ichain, chain, 2954 HAMMER2_INSERT_NOSTATS); 2955 hammer2_chain_unlock(chain); 2956 KKASSERT(parent->refs > 0); 2957 chain = NULL; 2958 next_key: 2959 spin_lock(&parent->core.cst.spin); 2960 next_key_spinlocked: 2961 if (--maxloops == 0) 2962 panic("hammer2_chain_create_indirect: maxloops"); 2963 reason = 4; 2964 if (key_next == 0 || key_next > key_end) 2965 break; 2966 key_beg = key_next; 2967 /* loop */ 2968 } 2969 spin_unlock(&parent->core.cst.spin); 2970 2971 /* 2972 * Insert the new indirect block into the parent now that we've 2973 * cleared out some entries in the parent. We calculated a good 2974 * insertion index in the loop above (ichain->index). 2975 * 2976 * We don't have to set UPDATE here because we mark ichain 2977 * modified down below (so the normal modified -> flush -> set-moved 2978 * sequence applies). 2979 * 2980 * The insertion shouldn't race as this is a completely new block 2981 * and the parent is locked. 2982 */ 2983 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2984 hammer2_chain_insert(parent, ichain, 2985 HAMMER2_CHAIN_INSERT_SPIN | 2986 HAMMER2_CHAIN_INSERT_LIVE, 2987 0); 2988 2989 /* 2990 * Make sure flushes propogate after our manual insertion. 2991 */ 2992 hammer2_chain_setflush(trans, ichain); 2993 hammer2_chain_setflush(trans, parent); 2994 2995 /* 2996 * Figure out what to return. 2997 */ 2998 if (~(((hammer2_key_t)1 << keybits) - 1) & 2999 (create_key ^ key)) { 3000 /* 3001 * Key being created is outside the key range, 3002 * return the original parent. 3003 */ 3004 hammer2_chain_unlock(ichain); 3005 } else { 3006 /* 3007 * Otherwise its in the range, return the new parent. 3008 * (leave both the new and old parent locked). 3009 */ 3010 parent = ichain; 3011 } 3012 3013 return(parent); 3014 } 3015 3016 /* 3017 * Calculate the keybits and highside/lowside of the freemap node the 3018 * caller is creating. 3019 * 3020 * This routine will specify the next higher-level freemap key/radix 3021 * representing the lowest-ordered set. By doing so, eventually all 3022 * low-ordered sets will be moved one level down. 3023 * 3024 * We have to be careful here because the freemap reserves a limited 3025 * number of blocks for a limited number of levels. So we can't just 3026 * push indiscriminately. 3027 */ 3028 int 3029 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 3030 int keybits, hammer2_blockref_t *base, int count) 3031 { 3032 hammer2_chain_t *chain; 3033 hammer2_blockref_t *bref; 3034 hammer2_key_t key; 3035 hammer2_key_t key_beg; 3036 hammer2_key_t key_end; 3037 hammer2_key_t key_next; 3038 int cache_index; 3039 int locount; 3040 int hicount; 3041 int maxloops = 300000; 3042 3043 key = *keyp; 3044 locount = 0; 3045 hicount = 0; 3046 keybits = 64; 3047 3048 /* 3049 * Calculate the range of keys in the array being careful to skip 3050 * slots which are overridden with a deletion. 3051 */ 3052 key_beg = 0; 3053 key_end = HAMMER2_KEY_MAX; 3054 cache_index = 0; 3055 spin_lock(&parent->core.cst.spin); 3056 3057 for (;;) { 3058 if (--maxloops == 0) { 3059 panic("indkey_freemap shit %p %p:%d\n", 3060 parent, base, count); 3061 } 3062 chain = hammer2_combined_find(parent, base, count, 3063 &cache_index, &key_next, 3064 key_beg, key_end, 3065 &bref); 3066 3067 /* 3068 * Exhausted search 3069 */ 3070 if (bref == NULL) 3071 break; 3072 3073 /* 3074 * Skip deleted chains. 3075 */ 3076 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3077 if (key_next == 0 || key_next > key_end) 3078 break; 3079 key_beg = key_next; 3080 continue; 3081 } 3082 3083 /* 3084 * Use the full live (not deleted) element for the scan 3085 * iteration. HAMMER2 does not allow partial replacements. 3086 * 3087 * XXX should be built into hammer2_combined_find(). 3088 */ 3089 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3090 3091 if (keybits > bref->keybits) { 3092 key = bref->key; 3093 keybits = bref->keybits; 3094 } else if (keybits == bref->keybits && bref->key < key) { 3095 key = bref->key; 3096 } 3097 if (key_next == 0) 3098 break; 3099 key_beg = key_next; 3100 } 3101 spin_unlock(&parent->core.cst.spin); 3102 3103 /* 3104 * Return the keybits for a higher-level FREEMAP_NODE covering 3105 * this node. 3106 */ 3107 switch(keybits) { 3108 case HAMMER2_FREEMAP_LEVEL0_RADIX: 3109 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 3110 break; 3111 case HAMMER2_FREEMAP_LEVEL1_RADIX: 3112 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 3113 break; 3114 case HAMMER2_FREEMAP_LEVEL2_RADIX: 3115 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 3116 break; 3117 case HAMMER2_FREEMAP_LEVEL3_RADIX: 3118 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 3119 break; 3120 case HAMMER2_FREEMAP_LEVEL4_RADIX: 3121 panic("hammer2_chain_indkey_freemap: level too high"); 3122 break; 3123 default: 3124 panic("hammer2_chain_indkey_freemap: bad radix"); 3125 break; 3126 } 3127 *keyp = key; 3128 3129 return (keybits); 3130 } 3131 3132 /* 3133 * Calculate the keybits and highside/lowside of the indirect block the 3134 * caller is creating. 3135 */ 3136 static int 3137 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp, 3138 int keybits, hammer2_blockref_t *base, int count) 3139 { 3140 hammer2_blockref_t *bref; 3141 hammer2_chain_t *chain; 3142 hammer2_key_t key_beg; 3143 hammer2_key_t key_end; 3144 hammer2_key_t key_next; 3145 hammer2_key_t key; 3146 int nkeybits; 3147 int locount; 3148 int hicount; 3149 int cache_index; 3150 int maxloops = 300000; 3151 3152 key = *keyp; 3153 locount = 0; 3154 hicount = 0; 3155 3156 /* 3157 * Calculate the range of keys in the array being careful to skip 3158 * slots which are overridden with a deletion. Once the scan 3159 * completes we will cut the key range in half and shift half the 3160 * range into the new indirect block. 3161 */ 3162 key_beg = 0; 3163 key_end = HAMMER2_KEY_MAX; 3164 cache_index = 0; 3165 spin_lock(&parent->core.cst.spin); 3166 3167 for (;;) { 3168 if (--maxloops == 0) { 3169 panic("indkey_freemap shit %p %p:%d\n", 3170 parent, base, count); 3171 } 3172 chain = hammer2_combined_find(parent, base, count, 3173 &cache_index, &key_next, 3174 key_beg, key_end, 3175 &bref); 3176 3177 /* 3178 * Exhausted search 3179 */ 3180 if (bref == NULL) 3181 break; 3182 3183 /* 3184 * NOTE: No need to check DUPLICATED here because we do 3185 * not release the spinlock. 3186 */ 3187 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3188 if (key_next == 0 || key_next > key_end) 3189 break; 3190 key_beg = key_next; 3191 continue; 3192 } 3193 3194 /* 3195 * Use the full live (not deleted) element for the scan 3196 * iteration. HAMMER2 does not allow partial replacements. 3197 * 3198 * XXX should be built into hammer2_combined_find(). 3199 */ 3200 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3201 3202 /* 3203 * Expand our calculated key range (key, keybits) to fit 3204 * the scanned key. nkeybits represents the full range 3205 * that we will later cut in half (two halves @ nkeybits - 1). 3206 */ 3207 nkeybits = keybits; 3208 if (nkeybits < bref->keybits) { 3209 if (bref->keybits > 64) { 3210 kprintf("bad bref chain %p bref %p\n", 3211 chain, bref); 3212 Debugger("fubar"); 3213 } 3214 nkeybits = bref->keybits; 3215 } 3216 while (nkeybits < 64 && 3217 (~(((hammer2_key_t)1 << nkeybits) - 1) & 3218 (key ^ bref->key)) != 0) { 3219 ++nkeybits; 3220 } 3221 3222 /* 3223 * If the new key range is larger we have to determine 3224 * which side of the new key range the existing keys fall 3225 * under by checking the high bit, then collapsing the 3226 * locount into the hicount or vise-versa. 3227 */ 3228 if (keybits != nkeybits) { 3229 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 3230 hicount += locount; 3231 locount = 0; 3232 } else { 3233 locount += hicount; 3234 hicount = 0; 3235 } 3236 keybits = nkeybits; 3237 } 3238 3239 /* 3240 * The newly scanned key will be in the lower half or the 3241 * upper half of the (new) key range. 3242 */ 3243 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 3244 ++hicount; 3245 else 3246 ++locount; 3247 3248 if (key_next == 0) 3249 break; 3250 key_beg = key_next; 3251 } 3252 spin_unlock(&parent->core.cst.spin); 3253 bref = NULL; /* now invalid (safety) */ 3254 3255 /* 3256 * Adjust keybits to represent half of the full range calculated 3257 * above (radix 63 max) 3258 */ 3259 --keybits; 3260 3261 /* 3262 * Select whichever half contains the most elements. Theoretically 3263 * we can select either side as long as it contains at least one 3264 * element (in order to ensure that a free slot is present to hold 3265 * the indirect block). 3266 */ 3267 if (hammer2_indirect_optimize) { 3268 /* 3269 * Insert node for least number of keys, this will arrange 3270 * the first few blocks of a large file or the first few 3271 * inodes in a directory with fewer indirect blocks when 3272 * created linearly. 3273 */ 3274 if (hicount < locount && hicount != 0) 3275 key |= (hammer2_key_t)1 << keybits; 3276 else 3277 key &= ~(hammer2_key_t)1 << keybits; 3278 } else { 3279 /* 3280 * Insert node for most number of keys, best for heavily 3281 * fragmented files. 3282 */ 3283 if (hicount > locount) 3284 key |= (hammer2_key_t)1 << keybits; 3285 else 3286 key &= ~(hammer2_key_t)1 << keybits; 3287 } 3288 *keyp = key; 3289 3290 return (keybits); 3291 } 3292 3293 /* 3294 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 3295 * it exists. 3296 * 3297 * Both parent and chain must be locked exclusively. 3298 * 3299 * This function will modify the parent if the blockref requires removal 3300 * from the parent's block table. 3301 * 3302 * This function is NOT recursive. Any entity already pushed into the 3303 * chain (such as an inode) may still need visibility into its contents, 3304 * as well as the ability to read and modify the contents. For example, 3305 * for an unlinked file which is still open. 3306 */ 3307 void 3308 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent, 3309 hammer2_chain_t *chain, int flags) 3310 { 3311 KKASSERT(ccms_thread_lock_owned(&chain->core.cst)); 3312 3313 /* 3314 * Nothing to do if already marked. 3315 * 3316 * We need the spinlock on the core whos RBTREE contains chain 3317 * to protect against races. 3318 */ 3319 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 3320 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 3321 chain->parent == parent); 3322 _hammer2_chain_delete_helper(trans, parent, chain, flags); 3323 } 3324 3325 if (flags & HAMMER2_DELETE_PERMANENT) { 3326 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 3327 hammer2_flush(trans, chain); 3328 } else { 3329 /* XXX might not be needed */ 3330 hammer2_chain_setflush(trans, chain); 3331 } 3332 } 3333 3334 /* 3335 * Returns the index of the nearest element in the blockref array >= elm. 3336 * Returns (count) if no element could be found. 3337 * 3338 * Sets *key_nextp to the next key for loop purposes but does not modify 3339 * it if the next key would be higher than the current value of *key_nextp. 3340 * Note that *key_nexp can overflow to 0, which should be tested by the 3341 * caller. 3342 * 3343 * (*cache_indexp) is a heuristic and can be any value without effecting 3344 * the result. 3345 * 3346 * WARNING! Must be called with parent's spinlock held. Spinlock remains 3347 * held through the operation. 3348 */ 3349 static int 3350 hammer2_base_find(hammer2_chain_t *parent, 3351 hammer2_blockref_t *base, int count, 3352 int *cache_indexp, hammer2_key_t *key_nextp, 3353 hammer2_key_t key_beg, hammer2_key_t key_end) 3354 { 3355 hammer2_blockref_t *scan; 3356 hammer2_key_t scan_end; 3357 int i; 3358 int limit; 3359 3360 /* 3361 * Require the live chain's already have their core's counted 3362 * so we can optimize operations. 3363 */ 3364 KKASSERT(parent->core.flags & HAMMER2_CORE_COUNTEDBREFS); 3365 3366 /* 3367 * Degenerate case 3368 */ 3369 if (count == 0 || base == NULL) 3370 return(count); 3371 3372 /* 3373 * Sequential optimization using *cache_indexp. This is the most 3374 * likely scenario. 3375 * 3376 * We can avoid trailing empty entries on live chains, otherwise 3377 * we might have to check the whole block array. 3378 */ 3379 i = *cache_indexp; 3380 cpu_ccfence(); 3381 limit = parent->core.live_zero; 3382 if (i >= limit) 3383 i = limit - 1; 3384 if (i < 0) 3385 i = 0; 3386 KKASSERT(i < count); 3387 3388 /* 3389 * Search backwards 3390 */ 3391 scan = &base[i]; 3392 while (i > 0 && (scan->type == 0 || scan->key > key_beg)) { 3393 --scan; 3394 --i; 3395 } 3396 *cache_indexp = i; 3397 3398 /* 3399 * Search forwards, stop when we find a scan element which 3400 * encloses the key or until we know that there are no further 3401 * elements. 3402 */ 3403 while (i < count) { 3404 if (scan->type != 0) { 3405 scan_end = scan->key + 3406 ((hammer2_key_t)1 << scan->keybits) - 1; 3407 if (scan->key > key_beg || scan_end >= key_beg) 3408 break; 3409 } 3410 if (i >= limit) 3411 return (count); 3412 ++scan; 3413 ++i; 3414 } 3415 if (i != count) { 3416 *cache_indexp = i; 3417 if (i >= limit) { 3418 i = count; 3419 } else { 3420 scan_end = scan->key + 3421 ((hammer2_key_t)1 << scan->keybits); 3422 if (scan_end && (*key_nextp > scan_end || 3423 *key_nextp == 0)) { 3424 *key_nextp = scan_end; 3425 } 3426 } 3427 } 3428 return (i); 3429 } 3430 3431 /* 3432 * Do a combined search and return the next match either from the blockref 3433 * array or from the in-memory chain. Sets *bresp to the returned bref in 3434 * both cases, or sets it to NULL if the search exhausted. Only returns 3435 * a non-NULL chain if the search matched from the in-memory chain. 3436 * 3437 * When no in-memory chain has been found and a non-NULL bref is returned 3438 * in *bresp. 3439 * 3440 * 3441 * The returned chain is not locked or referenced. Use the returned bref 3442 * to determine if the search exhausted or not. Iterate if the base find 3443 * is chosen but matches a deleted chain. 3444 * 3445 * WARNING! Must be called with parent's spinlock held. Spinlock remains 3446 * held through the operation. 3447 */ 3448 static hammer2_chain_t * 3449 hammer2_combined_find(hammer2_chain_t *parent, 3450 hammer2_blockref_t *base, int count, 3451 int *cache_indexp, hammer2_key_t *key_nextp, 3452 hammer2_key_t key_beg, hammer2_key_t key_end, 3453 hammer2_blockref_t **bresp) 3454 { 3455 hammer2_blockref_t *bref; 3456 hammer2_chain_t *chain; 3457 int i; 3458 3459 /* 3460 * Lookup in block array and in rbtree. 3461 */ 3462 *key_nextp = key_end + 1; 3463 i = hammer2_base_find(parent, base, count, cache_indexp, 3464 key_nextp, key_beg, key_end); 3465 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 3466 3467 /* 3468 * Neither matched 3469 */ 3470 if (i == count && chain == NULL) { 3471 *bresp = NULL; 3472 return(NULL); 3473 } 3474 3475 /* 3476 * Only chain matched. 3477 */ 3478 if (i == count) { 3479 bref = &chain->bref; 3480 goto found; 3481 } 3482 3483 /* 3484 * Only blockref matched. 3485 */ 3486 if (chain == NULL) { 3487 bref = &base[i]; 3488 goto found; 3489 } 3490 3491 /* 3492 * Both in-memory and blockref matched, select the nearer element. 3493 * 3494 * If both are flush with the left-hand side or both are the 3495 * same distance away, select the chain. In this situation the 3496 * chain must have been loaded from the matching blockmap. 3497 */ 3498 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 3499 chain->bref.key == base[i].key) { 3500 KKASSERT(chain->bref.key == base[i].key); 3501 bref = &chain->bref; 3502 goto found; 3503 } 3504 3505 /* 3506 * Select the nearer key 3507 */ 3508 if (chain->bref.key < base[i].key) { 3509 bref = &chain->bref; 3510 } else { 3511 bref = &base[i]; 3512 chain = NULL; 3513 } 3514 3515 /* 3516 * If the bref is out of bounds we've exhausted our search. 3517 */ 3518 found: 3519 if (bref->key > key_end) { 3520 *bresp = NULL; 3521 chain = NULL; 3522 } else { 3523 *bresp = bref; 3524 } 3525 return(chain); 3526 } 3527 3528 /* 3529 * Locate the specified block array element and delete it. The element 3530 * must exist. 3531 * 3532 * The spin lock on the related chain must be held. 3533 * 3534 * NOTE: live_count was adjusted when the chain was deleted, so it does not 3535 * need to be adjusted when we commit the media change. 3536 */ 3537 void 3538 hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent, 3539 hammer2_blockref_t *base, int count, 3540 int *cache_indexp, hammer2_chain_t *chain) 3541 { 3542 hammer2_blockref_t *elm = &chain->bref; 3543 hammer2_key_t key_next; 3544 int i; 3545 3546 /* 3547 * Delete element. Expect the element to exist. 3548 * 3549 * XXX see caller, flush code not yet sophisticated enough to prevent 3550 * re-flushed in some cases. 3551 */ 3552 key_next = 0; /* max range */ 3553 i = hammer2_base_find(parent, base, count, cache_indexp, 3554 &key_next, elm->key, elm->key); 3555 if (i == count || base[i].type == 0 || 3556 base[i].key != elm->key || 3557 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 3558 base[i].keybits != elm->keybits)) { 3559 spin_unlock(&parent->core.cst.spin); 3560 panic("delete base %p element not found at %d/%d elm %p\n", 3561 base, i, count, elm); 3562 return; 3563 } 3564 bzero(&base[i], sizeof(*base)); 3565 3566 /* 3567 * We can only optimize parent->core.live_zero for live chains. 3568 */ 3569 if (parent->core.live_zero == i + 1) { 3570 while (--i >= 0 && base[i].type == 0) 3571 ; 3572 parent->core.live_zero = i + 1; 3573 } 3574 3575 /* 3576 * Clear appropriate blockmap flags in chain. 3577 */ 3578 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 3579 HAMMER2_CHAIN_BMAPUPD); 3580 } 3581 3582 /* 3583 * Insert the specified element. The block array must not already have the 3584 * element and must have space available for the insertion. 3585 * 3586 * The spin lock on the related chain must be held. 3587 * 3588 * NOTE: live_count was adjusted when the chain was deleted, so it does not 3589 * need to be adjusted when we commit the media change. 3590 */ 3591 void 3592 hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent, 3593 hammer2_blockref_t *base, int count, 3594 int *cache_indexp, hammer2_chain_t *chain) 3595 { 3596 hammer2_blockref_t *elm = &chain->bref; 3597 hammer2_key_t key_next; 3598 hammer2_key_t xkey; 3599 int i; 3600 int j; 3601 int k; 3602 int l; 3603 int u = 1; 3604 3605 /* 3606 * Insert new element. Expect the element to not already exist 3607 * unless we are replacing it. 3608 * 3609 * XXX see caller, flush code not yet sophisticated enough to prevent 3610 * re-flushed in some cases. 3611 */ 3612 key_next = 0; /* max range */ 3613 i = hammer2_base_find(parent, base, count, cache_indexp, 3614 &key_next, elm->key, elm->key); 3615 3616 /* 3617 * Shortcut fill optimization, typical ordered insertion(s) may not 3618 * require a search. 3619 */ 3620 KKASSERT(i >= 0 && i <= count); 3621 3622 /* 3623 * Set appropriate blockmap flags in chain. 3624 */ 3625 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 3626 3627 /* 3628 * We can only optimize parent->core.live_zero for live chains. 3629 */ 3630 if (i == count && parent->core.live_zero < count) { 3631 i = parent->core.live_zero++; 3632 base[i] = *elm; 3633 return; 3634 } 3635 3636 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 3637 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 3638 spin_unlock(&parent->core.cst.spin); 3639 panic("insert base %p overlapping elements at %d elm %p\n", 3640 base, i, elm); 3641 } 3642 3643 /* 3644 * Try to find an empty slot before or after. 3645 */ 3646 j = i; 3647 k = i; 3648 while (j > 0 || k < count) { 3649 --j; 3650 if (j >= 0 && base[j].type == 0) { 3651 if (j == i - 1) { 3652 base[j] = *elm; 3653 } else { 3654 bcopy(&base[j+1], &base[j], 3655 (i - j - 1) * sizeof(*base)); 3656 base[i - 1] = *elm; 3657 } 3658 goto validate; 3659 } 3660 ++k; 3661 if (k < count && base[k].type == 0) { 3662 bcopy(&base[i], &base[i+1], 3663 (k - i) * sizeof(hammer2_blockref_t)); 3664 base[i] = *elm; 3665 3666 /* 3667 * We can only update parent->core.live_zero for live 3668 * chains. 3669 */ 3670 if (parent->core.live_zero <= k) 3671 parent->core.live_zero = k + 1; 3672 u = 2; 3673 goto validate; 3674 } 3675 } 3676 panic("hammer2_base_insert: no room!"); 3677 3678 /* 3679 * Debugging 3680 */ 3681 validate: 3682 key_next = 0; 3683 for (l = 0; l < count; ++l) { 3684 if (base[l].type) { 3685 key_next = base[l].key + 3686 ((hammer2_key_t)1 << base[l].keybits) - 1; 3687 break; 3688 } 3689 } 3690 while (++l < count) { 3691 if (base[l].type) { 3692 if (base[l].key <= key_next) 3693 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 3694 key_next = base[l].key + 3695 ((hammer2_key_t)1 << base[l].keybits) - 1; 3696 3697 } 3698 } 3699 3700 } 3701 3702 #if 0 3703 3704 /* 3705 * Sort the blockref array for the chain. Used by the flush code to 3706 * sort the blockref[] array. 3707 * 3708 * The chain must be exclusively locked AND spin-locked. 3709 */ 3710 typedef hammer2_blockref_t *hammer2_blockref_p; 3711 3712 static 3713 int 3714 hammer2_base_sort_callback(const void *v1, const void *v2) 3715 { 3716 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 3717 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 3718 3719 /* 3720 * Make sure empty elements are placed at the end of the array 3721 */ 3722 if (bref1->type == 0) { 3723 if (bref2->type == 0) 3724 return(0); 3725 return(1); 3726 } else if (bref2->type == 0) { 3727 return(-1); 3728 } 3729 3730 /* 3731 * Sort by key 3732 */ 3733 if (bref1->key < bref2->key) 3734 return(-1); 3735 if (bref1->key > bref2->key) 3736 return(1); 3737 return(0); 3738 } 3739 3740 void 3741 hammer2_base_sort(hammer2_chain_t *chain) 3742 { 3743 hammer2_blockref_t *base; 3744 int count; 3745 3746 switch(chain->bref.type) { 3747 case HAMMER2_BREF_TYPE_INODE: 3748 /* 3749 * Special shortcut for embedded data returns the inode 3750 * itself. Callers must detect this condition and access 3751 * the embedded data (the strategy code does this for us). 3752 * 3753 * This is only applicable to regular files and softlinks. 3754 */ 3755 if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) 3756 return; 3757 base = &chain->data->ipdata.u.blockset.blockref[0]; 3758 count = HAMMER2_SET_COUNT; 3759 break; 3760 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3761 case HAMMER2_BREF_TYPE_INDIRECT: 3762 /* 3763 * Optimize indirect blocks in the INITIAL state to avoid 3764 * I/O. 3765 */ 3766 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 3767 base = &chain->data->npdata[0]; 3768 count = chain->bytes / sizeof(hammer2_blockref_t); 3769 break; 3770 case HAMMER2_BREF_TYPE_VOLUME: 3771 base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 3772 count = HAMMER2_SET_COUNT; 3773 break; 3774 case HAMMER2_BREF_TYPE_FREEMAP: 3775 base = &chain->hmp->voldata.freemap_blockset.blockref[0]; 3776 count = HAMMER2_SET_COUNT; 3777 break; 3778 default: 3779 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 3780 chain->bref.type); 3781 base = NULL; /* safety */ 3782 count = 0; /* safety */ 3783 } 3784 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 3785 } 3786 3787 #endif 3788 3789 /* 3790 * Chain memory management 3791 */ 3792 void 3793 hammer2_chain_wait(hammer2_chain_t *chain) 3794 { 3795 tsleep(chain, 0, "chnflw", 1); 3796 } 3797 3798 const hammer2_media_data_t * 3799 hammer2_chain_rdata(hammer2_chain_t *chain) 3800 { 3801 KKASSERT(chain->data != NULL); 3802 return (chain->data); 3803 } 3804 3805 hammer2_media_data_t * 3806 hammer2_chain_wdata(hammer2_chain_t *chain) 3807 { 3808 KKASSERT(chain->data != NULL); 3809 return (chain->data); 3810 } 3811 3812 /* 3813 * Set the check data for a chain. This can be a heavy-weight operation 3814 * and typically only runs on-flush. For file data check data is calculated 3815 * when the logical buffers are flushed. 3816 */ 3817 void 3818 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 3819 { 3820 chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO; 3821 3822 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 3823 case HAMMER2_CHECK_NONE: 3824 break; 3825 case HAMMER2_CHECK_DISABLED: 3826 break; 3827 case HAMMER2_CHECK_ISCSI32: 3828 chain->bref.check.iscsi32.value = 3829 hammer2_icrc32(bdata, chain->bytes); 3830 break; 3831 case HAMMER2_CHECK_CRC64: 3832 chain->bref.check.crc64.value = 0; 3833 /* XXX */ 3834 break; 3835 case HAMMER2_CHECK_SHA192: 3836 { 3837 SHA256_CTX hash_ctx; 3838 union { 3839 uint8_t digest[SHA256_DIGEST_LENGTH]; 3840 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 3841 } u; 3842 3843 SHA256_Init(&hash_ctx); 3844 SHA256_Update(&hash_ctx, bdata, chain->bytes); 3845 SHA256_Final(u.digest, &hash_ctx); 3846 u.digest64[2] ^= u.digest64[3]; 3847 bcopy(u.digest, 3848 chain->bref.check.sha192.data, 3849 sizeof(chain->bref.check.sha192.data)); 3850 } 3851 break; 3852 case HAMMER2_CHECK_FREEMAP: 3853 chain->bref.check.freemap.icrc32 = 3854 hammer2_icrc32(bdata, chain->bytes); 3855 break; 3856 default: 3857 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 3858 chain->bref.methods); 3859 break; 3860 } 3861 } 3862 3863 int 3864 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 3865 { 3866 int r; 3867 3868 if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO) 3869 return 1; 3870 3871 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 3872 case HAMMER2_CHECK_NONE: 3873 r = 1; 3874 break; 3875 case HAMMER2_CHECK_DISABLED: 3876 r = 1; 3877 break; 3878 case HAMMER2_CHECK_ISCSI32: 3879 r = (chain->bref.check.iscsi32.value == 3880 hammer2_icrc32(bdata, chain->bytes)); 3881 break; 3882 case HAMMER2_CHECK_CRC64: 3883 r = (chain->bref.check.crc64.value == 0); 3884 /* XXX */ 3885 break; 3886 case HAMMER2_CHECK_SHA192: 3887 { 3888 SHA256_CTX hash_ctx; 3889 union { 3890 uint8_t digest[SHA256_DIGEST_LENGTH]; 3891 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 3892 } u; 3893 3894 SHA256_Init(&hash_ctx); 3895 SHA256_Update(&hash_ctx, bdata, chain->bytes); 3896 SHA256_Final(u.digest, &hash_ctx); 3897 u.digest64[2] ^= u.digest64[3]; 3898 if (bcmp(u.digest, 3899 chain->bref.check.sha192.data, 3900 sizeof(chain->bref.check.sha192.data)) == 0) { 3901 r = 1; 3902 } else { 3903 r = 0; 3904 } 3905 } 3906 break; 3907 case HAMMER2_CHECK_FREEMAP: 3908 r = (chain->bref.check.freemap.icrc32 == 3909 hammer2_icrc32(bdata, chain->bytes)); 3910 if (r == 0) { 3911 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 3912 chain->bref.check.freemap.icrc32, 3913 hammer2_icrc32(bdata, chain->bytes), chain->bytes); 3914 if (chain->dio) 3915 kprintf("dio %p buf %016jx,%d bdata %p/%p\n", 3916 chain->dio, chain->dio->bp->b_loffset, chain->dio->bp->b_bufsize, bdata, chain->dio->bp->b_data); 3917 } 3918 3919 break; 3920 default: 3921 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 3922 chain->bref.methods); 3923 r = 1; 3924 break; 3925 } 3926 return r; 3927 } 3928