1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static int hammer2_indirect_optimize; /* XXX SYSCTL */ 68 69 static hammer2_chain_t *hammer2_chain_create_indirect( 70 hammer2_trans_t *trans, hammer2_chain_t *parent, 71 hammer2_key_t key, int keybits, int for_type, int *errorp); 72 static void hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop); 73 static hammer2_chain_t *hammer2_combined_find( 74 hammer2_chain_t *parent, 75 hammer2_blockref_t *base, int count, 76 int *cache_indexp, hammer2_key_t *key_nextp, 77 hammer2_key_t key_beg, hammer2_key_t key_end, 78 hammer2_blockref_t **bresp); 79 80 /* 81 * Basic RBTree for chains (core->rbtree and core->dbtree). Chains cannot 82 * overlap in the RB trees. Deleted chains are moved from rbtree to either 83 * dbtree or to dbq. 84 * 85 * Chains in delete-duplicate sequences can always iterate through core_entry 86 * to locate the live version of the chain. 87 */ 88 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 89 90 int 91 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 92 { 93 hammer2_key_t c1_beg; 94 hammer2_key_t c1_end; 95 hammer2_key_t c2_beg; 96 hammer2_key_t c2_end; 97 98 /* 99 * Compare chains. Overlaps are not supposed to happen and catch 100 * any software issues early we count overlaps as a match. 101 */ 102 c1_beg = chain1->bref.key; 103 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 104 c2_beg = chain2->bref.key; 105 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 106 107 if (c1_end < c2_beg) /* fully to the left */ 108 return(-1); 109 if (c1_beg > c2_end) /* fully to the right */ 110 return(1); 111 return(0); /* overlap (must not cross edge boundary) */ 112 } 113 114 static __inline 115 int 116 hammer2_isclusterable(hammer2_chain_t *chain) 117 { 118 if (hammer2_cluster_enable) { 119 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 120 chain->bref.type == HAMMER2_BREF_TYPE_INODE || 121 chain->bref.type == HAMMER2_BREF_TYPE_DATA) { 122 return(1); 123 } 124 } 125 return(0); 126 } 127 128 /* 129 * Make a chain visible to the flusher. The flusher needs to be able to 130 * do flushes of a subdirectory chains or single files so it does a top-down 131 * recursion using the ONFLUSH flag for the recursion. It locates MODIFIED 132 * or UPDATE chains and flushes back up the chain to the root. 133 */ 134 void 135 hammer2_chain_setflush(hammer2_trans_t *trans, hammer2_chain_t *chain) 136 { 137 hammer2_chain_t *parent; 138 139 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 140 spin_lock(&chain->core.cst.spin); 141 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 142 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 143 if ((parent = chain->parent) == NULL) 144 break; 145 spin_lock(&parent->core.cst.spin); 146 spin_unlock(&chain->core.cst.spin); 147 chain = parent; 148 } 149 spin_unlock(&chain->core.cst.spin); 150 } 151 } 152 153 /* 154 * Allocate a new disconnected chain element representing the specified 155 * bref. chain->refs is set to 1 and the passed bref is copied to 156 * chain->bref. chain->bytes is derived from the bref. 157 * 158 * chain->core is NOT allocated and the media data and bp pointers are left 159 * NULL. The caller must call chain_core_alloc() to allocate or associate 160 * a core with the chain. 161 * 162 * chain->pmp inherits pmp unless the chain is an inode (other than the 163 * super-root inode). 164 * 165 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 166 */ 167 hammer2_chain_t * 168 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp, 169 hammer2_trans_t *trans, hammer2_blockref_t *bref) 170 { 171 hammer2_chain_t *chain; 172 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 173 174 /* 175 * Construct the appropriate system structure. 176 */ 177 switch(bref->type) { 178 case HAMMER2_BREF_TYPE_INODE: 179 case HAMMER2_BREF_TYPE_INDIRECT: 180 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 181 case HAMMER2_BREF_TYPE_DATA: 182 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 183 /* 184 * Chain's are really only associated with the hmp but we 185 * maintain a pmp association for per-mount memory tracking 186 * purposes. The pmp can be NULL. 187 */ 188 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 189 break; 190 case HAMMER2_BREF_TYPE_VOLUME: 191 case HAMMER2_BREF_TYPE_FREEMAP: 192 chain = NULL; 193 panic("hammer2_chain_alloc volume type illegal for op"); 194 default: 195 chain = NULL; 196 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 197 bref->type); 198 } 199 200 /* 201 * Initialize the new chain structure. 202 */ 203 chain->pmp = pmp; 204 chain->hmp = hmp; 205 chain->bref = *bref; 206 chain->bytes = bytes; 207 chain->refs = 1; 208 chain->flags = HAMMER2_CHAIN_ALLOCATED; 209 210 /* 211 * Set the PFS boundary flag if this chain represents a PFS root. 212 */ 213 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 214 chain->flags |= HAMMER2_CHAIN_PFSBOUNDARY; 215 216 return (chain); 217 } 218 219 /* 220 * Associate an existing core with the chain or allocate a new core. 221 * 222 * The core is not locked. No additional refs on the chain are made. 223 * (trans) must not be NULL if (core) is not NULL. 224 * 225 * When chains are delete-duplicated during flushes we insert nchain on 226 * the ownerq after ochain instead of at the end in order to give the 227 * drop code visibility in the correct order, otherwise drops can be missed. 228 */ 229 void 230 hammer2_chain_core_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain) 231 { 232 hammer2_chain_core_t *core = &chain->core; 233 234 /* 235 * Fresh core under nchain (no multi-homing of ochain's 236 * sub-tree). 237 */ 238 RB_INIT(&core->rbtree); /* live chains */ 239 ccms_cst_init(&core->cst, chain); 240 } 241 242 /* 243 * Add a reference to a chain element, preventing its destruction. 244 * 245 * (can be called with spinlock held) 246 */ 247 void 248 hammer2_chain_ref(hammer2_chain_t *chain) 249 { 250 atomic_add_int(&chain->refs, 1); 251 } 252 253 /* 254 * Insert the chain in the core rbtree. 255 * 256 * Normal insertions are placed in the live rbtree. Insertion of a deleted 257 * chain is a special case used by the flush code that is placed on the 258 * unstaged deleted list to avoid confusing the live view. 259 */ 260 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 261 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 262 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 263 264 static 265 int 266 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 267 int flags, int generation) 268 { 269 hammer2_chain_t *xchain; 270 int error = 0; 271 272 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 273 spin_lock(&parent->core.cst.spin); 274 275 /* 276 * Interlocked by spinlock, check for race 277 */ 278 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 279 parent->core.generation != generation) { 280 error = EAGAIN; 281 goto failed; 282 } 283 284 /* 285 * Insert chain 286 */ 287 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 288 KASSERT(xchain == NULL, 289 ("hammer2_chain_insert: collision %p %p", chain, xchain)); 290 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 291 chain->parent = parent; 292 ++parent->core.chain_count; 293 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 294 295 /* 296 * We have to keep track of the effective live-view blockref count 297 * so the create code knows when to push an indirect block. 298 */ 299 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 300 atomic_add_int(&parent->core.live_count, 1); 301 failed: 302 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 303 spin_unlock(&parent->core.cst.spin); 304 return error; 305 } 306 307 /* 308 * Drop the caller's reference to the chain. When the ref count drops to 309 * zero this function will try to disassociate the chain from its parent and 310 * deallocate it, then recursely drop the parent using the implied ref 311 * from the chain's chain->parent. 312 */ 313 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain); 314 315 void 316 hammer2_chain_drop(hammer2_chain_t *chain) 317 { 318 u_int refs; 319 u_int need = 0; 320 321 if (hammer2_debug & 0x200000) 322 Debugger("drop"); 323 324 if (chain->flags & HAMMER2_CHAIN_UPDATE) 325 ++need; 326 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 327 ++need; 328 KKASSERT(chain->refs > need); 329 330 while (chain) { 331 refs = chain->refs; 332 cpu_ccfence(); 333 KKASSERT(refs > 0); 334 335 if (refs == 1) { 336 chain = hammer2_chain_lastdrop(chain); 337 } else { 338 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 339 break; 340 /* retry the same chain */ 341 } 342 } 343 } 344 345 /* 346 * Safe handling of the 1->0 transition on chain. Returns a chain for 347 * recursive drop or NULL, possibly returning the same chain if the atomic 348 * op fails. 349 * 350 * Whem two chains need to be recursively dropped we use the chain 351 * we would otherwise free to placehold the additional chain. It's a bit 352 * convoluted but we can't just recurse without potentially blowing out 353 * the kernel stack. 354 * 355 * The chain cannot be freed if it has a non-empty core (children) or 356 * it is not at the head of ownerq. 357 * 358 * The cst spinlock is allowed nest child-to-parent (not parent-to-child). 359 */ 360 static 361 hammer2_chain_t * 362 hammer2_chain_lastdrop(hammer2_chain_t *chain) 363 { 364 hammer2_pfsmount_t *pmp; 365 hammer2_mount_t *hmp; 366 hammer2_chain_t *parent; 367 hammer2_chain_t *rdrop; 368 369 /* 370 * Spinlock the core and check to see if it is empty. If it is 371 * not empty we leave chain intact with refs == 0. The elements 372 * in core->rbtree are associated with other chains contemporary 373 * with ours but not with our chain directly. 374 */ 375 spin_lock(&chain->core.cst.spin); 376 377 /* 378 * We can't free non-stale chains with children until we are 379 * able to free the children because there might be a flush 380 * dependency. Flushes of stale children (which should also 381 * have their deleted flag set) short-cut recursive flush 382 * dependencies and can be freed here. Any flushes which run 383 * through stale children due to the flush synchronization 384 * point should have a FLUSH_* bit set in the chain and not 385 * reach lastdrop at this time. 386 * 387 * NOTE: We return (chain) on failure to retry. 388 */ 389 if (chain->core.chain_count) { 390 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 391 spin_unlock(&chain->core.cst.spin); 392 chain = NULL; /* success */ 393 } else { 394 spin_unlock(&chain->core.cst.spin); 395 } 396 return(chain); 397 } 398 /* no chains left under us */ 399 400 /* 401 * chain->core has no children left so no accessors can get to our 402 * chain from there. Now we have to lock the parent core to interlock 403 * remaining possible accessors that might bump chain's refs before 404 * we can safely drop chain's refs with intent to free the chain. 405 */ 406 hmp = chain->hmp; 407 pmp = chain->pmp; /* can be NULL */ 408 rdrop = NULL; 409 410 /* 411 * Spinlock the parent and try to drop the last ref on chain. 412 * On success remove chain from its parent, otherwise return NULL. 413 * 414 * (normal core locks are top-down recursive but we define core 415 * spinlocks as bottom-up recursive, so this is safe). 416 */ 417 if ((parent = chain->parent) != NULL) { 418 spin_lock(&parent->core.cst.spin); 419 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 420 /* 1->0 transition failed */ 421 spin_unlock(&parent->core.cst.spin); 422 spin_unlock(&chain->core.cst.spin); 423 return(chain); /* retry */ 424 } 425 426 /* 427 * 1->0 transition successful, remove chain from its 428 * above core. 429 */ 430 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 431 RB_REMOVE(hammer2_chain_tree, 432 &parent->core.rbtree, chain); 433 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 434 --parent->core.chain_count; 435 chain->parent = NULL; 436 } 437 438 /* 439 * If our chain was the last chain in the parent's core the 440 * core is now empty and its parent might have to be 441 * re-dropped if it has 0 refs. 442 */ 443 if (parent->core.chain_count == 0) { 444 rdrop = parent; 445 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) { 446 rdrop = NULL; 447 } 448 } 449 spin_unlock(&parent->core.cst.spin); 450 parent = NULL; /* safety */ 451 } 452 453 /* 454 * Successful 1->0 transition and the chain can be destroyed now. 455 * 456 * We still have the core spinlock, and core's chain_count is 0. 457 * Any parent spinlock is gone. 458 */ 459 spin_unlock(&chain->core.cst.spin); 460 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 461 chain->core.chain_count == 0); 462 KKASSERT(chain->core.cst.count == 0); 463 KKASSERT(chain->core.cst.upgrade == 0); 464 465 /* 466 * All spin locks are gone, finish freeing stuff. 467 */ 468 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 469 HAMMER2_CHAIN_MODIFIED)) == 0); 470 hammer2_chain_drop_data(chain, 1); 471 472 KKASSERT(chain->dio == NULL); 473 474 /* 475 * Once chain resources are gone we can use the now dead chain 476 * structure to placehold what might otherwise require a recursive 477 * drop, because we have potentially two things to drop and can only 478 * return one directly. 479 */ 480 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 481 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED; 482 chain->hmp = NULL; 483 kfree(chain, hmp->mchain); 484 } 485 486 /* 487 * Possible chaining loop when parent re-drop needed. 488 */ 489 return(rdrop); 490 } 491 492 /* 493 * On either last lock release or last drop 494 */ 495 static void 496 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop) 497 { 498 /*hammer2_mount_t *hmp = chain->hmp;*/ 499 500 switch(chain->bref.type) { 501 case HAMMER2_BREF_TYPE_VOLUME: 502 case HAMMER2_BREF_TYPE_FREEMAP: 503 if (lastdrop) 504 chain->data = NULL; 505 break; 506 default: 507 KKASSERT(chain->data == NULL); 508 break; 509 } 510 } 511 512 /* 513 * Ref and lock a chain element, acquiring its data with I/O if necessary, 514 * and specify how you would like the data to be resolved. 515 * 516 * Returns 0 on success or an error code if the data could not be acquired. 517 * The chain element is locked on return regardless of whether an error 518 * occurred or not. 519 * 520 * The lock is allowed to recurse, multiple locking ops will aggregate 521 * the requested resolve types. Once data is assigned it will not be 522 * removed until the last unlock. 523 * 524 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 525 * (typically used to avoid device/logical buffer 526 * aliasing for data) 527 * 528 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 529 * the INITIAL-create state (indirect blocks only). 530 * 531 * Do not resolve data elements for DATA chains. 532 * (typically used to avoid device/logical buffer 533 * aliasing for data) 534 * 535 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 536 * 537 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 538 * it will be locked exclusive. 539 * 540 * NOTE: Embedded elements (volume header, inodes) are always resolved 541 * regardless. 542 * 543 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 544 * element will instantiate and zero its buffer, and flush it on 545 * release. 546 * 547 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 548 * so as not to instantiate a device buffer, which could alias against 549 * a logical file buffer. However, if ALWAYS is specified the 550 * device buffer will be instantiated anyway. 551 * 552 * WARNING! If data must be fetched a shared lock will temporarily be 553 * upgraded to exclusive. However, a deadlock can occur if 554 * the caller owns more than one shared lock. 555 */ 556 int 557 hammer2_chain_lock(hammer2_chain_t *chain, int how) 558 { 559 hammer2_mount_t *hmp; 560 hammer2_blockref_t *bref; 561 ccms_state_t ostate; 562 char *bdata; 563 int error; 564 565 /* 566 * Ref and lock the element. Recursive locks are allowed. 567 */ 568 if ((how & HAMMER2_RESOLVE_NOREF) == 0) 569 hammer2_chain_ref(chain); 570 atomic_add_int(&chain->lockcnt, 1); 571 572 hmp = chain->hmp; 573 KKASSERT(hmp != NULL); 574 575 /* 576 * Get the appropriate lock. 577 */ 578 if (how & HAMMER2_RESOLVE_SHARED) 579 ccms_thread_lock(&chain->core.cst, CCMS_STATE_SHARED); 580 else 581 ccms_thread_lock(&chain->core.cst, CCMS_STATE_EXCLUSIVE); 582 583 /* 584 * If we already have a valid data pointer no further action is 585 * necessary. 586 */ 587 if (chain->data) 588 return (0); 589 590 /* 591 * Do we have to resolve the data? 592 */ 593 switch(how & HAMMER2_RESOLVE_MASK) { 594 case HAMMER2_RESOLVE_NEVER: 595 return(0); 596 case HAMMER2_RESOLVE_MAYBE: 597 if (chain->flags & HAMMER2_CHAIN_INITIAL) 598 return(0); 599 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 600 return(0); 601 #if 0 602 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 603 return(0); 604 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 605 return(0); 606 #endif 607 /* fall through */ 608 case HAMMER2_RESOLVE_ALWAYS: 609 break; 610 } 611 612 /* 613 * Upgrade to an exclusive lock so we can safely manipulate the 614 * buffer cache. If another thread got to it before us we 615 * can just return. 616 */ 617 ostate = ccms_thread_lock_upgrade(&chain->core.cst); 618 if (chain->data) { 619 ccms_thread_lock_downgrade(&chain->core.cst, ostate); 620 return (0); 621 } 622 623 /* 624 * We must resolve to a device buffer, either by issuing I/O or 625 * by creating a zero-fill element. We do not mark the buffer 626 * dirty when creating a zero-fill element (the hammer2_chain_modify() 627 * API must still be used to do that). 628 * 629 * The device buffer is variable-sized in powers of 2 down 630 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 631 * chunk always contains buffers of the same size. (XXX) 632 * 633 * The minimum physical IO size may be larger than the variable 634 * block size. 635 */ 636 bref = &chain->bref; 637 638 /* 639 * The getblk() optimization can only be used on newly created 640 * elements if the physical block size matches the request. 641 */ 642 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 643 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, 644 &chain->dio); 645 } else { 646 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes, 647 &chain->dio); 648 hammer2_adjreadcounter(&chain->bref, chain->bytes); 649 } 650 651 if (error) { 652 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 653 (intmax_t)bref->data_off, error); 654 hammer2_io_bqrelse(&chain->dio); 655 ccms_thread_lock_downgrade(&chain->core.cst, ostate); 656 return (error); 657 } 658 659 #if 0 660 /* 661 * No need for this, always require that hammer2_chain_modify() 662 * be called before any modifying operations, which ensures that 663 * the underlying dio is dirty. 664 */ 665 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) && 666 !hammer2_io_isdirty(chain->dio)) { 667 hammer2_io_setdirty(chain->dio); 668 } 669 #endif 670 671 /* 672 * Clear INITIAL. In this case we used io_new() and the buffer has 673 * been zero'd and marked dirty. 674 */ 675 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 676 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 677 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 678 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO; 679 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 680 /* 681 * check data not currently synchronized due to 682 * modification. XXX assumes data stays in the buffer 683 * cache, which might not be true (need biodep on flush 684 * to calculate crc? or simple crc?). 685 */ 686 } else { 687 if (hammer2_chain_testcheck(chain, bdata) == 0) { 688 kprintf("chain %016jx.%02x meth=%02x CHECK FAIL %08x (flags=%08x)\n", 689 690 chain->bref.data_off, 691 chain->bref.type, 692 chain->bref.methods, 693 hammer2_icrc32(bdata, chain->bytes), 694 chain->flags); 695 } 696 } 697 698 /* 699 * Setup the data pointer, either pointing it to an embedded data 700 * structure and copying the data from the buffer, or pointing it 701 * into the buffer. 702 * 703 * The buffer is not retained when copying to an embedded data 704 * structure in order to avoid potential deadlocks or recursions 705 * on the same physical buffer. 706 */ 707 switch (bref->type) { 708 case HAMMER2_BREF_TYPE_VOLUME: 709 case HAMMER2_BREF_TYPE_FREEMAP: 710 /* 711 * Copy data from bp to embedded buffer 712 */ 713 panic("hammer2_chain_lock: called on unresolved volume header"); 714 break; 715 case HAMMER2_BREF_TYPE_INODE: 716 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 717 case HAMMER2_BREF_TYPE_INDIRECT: 718 case HAMMER2_BREF_TYPE_DATA: 719 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 720 default: 721 /* 722 * Point data at the device buffer and leave dio intact. 723 */ 724 chain->data = (void *)bdata; 725 break; 726 } 727 ccms_thread_lock_downgrade(&chain->core.cst, ostate); 728 return (0); 729 } 730 731 /* 732 * Unlock and deref a chain element. 733 * 734 * On the last lock release any non-embedded data (chain->dio) will be 735 * retired. 736 */ 737 void 738 hammer2_chain_unlock(hammer2_chain_t *chain) 739 { 740 ccms_state_t ostate; 741 long *counterp; 742 u_int lockcnt; 743 744 /* 745 * The core->cst lock can be shared across several chains so we 746 * need to track the per-chain lockcnt separately. 747 * 748 * If multiple locks are present (or being attempted) on this 749 * particular chain we can just unlock, drop refs, and return. 750 * 751 * Otherwise fall-through on the 1->0 transition. 752 */ 753 for (;;) { 754 lockcnt = chain->lockcnt; 755 KKASSERT(lockcnt > 0); 756 cpu_ccfence(); 757 if (lockcnt > 1) { 758 if (atomic_cmpset_int(&chain->lockcnt, 759 lockcnt, lockcnt - 1)) { 760 ccms_thread_unlock(&chain->core.cst); 761 hammer2_chain_drop(chain); 762 return; 763 } 764 } else { 765 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 766 break; 767 } 768 /* retry */ 769 } 770 771 /* 772 * On the 1->0 transition we upgrade the core lock (if necessary) 773 * to exclusive for terminal processing. If after upgrading we find 774 * that lockcnt is non-zero, another thread is racing us and will 775 * handle the unload for us later on, so just cleanup and return 776 * leaving the data/io intact 777 * 778 * Otherwise if lockcnt is still 0 it is possible for it to become 779 * non-zero and race, but since we hold the core->cst lock 780 * exclusively all that will happen is that the chain will be 781 * reloaded after we unload it. 782 */ 783 ostate = ccms_thread_lock_upgrade(&chain->core.cst); 784 if (chain->lockcnt) { 785 ccms_thread_unlock_upgraded(&chain->core.cst, ostate); 786 hammer2_chain_drop(chain); 787 return; 788 } 789 790 /* 791 * Shortcut the case if the data is embedded or not resolved. 792 * 793 * Do NOT NULL out chain->data (e.g. inode data), it might be 794 * dirty. 795 */ 796 if (chain->dio == NULL) { 797 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) 798 hammer2_chain_drop_data(chain, 0); 799 ccms_thread_unlock_upgraded(&chain->core.cst, ostate); 800 hammer2_chain_drop(chain); 801 return; 802 } 803 804 /* 805 * Statistics 806 */ 807 if (hammer2_io_isdirty(chain->dio) == 0) { 808 ; 809 } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 810 switch(chain->bref.type) { 811 case HAMMER2_BREF_TYPE_DATA: 812 counterp = &hammer2_ioa_file_write; 813 break; 814 case HAMMER2_BREF_TYPE_INODE: 815 counterp = &hammer2_ioa_meta_write; 816 break; 817 case HAMMER2_BREF_TYPE_INDIRECT: 818 counterp = &hammer2_ioa_indr_write; 819 break; 820 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 821 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 822 counterp = &hammer2_ioa_fmap_write; 823 break; 824 default: 825 counterp = &hammer2_ioa_volu_write; 826 break; 827 } 828 *counterp += chain->bytes; 829 } else { 830 switch(chain->bref.type) { 831 case HAMMER2_BREF_TYPE_DATA: 832 counterp = &hammer2_iod_file_write; 833 break; 834 case HAMMER2_BREF_TYPE_INODE: 835 counterp = &hammer2_iod_meta_write; 836 break; 837 case HAMMER2_BREF_TYPE_INDIRECT: 838 counterp = &hammer2_iod_indr_write; 839 break; 840 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 841 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 842 counterp = &hammer2_iod_fmap_write; 843 break; 844 default: 845 counterp = &hammer2_iod_volu_write; 846 break; 847 } 848 *counterp += chain->bytes; 849 } 850 851 /* 852 * Clean out the dio. 853 * 854 * If a device buffer was used for data be sure to destroy the 855 * buffer when we are done to avoid aliases (XXX what about the 856 * underlying VM pages?). 857 * 858 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing 859 * is possible. 860 * 861 * NOTE: The isdirty check tracks whether we have to bdwrite() the 862 * buffer or not. The buffer might already be dirty. The 863 * flag is re-set when chain_modify() is called, even if 864 * MODIFIED is already set, allowing the OS to retire the 865 * buffer independent of a hammer2 flush. 866 */ 867 chain->data = NULL; 868 if ((chain->flags & HAMMER2_CHAIN_IOFLUSH) && 869 hammer2_io_isdirty(chain->dio)) { 870 hammer2_io_bawrite(&chain->dio); 871 } else { 872 hammer2_io_bqrelse(&chain->dio); 873 } 874 ccms_thread_unlock_upgraded(&chain->core.cst, ostate); 875 hammer2_chain_drop(chain); 876 } 877 878 /* 879 * This counts the number of live blockrefs in a block array and 880 * also calculates the point at which all remaining blockrefs are empty. 881 * This routine can only be called on a live chain (DUPLICATED flag not set). 882 * 883 * NOTE: Flag is not set until after the count is complete, allowing 884 * callers to test the flag without holding the spinlock. 885 * 886 * NOTE: If base is NULL the related chain is still in the INITIAL 887 * state and there are no blockrefs to count. 888 * 889 * NOTE: live_count may already have some counts accumulated due to 890 * creation and deletion and could even be initially negative. 891 */ 892 void 893 hammer2_chain_countbrefs(hammer2_chain_t *chain, 894 hammer2_blockref_t *base, int count) 895 { 896 spin_lock(&chain->core.cst.spin); 897 if ((chain->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) { 898 if (base) { 899 while (--count >= 0) { 900 if (base[count].type) 901 break; 902 } 903 chain->core.live_zero = count + 1; 904 while (count >= 0) { 905 if (base[count].type) 906 atomic_add_int(&chain->core.live_count, 907 1); 908 --count; 909 } 910 } else { 911 chain->core.live_zero = 0; 912 } 913 /* else do not modify live_count */ 914 atomic_set_int(&chain->core.flags, HAMMER2_CORE_COUNTEDBREFS); 915 } 916 spin_unlock(&chain->core.cst.spin); 917 } 918 919 /* 920 * Resize the chain's physical storage allocation in-place. This function does 921 * not adjust the data pointer and must be followed by (typically) a 922 * hammer2_chain_modify() call to copy any old data over and adjust the 923 * data pointer. 924 * 925 * Chains can be resized smaller without reallocating the storage. Resizing 926 * larger will reallocate the storage. Excess or prior storage is reclaimed 927 * asynchronously at a later time. 928 * 929 * Must be passed an exclusively locked parent and chain. 930 * 931 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 932 * to avoid instantiating a device buffer that conflicts with the vnode data 933 * buffer. However, because H2 can compress or encrypt data, the chain may 934 * have a dio assigned to it in those situations, and they do not conflict. 935 * 936 * XXX return error if cannot resize. 937 */ 938 void 939 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 940 hammer2_chain_t *parent, hammer2_chain_t *chain, 941 int nradix, int flags) 942 { 943 hammer2_mount_t *hmp; 944 size_t obytes; 945 size_t nbytes; 946 947 hmp = chain->hmp; 948 949 /* 950 * Only data and indirect blocks can be resized for now. 951 * (The volu root, inodes, and freemap elements use a fixed size). 952 */ 953 KKASSERT(chain != &hmp->vchain); 954 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 955 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT); 956 957 /* 958 * Nothing to do if the element is already the proper size 959 */ 960 obytes = chain->bytes; 961 nbytes = 1U << nradix; 962 if (obytes == nbytes) 963 return; 964 chain->data_count += (ssize_t)(nbytes - obytes); 965 966 /* 967 * Make sure the old data is instantiated so we can copy it. If this 968 * is a data block, the device data may be superfluous since the data 969 * might be in a logical block, but compressed or encrypted data is 970 * another matter. 971 * 972 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 973 */ 974 hammer2_chain_modify(trans, chain, 0); 975 976 /* 977 * Relocate the block, even if making it smaller (because different 978 * block sizes may be in different regions). 979 * 980 * (data blocks only, we aren't copying the storage here). 981 */ 982 hammer2_freemap_alloc(trans, chain, nbytes); 983 chain->bytes = nbytes; 984 /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */ 985 986 /* 987 * We don't want the followup chain_modify() to try to copy data 988 * from the old (wrong-sized) buffer. It won't know how much to 989 * copy. This case should only occur during writes when the 990 * originator already has the data to write in-hand. 991 */ 992 if (chain->dio) { 993 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA); 994 hammer2_io_brelse(&chain->dio); 995 chain->data = NULL; 996 } 997 } 998 999 #if 0 1000 1001 /* 1002 * REMOVED - see cluster code 1003 * 1004 * Set a chain modified, making it read-write and duplicating it if necessary. 1005 * This function will assign a new physical block to the chain if necessary 1006 * 1007 * Duplication of already-modified chains is possible when the modification 1008 * crosses a flush synchronization boundary. 1009 * 1010 * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE 1011 * level or the COW operation will not work. 1012 * 1013 * Data blocks - The chain is usually locked RESOLVE_NEVER so as not to 1014 * run the data through the device buffers. 1015 * 1016 * This function may return a different chain than was passed, in which case 1017 * the old chain will be unlocked and the new chain will be locked. 1018 * 1019 * ip->chain may be adjusted by hammer2_chain_modify_ip(). 1020 */ 1021 hammer2_inode_data_t * 1022 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 1023 hammer2_chain_t **chainp, int flags) 1024 { 1025 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 1026 hammer2_chain_modify(trans, chainp, flags); 1027 if (ip->chain != *chainp) 1028 hammer2_inode_repoint(ip, NULL, *chainp); 1029 if (ip->vp) 1030 vsetisdirty(ip->vp); 1031 return(&ip->chain->data->ipdata); 1032 } 1033 1034 #endif 1035 1036 void 1037 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags) 1038 { 1039 hammer2_blockref_t obref; 1040 hammer2_mount_t *hmp; 1041 hammer2_io_t *dio; 1042 int error; 1043 int wasinitial; 1044 int newmod; 1045 char *bdata; 1046 1047 hmp = chain->hmp; 1048 obref = chain->bref; 1049 1050 /* 1051 * Data is not optional for freemap chains (we must always be sure 1052 * to copy the data on COW storage allocations). 1053 */ 1054 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1055 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1056 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1057 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1058 } 1059 1060 /* 1061 * Data must be resolved if already assigned unless explicitly 1062 * flagged otherwise. 1063 */ 1064 if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1065 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1066 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1067 hammer2_chain_unlock(chain); 1068 } 1069 1070 /* 1071 * Otherwise do initial-chain handling. Set MODIFIED to indicate 1072 * that the chain has been modified. Set UPDATE to ensure that 1073 * the blockref is updated in the parent. 1074 */ 1075 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1076 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1077 hammer2_chain_ref(chain); 1078 hammer2_pfs_memory_inc(chain->pmp); 1079 newmod = 1; 1080 } else { 1081 newmod = 0; 1082 } 1083 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 1084 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1085 hammer2_chain_ref(chain); 1086 } 1087 1088 /* 1089 * The modification or re-modification requires an allocation and 1090 * possible COW. 1091 * 1092 * We normally always allocate new storage here. If storage exists 1093 * and MODIFY_NOREALLOC is passed in, we do not allocate new storage. 1094 */ 1095 if (chain != &hmp->vchain && chain != &hmp->fchain) { 1096 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1097 ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 && newmod) 1098 ) { 1099 hammer2_freemap_alloc(trans, chain, chain->bytes); 1100 /* XXX failed allocation */ 1101 } 1102 } 1103 1104 /* 1105 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1106 * requires updating as well as to tell the delete code that the 1107 * chain's blockref might not exactly match (in terms of physical size 1108 * or block offset) the one in the parent's blocktable. The base key 1109 * of course will still match. 1110 */ 1111 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1112 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1113 1114 /* 1115 * Short-cut data blocks which the caller does not need an actual 1116 * data reference to (aka OPTDATA), as long as the chain does not 1117 * already have a data pointer to the data. This generally means 1118 * that the modifications are being done via the logical buffer cache. 1119 * The INITIAL flag relates only to the device data buffer and thus 1120 * remains unchange in this situation. 1121 */ 1122 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1123 (flags & HAMMER2_MODIFY_OPTDATA) && 1124 chain->data == NULL) { 1125 goto skip2; 1126 } 1127 1128 /* 1129 * Clearing the INITIAL flag (for indirect blocks) indicates that 1130 * we've processed the uninitialized storage allocation. 1131 * 1132 * If this flag is already clear we are likely in a copy-on-write 1133 * situation but we have to be sure NOT to bzero the storage if 1134 * no data is present. 1135 */ 1136 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1137 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1138 wasinitial = 1; 1139 } else { 1140 wasinitial = 0; 1141 } 1142 1143 /* 1144 * Instantiate data buffer and possibly execute COW operation 1145 */ 1146 switch(chain->bref.type) { 1147 case HAMMER2_BREF_TYPE_VOLUME: 1148 case HAMMER2_BREF_TYPE_FREEMAP: 1149 /* 1150 * The data is embedded, no copy-on-write operation is 1151 * needed. 1152 */ 1153 KKASSERT(chain->dio == NULL); 1154 break; 1155 case HAMMER2_BREF_TYPE_INODE: 1156 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1157 case HAMMER2_BREF_TYPE_DATA: 1158 case HAMMER2_BREF_TYPE_INDIRECT: 1159 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1160 /* 1161 * Perform the copy-on-write operation 1162 * 1163 * zero-fill or copy-on-write depending on whether 1164 * chain->data exists or not and set the dirty state for 1165 * the new buffer. hammer2_io_new() will handle the 1166 * zero-fill. 1167 */ 1168 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1169 1170 if (wasinitial) { 1171 error = hammer2_io_new(hmp, chain->bref.data_off, 1172 chain->bytes, &dio); 1173 } else { 1174 error = hammer2_io_bread(hmp, chain->bref.data_off, 1175 chain->bytes, &dio); 1176 } 1177 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1178 KKASSERT(error == 0); 1179 1180 bdata = hammer2_io_data(dio, chain->bref.data_off); 1181 1182 if (chain->data) { 1183 KKASSERT(chain->dio != NULL); 1184 if (chain->data != (void *)bdata) { 1185 bcopy(chain->data, bdata, chain->bytes); 1186 } 1187 } else if (wasinitial == 0) { 1188 /* 1189 * We have a problem. We were asked to COW but 1190 * we don't have any data to COW with! 1191 */ 1192 panic("hammer2_chain_modify: having a COW %p\n", 1193 chain); 1194 } 1195 1196 /* 1197 * Retire the old buffer, replace with the new. Dirty or 1198 * redirty the new buffer. 1199 * 1200 * WARNING! The system buffer cache may have already flushed 1201 * the buffer, so we must be sure to [re]dirty it 1202 * for further modification. 1203 */ 1204 if (chain->dio) 1205 hammer2_io_brelse(&chain->dio); 1206 chain->data = (void *)bdata; 1207 chain->dio = dio; 1208 hammer2_io_setdirty(dio); /* modified by bcopy above */ 1209 break; 1210 default: 1211 panic("hammer2_chain_modify: illegal non-embedded type %d", 1212 chain->bref.type); 1213 break; 1214 1215 } 1216 skip2: 1217 /* 1218 * setflush on parent indicating that the parent must recurse down 1219 * to us. Do not call on chain itself which might already have it 1220 * set. 1221 */ 1222 if (chain->parent) 1223 hammer2_chain_setflush(trans, chain->parent); 1224 1225 #if 0 1226 /* 1227 * Adjust the freemap bitmap to indicate that the related blocks 1228 * MIGHT be freeable. Bulkfree must still determine that the blocks 1229 * are actually freeable. 1230 * 1231 * We no longer do this in the normal filesystem operations path 1232 * as it interferes with the bulkfree algorithm. 1233 */ 1234 if (obref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 1235 obref.type != HAMMER2_BREF_TYPE_FREEMAP_LEAF && 1236 (obref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1237 hammer2_freemap_adjust(trans, hmp, 1238 &obref, HAMMER2_FREEMAP_DOMAYFREE); 1239 } 1240 #endif 1241 } 1242 1243 /* 1244 * Volume header data locks 1245 */ 1246 void 1247 hammer2_voldata_lock(hammer2_mount_t *hmp) 1248 { 1249 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 1250 } 1251 1252 void 1253 hammer2_voldata_unlock(hammer2_mount_t *hmp) 1254 { 1255 lockmgr(&hmp->vollk, LK_RELEASE); 1256 } 1257 1258 void 1259 hammer2_voldata_modify(hammer2_mount_t *hmp) 1260 { 1261 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1262 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1263 hammer2_chain_ref(&hmp->vchain); 1264 hammer2_pfs_memory_inc(hmp->vchain.pmp); 1265 } 1266 } 1267 1268 /* 1269 * This function returns the chain at the nearest key within the specified 1270 * range. The returned chain will be referenced but not locked. 1271 * 1272 * This function will recurse through chain->rbtree as necessary and will 1273 * return a *key_nextp suitable for iteration. *key_nextp is only set if 1274 * the iteration value is less than the current value of *key_nextp. 1275 * 1276 * The caller should use (*key_nextp) to calculate the actual range of 1277 * the returned element, which will be (key_beg to *key_nextp - 1), because 1278 * there might be another element which is superior to the returned element 1279 * and overlaps it. 1280 * 1281 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 1282 * chains continue to be returned. On EOF (*key_nextp) may overflow since 1283 * it will wind up being (key_end + 1). 1284 * 1285 * WARNING! Must be called with child's spinlock held. Spinlock remains 1286 * held through the operation. 1287 */ 1288 struct hammer2_chain_find_info { 1289 hammer2_chain_t *best; 1290 hammer2_key_t key_beg; 1291 hammer2_key_t key_end; 1292 hammer2_key_t key_next; 1293 }; 1294 1295 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 1296 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 1297 1298 static 1299 hammer2_chain_t * 1300 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 1301 hammer2_key_t key_beg, hammer2_key_t key_end) 1302 { 1303 struct hammer2_chain_find_info info; 1304 1305 info.best = NULL; 1306 info.key_beg = key_beg; 1307 info.key_end = key_end; 1308 info.key_next = *key_nextp; 1309 1310 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 1311 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1312 &info); 1313 *key_nextp = info.key_next; 1314 #if 0 1315 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 1316 parent, key_beg, key_end, *key_nextp); 1317 #endif 1318 1319 return (info.best); 1320 } 1321 1322 static 1323 int 1324 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 1325 { 1326 struct hammer2_chain_find_info *info = data; 1327 hammer2_key_t child_beg; 1328 hammer2_key_t child_end; 1329 1330 child_beg = child->bref.key; 1331 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 1332 1333 if (child_end < info->key_beg) 1334 return(-1); 1335 if (child_beg > info->key_end) 1336 return(1); 1337 return(0); 1338 } 1339 1340 static 1341 int 1342 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 1343 { 1344 struct hammer2_chain_find_info *info = data; 1345 hammer2_chain_t *best; 1346 hammer2_key_t child_end; 1347 1348 /* 1349 * WARNING! Do not discard DUPLICATED chains, it is possible that 1350 * we are catching an insertion half-way done. If a 1351 * duplicated chain turns out to be the best choice the 1352 * caller will re-check its flags after locking it. 1353 * 1354 * WARNING! Layerq is scanned forwards, exact matches should keep 1355 * the existing info->best. 1356 */ 1357 if ((best = info->best) == NULL) { 1358 /* 1359 * No previous best. Assign best 1360 */ 1361 info->best = child; 1362 } else if (best->bref.key <= info->key_beg && 1363 child->bref.key <= info->key_beg) { 1364 /* 1365 * Illegal overlap. 1366 */ 1367 KKASSERT(0); 1368 /*info->best = child;*/ 1369 } else if (child->bref.key < best->bref.key) { 1370 /* 1371 * Child has a nearer key and best is not flush with key_beg. 1372 * Set best to child. Truncate key_next to the old best key. 1373 */ 1374 info->best = child; 1375 if (info->key_next > best->bref.key || info->key_next == 0) 1376 info->key_next = best->bref.key; 1377 } else if (child->bref.key == best->bref.key) { 1378 /* 1379 * If our current best is flush with the child then this 1380 * is an illegal overlap. 1381 * 1382 * key_next will automatically be limited to the smaller of 1383 * the two end-points. 1384 */ 1385 KKASSERT(0); 1386 info->best = child; 1387 } else { 1388 /* 1389 * Keep the current best but truncate key_next to the child's 1390 * base. 1391 * 1392 * key_next will also automatically be limited to the smaller 1393 * of the two end-points (probably not necessary for this case 1394 * but we do it anyway). 1395 */ 1396 if (info->key_next > child->bref.key || info->key_next == 0) 1397 info->key_next = child->bref.key; 1398 } 1399 1400 /* 1401 * Always truncate key_next based on child's end-of-range. 1402 */ 1403 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 1404 if (child_end && (info->key_next > child_end || info->key_next == 0)) 1405 info->key_next = child_end; 1406 1407 return(0); 1408 } 1409 1410 /* 1411 * Retrieve the specified chain from a media blockref, creating the 1412 * in-memory chain structure which reflects it. 1413 * 1414 * To handle insertion races pass the INSERT_RACE flag along with the 1415 * generation number of the core. NULL will be returned if the generation 1416 * number changes before we have a chance to insert the chain. Insert 1417 * races can occur because the parent might be held shared. 1418 * 1419 * Caller must hold the parent locked shared or exclusive since we may 1420 * need the parent's bref array to find our block. 1421 * 1422 * WARNING! chain->pmp is left NULL if the bref represents a PFS mount 1423 * point. 1424 */ 1425 hammer2_chain_t * 1426 hammer2_chain_get(hammer2_chain_t *parent, int generation, 1427 hammer2_blockref_t *bref) 1428 { 1429 hammer2_mount_t *hmp = parent->hmp; 1430 hammer2_chain_t *chain; 1431 int error; 1432 1433 /* 1434 * Allocate a chain structure representing the existing media 1435 * entry. Resulting chain has one ref and is not locked. 1436 */ 1437 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 1438 chain = hammer2_chain_alloc(hmp, NULL, NULL, bref); 1439 else 1440 chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref); 1441 hammer2_chain_core_alloc(NULL, chain); 1442 /* ref'd chain returned */ 1443 1444 /* 1445 * Flag that the chain is in the parent's blockmap so delete/flush 1446 * knows what to do with it. 1447 */ 1448 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 1449 1450 /* 1451 * Link the chain into its parent. A spinlock is required to safely 1452 * access the RBTREE, and it is possible to collide with another 1453 * hammer2_chain_get() operation because the caller might only hold 1454 * a shared lock on the parent. 1455 */ 1456 KKASSERT(parent->refs > 0); 1457 error = hammer2_chain_insert(parent, chain, 1458 HAMMER2_CHAIN_INSERT_SPIN | 1459 HAMMER2_CHAIN_INSERT_RACE, 1460 generation); 1461 if (error) { 1462 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 1463 kprintf("chain %p get race\n", chain); 1464 hammer2_chain_drop(chain); 1465 chain = NULL; 1466 } else { 1467 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 1468 } 1469 1470 /* 1471 * Return our new chain referenced but not locked, or NULL if 1472 * a race occurred. 1473 */ 1474 return (chain); 1475 } 1476 1477 /* 1478 * Lookup initialization/completion API 1479 */ 1480 hammer2_chain_t * 1481 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 1482 { 1483 if (flags & HAMMER2_LOOKUP_SHARED) { 1484 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 1485 HAMMER2_RESOLVE_SHARED); 1486 } else { 1487 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 1488 } 1489 return (parent); 1490 } 1491 1492 void 1493 hammer2_chain_lookup_done(hammer2_chain_t *parent) 1494 { 1495 if (parent) 1496 hammer2_chain_unlock(parent); 1497 } 1498 1499 static 1500 hammer2_chain_t * 1501 hammer2_chain_getparent(hammer2_chain_t **parentp, int how) 1502 { 1503 hammer2_chain_t *oparent; 1504 hammer2_chain_t *nparent; 1505 1506 /* 1507 * Be careful of order, oparent must be unlocked before nparent 1508 * is locked below to avoid a deadlock. 1509 */ 1510 oparent = *parentp; 1511 spin_lock(&oparent->core.cst.spin); 1512 nparent = oparent->parent; 1513 hammer2_chain_ref(nparent); 1514 spin_unlock(&oparent->core.cst.spin); 1515 if (oparent) { 1516 hammer2_chain_unlock(oparent); 1517 oparent = NULL; 1518 } 1519 1520 hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF); 1521 *parentp = nparent; 1522 1523 return (nparent); 1524 } 1525 1526 /* 1527 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 1528 * (*parentp) typically points to an inode but can also point to a related 1529 * indirect block and this function will recurse upwards and find the inode 1530 * again. 1531 * 1532 * (*parentp) must be exclusively locked and referenced and can be an inode 1533 * or an existing indirect block within the inode. 1534 * 1535 * On return (*parentp) will be modified to point at the deepest parent chain 1536 * element encountered during the search, as a helper for an insertion or 1537 * deletion. The new (*parentp) will be locked and referenced and the old 1538 * will be unlocked and dereferenced (no change if they are both the same). 1539 * 1540 * The matching chain will be returned exclusively locked. If NOLOCK is 1541 * requested the chain will be returned only referenced. 1542 * 1543 * NULL is returned if no match was found, but (*parentp) will still 1544 * potentially be adjusted. 1545 * 1546 * On return (*key_nextp) will point to an iterative value for key_beg. 1547 * (If NULL is returned (*key_nextp) is set to key_end). 1548 * 1549 * This function will also recurse up the chain if the key is not within the 1550 * current parent's range. (*parentp) can never be set to NULL. An iteration 1551 * can simply allow (*parentp) to float inside the loop. 1552 * 1553 * NOTE! chain->data is not always resolved. By default it will not be 1554 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 1555 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 1556 * BREF_TYPE_DATA as the device buffer can alias the logical file 1557 * buffer). 1558 */ 1559 hammer2_chain_t * 1560 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 1561 hammer2_key_t key_beg, hammer2_key_t key_end, 1562 int *cache_indexp, int flags, int *ddflagp) 1563 { 1564 hammer2_mount_t *hmp; 1565 hammer2_chain_t *parent; 1566 hammer2_chain_t *chain; 1567 hammer2_blockref_t *base; 1568 hammer2_blockref_t *bref; 1569 hammer2_blockref_t bcopy; 1570 hammer2_key_t scan_beg; 1571 hammer2_key_t scan_end; 1572 int count = 0; 1573 int how_always = HAMMER2_RESOLVE_ALWAYS; 1574 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1575 int how; 1576 int generation; 1577 int maxloops = 300000; 1578 1579 *ddflagp = 0; 1580 if (flags & HAMMER2_LOOKUP_ALWAYS) { 1581 how_maybe = how_always; 1582 how = HAMMER2_RESOLVE_ALWAYS; 1583 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 1584 how = HAMMER2_RESOLVE_NEVER; 1585 } else { 1586 how = HAMMER2_RESOLVE_MAYBE; 1587 } 1588 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 1589 how_maybe |= HAMMER2_RESOLVE_SHARED; 1590 how_always |= HAMMER2_RESOLVE_SHARED; 1591 how |= HAMMER2_RESOLVE_SHARED; 1592 } 1593 1594 /* 1595 * Recurse (*parentp) upward if necessary until the parent completely 1596 * encloses the key range or we hit the inode. 1597 * 1598 * This function handles races against the flusher doing a delete- 1599 * duplicate above us and re-homes the parent to the duplicate in 1600 * that case, otherwise we'd wind up recursing down a stale chain. 1601 */ 1602 parent = *parentp; 1603 hmp = parent->hmp; 1604 1605 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1606 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1607 scan_beg = parent->bref.key; 1608 scan_end = scan_beg + 1609 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1610 if (key_beg >= scan_beg && key_end <= scan_end) 1611 break; 1612 parent = hammer2_chain_getparent(parentp, how_maybe); 1613 } 1614 1615 again: 1616 if (--maxloops == 0) 1617 panic("hammer2_chain_lookup: maxloops"); 1618 /* 1619 * Locate the blockref array. Currently we do a fully associative 1620 * search through the array. 1621 */ 1622 switch(parent->bref.type) { 1623 case HAMMER2_BREF_TYPE_INODE: 1624 /* 1625 * Special shortcut for embedded data returns the inode 1626 * itself. Callers must detect this condition and access 1627 * the embedded data (the strategy code does this for us). 1628 * 1629 * This is only applicable to regular files and softlinks. 1630 */ 1631 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 1632 if (flags & HAMMER2_LOOKUP_NOLOCK) 1633 hammer2_chain_ref(parent); 1634 else 1635 hammer2_chain_lock(parent, how_always); 1636 *key_nextp = key_end + 1; 1637 *ddflagp = 1; 1638 return (parent); 1639 } 1640 base = &parent->data->ipdata.u.blockset.blockref[0]; 1641 count = HAMMER2_SET_COUNT; 1642 break; 1643 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1644 case HAMMER2_BREF_TYPE_INDIRECT: 1645 /* 1646 * Handle MATCHIND on the parent 1647 */ 1648 if (flags & HAMMER2_LOOKUP_MATCHIND) { 1649 scan_beg = parent->bref.key; 1650 scan_end = scan_beg + 1651 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1652 if (key_beg == scan_beg && key_end == scan_end) { 1653 chain = parent; 1654 hammer2_chain_lock(chain, how_maybe); 1655 *key_nextp = scan_end + 1; 1656 goto done; 1657 } 1658 } 1659 /* 1660 * Optimize indirect blocks in the INITIAL state to avoid 1661 * I/O. 1662 */ 1663 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1664 base = NULL; 1665 } else { 1666 if (parent->data == NULL) 1667 panic("parent->data is NULL"); 1668 base = &parent->data->npdata[0]; 1669 } 1670 count = parent->bytes / sizeof(hammer2_blockref_t); 1671 break; 1672 case HAMMER2_BREF_TYPE_VOLUME: 1673 base = &hmp->voldata.sroot_blockset.blockref[0]; 1674 count = HAMMER2_SET_COUNT; 1675 break; 1676 case HAMMER2_BREF_TYPE_FREEMAP: 1677 base = &hmp->voldata.freemap_blockset.blockref[0]; 1678 count = HAMMER2_SET_COUNT; 1679 break; 1680 default: 1681 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 1682 parent->bref.type); 1683 base = NULL; /* safety */ 1684 count = 0; /* safety */ 1685 } 1686 1687 /* 1688 * Merged scan to find next candidate. 1689 * 1690 * hammer2_base_*() functions require the parent->core.live_* fields 1691 * to be synchronized. 1692 * 1693 * We need to hold the spinlock to access the block array and RB tree 1694 * and to interlock chain creation. 1695 */ 1696 if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 1697 hammer2_chain_countbrefs(parent, base, count); 1698 1699 /* 1700 * Combined search 1701 */ 1702 spin_lock(&parent->core.cst.spin); 1703 chain = hammer2_combined_find(parent, base, count, 1704 cache_indexp, key_nextp, 1705 key_beg, key_end, 1706 &bref); 1707 generation = parent->core.generation; 1708 1709 /* 1710 * Exhausted parent chain, iterate. 1711 */ 1712 if (bref == NULL) { 1713 spin_unlock(&parent->core.cst.spin); 1714 if (key_beg == key_end) /* short cut single-key case */ 1715 return (NULL); 1716 1717 /* 1718 * Stop if we reached the end of the iteration. 1719 */ 1720 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 1721 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1722 return (NULL); 1723 } 1724 1725 /* 1726 * Calculate next key, stop if we reached the end of the 1727 * iteration, otherwise go up one level and loop. 1728 */ 1729 key_beg = parent->bref.key + 1730 ((hammer2_key_t)1 << parent->bref.keybits); 1731 if (key_beg == 0 || key_beg > key_end) 1732 return (NULL); 1733 parent = hammer2_chain_getparent(parentp, how_maybe); 1734 goto again; 1735 } 1736 1737 /* 1738 * Selected from blockref or in-memory chain. 1739 */ 1740 if (chain == NULL) { 1741 bcopy = *bref; 1742 spin_unlock(&parent->core.cst.spin); 1743 chain = hammer2_chain_get(parent, generation, 1744 &bcopy); 1745 if (chain == NULL) { 1746 kprintf("retry lookup parent %p keys %016jx:%016jx\n", 1747 parent, key_beg, key_end); 1748 goto again; 1749 } 1750 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 1751 hammer2_chain_drop(chain); 1752 goto again; 1753 } 1754 } else { 1755 hammer2_chain_ref(chain); 1756 spin_unlock(&parent->core.cst.spin); 1757 } 1758 1759 /* 1760 * chain is referenced but not locked. We must lock the chain 1761 * to obtain definitive DUPLICATED/DELETED state 1762 */ 1763 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1764 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1765 hammer2_chain_lock(chain, how_maybe | HAMMER2_RESOLVE_NOREF); 1766 } else { 1767 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); 1768 } 1769 1770 /* 1771 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 1772 * 1773 * NOTE: Chain's key range is not relevant as there might be 1774 * one-offs within the range that are not deleted. 1775 * 1776 * NOTE: Lookups can race delete-duplicate because 1777 * delete-duplicate does not lock the parent's core 1778 * (they just use the spinlock on the core). We must 1779 * check for races by comparing the DUPLICATED flag before 1780 * releasing the spinlock with the flag after locking the 1781 * chain. 1782 */ 1783 if (chain->flags & HAMMER2_CHAIN_DELETED) { 1784 hammer2_chain_unlock(chain); 1785 key_beg = *key_nextp; 1786 if (key_beg == 0 || key_beg > key_end) 1787 return(NULL); 1788 goto again; 1789 } 1790 1791 /* 1792 * If the chain element is an indirect block it becomes the new 1793 * parent and we loop on it. We must maintain our top-down locks 1794 * to prevent the flusher from interfering (i.e. doing a 1795 * delete-duplicate and leaving us recursing down a deleted chain). 1796 * 1797 * The parent always has to be locked with at least RESOLVE_MAYBE 1798 * so we can access its data. It might need a fixup if the caller 1799 * passed incompatible flags. Be careful not to cause a deadlock 1800 * as a data-load requires an exclusive lock. 1801 * 1802 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 1803 * range is within the requested key range we return the indirect 1804 * block and do NOT loop. This is usually only used to acquire 1805 * freemap nodes. 1806 */ 1807 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1808 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1809 hammer2_chain_unlock(parent); 1810 *parentp = parent = chain; 1811 goto again; 1812 } 1813 done: 1814 /* 1815 * All done, return the chain 1816 */ 1817 return (chain); 1818 } 1819 1820 /* 1821 * After having issued a lookup we can iterate all matching keys. 1822 * 1823 * If chain is non-NULL we continue the iteration from just after it's index. 1824 * 1825 * If chain is NULL we assume the parent was exhausted and continue the 1826 * iteration at the next parent. 1827 * 1828 * parent must be locked on entry and remains locked throughout. chain's 1829 * lock status must match flags. Chain is always at least referenced. 1830 * 1831 * WARNING! The MATCHIND flag does not apply to this function. 1832 */ 1833 hammer2_chain_t * 1834 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 1835 hammer2_key_t *key_nextp, 1836 hammer2_key_t key_beg, hammer2_key_t key_end, 1837 int *cache_indexp, int flags) 1838 { 1839 hammer2_chain_t *parent; 1840 int how_maybe; 1841 int ddflag; 1842 1843 /* 1844 * Calculate locking flags for upward recursion. 1845 */ 1846 how_maybe = HAMMER2_RESOLVE_MAYBE; 1847 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 1848 how_maybe |= HAMMER2_RESOLVE_SHARED; 1849 1850 parent = *parentp; 1851 1852 /* 1853 * Calculate the next index and recalculate the parent if necessary. 1854 */ 1855 if (chain) { 1856 key_beg = chain->bref.key + 1857 ((hammer2_key_t)1 << chain->bref.keybits); 1858 if (flags & HAMMER2_LOOKUP_NOLOCK) 1859 hammer2_chain_drop(chain); 1860 else 1861 hammer2_chain_unlock(chain); 1862 1863 /* 1864 * Any scan where the lookup returned degenerate data embedded 1865 * in the inode has an invalid index and must terminate. 1866 */ 1867 if (chain == parent) 1868 return(NULL); 1869 if (key_beg == 0 || key_beg > key_end) 1870 return(NULL); 1871 chain = NULL; 1872 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 1873 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1874 /* 1875 * We reached the end of the iteration. 1876 */ 1877 return (NULL); 1878 } else { 1879 /* 1880 * Continue iteration with next parent unless the current 1881 * parent covers the range. 1882 */ 1883 key_beg = parent->bref.key + 1884 ((hammer2_key_t)1 << parent->bref.keybits); 1885 if (key_beg == 0 || key_beg > key_end) 1886 return (NULL); 1887 parent = hammer2_chain_getparent(parentp, how_maybe); 1888 } 1889 1890 /* 1891 * And execute 1892 */ 1893 return (hammer2_chain_lookup(parentp, key_nextp, 1894 key_beg, key_end, 1895 cache_indexp, flags, &ddflag)); 1896 } 1897 1898 /* 1899 * The raw scan function is similar to lookup/next but does not seek to a key. 1900 * Blockrefs are iterated via first_chain = (parent, NULL) and 1901 * next_chain = (parent, chain). 1902 * 1903 * The passed-in parent must be locked and its data resolved. The returned 1904 * chain will be locked. Pass chain == NULL to acquire the first sub-chain 1905 * under parent and then iterate with the passed-in chain (which this 1906 * function will unlock). 1907 */ 1908 hammer2_chain_t * 1909 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain, 1910 int *cache_indexp, int flags) 1911 { 1912 hammer2_mount_t *hmp; 1913 hammer2_blockref_t *base; 1914 hammer2_blockref_t *bref; 1915 hammer2_blockref_t bcopy; 1916 hammer2_key_t key; 1917 hammer2_key_t next_key; 1918 int count = 0; 1919 int how_always = HAMMER2_RESOLVE_ALWAYS; 1920 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1921 int how; 1922 int generation; 1923 int maxloops = 300000; 1924 1925 hmp = parent->hmp; 1926 1927 /* 1928 * Scan flags borrowed from lookup 1929 */ 1930 if (flags & HAMMER2_LOOKUP_ALWAYS) { 1931 how_maybe = how_always; 1932 how = HAMMER2_RESOLVE_ALWAYS; 1933 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 1934 how = HAMMER2_RESOLVE_NEVER; 1935 } else { 1936 how = HAMMER2_RESOLVE_MAYBE; 1937 } 1938 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 1939 how_maybe |= HAMMER2_RESOLVE_SHARED; 1940 how_always |= HAMMER2_RESOLVE_SHARED; 1941 how |= HAMMER2_RESOLVE_SHARED; 1942 } 1943 1944 /* 1945 * Calculate key to locate first/next element, unlocking the previous 1946 * element as we go. Be careful, the key calculation can overflow. 1947 */ 1948 if (chain) { 1949 key = chain->bref.key + 1950 ((hammer2_key_t)1 << chain->bref.keybits); 1951 hammer2_chain_unlock(chain); 1952 chain = NULL; 1953 if (key == 0) 1954 goto done; 1955 } else { 1956 key = 0; 1957 } 1958 1959 again: 1960 if (--maxloops == 0) 1961 panic("hammer2_chain_scan: maxloops"); 1962 /* 1963 * Locate the blockref array. Currently we do a fully associative 1964 * search through the array. 1965 */ 1966 switch(parent->bref.type) { 1967 case HAMMER2_BREF_TYPE_INODE: 1968 /* 1969 * An inode with embedded data has no sub-chains. 1970 */ 1971 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) 1972 goto done; 1973 base = &parent->data->ipdata.u.blockset.blockref[0]; 1974 count = HAMMER2_SET_COUNT; 1975 break; 1976 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1977 case HAMMER2_BREF_TYPE_INDIRECT: 1978 /* 1979 * Optimize indirect blocks in the INITIAL state to avoid 1980 * I/O. 1981 */ 1982 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1983 base = NULL; 1984 } else { 1985 if (parent->data == NULL) 1986 panic("parent->data is NULL"); 1987 base = &parent->data->npdata[0]; 1988 } 1989 count = parent->bytes / sizeof(hammer2_blockref_t); 1990 break; 1991 case HAMMER2_BREF_TYPE_VOLUME: 1992 base = &hmp->voldata.sroot_blockset.blockref[0]; 1993 count = HAMMER2_SET_COUNT; 1994 break; 1995 case HAMMER2_BREF_TYPE_FREEMAP: 1996 base = &hmp->voldata.freemap_blockset.blockref[0]; 1997 count = HAMMER2_SET_COUNT; 1998 break; 1999 default: 2000 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2001 parent->bref.type); 2002 base = NULL; /* safety */ 2003 count = 0; /* safety */ 2004 } 2005 2006 /* 2007 * Merged scan to find next candidate. 2008 * 2009 * hammer2_base_*() functions require the parent->core.live_* fields 2010 * to be synchronized. 2011 * 2012 * We need to hold the spinlock to access the block array and RB tree 2013 * and to interlock chain creation. 2014 */ 2015 if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2016 hammer2_chain_countbrefs(parent, base, count); 2017 2018 next_key = 0; 2019 spin_lock(&parent->core.cst.spin); 2020 chain = hammer2_combined_find(parent, base, count, 2021 cache_indexp, &next_key, 2022 key, HAMMER2_KEY_MAX, 2023 &bref); 2024 generation = parent->core.generation; 2025 2026 /* 2027 * Exhausted parent chain, we're done. 2028 */ 2029 if (bref == NULL) { 2030 spin_unlock(&parent->core.cst.spin); 2031 KKASSERT(chain == NULL); 2032 goto done; 2033 } 2034 2035 /* 2036 * Selected from blockref or in-memory chain. 2037 */ 2038 if (chain == NULL) { 2039 bcopy = *bref; 2040 spin_unlock(&parent->core.cst.spin); 2041 chain = hammer2_chain_get(parent, generation, &bcopy); 2042 if (chain == NULL) { 2043 kprintf("retry scan parent %p keys %016jx\n", 2044 parent, key); 2045 goto again; 2046 } 2047 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2048 hammer2_chain_drop(chain); 2049 chain = NULL; 2050 goto again; 2051 } 2052 } else { 2053 hammer2_chain_ref(chain); 2054 spin_unlock(&parent->core.cst.spin); 2055 } 2056 2057 /* 2058 * chain is referenced but not locked. We must lock the chain 2059 * to obtain definitive DUPLICATED/DELETED state 2060 */ 2061 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); 2062 2063 /* 2064 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2065 * 2066 * NOTE: chain's key range is not relevant as there might be 2067 * one-offs within the range that are not deleted. 2068 * 2069 * NOTE: XXX this could create problems with scans used in 2070 * situations other than mount-time recovery. 2071 * 2072 * NOTE: Lookups can race delete-duplicate because 2073 * delete-duplicate does not lock the parent's core 2074 * (they just use the spinlock on the core). We must 2075 * check for races by comparing the DUPLICATED flag before 2076 * releasing the spinlock with the flag after locking the 2077 * chain. 2078 */ 2079 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2080 hammer2_chain_unlock(chain); 2081 chain = NULL; 2082 2083 key = next_key; 2084 if (key == 0) 2085 goto done; 2086 goto again; 2087 } 2088 2089 done: 2090 /* 2091 * All done, return the chain or NULL 2092 */ 2093 return (chain); 2094 } 2095 2096 /* 2097 * Create and return a new hammer2 system memory structure of the specified 2098 * key, type and size and insert it under (*parentp). This is a full 2099 * insertion, based on the supplied key/keybits, and may involve creating 2100 * indirect blocks and moving other chains around via delete/duplicate. 2101 * 2102 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 2103 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 2104 * FULL. This typically means that the caller is creating the chain after 2105 * doing a hammer2_chain_lookup(). 2106 * 2107 * (*parentp) must be exclusive locked and may be replaced on return 2108 * depending on how much work the function had to do. 2109 * 2110 * (*chainp) usually starts out NULL and returns the newly created chain, 2111 * but if the caller desires the caller may allocate a disconnected chain 2112 * and pass it in instead. 2113 * 2114 * This function should NOT be used to insert INDIRECT blocks. It is 2115 * typically used to create/insert inodes and data blocks. 2116 * 2117 * Caller must pass-in an exclusively locked parent the new chain is to 2118 * be inserted under, and optionally pass-in a disconnected, exclusively 2119 * locked chain to insert (else we create a new chain). The function will 2120 * adjust (*parentp) as necessary, create or connect the chain, and 2121 * return an exclusively locked chain in *chainp. 2122 */ 2123 int 2124 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp, 2125 hammer2_chain_t **chainp, hammer2_pfsmount_t *pmp, 2126 hammer2_key_t key, int keybits, int type, size_t bytes, 2127 int flags) 2128 { 2129 hammer2_mount_t *hmp; 2130 hammer2_chain_t *chain; 2131 hammer2_chain_t *parent; 2132 hammer2_blockref_t *base; 2133 hammer2_blockref_t dummy; 2134 int allocated = 0; 2135 int error = 0; 2136 int count; 2137 int maxloops = 300000; 2138 2139 /* 2140 * Topology may be crossing a PFS boundary. 2141 */ 2142 parent = *parentp; 2143 KKASSERT(ccms_thread_lock_owned(&parent->core.cst)); 2144 hmp = parent->hmp; 2145 chain = *chainp; 2146 2147 if (chain == NULL) { 2148 /* 2149 * First allocate media space and construct the dummy bref, 2150 * then allocate the in-memory chain structure. Set the 2151 * INITIAL flag for fresh chains which do not have embedded 2152 * data. 2153 */ 2154 bzero(&dummy, sizeof(dummy)); 2155 dummy.type = type; 2156 dummy.key = key; 2157 dummy.keybits = keybits; 2158 dummy.data_off = hammer2_getradix(bytes); 2159 dummy.methods = parent->bref.methods; 2160 chain = hammer2_chain_alloc(hmp, pmp, trans, &dummy); 2161 hammer2_chain_core_alloc(trans, chain); 2162 2163 /* 2164 * Lock the chain manually, chain_lock will load the chain 2165 * which we do NOT want to do. (note: chain->refs is set 2166 * to 1 by chain_alloc() for us, but lockcnt is not). 2167 */ 2168 chain->lockcnt = 1; 2169 ccms_thread_lock(&chain->core.cst, CCMS_STATE_EXCLUSIVE); 2170 allocated = 1; 2171 2172 /* 2173 * We do NOT set INITIAL here (yet). INITIAL is only 2174 * used for indirect blocks. 2175 * 2176 * Recalculate bytes to reflect the actual media block 2177 * allocation. 2178 */ 2179 bytes = (hammer2_off_t)1 << 2180 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 2181 chain->bytes = bytes; 2182 2183 switch(type) { 2184 case HAMMER2_BREF_TYPE_VOLUME: 2185 case HAMMER2_BREF_TYPE_FREEMAP: 2186 panic("hammer2_chain_create: called with volume type"); 2187 break; 2188 case HAMMER2_BREF_TYPE_INDIRECT: 2189 panic("hammer2_chain_create: cannot be used to" 2190 "create indirect block"); 2191 break; 2192 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2193 panic("hammer2_chain_create: cannot be used to" 2194 "create freemap root or node"); 2195 break; 2196 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2197 KKASSERT(bytes == sizeof(chain->data->bmdata)); 2198 /* fall through */ 2199 case HAMMER2_BREF_TYPE_INODE: 2200 case HAMMER2_BREF_TYPE_DATA: 2201 default: 2202 /* 2203 * leave chain->data NULL, set INITIAL 2204 */ 2205 KKASSERT(chain->data == NULL); 2206 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 2207 break; 2208 } 2209 2210 /* 2211 * Set statistics for pending updates. These will be 2212 * synchronized by the flush code. 2213 */ 2214 switch(type) { 2215 case HAMMER2_BREF_TYPE_INODE: 2216 chain->inode_count = 1; 2217 break; 2218 case HAMMER2_BREF_TYPE_DATA: 2219 case HAMMER2_BREF_TYPE_INDIRECT: 2220 chain->data_count = chain->bytes; 2221 break; 2222 } 2223 } else { 2224 /* 2225 * We are reattaching a previously deleted chain, possibly 2226 * under a new parent and possibly with a new key/keybits. 2227 * The chain does not have to be in a modified state. The 2228 * UPDATE flag will be set later on in this routine. 2229 * 2230 * Do NOT mess with the current state of the INITIAL flag. 2231 */ 2232 chain->bref.key = key; 2233 chain->bref.keybits = keybits; 2234 if (chain->flags & HAMMER2_CHAIN_DELETED) 2235 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2236 KKASSERT(chain->parent == NULL); 2237 } 2238 2239 /* 2240 * Calculate how many entries we have in the blockref array and 2241 * determine if an indirect block is required. 2242 */ 2243 again: 2244 if (--maxloops == 0) 2245 panic("hammer2_chain_create: maxloops"); 2246 2247 switch(parent->bref.type) { 2248 case HAMMER2_BREF_TYPE_INODE: 2249 KKASSERT((parent->data->ipdata.op_flags & 2250 HAMMER2_OPFLAG_DIRECTDATA) == 0); 2251 KKASSERT(parent->data != NULL); 2252 base = &parent->data->ipdata.u.blockset.blockref[0]; 2253 count = HAMMER2_SET_COUNT; 2254 break; 2255 case HAMMER2_BREF_TYPE_INDIRECT: 2256 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2257 if (parent->flags & HAMMER2_CHAIN_INITIAL) 2258 base = NULL; 2259 else 2260 base = &parent->data->npdata[0]; 2261 count = parent->bytes / sizeof(hammer2_blockref_t); 2262 break; 2263 case HAMMER2_BREF_TYPE_VOLUME: 2264 KKASSERT(parent->data != NULL); 2265 base = &hmp->voldata.sroot_blockset.blockref[0]; 2266 count = HAMMER2_SET_COUNT; 2267 break; 2268 case HAMMER2_BREF_TYPE_FREEMAP: 2269 KKASSERT(parent->data != NULL); 2270 base = &hmp->voldata.freemap_blockset.blockref[0]; 2271 count = HAMMER2_SET_COUNT; 2272 break; 2273 default: 2274 panic("hammer2_chain_create: unrecognized blockref type: %d", 2275 parent->bref.type); 2276 base = NULL; 2277 count = 0; 2278 break; 2279 } 2280 2281 /* 2282 * Make sure we've counted the brefs 2283 */ 2284 if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2285 hammer2_chain_countbrefs(parent, base, count); 2286 2287 KKASSERT(parent->core.live_count >= 0 && 2288 parent->core.live_count <= count); 2289 2290 /* 2291 * If no free blockref could be found we must create an indirect 2292 * block and move a number of blockrefs into it. With the parent 2293 * locked we can safely lock each child in order to delete+duplicate 2294 * it without causing a deadlock. 2295 * 2296 * This may return the new indirect block or the old parent depending 2297 * on where the key falls. NULL is returned on error. 2298 */ 2299 if (parent->core.live_count == count) { 2300 hammer2_chain_t *nparent; 2301 2302 nparent = hammer2_chain_create_indirect(trans, parent, 2303 key, keybits, 2304 type, &error); 2305 if (nparent == NULL) { 2306 if (allocated) 2307 hammer2_chain_drop(chain); 2308 chain = NULL; 2309 goto done; 2310 } 2311 if (parent != nparent) { 2312 hammer2_chain_unlock(parent); 2313 parent = *parentp = nparent; 2314 } 2315 goto again; 2316 } 2317 2318 /* 2319 * Link the chain into its parent. 2320 */ 2321 if (chain->parent != NULL) 2322 panic("hammer2: hammer2_chain_create: chain already connected"); 2323 KKASSERT(chain->parent == NULL); 2324 hammer2_chain_insert(parent, chain, 2325 HAMMER2_CHAIN_INSERT_SPIN | 2326 HAMMER2_CHAIN_INSERT_LIVE, 2327 0); 2328 2329 if (allocated) { 2330 /* 2331 * Mark the newly created chain modified. This will cause 2332 * UPDATE to be set. 2333 * 2334 * Device buffers are not instantiated for DATA elements 2335 * as these are handled by logical buffers. 2336 * 2337 * Indirect and freemap node indirect blocks are handled 2338 * by hammer2_chain_create_indirect() and not by this 2339 * function. 2340 * 2341 * Data for all other bref types is expected to be 2342 * instantiated (INODE, LEAF). 2343 */ 2344 switch(chain->bref.type) { 2345 case HAMMER2_BREF_TYPE_DATA: 2346 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2347 case HAMMER2_BREF_TYPE_INODE: 2348 hammer2_chain_modify(trans, chain, 2349 HAMMER2_MODIFY_OPTDATA); 2350 break; 2351 default: 2352 /* 2353 * Remaining types are not supported by this function. 2354 * In particular, INDIRECT and LEAF_NODE types are 2355 * handled by create_indirect(). 2356 */ 2357 panic("hammer2_chain_create: bad type: %d", 2358 chain->bref.type); 2359 /* NOT REACHED */ 2360 break; 2361 } 2362 } else { 2363 /* 2364 * When reconnecting a chain we must set UPDATE and 2365 * setflush so the flush recognizes that it must update 2366 * the bref in the parent. 2367 */ 2368 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) { 2369 hammer2_chain_ref(chain); 2370 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 2371 } 2372 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 2373 (flags & HAMMER2_INSERT_NOSTATS) == 0) { 2374 KKASSERT(chain->data); 2375 chain->inode_count_up += 2376 chain->data->ipdata.inode_count; 2377 chain->data_count_up += 2378 chain->data->ipdata.data_count; 2379 } 2380 } 2381 2382 /* 2383 * We must setflush(parent) to ensure that it recurses through to 2384 * chain. setflush(chain) might not work because ONFLUSH is possibly 2385 * already set in the chain (so it won't recurse up to set it in the 2386 * parent). 2387 */ 2388 hammer2_chain_setflush(trans, parent); 2389 2390 done: 2391 *chainp = chain; 2392 2393 return (error); 2394 } 2395 2396 /* 2397 * Move the chain from its old parent to a new parent. The chain must have 2398 * already been deleted or already disconnected (or never associated) with 2399 * a parent. The chain is reassociated with the new parent and the deleted 2400 * flag will be cleared (no longer deleted). The chain's modification state 2401 * is not altered. 2402 * 2403 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 2404 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 2405 * FULL. This typically means that the caller is creating the chain after 2406 * doing a hammer2_chain_lookup(). 2407 * 2408 * A non-NULL bref is typically passed when key and keybits must be overridden. 2409 * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields 2410 * from a passed-in bref and uses the old chain's bref for everything else. 2411 * 2412 * If (parent) is non-NULL then the new duplicated chain is inserted under 2413 * the parent. 2414 * 2415 * If (parent) is NULL then the newly duplicated chain is not inserted 2416 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 2417 * passing into hammer2_chain_create() after this function returns). 2418 * 2419 * WARNING! This function calls create which means it can insert indirect 2420 * blocks. This can cause other unrelated chains in the parent to 2421 * be moved to a newly inserted indirect block in addition to the 2422 * specific chain. 2423 */ 2424 void 2425 hammer2_chain_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref, 2426 hammer2_chain_t **parentp, hammer2_chain_t *chain, 2427 int flags) 2428 { 2429 hammer2_mount_t *hmp; 2430 hammer2_chain_t *parent; 2431 size_t bytes; 2432 2433 /* 2434 * WARNING! We should never resolve DATA to device buffers 2435 * (XXX allow it if the caller did?), and since 2436 * we currently do not have the logical buffer cache 2437 * buffer in-hand to fix its cached physical offset 2438 * we also force the modify code to not COW it. XXX 2439 */ 2440 hmp = chain->hmp; 2441 KKASSERT(chain->parent == NULL); 2442 2443 /* 2444 * Now create a duplicate of the chain structure, associating 2445 * it with the same core, making it the same size, pointing it 2446 * to the same bref (the same media block). 2447 */ 2448 if (bref == NULL) 2449 bref = &chain->bref; 2450 bytes = (hammer2_off_t)1 << 2451 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 2452 2453 /* 2454 * If parent is not NULL the duplicated chain will be entered under 2455 * the parent and the UPDATE bit set to tell flush to update 2456 * the blockref. 2457 * 2458 * We must setflush(parent) to ensure that it recurses through to 2459 * chain. setflush(chain) might not work because ONFLUSH is possibly 2460 * already set in the chain (so it won't recurse up to set it in the 2461 * parent). 2462 * 2463 * Having both chains locked is extremely important for atomicy. 2464 */ 2465 if (parentp && (parent = *parentp) != NULL) { 2466 KKASSERT(ccms_thread_lock_owned(&parent->core.cst)); 2467 KKASSERT(parent->refs > 0); 2468 2469 hammer2_chain_create(trans, parentp, &chain, chain->pmp, 2470 bref->key, bref->keybits, bref->type, 2471 chain->bytes, flags); 2472 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 2473 hammer2_chain_setflush(trans, *parentp); 2474 } 2475 } 2476 2477 /* 2478 * Helper function for deleting chains. 2479 * 2480 * The chain is removed from the live view (the RBTREE) as well as the parent's 2481 * blockmap. Both chain and its parent must be locked. 2482 */ 2483 static void 2484 _hammer2_chain_delete_helper(hammer2_trans_t *trans, 2485 hammer2_chain_t *parent, hammer2_chain_t *chain, 2486 int flags) 2487 { 2488 hammer2_mount_t *hmp; 2489 2490 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 2491 hmp = chain->hmp; 2492 2493 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 2494 /* 2495 * Chain is blockmapped, so there must be a parent. 2496 * Atomically remove the chain from the parent and remove 2497 * the blockmap entry. 2498 */ 2499 hammer2_blockref_t *base; 2500 int count; 2501 2502 KKASSERT(parent != NULL); 2503 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 2504 hammer2_chain_modify(trans, parent, 2505 HAMMER2_MODIFY_OPTDATA); 2506 2507 /* 2508 * Calculate blockmap pointer 2509 */ 2510 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 2511 spin_lock(&parent->core.cst.spin); 2512 2513 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2514 atomic_add_int(&parent->core.live_count, -1); 2515 ++parent->core.generation; 2516 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 2517 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 2518 --parent->core.chain_count; 2519 chain->parent = NULL; 2520 2521 switch(parent->bref.type) { 2522 case HAMMER2_BREF_TYPE_INODE: 2523 /* 2524 * Access the inode's block array. However, there 2525 * is no block array if the inode is flagged 2526 * DIRECTDATA. The DIRECTDATA case typicaly only 2527 * occurs when a hardlink has been shifted up the 2528 * tree and the original inode gets replaced with 2529 * an OBJTYPE_HARDLINK placeholding inode. 2530 */ 2531 if (parent->data && 2532 (parent->data->ipdata.op_flags & 2533 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 2534 base = 2535 &parent->data->ipdata.u.blockset.blockref[0]; 2536 } else { 2537 base = NULL; 2538 } 2539 count = HAMMER2_SET_COUNT; 2540 break; 2541 case HAMMER2_BREF_TYPE_INDIRECT: 2542 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2543 if (parent->data) 2544 base = &parent->data->npdata[0]; 2545 else 2546 base = NULL; 2547 count = parent->bytes / sizeof(hammer2_blockref_t); 2548 break; 2549 case HAMMER2_BREF_TYPE_VOLUME: 2550 base = &hmp->voldata.sroot_blockset.blockref[0]; 2551 count = HAMMER2_SET_COUNT; 2552 break; 2553 case HAMMER2_BREF_TYPE_FREEMAP: 2554 base = &parent->data->npdata[0]; 2555 count = HAMMER2_SET_COUNT; 2556 break; 2557 default: 2558 base = NULL; 2559 count = 0; 2560 panic("hammer2_flush_pass2: " 2561 "unrecognized blockref type: %d", 2562 parent->bref.type); 2563 } 2564 2565 /* 2566 * delete blockmapped chain from its parent. 2567 * 2568 * The parent is not affected by any statistics in chain 2569 * which are pending synchronization. That is, there is 2570 * nothing to undo in the parent since they have not yet 2571 * been incorporated into the parent. 2572 * 2573 * The parent is affected by statistics stored in inodes. 2574 * Those have already been synchronized, so they must be 2575 * undone. XXX split update possible w/delete in middle? 2576 */ 2577 if (base) { 2578 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 2579 (flags & HAMMER2_DELETE_NOSTATS) == 0) { 2580 KKASSERT(chain->data != NULL); 2581 parent->data_count -= 2582 chain->data->ipdata.data_count; 2583 parent->inode_count -= 2584 chain->data->ipdata.inode_count; 2585 } 2586 2587 int cache_index = -1; 2588 hammer2_base_delete(trans, parent, base, count, 2589 &cache_index, chain); 2590 } 2591 spin_unlock(&parent->core.cst.spin); 2592 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 2593 /* 2594 * Chain is not blockmapped but a parent is present. 2595 * Atomically remove the chain from the parent. There is 2596 * no blockmap entry to remove. 2597 * 2598 * Because chain was associated with a parent but not 2599 * synchronized, the chain's *_count_up fields contain 2600 * inode adjustment statistics which must be undone. 2601 */ 2602 spin_lock(&parent->core.cst.spin); 2603 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && 2604 (flags & HAMMER2_DELETE_NOSTATS) == 0) { 2605 KKASSERT(chain->data != NULL); 2606 chain->data_count_up -= 2607 chain->data->ipdata.data_count; 2608 chain->inode_count_up -= 2609 chain->data->ipdata.inode_count; 2610 } 2611 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2612 atomic_add_int(&parent->core.live_count, -1); 2613 ++parent->core.generation; 2614 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 2615 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 2616 --parent->core.chain_count; 2617 chain->parent = NULL; 2618 spin_unlock(&parent->core.cst.spin); 2619 } else { 2620 /* 2621 * Chain is not blockmapped and has no parent. This 2622 * is a degenerate case. 2623 */ 2624 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2625 } 2626 2627 #if 0 2628 /* 2629 * If the deletion is permanent (i.e. the chain is not simply being 2630 * moved within the topology), adjust the freemap to indicate that 2631 * the block *might* be freeable. bulkfree must still determine 2632 * that it is actually freeable. 2633 * 2634 * We no longer do this in the normal filesystem operations path 2635 * as it interferes with the bulkfree algorithm. 2636 */ 2637 if ((flags & HAMMER2_DELETE_PERMANENT) && 2638 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 2639 chain->bref.type != HAMMER2_BREF_TYPE_FREEMAP_LEAF && 2640 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 2641 hammer2_freemap_adjust(trans, hmp, &chain->bref, 2642 HAMMER2_FREEMAP_DOMAYFREE); 2643 } 2644 #endif 2645 } 2646 2647 /* 2648 * Create an indirect block that covers one or more of the elements in the 2649 * current parent. Either returns the existing parent with no locking or 2650 * ref changes or returns the new indirect block locked and referenced 2651 * and leaving the original parent lock/ref intact as well. 2652 * 2653 * If an error occurs, NULL is returned and *errorp is set to the error. 2654 * 2655 * The returned chain depends on where the specified key falls. 2656 * 2657 * The key/keybits for the indirect mode only needs to follow three rules: 2658 * 2659 * (1) That all elements underneath it fit within its key space and 2660 * 2661 * (2) That all elements outside it are outside its key space. 2662 * 2663 * (3) When creating the new indirect block any elements in the current 2664 * parent that fit within the new indirect block's keyspace must be 2665 * moved into the new indirect block. 2666 * 2667 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 2668 * keyspace the the current parent, but lookup/iteration rules will 2669 * ensure (and must ensure) that rule (2) for all parents leading up 2670 * to the nearest inode or the root volume header is adhered to. This 2671 * is accomplished by always recursing through matching keyspaces in 2672 * the hammer2_chain_lookup() and hammer2_chain_next() API. 2673 * 2674 * The current implementation calculates the current worst-case keyspace by 2675 * iterating the current parent and then divides it into two halves, choosing 2676 * whichever half has the most elements (not necessarily the half containing 2677 * the requested key). 2678 * 2679 * We can also opt to use the half with the least number of elements. This 2680 * causes lower-numbered keys (aka logical file offsets) to recurse through 2681 * fewer indirect blocks and higher-numbered keys to recurse through more. 2682 * This also has the risk of not moving enough elements to the new indirect 2683 * block and being forced to create several indirect blocks before the element 2684 * can be inserted. 2685 * 2686 * Must be called with an exclusively locked parent. 2687 */ 2688 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 2689 hammer2_key_t *keyp, int keybits, 2690 hammer2_blockref_t *base, int count); 2691 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent, 2692 hammer2_key_t *keyp, int keybits, 2693 hammer2_blockref_t *base, int count); 2694 static 2695 hammer2_chain_t * 2696 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, 2697 hammer2_key_t create_key, int create_bits, 2698 int for_type, int *errorp) 2699 { 2700 hammer2_mount_t *hmp; 2701 hammer2_blockref_t *base; 2702 hammer2_blockref_t *bref; 2703 hammer2_blockref_t bcopy; 2704 hammer2_chain_t *chain; 2705 hammer2_chain_t *ichain; 2706 hammer2_chain_t dummy; 2707 hammer2_key_t key = create_key; 2708 hammer2_key_t key_beg; 2709 hammer2_key_t key_end; 2710 hammer2_key_t key_next; 2711 int keybits = create_bits; 2712 int count; 2713 int nbytes; 2714 int cache_index; 2715 int loops; 2716 int reason; 2717 int generation; 2718 int maxloops = 300000; 2719 2720 /* 2721 * Calculate the base blockref pointer or NULL if the chain 2722 * is known to be empty. We need to calculate the array count 2723 * for RB lookups either way. 2724 */ 2725 hmp = parent->hmp; 2726 *errorp = 0; 2727 KKASSERT(ccms_thread_lock_owned(&parent->core.cst)); 2728 2729 /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/ 2730 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2731 base = NULL; 2732 2733 switch(parent->bref.type) { 2734 case HAMMER2_BREF_TYPE_INODE: 2735 count = HAMMER2_SET_COUNT; 2736 break; 2737 case HAMMER2_BREF_TYPE_INDIRECT: 2738 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2739 count = parent->bytes / sizeof(hammer2_blockref_t); 2740 break; 2741 case HAMMER2_BREF_TYPE_VOLUME: 2742 count = HAMMER2_SET_COUNT; 2743 break; 2744 case HAMMER2_BREF_TYPE_FREEMAP: 2745 count = HAMMER2_SET_COUNT; 2746 break; 2747 default: 2748 panic("hammer2_chain_create_indirect: " 2749 "unrecognized blockref type: %d", 2750 parent->bref.type); 2751 count = 0; 2752 break; 2753 } 2754 } else { 2755 switch(parent->bref.type) { 2756 case HAMMER2_BREF_TYPE_INODE: 2757 base = &parent->data->ipdata.u.blockset.blockref[0]; 2758 count = HAMMER2_SET_COUNT; 2759 break; 2760 case HAMMER2_BREF_TYPE_INDIRECT: 2761 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2762 base = &parent->data->npdata[0]; 2763 count = parent->bytes / sizeof(hammer2_blockref_t); 2764 break; 2765 case HAMMER2_BREF_TYPE_VOLUME: 2766 base = &hmp->voldata.sroot_blockset.blockref[0]; 2767 count = HAMMER2_SET_COUNT; 2768 break; 2769 case HAMMER2_BREF_TYPE_FREEMAP: 2770 base = &hmp->voldata.freemap_blockset.blockref[0]; 2771 count = HAMMER2_SET_COUNT; 2772 break; 2773 default: 2774 panic("hammer2_chain_create_indirect: " 2775 "unrecognized blockref type: %d", 2776 parent->bref.type); 2777 count = 0; 2778 break; 2779 } 2780 } 2781 2782 /* 2783 * dummy used in later chain allocation (no longer used for lookups). 2784 */ 2785 bzero(&dummy, sizeof(dummy)); 2786 2787 /* 2788 * When creating an indirect block for a freemap node or leaf 2789 * the key/keybits must be fitted to static radix levels because 2790 * particular radix levels use particular reserved blocks in the 2791 * related zone. 2792 * 2793 * This routine calculates the key/radix of the indirect block 2794 * we need to create, and whether it is on the high-side or the 2795 * low-side. 2796 */ 2797 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 2798 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 2799 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 2800 base, count); 2801 } else { 2802 keybits = hammer2_chain_indkey_normal(parent, &key, keybits, 2803 base, count); 2804 } 2805 2806 /* 2807 * Normalize the key for the radix being represented, keeping the 2808 * high bits and throwing away the low bits. 2809 */ 2810 key &= ~(((hammer2_key_t)1 << keybits) - 1); 2811 2812 /* 2813 * How big should our new indirect block be? It has to be at least 2814 * as large as its parent. 2815 */ 2816 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) 2817 nbytes = HAMMER2_IND_BYTES_MIN; 2818 else 2819 nbytes = HAMMER2_IND_BYTES_MAX; 2820 if (nbytes < count * sizeof(hammer2_blockref_t)) 2821 nbytes = count * sizeof(hammer2_blockref_t); 2822 2823 /* 2824 * Ok, create our new indirect block 2825 */ 2826 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 2827 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 2828 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 2829 } else { 2830 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 2831 } 2832 dummy.bref.key = key; 2833 dummy.bref.keybits = keybits; 2834 dummy.bref.data_off = hammer2_getradix(nbytes); 2835 dummy.bref.methods = parent->bref.methods; 2836 2837 ichain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy.bref); 2838 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 2839 hammer2_chain_core_alloc(trans, ichain); 2840 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 2841 hammer2_chain_drop(ichain); /* excess ref from alloc */ 2842 2843 /* 2844 * We have to mark it modified to allocate its block, but use 2845 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 2846 * it won't be acted upon by the flush code. 2847 */ 2848 hammer2_chain_modify(trans, ichain, HAMMER2_MODIFY_OPTDATA); 2849 2850 /* 2851 * Iterate the original parent and move the matching brefs into 2852 * the new indirect block. 2853 * 2854 * XXX handle flushes. 2855 */ 2856 key_beg = 0; 2857 key_end = HAMMER2_KEY_MAX; 2858 cache_index = 0; 2859 spin_lock(&parent->core.cst.spin); 2860 loops = 0; 2861 reason = 0; 2862 2863 for (;;) { 2864 if (++loops > 100000) { 2865 spin_unlock(&parent->core.cst.spin); 2866 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 2867 reason, parent, base, count, key_next); 2868 } 2869 2870 /* 2871 * NOTE: spinlock stays intact, returned chain (if not NULL) 2872 * is not referenced or locked which means that we 2873 * cannot safely check its flagged / deletion status 2874 * until we lock it. 2875 */ 2876 chain = hammer2_combined_find(parent, base, count, 2877 &cache_index, &key_next, 2878 key_beg, key_end, 2879 &bref); 2880 generation = parent->core.generation; 2881 if (bref == NULL) 2882 break; 2883 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 2884 2885 /* 2886 * Skip keys that are not within the key/radix of the new 2887 * indirect block. They stay in the parent. 2888 */ 2889 if ((~(((hammer2_key_t)1 << keybits) - 1) & 2890 (key ^ bref->key)) != 0) { 2891 goto next_key_spinlocked; 2892 } 2893 2894 /* 2895 * Load the new indirect block by acquiring the related 2896 * chains (potentially from media as it might not be 2897 * in-memory). Then move it to the new parent (ichain) 2898 * via DELETE-DUPLICATE. 2899 * 2900 * chain is referenced but not locked. We must lock the 2901 * chain to obtain definitive DUPLICATED/DELETED state 2902 */ 2903 if (chain) { 2904 /* 2905 * Use chain already present in the RBTREE 2906 */ 2907 hammer2_chain_ref(chain); 2908 spin_unlock(&parent->core.cst.spin); 2909 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER | 2910 HAMMER2_RESOLVE_NOREF); 2911 } else { 2912 /* 2913 * Get chain for blockref element. _get returns NULL 2914 * on insertion race. 2915 */ 2916 bcopy = *bref; 2917 spin_unlock(&parent->core.cst.spin); 2918 chain = hammer2_chain_get(parent, generation, &bcopy); 2919 if (chain == NULL) { 2920 reason = 1; 2921 spin_lock(&parent->core.cst.spin); 2922 continue; 2923 } 2924 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2925 kprintf("REASON 2\n"); 2926 reason = 2; 2927 hammer2_chain_drop(chain); 2928 spin_lock(&parent->core.cst.spin); 2929 continue; 2930 } 2931 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER | 2932 HAMMER2_RESOLVE_NOREF); 2933 } 2934 2935 /* 2936 * This is always live so if the chain has been deleted 2937 * we raced someone and we have to retry. 2938 * 2939 * NOTE: Lookups can race delete-duplicate because 2940 * delete-duplicate does not lock the parent's core 2941 * (they just use the spinlock on the core). We must 2942 * check for races by comparing the DUPLICATED flag before 2943 * releasing the spinlock with the flag after locking the 2944 * chain. 2945 * 2946 * (note reversed logic for this one) 2947 */ 2948 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2949 hammer2_chain_unlock(chain); 2950 goto next_key; 2951 } 2952 2953 /* 2954 * Shift the chain to the indirect block. 2955 * 2956 * WARNING! No reason for us to load chain data, pass NOSTATS 2957 * to prevent delete/insert from trying to access 2958 * inode stats (and thus asserting if there is no 2959 * chain->data loaded). 2960 */ 2961 hammer2_chain_delete(trans, parent, chain, 2962 HAMMER2_DELETE_NOSTATS); 2963 hammer2_chain_rename(trans, NULL, &ichain, chain, 2964 HAMMER2_INSERT_NOSTATS); 2965 hammer2_chain_unlock(chain); 2966 KKASSERT(parent->refs > 0); 2967 chain = NULL; 2968 next_key: 2969 spin_lock(&parent->core.cst.spin); 2970 next_key_spinlocked: 2971 if (--maxloops == 0) 2972 panic("hammer2_chain_create_indirect: maxloops"); 2973 reason = 4; 2974 if (key_next == 0 || key_next > key_end) 2975 break; 2976 key_beg = key_next; 2977 /* loop */ 2978 } 2979 spin_unlock(&parent->core.cst.spin); 2980 2981 /* 2982 * Insert the new indirect block into the parent now that we've 2983 * cleared out some entries in the parent. We calculated a good 2984 * insertion index in the loop above (ichain->index). 2985 * 2986 * We don't have to set UPDATE here because we mark ichain 2987 * modified down below (so the normal modified -> flush -> set-moved 2988 * sequence applies). 2989 * 2990 * The insertion shouldn't race as this is a completely new block 2991 * and the parent is locked. 2992 */ 2993 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 2994 hammer2_chain_insert(parent, ichain, 2995 HAMMER2_CHAIN_INSERT_SPIN | 2996 HAMMER2_CHAIN_INSERT_LIVE, 2997 0); 2998 2999 /* 3000 * Make sure flushes propogate after our manual insertion. 3001 */ 3002 hammer2_chain_setflush(trans, ichain); 3003 hammer2_chain_setflush(trans, parent); 3004 3005 /* 3006 * Figure out what to return. 3007 */ 3008 if (~(((hammer2_key_t)1 << keybits) - 1) & 3009 (create_key ^ key)) { 3010 /* 3011 * Key being created is outside the key range, 3012 * return the original parent. 3013 */ 3014 hammer2_chain_unlock(ichain); 3015 } else { 3016 /* 3017 * Otherwise its in the range, return the new parent. 3018 * (leave both the new and old parent locked). 3019 */ 3020 parent = ichain; 3021 } 3022 3023 return(parent); 3024 } 3025 3026 /* 3027 * Calculate the keybits and highside/lowside of the freemap node the 3028 * caller is creating. 3029 * 3030 * This routine will specify the next higher-level freemap key/radix 3031 * representing the lowest-ordered set. By doing so, eventually all 3032 * low-ordered sets will be moved one level down. 3033 * 3034 * We have to be careful here because the freemap reserves a limited 3035 * number of blocks for a limited number of levels. So we can't just 3036 * push indiscriminately. 3037 */ 3038 int 3039 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 3040 int keybits, hammer2_blockref_t *base, int count) 3041 { 3042 hammer2_chain_t *chain; 3043 hammer2_blockref_t *bref; 3044 hammer2_key_t key; 3045 hammer2_key_t key_beg; 3046 hammer2_key_t key_end; 3047 hammer2_key_t key_next; 3048 int cache_index; 3049 int locount; 3050 int hicount; 3051 int maxloops = 300000; 3052 3053 key = *keyp; 3054 locount = 0; 3055 hicount = 0; 3056 keybits = 64; 3057 3058 /* 3059 * Calculate the range of keys in the array being careful to skip 3060 * slots which are overridden with a deletion. 3061 */ 3062 key_beg = 0; 3063 key_end = HAMMER2_KEY_MAX; 3064 cache_index = 0; 3065 spin_lock(&parent->core.cst.spin); 3066 3067 for (;;) { 3068 if (--maxloops == 0) { 3069 panic("indkey_freemap shit %p %p:%d\n", 3070 parent, base, count); 3071 } 3072 chain = hammer2_combined_find(parent, base, count, 3073 &cache_index, &key_next, 3074 key_beg, key_end, 3075 &bref); 3076 3077 /* 3078 * Exhausted search 3079 */ 3080 if (bref == NULL) 3081 break; 3082 3083 /* 3084 * Skip deleted chains. 3085 */ 3086 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3087 if (key_next == 0 || key_next > key_end) 3088 break; 3089 key_beg = key_next; 3090 continue; 3091 } 3092 3093 /* 3094 * Use the full live (not deleted) element for the scan 3095 * iteration. HAMMER2 does not allow partial replacements. 3096 * 3097 * XXX should be built into hammer2_combined_find(). 3098 */ 3099 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3100 3101 if (keybits > bref->keybits) { 3102 key = bref->key; 3103 keybits = bref->keybits; 3104 } else if (keybits == bref->keybits && bref->key < key) { 3105 key = bref->key; 3106 } 3107 if (key_next == 0) 3108 break; 3109 key_beg = key_next; 3110 } 3111 spin_unlock(&parent->core.cst.spin); 3112 3113 /* 3114 * Return the keybits for a higher-level FREEMAP_NODE covering 3115 * this node. 3116 */ 3117 switch(keybits) { 3118 case HAMMER2_FREEMAP_LEVEL0_RADIX: 3119 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 3120 break; 3121 case HAMMER2_FREEMAP_LEVEL1_RADIX: 3122 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 3123 break; 3124 case HAMMER2_FREEMAP_LEVEL2_RADIX: 3125 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 3126 break; 3127 case HAMMER2_FREEMAP_LEVEL3_RADIX: 3128 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 3129 break; 3130 case HAMMER2_FREEMAP_LEVEL4_RADIX: 3131 panic("hammer2_chain_indkey_freemap: level too high"); 3132 break; 3133 default: 3134 panic("hammer2_chain_indkey_freemap: bad radix"); 3135 break; 3136 } 3137 *keyp = key; 3138 3139 return (keybits); 3140 } 3141 3142 /* 3143 * Calculate the keybits and highside/lowside of the indirect block the 3144 * caller is creating. 3145 */ 3146 static int 3147 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp, 3148 int keybits, hammer2_blockref_t *base, int count) 3149 { 3150 hammer2_blockref_t *bref; 3151 hammer2_chain_t *chain; 3152 hammer2_key_t key_beg; 3153 hammer2_key_t key_end; 3154 hammer2_key_t key_next; 3155 hammer2_key_t key; 3156 int nkeybits; 3157 int locount; 3158 int hicount; 3159 int cache_index; 3160 int maxloops = 300000; 3161 3162 key = *keyp; 3163 locount = 0; 3164 hicount = 0; 3165 3166 /* 3167 * Calculate the range of keys in the array being careful to skip 3168 * slots which are overridden with a deletion. Once the scan 3169 * completes we will cut the key range in half and shift half the 3170 * range into the new indirect block. 3171 */ 3172 key_beg = 0; 3173 key_end = HAMMER2_KEY_MAX; 3174 cache_index = 0; 3175 spin_lock(&parent->core.cst.spin); 3176 3177 for (;;) { 3178 if (--maxloops == 0) { 3179 panic("indkey_freemap shit %p %p:%d\n", 3180 parent, base, count); 3181 } 3182 chain = hammer2_combined_find(parent, base, count, 3183 &cache_index, &key_next, 3184 key_beg, key_end, 3185 &bref); 3186 3187 /* 3188 * Exhausted search 3189 */ 3190 if (bref == NULL) 3191 break; 3192 3193 /* 3194 * NOTE: No need to check DUPLICATED here because we do 3195 * not release the spinlock. 3196 */ 3197 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3198 if (key_next == 0 || key_next > key_end) 3199 break; 3200 key_beg = key_next; 3201 continue; 3202 } 3203 3204 /* 3205 * Use the full live (not deleted) element for the scan 3206 * iteration. HAMMER2 does not allow partial replacements. 3207 * 3208 * XXX should be built into hammer2_combined_find(). 3209 */ 3210 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3211 3212 /* 3213 * Expand our calculated key range (key, keybits) to fit 3214 * the scanned key. nkeybits represents the full range 3215 * that we will later cut in half (two halves @ nkeybits - 1). 3216 */ 3217 nkeybits = keybits; 3218 if (nkeybits < bref->keybits) { 3219 if (bref->keybits > 64) { 3220 kprintf("bad bref chain %p bref %p\n", 3221 chain, bref); 3222 Debugger("fubar"); 3223 } 3224 nkeybits = bref->keybits; 3225 } 3226 while (nkeybits < 64 && 3227 (~(((hammer2_key_t)1 << nkeybits) - 1) & 3228 (key ^ bref->key)) != 0) { 3229 ++nkeybits; 3230 } 3231 3232 /* 3233 * If the new key range is larger we have to determine 3234 * which side of the new key range the existing keys fall 3235 * under by checking the high bit, then collapsing the 3236 * locount into the hicount or vise-versa. 3237 */ 3238 if (keybits != nkeybits) { 3239 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 3240 hicount += locount; 3241 locount = 0; 3242 } else { 3243 locount += hicount; 3244 hicount = 0; 3245 } 3246 keybits = nkeybits; 3247 } 3248 3249 /* 3250 * The newly scanned key will be in the lower half or the 3251 * upper half of the (new) key range. 3252 */ 3253 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 3254 ++hicount; 3255 else 3256 ++locount; 3257 3258 if (key_next == 0) 3259 break; 3260 key_beg = key_next; 3261 } 3262 spin_unlock(&parent->core.cst.spin); 3263 bref = NULL; /* now invalid (safety) */ 3264 3265 /* 3266 * Adjust keybits to represent half of the full range calculated 3267 * above (radix 63 max) 3268 */ 3269 --keybits; 3270 3271 /* 3272 * Select whichever half contains the most elements. Theoretically 3273 * we can select either side as long as it contains at least one 3274 * element (in order to ensure that a free slot is present to hold 3275 * the indirect block). 3276 */ 3277 if (hammer2_indirect_optimize) { 3278 /* 3279 * Insert node for least number of keys, this will arrange 3280 * the first few blocks of a large file or the first few 3281 * inodes in a directory with fewer indirect blocks when 3282 * created linearly. 3283 */ 3284 if (hicount < locount && hicount != 0) 3285 key |= (hammer2_key_t)1 << keybits; 3286 else 3287 key &= ~(hammer2_key_t)1 << keybits; 3288 } else { 3289 /* 3290 * Insert node for most number of keys, best for heavily 3291 * fragmented files. 3292 */ 3293 if (hicount > locount) 3294 key |= (hammer2_key_t)1 << keybits; 3295 else 3296 key &= ~(hammer2_key_t)1 << keybits; 3297 } 3298 *keyp = key; 3299 3300 return (keybits); 3301 } 3302 3303 /* 3304 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 3305 * it exists. 3306 * 3307 * Both parent and chain must be locked exclusively. 3308 * 3309 * This function will modify the parent if the blockref requires removal 3310 * from the parent's block table. 3311 * 3312 * This function is NOT recursive. Any entity already pushed into the 3313 * chain (such as an inode) may still need visibility into its contents, 3314 * as well as the ability to read and modify the contents. For example, 3315 * for an unlinked file which is still open. 3316 */ 3317 void 3318 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent, 3319 hammer2_chain_t *chain, int flags) 3320 { 3321 KKASSERT(ccms_thread_lock_owned(&chain->core.cst)); 3322 3323 /* 3324 * Nothing to do if already marked. 3325 * 3326 * We need the spinlock on the core whos RBTREE contains chain 3327 * to protect against races. 3328 */ 3329 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 3330 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 3331 chain->parent == parent); 3332 _hammer2_chain_delete_helper(trans, parent, chain, flags); 3333 } 3334 3335 if (flags & HAMMER2_DELETE_PERMANENT) { 3336 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 3337 hammer2_flush(trans, chain); 3338 } else { 3339 /* XXX might not be needed */ 3340 hammer2_chain_setflush(trans, chain); 3341 } 3342 } 3343 3344 /* 3345 * Returns the index of the nearest element in the blockref array >= elm. 3346 * Returns (count) if no element could be found. 3347 * 3348 * Sets *key_nextp to the next key for loop purposes but does not modify 3349 * it if the next key would be higher than the current value of *key_nextp. 3350 * Note that *key_nexp can overflow to 0, which should be tested by the 3351 * caller. 3352 * 3353 * (*cache_indexp) is a heuristic and can be any value without effecting 3354 * the result. 3355 * 3356 * WARNING! Must be called with parent's spinlock held. Spinlock remains 3357 * held through the operation. 3358 */ 3359 static int 3360 hammer2_base_find(hammer2_chain_t *parent, 3361 hammer2_blockref_t *base, int count, 3362 int *cache_indexp, hammer2_key_t *key_nextp, 3363 hammer2_key_t key_beg, hammer2_key_t key_end) 3364 { 3365 hammer2_blockref_t *scan; 3366 hammer2_key_t scan_end; 3367 int i; 3368 int limit; 3369 3370 /* 3371 * Require the live chain's already have their core's counted 3372 * so we can optimize operations. 3373 */ 3374 KKASSERT(parent->core.flags & HAMMER2_CORE_COUNTEDBREFS); 3375 3376 /* 3377 * Degenerate case 3378 */ 3379 if (count == 0 || base == NULL) 3380 return(count); 3381 3382 /* 3383 * Sequential optimization using *cache_indexp. This is the most 3384 * likely scenario. 3385 * 3386 * We can avoid trailing empty entries on live chains, otherwise 3387 * we might have to check the whole block array. 3388 */ 3389 i = *cache_indexp; 3390 cpu_ccfence(); 3391 limit = parent->core.live_zero; 3392 if (i >= limit) 3393 i = limit - 1; 3394 if (i < 0) 3395 i = 0; 3396 KKASSERT(i < count); 3397 3398 /* 3399 * Search backwards 3400 */ 3401 scan = &base[i]; 3402 while (i > 0 && (scan->type == 0 || scan->key > key_beg)) { 3403 --scan; 3404 --i; 3405 } 3406 *cache_indexp = i; 3407 3408 /* 3409 * Search forwards, stop when we find a scan element which 3410 * encloses the key or until we know that there are no further 3411 * elements. 3412 */ 3413 while (i < count) { 3414 if (scan->type != 0) { 3415 scan_end = scan->key + 3416 ((hammer2_key_t)1 << scan->keybits) - 1; 3417 if (scan->key > key_beg || scan_end >= key_beg) 3418 break; 3419 } 3420 if (i >= limit) 3421 return (count); 3422 ++scan; 3423 ++i; 3424 } 3425 if (i != count) { 3426 *cache_indexp = i; 3427 if (i >= limit) { 3428 i = count; 3429 } else { 3430 scan_end = scan->key + 3431 ((hammer2_key_t)1 << scan->keybits); 3432 if (scan_end && (*key_nextp > scan_end || 3433 *key_nextp == 0)) { 3434 *key_nextp = scan_end; 3435 } 3436 } 3437 } 3438 return (i); 3439 } 3440 3441 /* 3442 * Do a combined search and return the next match either from the blockref 3443 * array or from the in-memory chain. Sets *bresp to the returned bref in 3444 * both cases, or sets it to NULL if the search exhausted. Only returns 3445 * a non-NULL chain if the search matched from the in-memory chain. 3446 * 3447 * When no in-memory chain has been found and a non-NULL bref is returned 3448 * in *bresp. 3449 * 3450 * 3451 * The returned chain is not locked or referenced. Use the returned bref 3452 * to determine if the search exhausted or not. Iterate if the base find 3453 * is chosen but matches a deleted chain. 3454 * 3455 * WARNING! Must be called with parent's spinlock held. Spinlock remains 3456 * held through the operation. 3457 */ 3458 static hammer2_chain_t * 3459 hammer2_combined_find(hammer2_chain_t *parent, 3460 hammer2_blockref_t *base, int count, 3461 int *cache_indexp, hammer2_key_t *key_nextp, 3462 hammer2_key_t key_beg, hammer2_key_t key_end, 3463 hammer2_blockref_t **bresp) 3464 { 3465 hammer2_blockref_t *bref; 3466 hammer2_chain_t *chain; 3467 int i; 3468 3469 /* 3470 * Lookup in block array and in rbtree. 3471 */ 3472 *key_nextp = key_end + 1; 3473 i = hammer2_base_find(parent, base, count, cache_indexp, 3474 key_nextp, key_beg, key_end); 3475 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 3476 3477 /* 3478 * Neither matched 3479 */ 3480 if (i == count && chain == NULL) { 3481 *bresp = NULL; 3482 return(NULL); 3483 } 3484 3485 /* 3486 * Only chain matched. 3487 */ 3488 if (i == count) { 3489 bref = &chain->bref; 3490 goto found; 3491 } 3492 3493 /* 3494 * Only blockref matched. 3495 */ 3496 if (chain == NULL) { 3497 bref = &base[i]; 3498 goto found; 3499 } 3500 3501 /* 3502 * Both in-memory and blockref matched, select the nearer element. 3503 * 3504 * If both are flush with the left-hand side or both are the 3505 * same distance away, select the chain. In this situation the 3506 * chain must have been loaded from the matching blockmap. 3507 */ 3508 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 3509 chain->bref.key == base[i].key) { 3510 KKASSERT(chain->bref.key == base[i].key); 3511 bref = &chain->bref; 3512 goto found; 3513 } 3514 3515 /* 3516 * Select the nearer key 3517 */ 3518 if (chain->bref.key < base[i].key) { 3519 bref = &chain->bref; 3520 } else { 3521 bref = &base[i]; 3522 chain = NULL; 3523 } 3524 3525 /* 3526 * If the bref is out of bounds we've exhausted our search. 3527 */ 3528 found: 3529 if (bref->key > key_end) { 3530 *bresp = NULL; 3531 chain = NULL; 3532 } else { 3533 *bresp = bref; 3534 } 3535 return(chain); 3536 } 3537 3538 /* 3539 * Locate the specified block array element and delete it. The element 3540 * must exist. 3541 * 3542 * The spin lock on the related chain must be held. 3543 * 3544 * NOTE: live_count was adjusted when the chain was deleted, so it does not 3545 * need to be adjusted when we commit the media change. 3546 */ 3547 void 3548 hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent, 3549 hammer2_blockref_t *base, int count, 3550 int *cache_indexp, hammer2_chain_t *chain) 3551 { 3552 hammer2_blockref_t *elm = &chain->bref; 3553 hammer2_key_t key_next; 3554 int i; 3555 3556 /* 3557 * Delete element. Expect the element to exist. 3558 * 3559 * XXX see caller, flush code not yet sophisticated enough to prevent 3560 * re-flushed in some cases. 3561 */ 3562 key_next = 0; /* max range */ 3563 i = hammer2_base_find(parent, base, count, cache_indexp, 3564 &key_next, elm->key, elm->key); 3565 if (i == count || base[i].type == 0 || 3566 base[i].key != elm->key || 3567 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 3568 base[i].keybits != elm->keybits)) { 3569 spin_unlock(&parent->core.cst.spin); 3570 panic("delete base %p element not found at %d/%d elm %p\n", 3571 base, i, count, elm); 3572 return; 3573 } 3574 bzero(&base[i], sizeof(*base)); 3575 3576 /* 3577 * We can only optimize parent->core.live_zero for live chains. 3578 */ 3579 if (parent->core.live_zero == i + 1) { 3580 while (--i >= 0 && base[i].type == 0) 3581 ; 3582 parent->core.live_zero = i + 1; 3583 } 3584 3585 /* 3586 * Clear appropriate blockmap flags in chain. 3587 */ 3588 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 3589 HAMMER2_CHAIN_BMAPUPD); 3590 } 3591 3592 /* 3593 * Insert the specified element. The block array must not already have the 3594 * element and must have space available for the insertion. 3595 * 3596 * The spin lock on the related chain must be held. 3597 * 3598 * NOTE: live_count was adjusted when the chain was deleted, so it does not 3599 * need to be adjusted when we commit the media change. 3600 */ 3601 void 3602 hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent, 3603 hammer2_blockref_t *base, int count, 3604 int *cache_indexp, hammer2_chain_t *chain) 3605 { 3606 hammer2_blockref_t *elm = &chain->bref; 3607 hammer2_key_t key_next; 3608 hammer2_key_t xkey; 3609 int i; 3610 int j; 3611 int k; 3612 int l; 3613 int u = 1; 3614 3615 /* 3616 * Insert new element. Expect the element to not already exist 3617 * unless we are replacing it. 3618 * 3619 * XXX see caller, flush code not yet sophisticated enough to prevent 3620 * re-flushed in some cases. 3621 */ 3622 key_next = 0; /* max range */ 3623 i = hammer2_base_find(parent, base, count, cache_indexp, 3624 &key_next, elm->key, elm->key); 3625 3626 /* 3627 * Shortcut fill optimization, typical ordered insertion(s) may not 3628 * require a search. 3629 */ 3630 KKASSERT(i >= 0 && i <= count); 3631 3632 /* 3633 * Set appropriate blockmap flags in chain. 3634 */ 3635 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 3636 3637 /* 3638 * We can only optimize parent->core.live_zero for live chains. 3639 */ 3640 if (i == count && parent->core.live_zero < count) { 3641 i = parent->core.live_zero++; 3642 base[i] = *elm; 3643 return; 3644 } 3645 3646 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 3647 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 3648 spin_unlock(&parent->core.cst.spin); 3649 panic("insert base %p overlapping elements at %d elm %p\n", 3650 base, i, elm); 3651 } 3652 3653 /* 3654 * Try to find an empty slot before or after. 3655 */ 3656 j = i; 3657 k = i; 3658 while (j > 0 || k < count) { 3659 --j; 3660 if (j >= 0 && base[j].type == 0) { 3661 if (j == i - 1) { 3662 base[j] = *elm; 3663 } else { 3664 bcopy(&base[j+1], &base[j], 3665 (i - j - 1) * sizeof(*base)); 3666 base[i - 1] = *elm; 3667 } 3668 goto validate; 3669 } 3670 ++k; 3671 if (k < count && base[k].type == 0) { 3672 bcopy(&base[i], &base[i+1], 3673 (k - i) * sizeof(hammer2_blockref_t)); 3674 base[i] = *elm; 3675 3676 /* 3677 * We can only update parent->core.live_zero for live 3678 * chains. 3679 */ 3680 if (parent->core.live_zero <= k) 3681 parent->core.live_zero = k + 1; 3682 u = 2; 3683 goto validate; 3684 } 3685 } 3686 panic("hammer2_base_insert: no room!"); 3687 3688 /* 3689 * Debugging 3690 */ 3691 validate: 3692 key_next = 0; 3693 for (l = 0; l < count; ++l) { 3694 if (base[l].type) { 3695 key_next = base[l].key + 3696 ((hammer2_key_t)1 << base[l].keybits) - 1; 3697 break; 3698 } 3699 } 3700 while (++l < count) { 3701 if (base[l].type) { 3702 if (base[l].key <= key_next) 3703 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 3704 key_next = base[l].key + 3705 ((hammer2_key_t)1 << base[l].keybits) - 1; 3706 3707 } 3708 } 3709 3710 } 3711 3712 #if 0 3713 3714 /* 3715 * Sort the blockref array for the chain. Used by the flush code to 3716 * sort the blockref[] array. 3717 * 3718 * The chain must be exclusively locked AND spin-locked. 3719 */ 3720 typedef hammer2_blockref_t *hammer2_blockref_p; 3721 3722 static 3723 int 3724 hammer2_base_sort_callback(const void *v1, const void *v2) 3725 { 3726 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 3727 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 3728 3729 /* 3730 * Make sure empty elements are placed at the end of the array 3731 */ 3732 if (bref1->type == 0) { 3733 if (bref2->type == 0) 3734 return(0); 3735 return(1); 3736 } else if (bref2->type == 0) { 3737 return(-1); 3738 } 3739 3740 /* 3741 * Sort by key 3742 */ 3743 if (bref1->key < bref2->key) 3744 return(-1); 3745 if (bref1->key > bref2->key) 3746 return(1); 3747 return(0); 3748 } 3749 3750 void 3751 hammer2_base_sort(hammer2_chain_t *chain) 3752 { 3753 hammer2_blockref_t *base; 3754 int count; 3755 3756 switch(chain->bref.type) { 3757 case HAMMER2_BREF_TYPE_INODE: 3758 /* 3759 * Special shortcut for embedded data returns the inode 3760 * itself. Callers must detect this condition and access 3761 * the embedded data (the strategy code does this for us). 3762 * 3763 * This is only applicable to regular files and softlinks. 3764 */ 3765 if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) 3766 return; 3767 base = &chain->data->ipdata.u.blockset.blockref[0]; 3768 count = HAMMER2_SET_COUNT; 3769 break; 3770 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3771 case HAMMER2_BREF_TYPE_INDIRECT: 3772 /* 3773 * Optimize indirect blocks in the INITIAL state to avoid 3774 * I/O. 3775 */ 3776 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 3777 base = &chain->data->npdata[0]; 3778 count = chain->bytes / sizeof(hammer2_blockref_t); 3779 break; 3780 case HAMMER2_BREF_TYPE_VOLUME: 3781 base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 3782 count = HAMMER2_SET_COUNT; 3783 break; 3784 case HAMMER2_BREF_TYPE_FREEMAP: 3785 base = &chain->hmp->voldata.freemap_blockset.blockref[0]; 3786 count = HAMMER2_SET_COUNT; 3787 break; 3788 default: 3789 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 3790 chain->bref.type); 3791 base = NULL; /* safety */ 3792 count = 0; /* safety */ 3793 } 3794 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 3795 } 3796 3797 #endif 3798 3799 /* 3800 * Chain memory management 3801 */ 3802 void 3803 hammer2_chain_wait(hammer2_chain_t *chain) 3804 { 3805 tsleep(chain, 0, "chnflw", 1); 3806 } 3807 3808 const hammer2_media_data_t * 3809 hammer2_chain_rdata(hammer2_chain_t *chain) 3810 { 3811 KKASSERT(chain->data != NULL); 3812 return (chain->data); 3813 } 3814 3815 hammer2_media_data_t * 3816 hammer2_chain_wdata(hammer2_chain_t *chain) 3817 { 3818 KKASSERT(chain->data != NULL); 3819 return (chain->data); 3820 } 3821 3822 /* 3823 * Set the check data for a chain. This can be a heavy-weight operation 3824 * and typically only runs on-flush. For file data check data is calculated 3825 * when the logical buffers are flushed. 3826 */ 3827 void 3828 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 3829 { 3830 chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO; 3831 3832 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 3833 case HAMMER2_CHECK_NONE: 3834 break; 3835 case HAMMER2_CHECK_DISABLED: 3836 break; 3837 case HAMMER2_CHECK_ISCSI32: 3838 chain->bref.check.iscsi32.value = 3839 hammer2_icrc32(bdata, chain->bytes); 3840 break; 3841 case HAMMER2_CHECK_CRC64: 3842 chain->bref.check.crc64.value = 0; 3843 /* XXX */ 3844 break; 3845 case HAMMER2_CHECK_SHA192: 3846 { 3847 SHA256_CTX hash_ctx; 3848 union { 3849 uint8_t digest[SHA256_DIGEST_LENGTH]; 3850 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 3851 } u; 3852 3853 SHA256_Init(&hash_ctx); 3854 SHA256_Update(&hash_ctx, bdata, chain->bytes); 3855 SHA256_Final(u.digest, &hash_ctx); 3856 u.digest64[2] ^= u.digest64[3]; 3857 bcopy(u.digest, 3858 chain->bref.check.sha192.data, 3859 sizeof(chain->bref.check.sha192.data)); 3860 } 3861 break; 3862 case HAMMER2_CHECK_FREEMAP: 3863 chain->bref.check.freemap.icrc32 = 3864 hammer2_icrc32(bdata, chain->bytes); 3865 break; 3866 default: 3867 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 3868 chain->bref.methods); 3869 break; 3870 } 3871 } 3872 3873 int 3874 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 3875 { 3876 int r; 3877 3878 if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO) 3879 return 1; 3880 3881 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 3882 case HAMMER2_CHECK_NONE: 3883 r = 1; 3884 break; 3885 case HAMMER2_CHECK_DISABLED: 3886 r = 1; 3887 break; 3888 case HAMMER2_CHECK_ISCSI32: 3889 r = (chain->bref.check.iscsi32.value == 3890 hammer2_icrc32(bdata, chain->bytes)); 3891 break; 3892 case HAMMER2_CHECK_CRC64: 3893 r = (chain->bref.check.crc64.value == 0); 3894 /* XXX */ 3895 break; 3896 case HAMMER2_CHECK_SHA192: 3897 { 3898 SHA256_CTX hash_ctx; 3899 union { 3900 uint8_t digest[SHA256_DIGEST_LENGTH]; 3901 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 3902 } u; 3903 3904 SHA256_Init(&hash_ctx); 3905 SHA256_Update(&hash_ctx, bdata, chain->bytes); 3906 SHA256_Final(u.digest, &hash_ctx); 3907 u.digest64[2] ^= u.digest64[3]; 3908 if (bcmp(u.digest, 3909 chain->bref.check.sha192.data, 3910 sizeof(chain->bref.check.sha192.data)) == 0) { 3911 r = 1; 3912 } else { 3913 r = 0; 3914 } 3915 } 3916 break; 3917 case HAMMER2_CHECK_FREEMAP: 3918 r = (chain->bref.check.freemap.icrc32 == 3919 hammer2_icrc32(bdata, chain->bytes)); 3920 if (r == 0) { 3921 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 3922 chain->bref.check.freemap.icrc32, 3923 hammer2_icrc32(bdata, chain->bytes), chain->bytes); 3924 if (chain->dio) 3925 kprintf("dio %p buf %016jx,%d bdata %p/%p\n", 3926 chain->dio, chain->dio->bp->b_loffset, chain->dio->bp->b_bufsize, bdata, chain->dio->bp->b_data); 3927 } 3928 3929 break; 3930 default: 3931 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 3932 chain->bref.methods); 3933 r = 1; 3934 break; 3935 } 3936 return r; 3937 } 3938