1 /* 2 * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * Chains are no-longer delete-duplicated. Instead, the original in-memory 44 * chain will be moved along with its block reference (e.g. for things like 45 * renames, hardlink operations, modifications, etc), and will be indexed 46 * on a secondary list for flush handling instead of propagating a flag 47 * upward to the root. 48 * 49 * Concurrent front-end operations can still run against backend flushes 50 * as long as they do not cross the current flush boundary. An operation 51 * running above the current flush (in areas not yet flushed) can become 52 * part of the current flush while ano peration running below the current 53 * flush can become part of the next flush. 54 */ 55 #include <sys/cdefs.h> 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/types.h> 59 #include <sys/lock.h> 60 #include <sys/kern_syscall.h> 61 #include <sys/uuid.h> 62 63 #include <crypto/sha2/sha2.h> 64 65 #include "hammer2.h" 66 67 static hammer2_chain_t *hammer2_chain_create_indirect( 68 hammer2_chain_t *parent, 69 hammer2_key_t key, int keybits, 70 hammer2_tid_t mtid, int for_type, int *errorp); 71 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain, 72 int lastdrop); 73 static hammer2_chain_t *hammer2_combined_find( 74 hammer2_chain_t *parent, 75 hammer2_blockref_t *base, int count, 76 int *cache_indexp, hammer2_key_t *key_nextp, 77 hammer2_key_t key_beg, hammer2_key_t key_end, 78 hammer2_blockref_t **bresp); 79 80 /* 81 * Basic RBTree for chains (core->rbtree and core->dbtree). Chains cannot 82 * overlap in the RB trees. Deleted chains are moved from rbtree to either 83 * dbtree or to dbq. 84 * 85 * Chains in delete-duplicate sequences can always iterate through core_entry 86 * to locate the live version of the chain. 87 */ 88 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 89 90 extern int h2timer[32]; 91 extern int h2last; 92 extern int h2lid; 93 94 #define TIMER(which) do { \ 95 if (h2last) \ 96 h2timer[h2lid] += (int)(ticks - h2last);\ 97 h2last = ticks; \ 98 h2lid = which; \ 99 } while(0) 100 101 int 102 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 103 { 104 hammer2_key_t c1_beg; 105 hammer2_key_t c1_end; 106 hammer2_key_t c2_beg; 107 hammer2_key_t c2_end; 108 109 /* 110 * Compare chains. Overlaps are not supposed to happen and catch 111 * any software issues early we count overlaps as a match. 112 */ 113 c1_beg = chain1->bref.key; 114 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 115 c2_beg = chain2->bref.key; 116 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 117 118 if (c1_end < c2_beg) /* fully to the left */ 119 return(-1); 120 if (c1_beg > c2_end) /* fully to the right */ 121 return(1); 122 return(0); /* overlap (must not cross edge boundary) */ 123 } 124 125 /* 126 * Make a chain visible to the flusher. The flusher needs to be able to 127 * do flushes of subdirectory chains or single files so it does a top-down 128 * recursion using the ONFLUSH flag for the recursion. It locates MODIFIED 129 * or UPDATE chains and flushes back up the chain to the volume root. 130 * 131 * This routine sets ONFLUSH upward until it hits the volume root. For 132 * simplicity we ignore PFSROOT boundaries whos rules can be complex. 133 * Extra ONFLUSH flagging doesn't hurt the filesystem. 134 */ 135 void 136 hammer2_chain_setflush(hammer2_chain_t *chain) 137 { 138 hammer2_chain_t *parent; 139 140 if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 141 hammer2_spin_sh(&chain->core.spin); 142 while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) { 143 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 144 if ((parent = chain->parent) == NULL) 145 break; 146 hammer2_spin_sh(&parent->core.spin); 147 hammer2_spin_unsh(&chain->core.spin); 148 chain = parent; 149 } 150 hammer2_spin_unsh(&chain->core.spin); 151 } 152 } 153 154 /* 155 * Allocate a new disconnected chain element representing the specified 156 * bref. chain->refs is set to 1 and the passed bref is copied to 157 * chain->bref. chain->bytes is derived from the bref. 158 * 159 * chain->pmp inherits pmp unless the chain is an inode (other than the 160 * super-root inode). 161 * 162 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 163 */ 164 hammer2_chain_t * 165 hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp, 166 hammer2_blockref_t *bref) 167 { 168 hammer2_chain_t *chain; 169 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 170 171 atomic_add_long(&hammer2_chain_allocs, 1); 172 173 /* 174 * Construct the appropriate system structure. 175 */ 176 switch(bref->type) { 177 case HAMMER2_BREF_TYPE_INODE: 178 case HAMMER2_BREF_TYPE_INDIRECT: 179 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 180 case HAMMER2_BREF_TYPE_DATA: 181 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 182 /* 183 * Chain's are really only associated with the hmp but we 184 * maintain a pmp association for per-mount memory tracking 185 * purposes. The pmp can be NULL. 186 */ 187 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 188 break; 189 case HAMMER2_BREF_TYPE_VOLUME: 190 case HAMMER2_BREF_TYPE_FREEMAP: 191 /* 192 * Only hammer2_chain_bulksnap() calls this function with these 193 * types. 194 */ 195 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 196 break; 197 default: 198 chain = NULL; 199 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 200 bref->type); 201 } 202 203 /* 204 * Initialize the new chain structure. pmp must be set to NULL for 205 * chains belonging to the super-root topology of a device mount. 206 */ 207 if (pmp == hmp->spmp) 208 chain->pmp = NULL; 209 else 210 chain->pmp = pmp; 211 chain->hmp = hmp; 212 chain->bref = *bref; 213 chain->bytes = bytes; 214 chain->refs = 1; 215 chain->flags = HAMMER2_CHAIN_ALLOCATED; 216 217 /* 218 * Set the PFS boundary flag if this chain represents a PFS root. 219 */ 220 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 221 atomic_set_int(&chain->flags, HAMMER2_CHAIN_PFSBOUNDARY); 222 hammer2_chain_core_init(chain); 223 224 return (chain); 225 } 226 227 /* 228 * Initialize a chain's core structure. This structure used to be allocated 229 * but is now embedded. 230 * 231 * The core is not locked. No additional refs on the chain are made. 232 * (trans) must not be NULL if (core) is not NULL. 233 */ 234 void 235 hammer2_chain_core_init(hammer2_chain_t *chain) 236 { 237 /* 238 * Fresh core under nchain (no multi-homing of ochain's 239 * sub-tree). 240 */ 241 RB_INIT(&chain->core.rbtree); /* live chains */ 242 hammer2_mtx_init(&chain->lock, "h2chain"); 243 } 244 245 /* 246 * Add a reference to a chain element, preventing its destruction. 247 * 248 * (can be called with spinlock held) 249 */ 250 void 251 hammer2_chain_ref(hammer2_chain_t *chain) 252 { 253 if (atomic_fetchadd_int(&chain->refs, 1) == 0) { 254 /* 255 * 0->non-zero transition must ensure that chain is removed 256 * from the LRU list. 257 * 258 * NOTE: Already holding lru_spin here so we cannot call 259 * hammer2_chain_ref() to get it off lru_list, do 260 * it manually. 261 */ 262 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 263 hammer2_pfs_t *pmp = chain->pmp; 264 hammer2_spin_ex(&pmp->lru_spin); 265 if (chain->flags & HAMMER2_CHAIN_ONLRU) { 266 atomic_add_int(&pmp->lru_count, -1); 267 atomic_clear_int(&chain->flags, 268 HAMMER2_CHAIN_ONLRU); 269 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node); 270 } 271 hammer2_spin_unex(&pmp->lru_spin); 272 } 273 } 274 #if 0 275 kprintf("REFC %p %d %08x\n", chain, chain->refs - 1, chain->flags); 276 print_backtrace(8); 277 #endif 278 } 279 280 /* 281 * Ref a locked chain and force the data to be held across an unlock. 282 * Chain must be currently locked. The user of the chain who desires 283 * to release the hold must call hammer2_chain_lock_unhold() to lock 284 * and unhold the chain, then unlock normally, or may simply call 285 * hammer2_chain_drop_unhold() (which is safer against deadlocks). 286 */ 287 void 288 hammer2_chain_ref_hold(hammer2_chain_t *chain) 289 { 290 atomic_add_int(&chain->persist_refs, 1); 291 hammer2_chain_ref(chain); 292 } 293 294 /* 295 * Insert the chain in the core rbtree. 296 * 297 * Normal insertions are placed in the live rbtree. Insertion of a deleted 298 * chain is a special case used by the flush code that is placed on the 299 * unstaged deleted list to avoid confusing the live view. 300 */ 301 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 302 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 303 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 304 305 static 306 int 307 hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain, 308 int flags, int generation) 309 { 310 hammer2_chain_t *xchain; 311 int error = 0; 312 313 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 314 hammer2_spin_ex(&parent->core.spin); 315 316 /* 317 * Interlocked by spinlock, check for race 318 */ 319 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 320 parent->core.generation != generation) { 321 error = EAGAIN; 322 goto failed; 323 } 324 325 /* 326 * Insert chain 327 */ 328 xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain); 329 KASSERT(xchain == NULL, 330 ("hammer2_chain_insert: collision %p %p (key=%016jx)", 331 chain, xchain, chain->bref.key)); 332 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 333 chain->parent = parent; 334 ++parent->core.chain_count; 335 ++parent->core.generation; /* XXX incs for _get() too, XXX */ 336 337 /* 338 * We have to keep track of the effective live-view blockref count 339 * so the create code knows when to push an indirect block. 340 */ 341 if (flags & HAMMER2_CHAIN_INSERT_LIVE) 342 atomic_add_int(&parent->core.live_count, 1); 343 failed: 344 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 345 hammer2_spin_unex(&parent->core.spin); 346 return error; 347 } 348 349 /* 350 * Drop the caller's reference to the chain. When the ref count drops to 351 * zero this function will try to disassociate the chain from its parent and 352 * deallocate it, then recursely drop the parent using the implied ref 353 * from the chain's chain->parent. 354 */ 355 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain); 356 357 void 358 hammer2_chain_drop(hammer2_chain_t *chain) 359 { 360 u_int refs; 361 362 if (hammer2_debug & 0x200000) 363 Debugger("drop"); 364 #if 0 365 kprintf("DROP %p %d %08x\n", chain, chain->refs - 1, chain->flags); 366 print_backtrace(8); 367 #endif 368 369 KKASSERT(chain->refs > 0); 370 371 while (chain) { 372 refs = chain->refs; 373 cpu_ccfence(); 374 KKASSERT(refs > 0); 375 376 if (refs == 1) { 377 chain = hammer2_chain_lastdrop(chain); 378 } else { 379 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 380 break; 381 /* retry the same chain */ 382 } 383 } 384 } 385 386 /* 387 * Unhold a held and probably not-locked chain. To ensure that the data 388 * is properly dropped we check lockcnt. If lockcnt is 0 we unconditionally 389 * interlock the chain to release its data. We must obtain the lock 390 * unconditionally becuase it is possible for the chain to still be 391 * temporarily locked by a hammer2_chain_unlock() call in a race. 392 */ 393 void 394 hammer2_chain_drop_unhold(hammer2_chain_t *chain) 395 { 396 hammer2_io_t *dio; 397 398 atomic_add_int(&chain->persist_refs, -1); 399 cpu_lfence(); 400 if (chain->lockcnt == 0) { 401 hammer2_mtx_ex(&chain->lock); 402 if (chain->lockcnt == 0 && chain->persist_refs == 0) { 403 dio = hammer2_chain_drop_data(chain, 0); 404 if (dio) 405 hammer2_io_bqrelse(&dio); 406 } 407 hammer2_mtx_unlock(&chain->lock); 408 } 409 hammer2_chain_drop(chain); 410 } 411 412 /* 413 * Safe handling of the 1->0 transition on chain. Returns a chain for 414 * recursive drop or NULL, possibly returning the same chain if the atomic 415 * op fails. 416 * 417 * When two chains need to be recursively dropped we use the chain we 418 * would otherwise free to placehold the additional chain. It's a bit 419 * convoluted but we can't just recurse without potentially blowing out 420 * the kernel stack. 421 * 422 * The chain cannot be freed if it has any children. 423 * The chain cannot be freed if flagged MODIFIED unless we can dispose of that. 424 * The chain cannot be freed if flagged UPDATE unless we can dispose of that. 425 * 426 * The core spinlock is allowed nest child-to-parent (not parent-to-child). 427 */ 428 static 429 hammer2_chain_t * 430 hammer2_chain_lastdrop(hammer2_chain_t *chain) 431 { 432 hammer2_pfs_t *pmp; 433 hammer2_dev_t *hmp; 434 hammer2_chain_t *parent; 435 hammer2_chain_t *rdrop; 436 hammer2_io_t *dio; 437 438 /* 439 * Critical field access. 440 */ 441 hammer2_spin_ex(&chain->core.spin); 442 443 if ((parent = chain->parent) != NULL) { 444 /* 445 * If the chain has a parent the UPDATE bit prevents scrapping 446 * as the chain is needed to properly flush the parent. Try 447 * to complete the 1->0 transition and return NULL. Retry 448 * (return chain) if we are unable to complete the 1->0 449 * transition, else return NULL (nothing more to do). 450 * 451 * If the chain has a parent the MODIFIED bit prevents 452 * scrapping. 453 * 454 * Chains with UPDATE/MODIFIED are *not* put on the LRU list! 455 */ 456 if (chain->flags & (HAMMER2_CHAIN_UPDATE | 457 HAMMER2_CHAIN_MODIFIED)) { 458 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 459 dio = hammer2_chain_drop_data(chain, 0); 460 hammer2_spin_unex(&chain->core.spin); 461 if (dio) 462 hammer2_io_bqrelse(&dio); 463 chain = NULL; 464 } else { 465 hammer2_spin_unex(&chain->core.spin); 466 } 467 return (chain); 468 } 469 /* spinlock still held */ 470 } else { 471 /* 472 * The chain has no parent and can be flagged for destruction. 473 * Since it has no parent, UPDATE can also be cleared. 474 */ 475 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 476 if (chain->flags & HAMMER2_CHAIN_UPDATE) 477 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 478 479 /* 480 * If the chain has children or if it has been MODIFIED and 481 * also recorded for DEDUP, we must still flush the chain. 482 * 483 * In the case where it has children, the DESTROY flag test 484 * in the flush code will prevent unnecessary flushes of 485 * MODIFIED chains that are not flagged DEDUP so don't worry 486 * about that here. 487 */ 488 if (chain->core.chain_count || 489 (chain->flags & (HAMMER2_CHAIN_MODIFIED | 490 HAMMER2_CHAIN_DEDUP)) == 491 (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_DEDUP)) { 492 /* 493 * Put on flushq (should ensure refs > 1), retry 494 * the drop. 495 */ 496 hammer2_spin_unex(&chain->core.spin); 497 hammer2_delayed_flush(chain); 498 return(chain); /* retry drop */ 499 } 500 501 /* 502 * Otherwise we can scrap the MODIFIED bit if it is set, 503 * and continue along the freeing path. 504 */ 505 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 506 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 507 atomic_add_long(&hammer2_count_modified_chains, -1); 508 if (chain->pmp) 509 hammer2_pfs_memory_wakeup(chain->pmp); 510 } 511 /* spinlock still held */ 512 } 513 514 /* spinlock still held */ 515 dio = NULL; 516 517 /* 518 * If any children exist we must leave the chain intact with refs == 0. 519 * They exist because chains are retained below us which have refs or 520 * may require flushing. This case can occur when parent != NULL. 521 * 522 * Retry (return chain) if we fail to transition the refs to 0, else 523 * return NULL indication nothing more to do. 524 * 525 * Chains with children are NOT put on the LRU list. 526 */ 527 if (chain->core.chain_count) { 528 if (parent) 529 hammer2_spin_ex(&parent->core.spin); 530 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 531 dio = hammer2_chain_drop_data(chain, 1); 532 hammer2_spin_unex(&chain->core.spin); 533 if (parent) 534 hammer2_spin_unex(&parent->core.spin); 535 chain = NULL; 536 if (dio) 537 hammer2_io_bqrelse(&dio); 538 } else { 539 hammer2_spin_unex(&chain->core.spin); 540 if (parent) 541 hammer2_spin_unex(&parent->core.spin); 542 } 543 return (chain); 544 } 545 /* spinlock still held */ 546 /* no chains left under us */ 547 548 /* 549 * chain->core has no children left so no accessors can get to our 550 * chain from there. Now we have to lock the parent core to interlock 551 * remaining possible accessors that might bump chain's refs before 552 * we can safely drop chain's refs with intent to free the chain. 553 */ 554 hmp = chain->hmp; 555 pmp = chain->pmp; /* can be NULL */ 556 rdrop = NULL; 557 558 parent = chain->parent; 559 560 /* 561 * WARNING! chain's spin lock is still held here, and other spinlocks 562 * will be acquired and released in the code below. We 563 * cannot be making fancy procedure calls! 564 */ 565 566 /* 567 * We can cache the chain if it is associated with a pmp 568 * and not flagged as being destroyed or requesting a full 569 * release. In this situation the chain is not removed 570 * from its parent, i.e. it can still be looked up. 571 * 572 * We intentionally do not cache DATA chains because these 573 * were likely used to load data into the logical buffer cache 574 * and will not be accessed again for some time. 575 */ 576 if ((chain->flags & 577 (HAMMER2_CHAIN_DESTROY | HAMMER2_CHAIN_RELEASE)) == 0 && 578 chain->pmp && 579 chain->bref.type != HAMMER2_BREF_TYPE_DATA) { 580 if (parent) 581 hammer2_spin_ex(&parent->core.spin); 582 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 583 /* 584 * 1->0 transition failed, retry. Do not drop 585 * the chain's data yet! 586 */ 587 if (parent) 588 hammer2_spin_unex(&parent->core.spin); 589 hammer2_spin_unex(&chain->core.spin); 590 591 return(chain); 592 } 593 594 /* 595 * Success, be sure to clean out the chain's data 596 * before putting it on a queue that it might be 597 * reused from. 598 */ 599 dio = hammer2_chain_drop_data(chain, 1); 600 601 KKASSERT((chain->flags & HAMMER2_CHAIN_ONLRU) == 0); 602 hammer2_spin_ex(&pmp->lru_spin); 603 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONLRU); 604 TAILQ_INSERT_TAIL(&pmp->lru_list, chain, lru_node); 605 606 /* 607 * If we are over the LRU limit we need to drop something. 608 */ 609 if (pmp->lru_count > HAMMER2_LRU_LIMIT) { 610 rdrop = TAILQ_FIRST(&pmp->lru_list); 611 atomic_clear_int(&rdrop->flags, HAMMER2_CHAIN_ONLRU); 612 TAILQ_REMOVE(&pmp->lru_list, rdrop, lru_node); 613 atomic_add_int(&rdrop->refs, 1); 614 atomic_set_int(&rdrop->flags, HAMMER2_CHAIN_RELEASE); 615 } else { 616 atomic_add_int(&pmp->lru_count, 1); 617 } 618 hammer2_spin_unex(&pmp->lru_spin); 619 if (parent) { 620 hammer2_spin_unex(&parent->core.spin); 621 parent = NULL; /* safety */ 622 } 623 hammer2_spin_unex(&chain->core.spin); 624 if (dio) 625 hammer2_io_bqrelse(&dio); 626 627 return rdrop; 628 /* NOT REACHED */ 629 } 630 631 /* 632 * Spinlock the parent and try to drop the last ref on chain. 633 * On success determine if we should dispose of the chain 634 * (remove the chain from its parent, etc). 635 * 636 * (normal core locks are top-down recursive but we define 637 * core spinlocks as bottom-up recursive, so this is safe). 638 */ 639 if (parent) { 640 hammer2_spin_ex(&parent->core.spin); 641 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 642 #if 0 643 /* XXX remove, don't try to drop data on fail */ 644 hammer2_spin_unex(&parent->core.spin); 645 dio = hammer2_chain_drop_data(chain, 0); 646 hammer2_spin_unex(&chain->core.spin); 647 if (dio) 648 hammer2_io_bqrelse(&dio); 649 #endif 650 /* 651 * 1->0 transition failed, retry. 652 */ 653 hammer2_spin_unex(&parent->core.spin); 654 hammer2_spin_unex(&chain->core.spin); 655 656 return(chain); 657 } 658 659 /* 660 * 1->0 transition successful, remove chain from the 661 * parent. 662 */ 663 if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 664 RB_REMOVE(hammer2_chain_tree, 665 &parent->core.rbtree, chain); 666 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 667 --parent->core.chain_count; 668 chain->parent = NULL; 669 } 670 671 /* 672 * If our chain was the last chain in the parent's core the 673 * core is now empty and its parent might have to be 674 * re-dropped if it has 0 refs. 675 */ 676 if (parent->core.chain_count == 0) { 677 rdrop = parent; 678 atomic_add_int(&rdrop->refs, 1); 679 /* 680 if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) 681 rdrop = NULL; 682 */ 683 } 684 hammer2_spin_unex(&parent->core.spin); 685 parent = NULL; /* safety */ 686 /* FALL THROUGH */ 687 } 688 689 /* 690 * Successful 1->0 transition and the chain can be destroyed now. 691 * 692 * We still have the core spinlock, and core's chain_count is 0. 693 * Any parent spinlock is gone. 694 */ 695 hammer2_spin_unex(&chain->core.spin); 696 KKASSERT(RB_EMPTY(&chain->core.rbtree) && 697 chain->core.chain_count == 0); 698 699 /* 700 * All spin locks are gone, no pointers remain to the chain, finish 701 * freeing it. 702 */ 703 KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE | 704 HAMMER2_CHAIN_MODIFIED)) == 0); 705 dio = hammer2_chain_drop_data(chain, 1); 706 if (dio) 707 hammer2_io_bqrelse(&dio); 708 709 /* 710 * Once chain resources are gone we can use the now dead chain 711 * structure to placehold what might otherwise require a recursive 712 * drop, because we have potentially two things to drop and can only 713 * return one directly. 714 */ 715 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 716 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ALLOCATED); 717 chain->hmp = NULL; 718 kfree(chain, hmp->mchain); 719 } 720 721 /* 722 * Possible chaining loop when parent re-drop needed. 723 */ 724 return(rdrop); 725 } 726 727 /* 728 * On either last lock release or last drop 729 */ 730 static hammer2_io_t * 731 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop) 732 { 733 hammer2_io_t *dio; 734 735 if ((dio = chain->dio) != NULL) { 736 chain->dio = NULL; 737 chain->data = NULL; 738 } else { 739 switch(chain->bref.type) { 740 case HAMMER2_BREF_TYPE_VOLUME: 741 case HAMMER2_BREF_TYPE_FREEMAP: 742 if (lastdrop) 743 chain->data = NULL; 744 break; 745 default: 746 if (chain->data != NULL) { 747 hammer2_spin_unex(&chain->core.spin); 748 panic("chain data not null"); 749 } 750 KKASSERT(chain->data == NULL); 751 break; 752 } 753 } 754 return dio; 755 } 756 757 /* 758 * Lock a referenced chain element, acquiring its data with I/O if necessary, 759 * and specify how you would like the data to be resolved. 760 * 761 * If an I/O or other fatal error occurs, chain->error will be set to non-zero. 762 * 763 * The lock is allowed to recurse, multiple locking ops will aggregate 764 * the requested resolve types. Once data is assigned it will not be 765 * removed until the last unlock. 766 * 767 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 768 * (typically used to avoid device/logical buffer 769 * aliasing for data) 770 * 771 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 772 * the INITIAL-create state (indirect blocks only). 773 * 774 * Do not resolve data elements for DATA chains. 775 * (typically used to avoid device/logical buffer 776 * aliasing for data) 777 * 778 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 779 * 780 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 781 * it will be locked exclusive. 782 * 783 * NOTE: Embedded elements (volume header, inodes) are always resolved 784 * regardless. 785 * 786 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 787 * element will instantiate and zero its buffer, and flush it on 788 * release. 789 * 790 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 791 * so as not to instantiate a device buffer, which could alias against 792 * a logical file buffer. However, if ALWAYS is specified the 793 * device buffer will be instantiated anyway. 794 * 795 * WARNING! This function blocks on I/O if data needs to be fetched. This 796 * blocking can run concurrent with other compatible lock holders 797 * who do not need data returning. The lock is not upgraded to 798 * exclusive during a data fetch, a separate bit is used to 799 * interlock I/O. However, an exclusive lock holder can still count 800 * on being interlocked against an I/O fetch managed by a shared 801 * lock holder. 802 */ 803 void 804 hammer2_chain_lock(hammer2_chain_t *chain, int how) 805 { 806 /* 807 * Ref and lock the element. Recursive locks are allowed. 808 */ 809 KKASSERT(chain->refs > 0); 810 atomic_add_int(&chain->lockcnt, 1); 811 812 TIMER(20); 813 814 /* 815 * Get the appropriate lock. If LOCKAGAIN is flagged with SHARED 816 * the caller expects a shared lock to already be present and we 817 * are giving it another ref. This case must importantly not block 818 * if there is a pending exclusive lock request. 819 */ 820 if (how & HAMMER2_RESOLVE_SHARED) { 821 if (how & HAMMER2_RESOLVE_LOCKAGAIN) { 822 hammer2_mtx_sh_again(&chain->lock); 823 } else { 824 hammer2_mtx_sh(&chain->lock); 825 } 826 } else { 827 hammer2_mtx_ex(&chain->lock); 828 } 829 ++curthread->td_tracker; 830 TIMER(21); 831 832 /* 833 * If we already have a valid data pointer no further action is 834 * necessary. 835 */ 836 if (chain->data) 837 return; 838 TIMER(22); 839 840 /* 841 * Do we have to resolve the data? 842 */ 843 switch(how & HAMMER2_RESOLVE_MASK) { 844 case HAMMER2_RESOLVE_NEVER: 845 return; 846 case HAMMER2_RESOLVE_MAYBE: 847 if (chain->flags & HAMMER2_CHAIN_INITIAL) 848 return; 849 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 850 return; 851 #if 0 852 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 853 return; 854 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 855 return; 856 #endif 857 /* fall through */ 858 case HAMMER2_RESOLVE_ALWAYS: 859 default: 860 break; 861 } 862 863 /* 864 * Caller requires data 865 */ 866 hammer2_chain_load_data(chain); 867 } 868 869 /* 870 * Lock the chain and remove the data hold (matches against 871 * hammer2_chain_unlock_hold()). The data remains valid because 872 * the chain is now locked, but will be dropped as per-normal when 873 * the caller does a normal unlock. 874 */ 875 void 876 hammer2_chain_lock_unhold(hammer2_chain_t *chain, int how) 877 { 878 atomic_add_int(&chain->persist_refs, -1); 879 hammer2_chain_lock(chain, how); 880 } 881 882 #if 0 883 /* 884 * Downgrade an exclusive chain lock to a shared chain lock. 885 * 886 * NOTE: There is no upgrade equivalent due to the ease of 887 * deadlocks in that direction. 888 */ 889 void 890 hammer2_chain_lock_downgrade(hammer2_chain_t *chain) 891 { 892 hammer2_mtx_downgrade(&chain->lock); 893 } 894 #endif 895 896 #if 0 897 /* 898 * Obtains a second shared lock on the chain, does not account the second 899 * shared lock as being owned by the current thread. 900 * 901 * Caller must already own a shared lock on this chain. 902 * 903 * The lock function is required to obtain the second shared lock without 904 * blocking on pending exclusive requests. 905 */ 906 void 907 hammer2_chain_push_shared_lock(hammer2_chain_t *chain) 908 { 909 hammer2_mtx_sh_again(&chain->lock); 910 atomic_add_int(&chain->lockcnt, 1); 911 /* do not count in td_tracker for this thread */ 912 } 913 914 /* 915 * Accounts for a shared lock that was pushed to us as being owned by our 916 * thread. 917 */ 918 void 919 hammer2_chain_pull_shared_lock(hammer2_chain_t *chain) 920 { 921 ++curthread->td_tracker; 922 } 923 #endif 924 925 /* 926 * Issue I/O and install chain->data. Caller must hold a chain lock, lock 927 * may be of any type. 928 * 929 * Once chain->data is set it cannot be disposed of until all locks are 930 * released. 931 */ 932 void 933 hammer2_chain_load_data(hammer2_chain_t *chain) 934 { 935 hammer2_blockref_t *bref; 936 hammer2_dev_t *hmp; 937 hammer2_io_t *dio; 938 char *bdata; 939 int error; 940 941 /* 942 * Degenerate case, data already present. 943 */ 944 if (chain->data) 945 return; 946 TIMER(23); 947 948 hmp = chain->hmp; 949 KKASSERT(hmp != NULL); 950 951 /* 952 * Gain the IOINPROG bit, interlocked block. 953 */ 954 for (;;) { 955 u_int oflags; 956 u_int nflags; 957 958 oflags = chain->flags; 959 cpu_ccfence(); 960 if (oflags & HAMMER2_CHAIN_IOINPROG) { 961 nflags = oflags | HAMMER2_CHAIN_IOSIGNAL; 962 tsleep_interlock(&chain->flags, 0); 963 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 964 tsleep(&chain->flags, PINTERLOCKED, 965 "h2iocw", 0); 966 } 967 /* retry */ 968 } else { 969 nflags = oflags | HAMMER2_CHAIN_IOINPROG; 970 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 971 break; 972 } 973 /* retry */ 974 } 975 } 976 TIMER(24); 977 978 /* 979 * We own CHAIN_IOINPROG 980 * 981 * Degenerate case if we raced another load. 982 */ 983 if (chain->data) 984 goto done; 985 986 /* 987 * We must resolve to a device buffer, either by issuing I/O or 988 * by creating a zero-fill element. We do not mark the buffer 989 * dirty when creating a zero-fill element (the hammer2_chain_modify() 990 * API must still be used to do that). 991 * 992 * The device buffer is variable-sized in powers of 2 down 993 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 994 * chunk always contains buffers of the same size. (XXX) 995 * 996 * The minimum physical IO size may be larger than the variable 997 * block size. 998 */ 999 bref = &chain->bref; 1000 1001 /* 1002 * The getblk() optimization can only be used on newly created 1003 * elements if the physical block size matches the request. 1004 */ 1005 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1006 error = hammer2_io_new(hmp, bref->type, 1007 bref->data_off, chain->bytes, 1008 &chain->dio); 1009 } else { 1010 error = hammer2_io_bread(hmp, bref->type, 1011 bref->data_off, chain->bytes, 1012 &chain->dio); 1013 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1014 } 1015 TIMER(25); 1016 if (error) { 1017 chain->error = HAMMER2_ERROR_IO; 1018 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 1019 (intmax_t)bref->data_off, error); 1020 hammer2_io_bqrelse(&chain->dio); 1021 goto done; 1022 } 1023 chain->error = 0; 1024 1025 /* 1026 * This isn't perfect and can be ignored on OSs which do not have 1027 * an indication as to whether a buffer is coming from cache or 1028 * if I/O was actually issued for the read. TESTEDGOOD will work 1029 * pretty well without the B_IOISSUED logic because chains are 1030 * cached. 1031 * 1032 * If the underlying kernel buffer covers the entire chain we can 1033 * use the B_IOISSUED indication to determine if we have to re-run 1034 * the CRC on chain data for chains that managed to stay cached 1035 * across the kernel disposal of the original buffer. 1036 */ 1037 if ((dio = chain->dio) != NULL && dio->bp) { 1038 struct buf *bp = dio->bp; 1039 1040 if (dio->psize == chain->bytes && 1041 (bp->b_flags & B_IOISSUED)) { 1042 atomic_clear_int(&chain->flags, 1043 HAMMER2_CHAIN_TESTEDGOOD); 1044 bp->b_flags &= ~B_IOISSUED; 1045 } 1046 } 1047 1048 /* 1049 * NOTE: A locked chain's data cannot be modified without first 1050 * calling hammer2_chain_modify(). 1051 */ 1052 1053 /* 1054 * Clear INITIAL. In this case we used io_new() and the buffer has 1055 * been zero'd and marked dirty. 1056 */ 1057 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 1058 1059 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1060 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1061 chain->bref.flags |= HAMMER2_BREF_FLAG_ZERO; 1062 } else if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1063 /* 1064 * check data not currently synchronized due to 1065 * modification. XXX assumes data stays in the buffer 1066 * cache, which might not be true (need biodep on flush 1067 * to calculate crc? or simple crc?). 1068 */ 1069 } else if ((chain->flags & HAMMER2_CHAIN_TESTEDGOOD) == 0) { 1070 TIMER(26); 1071 if (hammer2_chain_testcheck(chain, bdata) == 0) { 1072 chain->error = HAMMER2_ERROR_CHECK; 1073 } else { 1074 atomic_set_int(&chain->flags, HAMMER2_CHAIN_TESTEDGOOD); 1075 } 1076 } 1077 TIMER(27); 1078 1079 /* 1080 * Setup the data pointer, either pointing it to an embedded data 1081 * structure and copying the data from the buffer, or pointing it 1082 * into the buffer. 1083 * 1084 * The buffer is not retained when copying to an embedded data 1085 * structure in order to avoid potential deadlocks or recursions 1086 * on the same physical buffer. 1087 * 1088 * WARNING! Other threads can start using the data the instant we 1089 * set chain->data non-NULL. 1090 */ 1091 switch (bref->type) { 1092 case HAMMER2_BREF_TYPE_VOLUME: 1093 case HAMMER2_BREF_TYPE_FREEMAP: 1094 /* 1095 * Copy data from bp to embedded buffer 1096 */ 1097 panic("hammer2_chain_lock: called on unresolved volume header"); 1098 break; 1099 case HAMMER2_BREF_TYPE_INODE: 1100 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1101 case HAMMER2_BREF_TYPE_INDIRECT: 1102 case HAMMER2_BREF_TYPE_DATA: 1103 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1104 default: 1105 /* 1106 * Point data at the device buffer and leave dio intact. 1107 */ 1108 chain->data = (void *)bdata; 1109 break; 1110 } 1111 1112 /* 1113 * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested. 1114 */ 1115 done: 1116 for (;;) { 1117 u_int oflags; 1118 u_int nflags; 1119 1120 oflags = chain->flags; 1121 nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG | 1122 HAMMER2_CHAIN_IOSIGNAL); 1123 KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG); 1124 if (atomic_cmpset_int(&chain->flags, oflags, nflags)) { 1125 if (oflags & HAMMER2_CHAIN_IOSIGNAL) 1126 wakeup(&chain->flags); 1127 break; 1128 } 1129 } 1130 TIMER(28); 1131 } 1132 1133 /* 1134 * Unlock and deref a chain element. 1135 * 1136 * Remember that the presence of children under chain prevent the chain's 1137 * destruction but do not add additional references, so the dio will still 1138 * be dropped. 1139 */ 1140 void 1141 hammer2_chain_unlock(hammer2_chain_t *chain) 1142 { 1143 u_int lockcnt; 1144 1145 --curthread->td_tracker; 1146 /* 1147 * If multiple locks are present (or being attempted) on this 1148 * particular chain we can just unlock, drop refs, and return. 1149 * 1150 * Otherwise fall-through on the 1->0 transition. 1151 */ 1152 for (;;) { 1153 lockcnt = chain->lockcnt; 1154 KKASSERT(lockcnt > 0); 1155 cpu_ccfence(); 1156 if (lockcnt > 1) { 1157 if (atomic_cmpset_int(&chain->lockcnt, 1158 lockcnt, lockcnt - 1)) { 1159 hammer2_mtx_unlock(&chain->lock); 1160 return; 1161 } 1162 } else { 1163 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1164 break; 1165 } 1166 /* retry */ 1167 } 1168 1169 /* 1170 * Normally we want to disassociate the data on the last unlock, 1171 * but leave it intact if persist_refs is non-zero. The persist-data 1172 * user modifies persist_refs only while holding the chain locked 1173 * so there should be no race on the last unlock here. 1174 * 1175 * NOTE: If this was a shared lock we have to temporarily upgrade it 1176 * to prevent data load races. We can only do this non-blocking, 1177 * and unlock/relock-excl can deadlock. If the try fails it 1178 * means someone else got a shared or exclusive lock while we 1179 * we bandying about. 1180 */ 1181 if (chain->persist_refs == 0) { 1182 hammer2_io_t *dio; 1183 1184 if (hammer2_mtx_upgrade_try(&chain->lock) == 0 && 1185 chain->lockcnt == 0 && chain->persist_refs == 0) { 1186 dio = hammer2_chain_drop_data(chain, 0); 1187 if (dio) 1188 hammer2_io_bqrelse(&dio); 1189 } 1190 } 1191 hammer2_mtx_unlock(&chain->lock); 1192 } 1193 1194 /* 1195 * Unlock and hold chain data intact 1196 */ 1197 void 1198 hammer2_chain_unlock_hold(hammer2_chain_t *chain) 1199 { 1200 atomic_add_int(&chain->persist_refs, 1); 1201 hammer2_chain_unlock(chain); 1202 } 1203 1204 /* 1205 * Helper to obtain the blockref[] array base and count for a chain. 1206 * 1207 * XXX Not widely used yet, various use cases need to be validated and 1208 * converted to use this function. 1209 */ 1210 static 1211 hammer2_blockref_t * 1212 hammer2_chain_base_and_count(hammer2_chain_t *parent, int *countp) 1213 { 1214 hammer2_blockref_t *base; 1215 int count; 1216 1217 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1218 base = NULL; 1219 1220 switch(parent->bref.type) { 1221 case HAMMER2_BREF_TYPE_INODE: 1222 count = HAMMER2_SET_COUNT; 1223 break; 1224 case HAMMER2_BREF_TYPE_INDIRECT: 1225 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1226 count = parent->bytes / sizeof(hammer2_blockref_t); 1227 break; 1228 case HAMMER2_BREF_TYPE_VOLUME: 1229 count = HAMMER2_SET_COUNT; 1230 break; 1231 case HAMMER2_BREF_TYPE_FREEMAP: 1232 count = HAMMER2_SET_COUNT; 1233 break; 1234 default: 1235 panic("hammer2_chain_create_indirect: " 1236 "unrecognized blockref type: %d", 1237 parent->bref.type); 1238 count = 0; 1239 break; 1240 } 1241 } else { 1242 switch(parent->bref.type) { 1243 case HAMMER2_BREF_TYPE_INODE: 1244 base = &parent->data->ipdata.u.blockset.blockref[0]; 1245 count = HAMMER2_SET_COUNT; 1246 break; 1247 case HAMMER2_BREF_TYPE_INDIRECT: 1248 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1249 base = &parent->data->npdata[0]; 1250 count = parent->bytes / sizeof(hammer2_blockref_t); 1251 break; 1252 case HAMMER2_BREF_TYPE_VOLUME: 1253 base = &parent->data->voldata. 1254 sroot_blockset.blockref[0]; 1255 count = HAMMER2_SET_COUNT; 1256 break; 1257 case HAMMER2_BREF_TYPE_FREEMAP: 1258 base = &parent->data->blkset.blockref[0]; 1259 count = HAMMER2_SET_COUNT; 1260 break; 1261 default: 1262 panic("hammer2_chain_create_indirect: " 1263 "unrecognized blockref type: %d", 1264 parent->bref.type); 1265 count = 0; 1266 break; 1267 } 1268 } 1269 *countp = count; 1270 1271 return base; 1272 } 1273 1274 /* 1275 * This counts the number of live blockrefs in a block array and 1276 * also calculates the point at which all remaining blockrefs are empty. 1277 * This routine can only be called on a live chain. 1278 * 1279 * NOTE: Flag is not set until after the count is complete, allowing 1280 * callers to test the flag without holding the spinlock. 1281 * 1282 * NOTE: If base is NULL the related chain is still in the INITIAL 1283 * state and there are no blockrefs to count. 1284 * 1285 * NOTE: live_count may already have some counts accumulated due to 1286 * creation and deletion and could even be initially negative. 1287 */ 1288 void 1289 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1290 hammer2_blockref_t *base, int count) 1291 { 1292 hammer2_spin_ex(&chain->core.spin); 1293 if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) { 1294 if (base) { 1295 while (--count >= 0) { 1296 if (base[count].type) 1297 break; 1298 } 1299 chain->core.live_zero = count + 1; 1300 while (count >= 0) { 1301 if (base[count].type) 1302 atomic_add_int(&chain->core.live_count, 1303 1); 1304 --count; 1305 } 1306 } else { 1307 chain->core.live_zero = 0; 1308 } 1309 /* else do not modify live_count */ 1310 atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS); 1311 } 1312 hammer2_spin_unex(&chain->core.spin); 1313 } 1314 1315 /* 1316 * Resize the chain's physical storage allocation in-place. This function does 1317 * not adjust the data pointer and must be followed by (typically) a 1318 * hammer2_chain_modify() call to copy any old data over and adjust the 1319 * data pointer. 1320 * 1321 * Chains can be resized smaller without reallocating the storage. Resizing 1322 * larger will reallocate the storage. Excess or prior storage is reclaimed 1323 * asynchronously at a later time. 1324 * 1325 * Must be passed an exclusively locked parent and chain. 1326 * 1327 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1328 * to avoid instantiating a device buffer that conflicts with the vnode data 1329 * buffer. However, because H2 can compress or encrypt data, the chain may 1330 * have a dio assigned to it in those situations, and they do not conflict. 1331 * 1332 * XXX return error if cannot resize. 1333 */ 1334 void 1335 hammer2_chain_resize(hammer2_inode_t *ip, 1336 hammer2_chain_t *parent, hammer2_chain_t *chain, 1337 hammer2_tid_t mtid, hammer2_off_t dedup_off, 1338 int nradix, int flags) 1339 { 1340 hammer2_dev_t *hmp; 1341 size_t obytes; 1342 size_t nbytes; 1343 1344 hmp = chain->hmp; 1345 1346 /* 1347 * Only data and indirect blocks can be resized for now. 1348 * (The volu root, inodes, and freemap elements use a fixed size). 1349 */ 1350 KKASSERT(chain != &hmp->vchain); 1351 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1352 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT); 1353 KKASSERT(chain->parent == parent); 1354 1355 /* 1356 * Nothing to do if the element is already the proper size 1357 */ 1358 obytes = chain->bytes; 1359 nbytes = 1U << nradix; 1360 if (obytes == nbytes) 1361 return; 1362 1363 /* 1364 * Make sure the old data is instantiated so we can copy it. If this 1365 * is a data block, the device data may be superfluous since the data 1366 * might be in a logical block, but compressed or encrypted data is 1367 * another matter. 1368 * 1369 * NOTE: The modify will set BMAPUPD for us if BMAPPED is set. 1370 */ 1371 hammer2_chain_modify(chain, mtid, dedup_off, 0); 1372 1373 /* 1374 * Relocate the block, even if making it smaller (because different 1375 * block sizes may be in different regions). 1376 * 1377 * (data blocks only, we aren't copying the storage here). 1378 */ 1379 hammer2_freemap_alloc(chain, nbytes); 1380 chain->bytes = nbytes; 1381 /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */ 1382 1383 /* 1384 * We don't want the followup chain_modify() to try to copy data 1385 * from the old (wrong-sized) buffer. It won't know how much to 1386 * copy. This case should only occur during writes when the 1387 * originator already has the data to write in-hand. 1388 */ 1389 if (chain->dio) { 1390 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA); 1391 hammer2_io_brelse(&chain->dio); 1392 chain->data = NULL; 1393 } 1394 } 1395 1396 /* 1397 * Helper for chains already flagged as MODIFIED. A new allocation may 1398 * still be required if the existing one has already been used in a de-dup. 1399 */ 1400 static __inline 1401 int 1402 modified_needs_new_allocation(hammer2_chain_t *chain) 1403 { 1404 hammer2_io_t *dio; 1405 1406 /* 1407 * We only live-dedup data, we do not live-dedup meta-data. 1408 */ 1409 if (chain->bref.type != HAMMER2_BREF_TYPE_DATA) 1410 return 0; 1411 1412 /* 1413 * If this flag is not set the current modification has not been 1414 * recorded for dedup so a new allocation is not needed. The 1415 * recording occurs when dirty file data is flushed from the frontend 1416 * to the backend. 1417 */ 1418 if (chain->flags & HAMMER2_CHAIN_DEDUP) 1419 return 1; 1420 1421 /* 1422 * If the DEDUP flag is set we have one final line of defense to 1423 * allow re-use of a modified buffer, and that is if the DIO_INVALOK 1424 * flag is still set on the underlying DIO. This flag is only set 1425 * for hammer2_io_new() buffers which cover the whole buffer (64KB), 1426 * and is cleared when a dedup operation actually decides to use 1427 * the buffer. 1428 */ 1429 1430 if ((dio = chain->dio) != NULL) { 1431 if (dio->refs & HAMMER2_DIO_INVALOK) 1432 return 0; 1433 } else { 1434 dio = hammer2_io_getquick(chain->hmp, chain->bref.data_off, 1435 chain->bytes); 1436 if (dio) { 1437 if (dio->refs & HAMMER2_DIO_INVALOK) { 1438 hammer2_io_putblk(&dio); 1439 return 0; 1440 } 1441 hammer2_io_putblk(&dio); 1442 } 1443 } 1444 return 1; 1445 } 1446 1447 /* 1448 * Set the chain modified so its data can be changed by the caller. 1449 * 1450 * Sets bref.modify_tid to mtid only if mtid != 0. Note that bref.modify_tid 1451 * is a CLC (cluster level change) field and is not updated by parent 1452 * propagation during a flush. 1453 * 1454 * If the caller passes a non-zero dedup_off we assign data_off to that 1455 * instead of allocating a ne block. Caller must not modify the data already 1456 * present at the target offset. 1457 */ 1458 void 1459 hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, 1460 hammer2_off_t dedup_off, int flags) 1461 { 1462 hammer2_blockref_t obref; 1463 hammer2_dev_t *hmp; 1464 hammer2_io_t *dio; 1465 int error; 1466 int wasinitial; 1467 int newmod; 1468 char *bdata; 1469 1470 hmp = chain->hmp; 1471 obref = chain->bref; 1472 KKASSERT((chain->flags & HAMMER2_CHAIN_FICTITIOUS) == 0); 1473 1474 /* 1475 * Data is not optional for freemap chains (we must always be sure 1476 * to copy the data on COW storage allocations). 1477 */ 1478 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1479 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1480 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1481 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1482 } 1483 1484 /* 1485 * Data must be resolved if already assigned, unless explicitly 1486 * flagged otherwise. 1487 */ 1488 if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1489 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1490 hammer2_chain_load_data(chain); 1491 } 1492 1493 /* 1494 * Set MODIFIED to indicate that the chain has been modified. 1495 * Set UPDATE to ensure that the blockref is updated in the parent. 1496 * 1497 * If MODIFIED is already set determine if we can reuse the assigned 1498 * data block or if we need a new data block. The assigned data block 1499 * can be reused if HAMMER2_DIO_INVALOK is set on the dio. 1500 */ 1501 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) && 1502 modified_needs_new_allocation(chain)) { 1503 newmod = 1; 1504 } else if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1505 /* 1506 * Must set modified bit. 1507 */ 1508 atomic_add_long(&hammer2_count_modified_chains, 1); 1509 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1510 hammer2_pfs_memory_inc(chain->pmp); /* can be NULL */ 1511 1512 /* 1513 * We may be able to avoid a copy-on-write if the chain's 1514 * check mode is set to NONE and the chain's current 1515 * modify_tid is beyond the last explicit snapshot tid. 1516 * 1517 * This implements HAMMER2's overwrite-in-place feature. 1518 * 1519 * NOTE! This data-block cannot be used as a de-duplication 1520 * source when the check mode is set to NONE. 1521 */ 1522 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1523 (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 && 1524 HAMMER2_DEC_CHECK(chain->bref.methods) == 1525 HAMMER2_CHECK_NONE && 1526 chain->pmp && 1527 chain->bref.modify_tid > 1528 chain->pmp->iroot->meta.pfs_lsnap_tid && 1529 modified_needs_new_allocation(chain) == 0) { 1530 /* 1531 * Sector overwrite allowed. 1532 */ 1533 newmod = 0; 1534 } else { 1535 /* 1536 * Sector overwrite not allowed, must copy-on-write. 1537 */ 1538 newmod = 1; 1539 } 1540 } else { 1541 /* 1542 * Already flagged modified, no new allocation is needed. 1543 */ 1544 newmod = 0; 1545 } 1546 1547 /* 1548 * Flag parent update required. 1549 */ 1550 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 1551 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1552 1553 /* 1554 * The modification or re-modification requires an allocation and 1555 * possible COW. 1556 * 1557 * If dedup_off is non-zero, caller already has a data offset 1558 * containing the caller's desired data. The dedup offset is 1559 * allowed to be in a partially free state and we must be sure 1560 * to reset it to a fully allocated state to force two bulkfree 1561 * passes to free it again. 1562 * 1563 * XXX can a chain already be marked MODIFIED without a data 1564 * assignment? If not, assert here instead of testing the case. 1565 */ 1566 if (chain != &hmp->vchain && chain != &hmp->fchain) { 1567 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1568 newmod 1569 ) { 1570 if (dedup_off) { 1571 chain->bref.data_off = dedup_off; 1572 chain->bytes = 1 << (dedup_off & 1573 HAMMER2_OFF_MASK_RADIX); 1574 atomic_set_int(&chain->flags, 1575 HAMMER2_CHAIN_DEDUP); 1576 hammer2_freemap_adjust(hmp, &chain->bref, 1577 HAMMER2_FREEMAP_DORECOVER); 1578 } else { 1579 hammer2_freemap_alloc(chain, chain->bytes); 1580 atomic_clear_int(&chain->flags, 1581 HAMMER2_CHAIN_DEDUP); 1582 } 1583 /* XXX failed allocation */ 1584 } 1585 } 1586 1587 /* 1588 * Update mirror_tid and modify_tid. modify_tid is only updated 1589 * if not passed as zero (during flushes, parent propagation passes 1590 * the value 0). 1591 * 1592 * NOTE: chain->pmp could be the device spmp. 1593 */ 1594 chain->bref.mirror_tid = hmp->voldata.mirror_tid + 1; 1595 if (mtid) 1596 chain->bref.modify_tid = mtid; 1597 1598 /* 1599 * Set BMAPUPD to tell the flush code that an existing blockmap entry 1600 * requires updating as well as to tell the delete code that the 1601 * chain's blockref might not exactly match (in terms of physical size 1602 * or block offset) the one in the parent's blocktable. The base key 1603 * of course will still match. 1604 */ 1605 if (chain->flags & HAMMER2_CHAIN_BMAPPED) 1606 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD); 1607 1608 /* 1609 * Short-cut data blocks which the caller does not need an actual 1610 * data reference to (aka OPTDATA), as long as the chain does not 1611 * already have a data pointer to the data. This generally means 1612 * that the modifications are being done via the logical buffer cache. 1613 * The INITIAL flag relates only to the device data buffer and thus 1614 * remains unchange in this situation. 1615 */ 1616 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1617 (flags & HAMMER2_MODIFY_OPTDATA) && 1618 chain->data == NULL) { 1619 KKASSERT(chain->dio == NULL); 1620 goto skip2; 1621 } 1622 1623 /* 1624 * Clearing the INITIAL flag (for indirect blocks) indicates that 1625 * we've processed the uninitialized storage allocation. 1626 * 1627 * If this flag is already clear we are likely in a copy-on-write 1628 * situation but we have to be sure NOT to bzero the storage if 1629 * no data is present. 1630 */ 1631 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1632 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1633 wasinitial = 1; 1634 } else { 1635 wasinitial = 0; 1636 } 1637 1638 /* 1639 * Instantiate data buffer and possibly execute COW operation 1640 */ 1641 switch(chain->bref.type) { 1642 case HAMMER2_BREF_TYPE_VOLUME: 1643 case HAMMER2_BREF_TYPE_FREEMAP: 1644 /* 1645 * The data is embedded, no copy-on-write operation is 1646 * needed. 1647 */ 1648 KKASSERT(chain->dio == NULL); 1649 break; 1650 case HAMMER2_BREF_TYPE_INODE: 1651 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1652 case HAMMER2_BREF_TYPE_DATA: 1653 case HAMMER2_BREF_TYPE_INDIRECT: 1654 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1655 /* 1656 * Perform the copy-on-write operation 1657 * 1658 * zero-fill or copy-on-write depending on whether 1659 * chain->data exists or not and set the dirty state for 1660 * the new buffer. hammer2_io_new() will handle the 1661 * zero-fill. 1662 * 1663 * If a dedup_off was supplied this is an existing block 1664 * and no COW, copy, or further modification is required. 1665 */ 1666 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1667 1668 if (wasinitial && dedup_off == 0) { 1669 error = hammer2_io_new(hmp, chain->bref.type, 1670 chain->bref.data_off, 1671 chain->bytes, &dio); 1672 } else { 1673 error = hammer2_io_bread(hmp, chain->bref.type, 1674 chain->bref.data_off, 1675 chain->bytes, &dio); 1676 } 1677 hammer2_adjreadcounter(&chain->bref, chain->bytes); 1678 1679 /* 1680 * If an I/O error occurs make sure callers cannot accidently 1681 * modify the old buffer's contents and corrupt the filesystem. 1682 */ 1683 if (error) { 1684 kprintf("hammer2_chain_modify: hmp=%p I/O error\n", 1685 hmp); 1686 chain->error = HAMMER2_ERROR_IO; 1687 hammer2_io_brelse(&dio); 1688 hammer2_io_brelse(&chain->dio); 1689 chain->data = NULL; 1690 break; 1691 } 1692 chain->error = 0; 1693 bdata = hammer2_io_data(dio, chain->bref.data_off); 1694 1695 if (chain->data) { 1696 /* 1697 * COW (unless a dedup). 1698 */ 1699 KKASSERT(chain->dio != NULL); 1700 if (chain->data != (void *)bdata && dedup_off == 0) { 1701 bcopy(chain->data, bdata, chain->bytes); 1702 } 1703 } else if (wasinitial == 0) { 1704 /* 1705 * We have a problem. We were asked to COW but 1706 * we don't have any data to COW with! 1707 */ 1708 panic("hammer2_chain_modify: having a COW %p\n", 1709 chain); 1710 } 1711 1712 /* 1713 * Retire the old buffer, replace with the new. Dirty or 1714 * redirty the new buffer. 1715 * 1716 * WARNING! The system buffer cache may have already flushed 1717 * the buffer, so we must be sure to [re]dirty it 1718 * for further modification. 1719 * 1720 * If dedup_off was supplied, the caller is not 1721 * expected to make any further modification to the 1722 * buffer. 1723 */ 1724 if (chain->dio) 1725 hammer2_io_bqrelse(&chain->dio); 1726 chain->data = (void *)bdata; 1727 chain->dio = dio; 1728 if (dedup_off == 0) 1729 hammer2_io_setdirty(dio); 1730 break; 1731 default: 1732 panic("hammer2_chain_modify: illegal non-embedded type %d", 1733 chain->bref.type); 1734 break; 1735 1736 } 1737 skip2: 1738 /* 1739 * setflush on parent indicating that the parent must recurse down 1740 * to us. Do not call on chain itself which might already have it 1741 * set. 1742 */ 1743 if (chain->parent) 1744 hammer2_chain_setflush(chain->parent); 1745 } 1746 1747 /* 1748 * Modify the chain associated with an inode. 1749 */ 1750 void 1751 hammer2_chain_modify_ip(hammer2_inode_t *ip, hammer2_chain_t *chain, 1752 hammer2_tid_t mtid, int flags) 1753 { 1754 hammer2_inode_modify(ip); 1755 hammer2_chain_modify(chain, mtid, 0, flags); 1756 } 1757 1758 /* 1759 * Volume header data locks 1760 */ 1761 void 1762 hammer2_voldata_lock(hammer2_dev_t *hmp) 1763 { 1764 lockmgr(&hmp->vollk, LK_EXCLUSIVE); 1765 } 1766 1767 void 1768 hammer2_voldata_unlock(hammer2_dev_t *hmp) 1769 { 1770 lockmgr(&hmp->vollk, LK_RELEASE); 1771 } 1772 1773 void 1774 hammer2_voldata_modify(hammer2_dev_t *hmp) 1775 { 1776 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1777 atomic_add_long(&hammer2_count_modified_chains, 1); 1778 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED); 1779 hammer2_pfs_memory_inc(hmp->vchain.pmp); 1780 } 1781 } 1782 1783 /* 1784 * This function returns the chain at the nearest key within the specified 1785 * range. The returned chain will be referenced but not locked. 1786 * 1787 * This function will recurse through chain->rbtree as necessary and will 1788 * return a *key_nextp suitable for iteration. *key_nextp is only set if 1789 * the iteration value is less than the current value of *key_nextp. 1790 * 1791 * The caller should use (*key_nextp) to calculate the actual range of 1792 * the returned element, which will be (key_beg to *key_nextp - 1), because 1793 * there might be another element which is superior to the returned element 1794 * and overlaps it. 1795 * 1796 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 1797 * chains continue to be returned. On EOF (*key_nextp) may overflow since 1798 * it will wind up being (key_end + 1). 1799 * 1800 * WARNING! Must be called with child's spinlock held. Spinlock remains 1801 * held through the operation. 1802 */ 1803 struct hammer2_chain_find_info { 1804 hammer2_chain_t *best; 1805 hammer2_key_t key_beg; 1806 hammer2_key_t key_end; 1807 hammer2_key_t key_next; 1808 }; 1809 1810 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 1811 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 1812 1813 static 1814 hammer2_chain_t * 1815 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 1816 hammer2_key_t key_beg, hammer2_key_t key_end) 1817 { 1818 struct hammer2_chain_find_info info; 1819 1820 info.best = NULL; 1821 info.key_beg = key_beg; 1822 info.key_end = key_end; 1823 info.key_next = *key_nextp; 1824 1825 RB_SCAN(hammer2_chain_tree, &parent->core.rbtree, 1826 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1827 &info); 1828 *key_nextp = info.key_next; 1829 #if 0 1830 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 1831 parent, key_beg, key_end, *key_nextp); 1832 #endif 1833 1834 return (info.best); 1835 } 1836 1837 static 1838 int 1839 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 1840 { 1841 struct hammer2_chain_find_info *info = data; 1842 hammer2_key_t child_beg; 1843 hammer2_key_t child_end; 1844 1845 child_beg = child->bref.key; 1846 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 1847 1848 if (child_end < info->key_beg) 1849 return(-1); 1850 if (child_beg > info->key_end) 1851 return(1); 1852 return(0); 1853 } 1854 1855 static 1856 int 1857 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 1858 { 1859 struct hammer2_chain_find_info *info = data; 1860 hammer2_chain_t *best; 1861 hammer2_key_t child_end; 1862 1863 /* 1864 * WARNING! Layerq is scanned forwards, exact matches should keep 1865 * the existing info->best. 1866 */ 1867 if ((best = info->best) == NULL) { 1868 /* 1869 * No previous best. Assign best 1870 */ 1871 info->best = child; 1872 } else if (best->bref.key <= info->key_beg && 1873 child->bref.key <= info->key_beg) { 1874 /* 1875 * Illegal overlap. 1876 */ 1877 KKASSERT(0); 1878 /*info->best = child;*/ 1879 } else if (child->bref.key < best->bref.key) { 1880 /* 1881 * Child has a nearer key and best is not flush with key_beg. 1882 * Set best to child. Truncate key_next to the old best key. 1883 */ 1884 info->best = child; 1885 if (info->key_next > best->bref.key || info->key_next == 0) 1886 info->key_next = best->bref.key; 1887 } else if (child->bref.key == best->bref.key) { 1888 /* 1889 * If our current best is flush with the child then this 1890 * is an illegal overlap. 1891 * 1892 * key_next will automatically be limited to the smaller of 1893 * the two end-points. 1894 */ 1895 KKASSERT(0); 1896 info->best = child; 1897 } else { 1898 /* 1899 * Keep the current best but truncate key_next to the child's 1900 * base. 1901 * 1902 * key_next will also automatically be limited to the smaller 1903 * of the two end-points (probably not necessary for this case 1904 * but we do it anyway). 1905 */ 1906 if (info->key_next > child->bref.key || info->key_next == 0) 1907 info->key_next = child->bref.key; 1908 } 1909 1910 /* 1911 * Always truncate key_next based on child's end-of-range. 1912 */ 1913 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 1914 if (child_end && (info->key_next > child_end || info->key_next == 0)) 1915 info->key_next = child_end; 1916 1917 return(0); 1918 } 1919 1920 /* 1921 * Retrieve the specified chain from a media blockref, creating the 1922 * in-memory chain structure which reflects it. 1923 * 1924 * To handle insertion races pass the INSERT_RACE flag along with the 1925 * generation number of the core. NULL will be returned if the generation 1926 * number changes before we have a chance to insert the chain. Insert 1927 * races can occur because the parent might be held shared. 1928 * 1929 * Caller must hold the parent locked shared or exclusive since we may 1930 * need the parent's bref array to find our block. 1931 * 1932 * WARNING! chain->pmp is always set to NULL for any chain representing 1933 * part of the super-root topology. 1934 */ 1935 hammer2_chain_t * 1936 hammer2_chain_get(hammer2_chain_t *parent, int generation, 1937 hammer2_blockref_t *bref) 1938 { 1939 hammer2_dev_t *hmp = parent->hmp; 1940 hammer2_chain_t *chain; 1941 int error; 1942 1943 /* 1944 * Allocate a chain structure representing the existing media 1945 * entry. Resulting chain has one ref and is not locked. 1946 */ 1947 if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT) 1948 chain = hammer2_chain_alloc(hmp, NULL, bref); 1949 else 1950 chain = hammer2_chain_alloc(hmp, parent->pmp, bref); 1951 /* ref'd chain returned */ 1952 1953 /* 1954 * Flag that the chain is in the parent's blockmap so delete/flush 1955 * knows what to do with it. 1956 */ 1957 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 1958 1959 /* 1960 * Link the chain into its parent. A spinlock is required to safely 1961 * access the RBTREE, and it is possible to collide with another 1962 * hammer2_chain_get() operation because the caller might only hold 1963 * a shared lock on the parent. 1964 * 1965 * NOTE: Get races can occur quite often when we distribute 1966 * asynchronous read-aheads across multiple threads. 1967 */ 1968 KKASSERT(parent->refs > 0); 1969 error = hammer2_chain_insert(parent, chain, 1970 HAMMER2_CHAIN_INSERT_SPIN | 1971 HAMMER2_CHAIN_INSERT_RACE, 1972 generation); 1973 if (error) { 1974 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 1975 /*kprintf("chain %p get race\n", chain);*/ 1976 hammer2_chain_drop(chain); 1977 chain = NULL; 1978 } else { 1979 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 1980 } 1981 1982 /* 1983 * Return our new chain referenced but not locked, or NULL if 1984 * a race occurred. 1985 */ 1986 return (chain); 1987 } 1988 1989 /* 1990 * Lookup initialization/completion API 1991 */ 1992 hammer2_chain_t * 1993 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 1994 { 1995 hammer2_chain_ref(parent); 1996 if (flags & HAMMER2_LOOKUP_SHARED) { 1997 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 1998 HAMMER2_RESOLVE_SHARED); 1999 } else { 2000 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 2001 } 2002 return (parent); 2003 } 2004 2005 void 2006 hammer2_chain_lookup_done(hammer2_chain_t *parent) 2007 { 2008 if (parent) { 2009 hammer2_chain_unlock(parent); 2010 hammer2_chain_drop(parent); 2011 } 2012 } 2013 2014 hammer2_chain_t * 2015 hammer2_chain_getparent(hammer2_chain_t **parentp, int how) 2016 { 2017 hammer2_chain_t *oparent; 2018 hammer2_chain_t *nparent; 2019 2020 /* 2021 * Be careful of order, oparent must be unlocked before nparent 2022 * is locked below to avoid a deadlock. 2023 */ 2024 oparent = *parentp; 2025 hammer2_spin_ex(&oparent->core.spin); 2026 nparent = oparent->parent; 2027 if (nparent == NULL) { 2028 hammer2_spin_unex(&oparent->core.spin); 2029 panic("hammer2_chain_getparent: no parent"); 2030 } 2031 hammer2_chain_ref(nparent); 2032 hammer2_spin_unex(&oparent->core.spin); 2033 if (oparent) { 2034 hammer2_chain_unlock(oparent); 2035 hammer2_chain_drop(oparent); 2036 oparent = NULL; 2037 } 2038 2039 hammer2_chain_lock(nparent, how); 2040 *parentp = nparent; 2041 2042 return (nparent); 2043 } 2044 2045 /* 2046 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 2047 * (*parentp) typically points to an inode but can also point to a related 2048 * indirect block and this function will recurse upwards and find the inode 2049 * again. 2050 * 2051 * (*parentp) must be exclusively locked and referenced and can be an inode 2052 * or an existing indirect block within the inode. 2053 * 2054 * On return (*parentp) will be modified to point at the deepest parent chain 2055 * element encountered during the search, as a helper for an insertion or 2056 * deletion. The new (*parentp) will be locked and referenced and the old 2057 * will be unlocked and dereferenced (no change if they are both the same). 2058 * 2059 * The matching chain will be returned exclusively locked. If NOLOCK is 2060 * requested the chain will be returned only referenced. Note that the 2061 * parent chain must always be locked shared or exclusive, matching the 2062 * HAMMER2_LOOKUP_SHARED flag. We can conceivably lock it SHARED temporarily 2063 * when NOLOCK is specified but that complicates matters if *parentp must 2064 * inherit the chain. 2065 * 2066 * NOLOCK also implies NODATA, since an unlocked chain usually has a NULL 2067 * data pointer or can otherwise be in flux. 2068 * 2069 * NULL is returned if no match was found, but (*parentp) will still 2070 * potentially be adjusted. 2071 * 2072 * If a fatal error occurs (typically an I/O error), a dummy chain is 2073 * returned with chain->error and error-identifying information set. This 2074 * chain will assert if you try to do anything fancy with it. 2075 * 2076 * XXX Depending on where the error occurs we should allow continued iteration. 2077 * 2078 * On return (*key_nextp) will point to an iterative value for key_beg. 2079 * (If NULL is returned (*key_nextp) is set to (key_end + 1)). 2080 * 2081 * This function will also recurse up the chain if the key is not within the 2082 * current parent's range. (*parentp) can never be set to NULL. An iteration 2083 * can simply allow (*parentp) to float inside the loop. 2084 * 2085 * NOTE! chain->data is not always resolved. By default it will not be 2086 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 2087 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 2088 * BREF_TYPE_DATA as the device buffer can alias the logical file 2089 * buffer). 2090 */ 2091 2092 hammer2_chain_t * 2093 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2094 hammer2_key_t key_beg, hammer2_key_t key_end, 2095 int *cache_indexp, int flags) 2096 { 2097 hammer2_dev_t *hmp; 2098 hammer2_chain_t *parent; 2099 hammer2_chain_t *chain; 2100 hammer2_blockref_t *base; 2101 hammer2_blockref_t *bref; 2102 hammer2_blockref_t bcopy; 2103 hammer2_key_t scan_beg; 2104 hammer2_key_t scan_end; 2105 int count = 0; 2106 int how_always = HAMMER2_RESOLVE_ALWAYS; 2107 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2108 int how; 2109 int generation; 2110 int maxloops = 300000; 2111 2112 TIMER(8); 2113 2114 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2115 how_maybe = how_always; 2116 how = HAMMER2_RESOLVE_ALWAYS; 2117 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2118 how = HAMMER2_RESOLVE_NEVER; 2119 } else { 2120 how = HAMMER2_RESOLVE_MAYBE; 2121 } 2122 if (flags & HAMMER2_LOOKUP_SHARED) { 2123 how_maybe |= HAMMER2_RESOLVE_SHARED; 2124 how_always |= HAMMER2_RESOLVE_SHARED; 2125 how |= HAMMER2_RESOLVE_SHARED; 2126 } 2127 2128 /* 2129 * Recurse (*parentp) upward if necessary until the parent completely 2130 * encloses the key range or we hit the inode. 2131 * 2132 * Handle races against the flusher deleting indirect nodes on its 2133 * way back up by continuing to recurse upward past the deletion. 2134 */ 2135 parent = *parentp; 2136 hmp = parent->hmp; 2137 2138 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2139 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2140 scan_beg = parent->bref.key; 2141 scan_end = scan_beg + 2142 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2143 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT || 2144 (parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 2145 if (key_beg >= scan_beg && key_end <= scan_end) 2146 break; 2147 } 2148 parent = hammer2_chain_getparent(parentp, how_maybe); 2149 } 2150 again: 2151 2152 TIMER(9); 2153 if (--maxloops == 0) 2154 panic("hammer2_chain_lookup: maxloops"); 2155 /* 2156 * Locate the blockref array. Currently we do a fully associative 2157 * search through the array. 2158 */ 2159 switch(parent->bref.type) { 2160 case HAMMER2_BREF_TYPE_INODE: 2161 /* 2162 * Special shortcut for embedded data returns the inode 2163 * itself. Callers must detect this condition and access 2164 * the embedded data (the strategy code does this for us). 2165 * 2166 * This is only applicable to regular files and softlinks. 2167 * 2168 * We need a second lock on parent. Since we already have 2169 * a lock we must pass LOCKAGAIN to prevent unexpected 2170 * blocking (we don't want to block on a second shared 2171 * ref if an exclusive lock is pending) 2172 */ 2173 if (parent->data->ipdata.meta.op_flags & 2174 HAMMER2_OPFLAG_DIRECTDATA) { 2175 if (flags & HAMMER2_LOOKUP_NODIRECT) { 2176 chain = NULL; 2177 *key_nextp = key_end + 1; 2178 goto done; 2179 } 2180 hammer2_chain_ref(parent); 2181 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 2182 hammer2_chain_lock(parent, 2183 how_always | 2184 HAMMER2_RESOLVE_LOCKAGAIN); 2185 *key_nextp = key_end + 1; 2186 return (parent); 2187 } 2188 base = &parent->data->ipdata.u.blockset.blockref[0]; 2189 count = HAMMER2_SET_COUNT; 2190 break; 2191 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2192 case HAMMER2_BREF_TYPE_INDIRECT: 2193 /* 2194 * Handle MATCHIND on the parent 2195 */ 2196 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2197 scan_beg = parent->bref.key; 2198 scan_end = scan_beg + 2199 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2200 if (key_beg == scan_beg && key_end == scan_end) { 2201 chain = parent; 2202 hammer2_chain_ref(chain); 2203 hammer2_chain_lock(chain, how_maybe); 2204 *key_nextp = scan_end + 1; 2205 goto done; 2206 } 2207 } 2208 2209 /* 2210 * Optimize indirect blocks in the INITIAL state to avoid 2211 * I/O. 2212 */ 2213 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2214 base = NULL; 2215 } else { 2216 if (parent->data == NULL) { 2217 kprintf("parent->data is NULL %p\n", parent); 2218 while (1) 2219 tsleep(parent, 0, "xxx", 0); 2220 } 2221 base = &parent->data->npdata[0]; 2222 } 2223 count = parent->bytes / sizeof(hammer2_blockref_t); 2224 break; 2225 case HAMMER2_BREF_TYPE_VOLUME: 2226 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2227 count = HAMMER2_SET_COUNT; 2228 break; 2229 case HAMMER2_BREF_TYPE_FREEMAP: 2230 base = &parent->data->blkset.blockref[0]; 2231 count = HAMMER2_SET_COUNT; 2232 break; 2233 default: 2234 kprintf("hammer2_chain_lookup: unrecognized " 2235 "blockref(B) type: %d", 2236 parent->bref.type); 2237 while (1) 2238 tsleep(&base, 0, "dead", 0); 2239 panic("hammer2_chain_lookup: unrecognized " 2240 "blockref(B) type: %d", 2241 parent->bref.type); 2242 base = NULL; /* safety */ 2243 count = 0; /* safety */ 2244 } 2245 TIMER(10); 2246 2247 /* 2248 * Merged scan to find next candidate. 2249 * 2250 * hammer2_base_*() functions require the parent->core.live_* fields 2251 * to be synchronized. 2252 * 2253 * We need to hold the spinlock to access the block array and RB tree 2254 * and to interlock chain creation. 2255 */ 2256 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2257 hammer2_chain_countbrefs(parent, base, count); 2258 2259 TIMER(11); 2260 2261 /* 2262 * Combined search 2263 */ 2264 hammer2_spin_ex(&parent->core.spin); 2265 chain = hammer2_combined_find(parent, base, count, 2266 cache_indexp, key_nextp, 2267 key_beg, key_end, 2268 &bref); 2269 generation = parent->core.generation; 2270 2271 TIMER(12); 2272 2273 /* 2274 * Exhausted parent chain, iterate. 2275 */ 2276 if (bref == NULL) { 2277 TIMER(13); 2278 hammer2_spin_unex(&parent->core.spin); 2279 if (key_beg == key_end) /* short cut single-key case */ 2280 return (NULL); 2281 2282 /* 2283 * Stop if we reached the end of the iteration. 2284 */ 2285 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2286 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2287 return (NULL); 2288 } 2289 2290 /* 2291 * Calculate next key, stop if we reached the end of the 2292 * iteration, otherwise go up one level and loop. 2293 */ 2294 key_beg = parent->bref.key + 2295 ((hammer2_key_t)1 << parent->bref.keybits); 2296 if (key_beg == 0 || key_beg > key_end) 2297 return (NULL); 2298 parent = hammer2_chain_getparent(parentp, how_maybe); 2299 goto again; 2300 } 2301 2302 /* 2303 * Selected from blockref or in-memory chain. 2304 */ 2305 if (chain == NULL) { 2306 TIMER(14); 2307 bcopy = *bref; 2308 hammer2_spin_unex(&parent->core.spin); 2309 chain = hammer2_chain_get(parent, generation, 2310 &bcopy); 2311 if (chain == NULL) { 2312 /* 2313 kprintf("retry lookup parent %p keys %016jx:%016jx\n", 2314 parent, key_beg, key_end); 2315 */ 2316 goto again; 2317 } 2318 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2319 hammer2_chain_drop(chain); 2320 goto again; 2321 } 2322 } else { 2323 TIMER(15); 2324 hammer2_chain_ref(chain); 2325 hammer2_spin_unex(&parent->core.spin); 2326 } 2327 2328 TIMER(16); 2329 /* 2330 * chain is referenced but not locked. We must lock the chain 2331 * to obtain definitive state. 2332 */ 2333 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2334 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2335 hammer2_chain_lock(chain, how_maybe); 2336 } else { 2337 hammer2_chain_lock(chain, how); 2338 } 2339 KKASSERT(chain->parent == parent); 2340 TIMER(17); 2341 2342 /* 2343 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2344 * 2345 * NOTE: Chain's key range is not relevant as there might be 2346 * one-offs within the range that are not deleted. 2347 * 2348 * NOTE: Lookups can race delete-duplicate because 2349 * delete-duplicate does not lock the parent's core 2350 * (they just use the spinlock on the core). 2351 */ 2352 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2353 kprintf("skip deleted chain %016jx.%02x key=%016jx\n", 2354 chain->bref.data_off, chain->bref.type, 2355 chain->bref.key); 2356 hammer2_chain_unlock(chain); 2357 hammer2_chain_drop(chain); 2358 key_beg = *key_nextp; 2359 if (key_beg == 0 || key_beg > key_end) 2360 return(NULL); 2361 goto again; 2362 } 2363 TIMER(18); 2364 2365 /* 2366 * If the chain element is an indirect block it becomes the new 2367 * parent and we loop on it. We must maintain our top-down locks 2368 * to prevent the flusher from interfering (i.e. doing a 2369 * delete-duplicate and leaving us recursing down a deleted chain). 2370 * 2371 * The parent always has to be locked with at least RESOLVE_MAYBE 2372 * so we can access its data. It might need a fixup if the caller 2373 * passed incompatible flags. Be careful not to cause a deadlock 2374 * as a data-load requires an exclusive lock. 2375 * 2376 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2377 * range is within the requested key range we return the indirect 2378 * block and do NOT loop. This is usually only used to acquire 2379 * freemap nodes. 2380 */ 2381 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2382 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2383 hammer2_chain_unlock(parent); 2384 hammer2_chain_drop(parent); 2385 *parentp = parent = chain; 2386 goto again; 2387 } 2388 TIMER(19); 2389 done: 2390 /* 2391 * All done, return the chain. 2392 * 2393 * If the caller does not want a locked chain, replace the lock with 2394 * a ref. Perhaps this can eventually be optimized to not obtain the 2395 * lock in the first place for situations where the data does not 2396 * need to be resolved. 2397 */ 2398 if (chain) { 2399 if (flags & HAMMER2_LOOKUP_NOLOCK) 2400 hammer2_chain_unlock(chain); 2401 } 2402 TIMER(20); 2403 2404 return (chain); 2405 } 2406 2407 /* 2408 * After having issued a lookup we can iterate all matching keys. 2409 * 2410 * If chain is non-NULL we continue the iteration from just after it's index. 2411 * 2412 * If chain is NULL we assume the parent was exhausted and continue the 2413 * iteration at the next parent. 2414 * 2415 * If a fatal error occurs (typically an I/O error), a dummy chain is 2416 * returned with chain->error and error-identifying information set. This 2417 * chain will assert if you try to do anything fancy with it. 2418 * 2419 * XXX Depending on where the error occurs we should allow continued iteration. 2420 * 2421 * parent must be locked on entry and remains locked throughout. chain's 2422 * lock status must match flags. Chain is always at least referenced. 2423 * 2424 * WARNING! The MATCHIND flag does not apply to this function. 2425 */ 2426 hammer2_chain_t * 2427 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2428 hammer2_key_t *key_nextp, 2429 hammer2_key_t key_beg, hammer2_key_t key_end, 2430 int *cache_indexp, int flags) 2431 { 2432 hammer2_chain_t *parent; 2433 int how_maybe; 2434 2435 /* 2436 * Calculate locking flags for upward recursion. 2437 */ 2438 how_maybe = HAMMER2_RESOLVE_MAYBE; 2439 if (flags & HAMMER2_LOOKUP_SHARED) 2440 how_maybe |= HAMMER2_RESOLVE_SHARED; 2441 2442 parent = *parentp; 2443 2444 /* 2445 * Calculate the next index and recalculate the parent if necessary. 2446 */ 2447 if (chain) { 2448 key_beg = chain->bref.key + 2449 ((hammer2_key_t)1 << chain->bref.keybits); 2450 if ((flags & (HAMMER2_LOOKUP_NOLOCK | 2451 HAMMER2_LOOKUP_NOUNLOCK)) == 0) { 2452 hammer2_chain_unlock(chain); 2453 } 2454 hammer2_chain_drop(chain); 2455 2456 /* 2457 * chain invalid past this point, but we can still do a 2458 * pointer comparison w/parent. 2459 * 2460 * Any scan where the lookup returned degenerate data embedded 2461 * in the inode has an invalid index and must terminate. 2462 */ 2463 if (chain == parent) 2464 return(NULL); 2465 if (key_beg == 0 || key_beg > key_end) 2466 return(NULL); 2467 chain = NULL; 2468 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2469 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2470 /* 2471 * We reached the end of the iteration. 2472 */ 2473 return (NULL); 2474 } else { 2475 /* 2476 * Continue iteration with next parent unless the current 2477 * parent covers the range. 2478 * 2479 * (This also handles the case of a deleted, empty indirect 2480 * node). 2481 */ 2482 key_beg = parent->bref.key + 2483 ((hammer2_key_t)1 << parent->bref.keybits); 2484 if (key_beg == 0 || key_beg > key_end) 2485 return (NULL); 2486 parent = hammer2_chain_getparent(parentp, how_maybe); 2487 } 2488 2489 /* 2490 * And execute 2491 */ 2492 return (hammer2_chain_lookup(parentp, key_nextp, 2493 key_beg, key_end, 2494 cache_indexp, flags)); 2495 } 2496 2497 /* 2498 * The raw scan function is similar to lookup/next but does not seek to a key. 2499 * Blockrefs are iterated via first_bref = (parent, NULL) and 2500 * next_chain = (parent, bref). 2501 * 2502 * The passed-in parent must be locked and its data resolved. The function 2503 * nominally returns a locked and referenced *chainp != NULL for chains 2504 * the caller might need to recurse on (and will dipose of any *chainp passed 2505 * in). The caller must check the chain->bref.type either way. 2506 * 2507 * *chainp is not set for leaf elements. 2508 * 2509 * This function takes a pointer to a stack-based bref structure whos 2510 * contents is updated for each iteration. The same pointer is returned, 2511 * or NULL when the iteration is complete. *firstp must be set to 1 for 2512 * the first ieration. This function will set it to 0. 2513 */ 2514 hammer2_blockref_t * 2515 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, 2516 hammer2_blockref_t *bref, int *firstp, 2517 int *cache_indexp, int flags) 2518 { 2519 hammer2_dev_t *hmp; 2520 hammer2_blockref_t *base; 2521 hammer2_blockref_t *bref_ptr; 2522 hammer2_key_t key; 2523 hammer2_key_t next_key; 2524 hammer2_chain_t *chain = NULL; 2525 int count = 0; 2526 int how_always = HAMMER2_RESOLVE_ALWAYS; 2527 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2528 int how; 2529 int generation; 2530 int maxloops = 300000; 2531 2532 hmp = parent->hmp; 2533 2534 /* 2535 * Scan flags borrowed from lookup. 2536 */ 2537 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2538 how_maybe = how_always; 2539 how = HAMMER2_RESOLVE_ALWAYS; 2540 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2541 how = HAMMER2_RESOLVE_NEVER; 2542 } else { 2543 how = HAMMER2_RESOLVE_MAYBE; 2544 } 2545 if (flags & HAMMER2_LOOKUP_SHARED) { 2546 how_maybe |= HAMMER2_RESOLVE_SHARED; 2547 how_always |= HAMMER2_RESOLVE_SHARED; 2548 how |= HAMMER2_RESOLVE_SHARED; 2549 } 2550 2551 /* 2552 * Calculate key to locate first/next element, unlocking the previous 2553 * element as we go. Be careful, the key calculation can overflow. 2554 * 2555 * (also reset bref to NULL) 2556 */ 2557 if (*firstp) { 2558 key = 0; 2559 *firstp = 0; 2560 } else { 2561 key = bref->key + ((hammer2_key_t)1 << bref->keybits); 2562 if ((chain = *chainp) != NULL) { 2563 *chainp = NULL; 2564 hammer2_chain_unlock(chain); 2565 hammer2_chain_drop(chain); 2566 chain = NULL; 2567 } 2568 if (key == 0) { 2569 bref = NULL; 2570 goto done; 2571 } 2572 } 2573 2574 again: 2575 KKASSERT(parent->error == 0); /* XXX case not handled yet */ 2576 if (--maxloops == 0) 2577 panic("hammer2_chain_scan: maxloops"); 2578 /* 2579 * Locate the blockref array. Currently we do a fully associative 2580 * search through the array. 2581 */ 2582 switch(parent->bref.type) { 2583 case HAMMER2_BREF_TYPE_INODE: 2584 /* 2585 * An inode with embedded data has no sub-chains. 2586 * 2587 * WARNING! Bulk scan code may pass a static chain marked 2588 * as BREF_TYPE_INODE with a copy of the volume 2589 * root blockset to snapshot the volume. 2590 */ 2591 if (parent->data->ipdata.meta.op_flags & 2592 HAMMER2_OPFLAG_DIRECTDATA) { 2593 bref = NULL; 2594 goto done; 2595 } 2596 base = &parent->data->ipdata.u.blockset.blockref[0]; 2597 count = HAMMER2_SET_COUNT; 2598 break; 2599 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2600 case HAMMER2_BREF_TYPE_INDIRECT: 2601 /* 2602 * Optimize indirect blocks in the INITIAL state to avoid 2603 * I/O. 2604 */ 2605 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2606 base = NULL; 2607 } else { 2608 if (parent->data == NULL) 2609 panic("parent->data is NULL"); 2610 base = &parent->data->npdata[0]; 2611 } 2612 count = parent->bytes / sizeof(hammer2_blockref_t); 2613 break; 2614 case HAMMER2_BREF_TYPE_VOLUME: 2615 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2616 count = HAMMER2_SET_COUNT; 2617 break; 2618 case HAMMER2_BREF_TYPE_FREEMAP: 2619 base = &parent->data->blkset.blockref[0]; 2620 count = HAMMER2_SET_COUNT; 2621 break; 2622 default: 2623 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2624 parent->bref.type); 2625 base = NULL; /* safety */ 2626 count = 0; /* safety */ 2627 } 2628 2629 /* 2630 * Merged scan to find next candidate. 2631 * 2632 * hammer2_base_*() functions require the parent->core.live_* fields 2633 * to be synchronized. 2634 * 2635 * We need to hold the spinlock to access the block array and RB tree 2636 * and to interlock chain creation. 2637 */ 2638 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2639 hammer2_chain_countbrefs(parent, base, count); 2640 2641 next_key = 0; 2642 bref_ptr = NULL; 2643 hammer2_spin_ex(&parent->core.spin); 2644 chain = hammer2_combined_find(parent, base, count, 2645 cache_indexp, &next_key, 2646 key, HAMMER2_KEY_MAX, 2647 &bref_ptr); 2648 generation = parent->core.generation; 2649 2650 /* 2651 * Exhausted parent chain, we're done. 2652 */ 2653 if (bref_ptr == NULL) { 2654 hammer2_spin_unex(&parent->core.spin); 2655 KKASSERT(chain == NULL); 2656 bref = NULL; 2657 goto done; 2658 } 2659 2660 /* 2661 * Copy into the supplied stack-based blockref. 2662 */ 2663 *bref = *bref_ptr; 2664 2665 /* 2666 * Selected from blockref or in-memory chain. 2667 */ 2668 if (chain == NULL) { 2669 switch(bref->type) { 2670 case HAMMER2_BREF_TYPE_INODE: 2671 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2672 case HAMMER2_BREF_TYPE_INDIRECT: 2673 case HAMMER2_BREF_TYPE_VOLUME: 2674 case HAMMER2_BREF_TYPE_FREEMAP: 2675 /* 2676 * Recursion, always get the chain 2677 */ 2678 hammer2_spin_unex(&parent->core.spin); 2679 chain = hammer2_chain_get(parent, generation, bref); 2680 if (chain == NULL) { 2681 kprintf("retry scan parent %p keys %016jx\n", 2682 parent, key); 2683 goto again; 2684 } 2685 if (bcmp(bref, bref_ptr, sizeof(*bref))) { 2686 hammer2_chain_drop(chain); 2687 chain = NULL; 2688 goto again; 2689 } 2690 break; 2691 default: 2692 /* 2693 * No recursion, do not waste time instantiating 2694 * a chain, just iterate using the bref. 2695 */ 2696 hammer2_spin_unex(&parent->core.spin); 2697 break; 2698 } 2699 } else { 2700 /* 2701 * Recursion or not we need the chain in order to supply 2702 * the bref. 2703 */ 2704 hammer2_chain_ref(chain); 2705 hammer2_spin_unex(&parent->core.spin); 2706 } 2707 2708 /* 2709 * chain is referenced but not locked. We must lock the chain 2710 * to obtain definitive state. 2711 */ 2712 if (chain) 2713 hammer2_chain_lock(chain, how); 2714 2715 /* 2716 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2717 * 2718 * NOTE: chain's key range is not relevant as there might be 2719 * one-offs within the range that are not deleted. 2720 * 2721 * NOTE: XXX this could create problems with scans used in 2722 * situations other than mount-time recovery. 2723 * 2724 * NOTE: Lookups can race delete-duplicate because 2725 * delete-duplicate does not lock the parent's core 2726 * (they just use the spinlock on the core). 2727 */ 2728 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 2729 hammer2_chain_unlock(chain); 2730 hammer2_chain_drop(chain); 2731 chain = NULL; 2732 2733 key = next_key; 2734 if (key == 0) { 2735 bref = NULL; 2736 goto done; 2737 } 2738 goto again; 2739 } 2740 2741 done: 2742 /* 2743 * All done, return the bref or NULL, supply chain if necessary. 2744 */ 2745 if (chain) 2746 *chainp = chain; 2747 return (bref); 2748 } 2749 2750 /* 2751 * Create and return a new hammer2 system memory structure of the specified 2752 * key, type and size and insert it under (*parentp). This is a full 2753 * insertion, based on the supplied key/keybits, and may involve creating 2754 * indirect blocks and moving other chains around via delete/duplicate. 2755 * 2756 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 2757 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 2758 * FULL. This typically means that the caller is creating the chain after 2759 * doing a hammer2_chain_lookup(). 2760 * 2761 * (*parentp) must be exclusive locked and may be replaced on return 2762 * depending on how much work the function had to do. 2763 * 2764 * (*parentp) must not be errored or this function will assert. 2765 * 2766 * (*chainp) usually starts out NULL and returns the newly created chain, 2767 * but if the caller desires the caller may allocate a disconnected chain 2768 * and pass it in instead. 2769 * 2770 * This function should NOT be used to insert INDIRECT blocks. It is 2771 * typically used to create/insert inodes and data blocks. 2772 * 2773 * Caller must pass-in an exclusively locked parent the new chain is to 2774 * be inserted under, and optionally pass-in a disconnected, exclusively 2775 * locked chain to insert (else we create a new chain). The function will 2776 * adjust (*parentp) as necessary, create or connect the chain, and 2777 * return an exclusively locked chain in *chainp. 2778 * 2779 * When creating a PFSROOT inode under the super-root, pmp is typically NULL 2780 * and will be reassigned. 2781 */ 2782 int 2783 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 2784 hammer2_pfs_t *pmp, int methods, 2785 hammer2_key_t key, int keybits, int type, size_t bytes, 2786 hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags) 2787 { 2788 hammer2_dev_t *hmp; 2789 hammer2_chain_t *chain; 2790 hammer2_chain_t *parent; 2791 hammer2_blockref_t *base; 2792 hammer2_blockref_t dummy; 2793 int allocated = 0; 2794 int error = 0; 2795 int count; 2796 int maxloops = 300000; 2797 2798 /* 2799 * Topology may be crossing a PFS boundary. 2800 */ 2801 parent = *parentp; 2802 KKASSERT(hammer2_mtx_owned(&parent->lock)); 2803 KKASSERT(parent->error == 0); 2804 hmp = parent->hmp; 2805 chain = *chainp; 2806 2807 if (chain == NULL) { 2808 /* 2809 * First allocate media space and construct the dummy bref, 2810 * then allocate the in-memory chain structure. Set the 2811 * INITIAL flag for fresh chains which do not have embedded 2812 * data. 2813 * 2814 * XXX for now set the check mode of the child based on 2815 * the parent or, if the parent is an inode, the 2816 * specification in the inode. 2817 */ 2818 bzero(&dummy, sizeof(dummy)); 2819 dummy.type = type; 2820 dummy.key = key; 2821 dummy.keybits = keybits; 2822 dummy.data_off = hammer2_getradix(bytes); 2823 2824 /* 2825 * Inherit methods from parent by default. Primarily used 2826 * for BREF_TYPE_DATA. Non-data types *must* be set to 2827 * a non-NONE check algorithm. 2828 */ 2829 if (methods == -1) 2830 dummy.methods = parent->bref.methods; 2831 else 2832 dummy.methods = (uint8_t)methods; 2833 2834 if (type != HAMMER2_BREF_TYPE_DATA && 2835 HAMMER2_DEC_CHECK(dummy.methods) == HAMMER2_CHECK_NONE) { 2836 dummy.methods |= 2837 HAMMER2_ENC_CHECK(HAMMER2_CHECK_DEFAULT); 2838 } 2839 2840 chain = hammer2_chain_alloc(hmp, pmp, &dummy); 2841 2842 /* 2843 * Lock the chain manually, chain_lock will load the chain 2844 * which we do NOT want to do. (note: chain->refs is set 2845 * to 1 by chain_alloc() for us, but lockcnt is not). 2846 */ 2847 chain->lockcnt = 1; 2848 hammer2_mtx_ex(&chain->lock); 2849 allocated = 1; 2850 ++curthread->td_tracker; 2851 2852 /* 2853 * Set INITIAL to optimize I/O. The flag will generally be 2854 * processed when we call hammer2_chain_modify(). 2855 * 2856 * Recalculate bytes to reflect the actual media block 2857 * allocation. 2858 */ 2859 bytes = (hammer2_off_t)1 << 2860 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 2861 chain->bytes = bytes; 2862 2863 switch(type) { 2864 case HAMMER2_BREF_TYPE_VOLUME: 2865 case HAMMER2_BREF_TYPE_FREEMAP: 2866 panic("hammer2_chain_create: called with volume type"); 2867 break; 2868 case HAMMER2_BREF_TYPE_INDIRECT: 2869 panic("hammer2_chain_create: cannot be used to" 2870 "create indirect block"); 2871 break; 2872 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2873 panic("hammer2_chain_create: cannot be used to" 2874 "create freemap root or node"); 2875 break; 2876 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2877 KKASSERT(bytes == sizeof(chain->data->bmdata)); 2878 /* fall through */ 2879 case HAMMER2_BREF_TYPE_INODE: 2880 case HAMMER2_BREF_TYPE_DATA: 2881 default: 2882 /* 2883 * leave chain->data NULL, set INITIAL 2884 */ 2885 KKASSERT(chain->data == NULL); 2886 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 2887 break; 2888 } 2889 } else { 2890 /* 2891 * We are reattaching a previously deleted chain, possibly 2892 * under a new parent and possibly with a new key/keybits. 2893 * The chain does not have to be in a modified state. The 2894 * UPDATE flag will be set later on in this routine. 2895 * 2896 * Do NOT mess with the current state of the INITIAL flag. 2897 */ 2898 chain->bref.key = key; 2899 chain->bref.keybits = keybits; 2900 if (chain->flags & HAMMER2_CHAIN_DELETED) 2901 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2902 KKASSERT(chain->parent == NULL); 2903 } 2904 if (flags & HAMMER2_INSERT_PFSROOT) 2905 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 2906 else 2907 chain->bref.flags &= ~HAMMER2_BREF_FLAG_PFSROOT; 2908 2909 /* 2910 * Calculate how many entries we have in the blockref array and 2911 * determine if an indirect block is required. 2912 */ 2913 again: 2914 if (--maxloops == 0) 2915 panic("hammer2_chain_create: maxloops"); 2916 2917 switch(parent->bref.type) { 2918 case HAMMER2_BREF_TYPE_INODE: 2919 if ((parent->data->ipdata.meta.op_flags & 2920 HAMMER2_OPFLAG_DIRECTDATA) != 0) { 2921 kprintf("hammer2: parent set for direct-data! " 2922 "pkey=%016jx ckey=%016jx\n", 2923 parent->bref.key, 2924 chain->bref.key); 2925 } 2926 KKASSERT((parent->data->ipdata.meta.op_flags & 2927 HAMMER2_OPFLAG_DIRECTDATA) == 0); 2928 KKASSERT(parent->data != NULL); 2929 base = &parent->data->ipdata.u.blockset.blockref[0]; 2930 count = HAMMER2_SET_COUNT; 2931 break; 2932 case HAMMER2_BREF_TYPE_INDIRECT: 2933 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2934 if (parent->flags & HAMMER2_CHAIN_INITIAL) 2935 base = NULL; 2936 else 2937 base = &parent->data->npdata[0]; 2938 count = parent->bytes / sizeof(hammer2_blockref_t); 2939 break; 2940 case HAMMER2_BREF_TYPE_VOLUME: 2941 KKASSERT(parent->data != NULL); 2942 base = &parent->data->voldata.sroot_blockset.blockref[0]; 2943 count = HAMMER2_SET_COUNT; 2944 break; 2945 case HAMMER2_BREF_TYPE_FREEMAP: 2946 KKASSERT(parent->data != NULL); 2947 base = &parent->data->blkset.blockref[0]; 2948 count = HAMMER2_SET_COUNT; 2949 break; 2950 default: 2951 panic("hammer2_chain_create: unrecognized blockref type: %d", 2952 parent->bref.type); 2953 base = NULL; 2954 count = 0; 2955 break; 2956 } 2957 2958 /* 2959 * Make sure we've counted the brefs 2960 */ 2961 if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) 2962 hammer2_chain_countbrefs(parent, base, count); 2963 2964 KASSERT(parent->core.live_count >= 0 && 2965 parent->core.live_count <= count, 2966 ("bad live_count %d/%d (%02x, %d)", 2967 parent->core.live_count, count, 2968 parent->bref.type, parent->bytes)); 2969 2970 /* 2971 * If no free blockref could be found we must create an indirect 2972 * block and move a number of blockrefs into it. With the parent 2973 * locked we can safely lock each child in order to delete+duplicate 2974 * it without causing a deadlock. 2975 * 2976 * This may return the new indirect block or the old parent depending 2977 * on where the key falls. NULL is returned on error. 2978 */ 2979 if (parent->core.live_count == count) { 2980 hammer2_chain_t *nparent; 2981 2982 nparent = hammer2_chain_create_indirect(parent, key, keybits, 2983 mtid, type, &error); 2984 if (nparent == NULL) { 2985 if (allocated) 2986 hammer2_chain_drop(chain); 2987 chain = NULL; 2988 goto done; 2989 } 2990 if (parent != nparent) { 2991 hammer2_chain_unlock(parent); 2992 hammer2_chain_drop(parent); 2993 parent = *parentp = nparent; 2994 } 2995 goto again; 2996 } 2997 2998 if (chain->flags & HAMMER2_CHAIN_DELETED) 2999 kprintf("Inserting deleted chain @%016jx\n", 3000 chain->bref.key); 3001 3002 /* 3003 * Link the chain into its parent. 3004 */ 3005 if (chain->parent != NULL) 3006 panic("hammer2: hammer2_chain_create: chain already connected"); 3007 KKASSERT(chain->parent == NULL); 3008 hammer2_chain_insert(parent, chain, 3009 HAMMER2_CHAIN_INSERT_SPIN | 3010 HAMMER2_CHAIN_INSERT_LIVE, 3011 0); 3012 3013 if (allocated) { 3014 /* 3015 * Mark the newly created chain modified. This will cause 3016 * UPDATE to be set and process the INITIAL flag. 3017 * 3018 * Device buffers are not instantiated for DATA elements 3019 * as these are handled by logical buffers. 3020 * 3021 * Indirect and freemap node indirect blocks are handled 3022 * by hammer2_chain_create_indirect() and not by this 3023 * function. 3024 * 3025 * Data for all other bref types is expected to be 3026 * instantiated (INODE, LEAF). 3027 */ 3028 switch(chain->bref.type) { 3029 case HAMMER2_BREF_TYPE_DATA: 3030 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3031 case HAMMER2_BREF_TYPE_INODE: 3032 hammer2_chain_modify(chain, mtid, dedup_off, 3033 HAMMER2_MODIFY_OPTDATA); 3034 break; 3035 default: 3036 /* 3037 * Remaining types are not supported by this function. 3038 * In particular, INDIRECT and LEAF_NODE types are 3039 * handled by create_indirect(). 3040 */ 3041 panic("hammer2_chain_create: bad type: %d", 3042 chain->bref.type); 3043 /* NOT REACHED */ 3044 break; 3045 } 3046 } else { 3047 /* 3048 * When reconnecting a chain we must set UPDATE and 3049 * setflush so the flush recognizes that it must update 3050 * the bref in the parent. 3051 */ 3052 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) 3053 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 3054 } 3055 3056 /* 3057 * We must setflush(parent) to ensure that it recurses through to 3058 * chain. setflush(chain) might not work because ONFLUSH is possibly 3059 * already set in the chain (so it won't recurse up to set it in the 3060 * parent). 3061 */ 3062 hammer2_chain_setflush(parent); 3063 3064 done: 3065 *chainp = chain; 3066 3067 return (error); 3068 } 3069 3070 /* 3071 * Move the chain from its old parent to a new parent. The chain must have 3072 * already been deleted or already disconnected (or never associated) with 3073 * a parent. The chain is reassociated with the new parent and the deleted 3074 * flag will be cleared (no longer deleted). The chain's modification state 3075 * is not altered. 3076 * 3077 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION 3078 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 3079 * FULL. This typically means that the caller is creating the chain after 3080 * doing a hammer2_chain_lookup(). 3081 * 3082 * A non-NULL bref is typically passed when key and keybits must be overridden. 3083 * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields 3084 * from a passed-in bref and uses the old chain's bref for everything else. 3085 * 3086 * Neither (parent) or (chain) can be errored. 3087 * 3088 * If (parent) is non-NULL then the chain is inserted under the parent. 3089 * 3090 * If (parent) is NULL then the newly duplicated chain is not inserted 3091 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 3092 * passing into hammer2_chain_create() after this function returns). 3093 * 3094 * WARNING! This function calls create which means it can insert indirect 3095 * blocks. This can cause other unrelated chains in the parent to 3096 * be moved to a newly inserted indirect block in addition to the 3097 * specific chain. 3098 */ 3099 void 3100 hammer2_chain_rename(hammer2_blockref_t *bref, 3101 hammer2_chain_t **parentp, hammer2_chain_t *chain, 3102 hammer2_tid_t mtid, int flags) 3103 { 3104 hammer2_dev_t *hmp; 3105 hammer2_chain_t *parent; 3106 size_t bytes; 3107 3108 /* 3109 * WARNING! We should never resolve DATA to device buffers 3110 * (XXX allow it if the caller did?), and since 3111 * we currently do not have the logical buffer cache 3112 * buffer in-hand to fix its cached physical offset 3113 * we also force the modify code to not COW it. XXX 3114 */ 3115 hmp = chain->hmp; 3116 KKASSERT(chain->parent == NULL); 3117 KKASSERT(chain->error == 0); 3118 3119 /* 3120 * Now create a duplicate of the chain structure, associating 3121 * it with the same core, making it the same size, pointing it 3122 * to the same bref (the same media block). 3123 */ 3124 if (bref == NULL) 3125 bref = &chain->bref; 3126 bytes = (hammer2_off_t)1 << 3127 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 3128 3129 /* 3130 * If parent is not NULL the duplicated chain will be entered under 3131 * the parent and the UPDATE bit set to tell flush to update 3132 * the blockref. 3133 * 3134 * We must setflush(parent) to ensure that it recurses through to 3135 * chain. setflush(chain) might not work because ONFLUSH is possibly 3136 * already set in the chain (so it won't recurse up to set it in the 3137 * parent). 3138 * 3139 * Having both chains locked is extremely important for atomicy. 3140 */ 3141 if (parentp && (parent = *parentp) != NULL) { 3142 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3143 KKASSERT(parent->refs > 0); 3144 KKASSERT(parent->error == 0); 3145 3146 hammer2_chain_create(parentp, &chain, 3147 chain->pmp, HAMMER2_METH_DEFAULT, 3148 bref->key, bref->keybits, bref->type, 3149 chain->bytes, mtid, 0, flags); 3150 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 3151 hammer2_chain_setflush(*parentp); 3152 } 3153 } 3154 3155 /* 3156 * Helper function for deleting chains. 3157 * 3158 * The chain is removed from the live view (the RBTREE) as well as the parent's 3159 * blockmap. Both chain and its parent must be locked. 3160 * 3161 * parent may not be errored. chain can be errored. 3162 */ 3163 static void 3164 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain, 3165 hammer2_tid_t mtid, int flags) 3166 { 3167 hammer2_dev_t *hmp; 3168 3169 KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED | 3170 HAMMER2_CHAIN_FICTITIOUS)) == 0); 3171 KKASSERT(chain->parent == parent); 3172 hmp = chain->hmp; 3173 3174 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 3175 /* 3176 * Chain is blockmapped, so there must be a parent. 3177 * Atomically remove the chain from the parent and remove 3178 * the blockmap entry. The parent must be set modified 3179 * to remove the blockmap entry. 3180 */ 3181 hammer2_blockref_t *base; 3182 int count; 3183 3184 KKASSERT(parent != NULL); 3185 KKASSERT(parent->error == 0); 3186 KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0); 3187 hammer2_chain_modify(parent, mtid, 0, HAMMER2_MODIFY_OPTDATA); 3188 3189 /* 3190 * Calculate blockmap pointer 3191 */ 3192 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3193 hammer2_spin_ex(&parent->core.spin); 3194 3195 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3196 atomic_add_int(&parent->core.live_count, -1); 3197 ++parent->core.generation; 3198 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3199 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3200 --parent->core.chain_count; 3201 chain->parent = NULL; 3202 3203 switch(parent->bref.type) { 3204 case HAMMER2_BREF_TYPE_INODE: 3205 /* 3206 * Access the inode's block array. However, there 3207 * is no block array if the inode is flagged 3208 * DIRECTDATA. The DIRECTDATA case typicaly only 3209 * occurs when a hardlink has been shifted up the 3210 * tree and the original inode gets replaced with 3211 * an OBJTYPE_HARDLINK placeholding inode. 3212 */ 3213 if (parent->data && 3214 (parent->data->ipdata.meta.op_flags & 3215 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 3216 base = 3217 &parent->data->ipdata.u.blockset.blockref[0]; 3218 } else { 3219 base = NULL; 3220 } 3221 count = HAMMER2_SET_COUNT; 3222 break; 3223 case HAMMER2_BREF_TYPE_INDIRECT: 3224 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3225 if (parent->data) 3226 base = &parent->data->npdata[0]; 3227 else 3228 base = NULL; 3229 count = parent->bytes / sizeof(hammer2_blockref_t); 3230 break; 3231 case HAMMER2_BREF_TYPE_VOLUME: 3232 base = &parent->data->voldata. 3233 sroot_blockset.blockref[0]; 3234 count = HAMMER2_SET_COUNT; 3235 break; 3236 case HAMMER2_BREF_TYPE_FREEMAP: 3237 base = &parent->data->blkset.blockref[0]; 3238 count = HAMMER2_SET_COUNT; 3239 break; 3240 default: 3241 base = NULL; 3242 count = 0; 3243 panic("hammer2_flush_pass2: " 3244 "unrecognized blockref type: %d", 3245 parent->bref.type); 3246 } 3247 3248 /* 3249 * delete blockmapped chain from its parent. 3250 * 3251 * The parent is not affected by any statistics in chain 3252 * which are pending synchronization. That is, there is 3253 * nothing to undo in the parent since they have not yet 3254 * been incorporated into the parent. 3255 * 3256 * The parent is affected by statistics stored in inodes. 3257 * Those have already been synchronized, so they must be 3258 * undone. XXX split update possible w/delete in middle? 3259 */ 3260 if (base) { 3261 int cache_index = -1; 3262 hammer2_base_delete(parent, base, count, 3263 &cache_index, chain); 3264 } 3265 hammer2_spin_unex(&parent->core.spin); 3266 } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) { 3267 /* 3268 * Chain is not blockmapped but a parent is present. 3269 * Atomically remove the chain from the parent. There is 3270 * no blockmap entry to remove. 3271 * 3272 * Because chain was associated with a parent but not 3273 * synchronized, the chain's *_count_up fields contain 3274 * inode adjustment statistics which must be undone. 3275 */ 3276 hammer2_spin_ex(&parent->core.spin); 3277 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3278 atomic_add_int(&parent->core.live_count, -1); 3279 ++parent->core.generation; 3280 RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain); 3281 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 3282 --parent->core.chain_count; 3283 chain->parent = NULL; 3284 hammer2_spin_unex(&parent->core.spin); 3285 } else { 3286 /* 3287 * Chain is not blockmapped and has no parent. This 3288 * is a degenerate case. 3289 */ 3290 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3291 } 3292 } 3293 3294 /* 3295 * Create an indirect block that covers one or more of the elements in the 3296 * current parent. Either returns the existing parent with no locking or 3297 * ref changes or returns the new indirect block locked and referenced 3298 * and leaving the original parent lock/ref intact as well. 3299 * 3300 * If an error occurs, NULL is returned and *errorp is set to the error. 3301 * 3302 * The returned chain depends on where the specified key falls. 3303 * 3304 * The key/keybits for the indirect mode only needs to follow three rules: 3305 * 3306 * (1) That all elements underneath it fit within its key space and 3307 * 3308 * (2) That all elements outside it are outside its key space. 3309 * 3310 * (3) When creating the new indirect block any elements in the current 3311 * parent that fit within the new indirect block's keyspace must be 3312 * moved into the new indirect block. 3313 * 3314 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3315 * keyspace the the current parent, but lookup/iteration rules will 3316 * ensure (and must ensure) that rule (2) for all parents leading up 3317 * to the nearest inode or the root volume header is adhered to. This 3318 * is accomplished by always recursing through matching keyspaces in 3319 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3320 * 3321 * The current implementation calculates the current worst-case keyspace by 3322 * iterating the current parent and then divides it into two halves, choosing 3323 * whichever half has the most elements (not necessarily the half containing 3324 * the requested key). 3325 * 3326 * We can also opt to use the half with the least number of elements. This 3327 * causes lower-numbered keys (aka logical file offsets) to recurse through 3328 * fewer indirect blocks and higher-numbered keys to recurse through more. 3329 * This also has the risk of not moving enough elements to the new indirect 3330 * block and being forced to create several indirect blocks before the element 3331 * can be inserted. 3332 * 3333 * Must be called with an exclusively locked parent. 3334 */ 3335 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3336 hammer2_key_t *keyp, int keybits, 3337 hammer2_blockref_t *base, int count); 3338 static int hammer2_chain_indkey_file(hammer2_chain_t *parent, 3339 hammer2_key_t *keyp, int keybits, 3340 hammer2_blockref_t *base, int count, 3341 int ncount); 3342 static int hammer2_chain_indkey_dir(hammer2_chain_t *parent, 3343 hammer2_key_t *keyp, int keybits, 3344 hammer2_blockref_t *base, int count, 3345 int ncount); 3346 static 3347 hammer2_chain_t * 3348 hammer2_chain_create_indirect(hammer2_chain_t *parent, 3349 hammer2_key_t create_key, int create_bits, 3350 hammer2_tid_t mtid, int for_type, int *errorp) 3351 { 3352 hammer2_dev_t *hmp; 3353 hammer2_blockref_t *base; 3354 hammer2_blockref_t *bref; 3355 hammer2_blockref_t bcopy; 3356 hammer2_chain_t *chain; 3357 hammer2_chain_t *ichain; 3358 hammer2_chain_t dummy; 3359 hammer2_key_t key = create_key; 3360 hammer2_key_t key_beg; 3361 hammer2_key_t key_end; 3362 hammer2_key_t key_next; 3363 int keybits = create_bits; 3364 int count; 3365 int ncount; 3366 int nbytes; 3367 int cache_index; 3368 int loops; 3369 int reason; 3370 int generation; 3371 int maxloops = 300000; 3372 3373 /* 3374 * Calculate the base blockref pointer or NULL if the chain 3375 * is known to be empty. We need to calculate the array count 3376 * for RB lookups either way. 3377 */ 3378 hmp = parent->hmp; 3379 *errorp = 0; 3380 KKASSERT(hammer2_mtx_owned(&parent->lock)); 3381 3382 /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/ 3383 base = hammer2_chain_base_and_count(parent, &count); 3384 3385 /* 3386 * dummy used in later chain allocation (no longer used for lookups). 3387 */ 3388 bzero(&dummy, sizeof(dummy)); 3389 3390 /* 3391 * How big should our new indirect block be? It has to be at least 3392 * as large as its parent. 3393 * 3394 * The freemap uses a specific indirect block size. The number of 3395 * levels are built dynamically and ultimately depend on the size 3396 * volume. Because freemap blocks are taken from the reserved areas 3397 * of the volume our goal is efficiency (fewer levels) and not so 3398 * much to save disk space. 3399 * 3400 * The first indirect block level for a directory usually uses 3401 * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). 3402 * (the 4 entries built-into the inode can handle 4 directory 3403 * entries) 3404 * 3405 * The first indirect block level for a file usually uses 3406 * HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs = ~8MB file). 3407 * (the 4 entries built-into the inode can handle a 256KB file). 3408 * 3409 * The first indirect block level down from an inode typically 3410 * uses LBUFSIZE (16384), else it uses PBUFSIZE (65536). 3411 */ 3412 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3413 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3414 nbytes = HAMMER2_FREEMAP_LEVELN_PSIZE; 3415 } else if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 3416 if (parent->data->ipdata.meta.type == 3417 HAMMER2_OBJTYPE_DIRECTORY) 3418 nbytes = HAMMER2_IND_BYTES_MIN; /* 4KB = 32 entries */ 3419 else 3420 nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ 3421 3422 } else { 3423 nbytes = HAMMER2_IND_BYTES_MAX; 3424 } 3425 if (nbytes < count * sizeof(hammer2_blockref_t)) { 3426 KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && 3427 for_type != HAMMER2_BREF_TYPE_FREEMAP_LEAF); 3428 nbytes = count * sizeof(hammer2_blockref_t); 3429 } 3430 ncount = nbytes / sizeof(hammer2_blockref_t); 3431 3432 /* 3433 * When creating an indirect block for a freemap node or leaf 3434 * the key/keybits must be fitted to static radix levels because 3435 * particular radix levels use particular reserved blocks in the 3436 * related zone. 3437 * 3438 * This routine calculates the key/radix of the indirect block 3439 * we need to create, and whether it is on the high-side or the 3440 * low-side. 3441 */ 3442 switch(for_type) { 3443 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3444 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3445 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 3446 base, count); 3447 break; 3448 case HAMMER2_BREF_TYPE_DATA: 3449 keybits = hammer2_chain_indkey_file(parent, &key, keybits, 3450 base, count, ncount); 3451 break; 3452 case HAMMER2_BREF_TYPE_INODE: 3453 keybits = hammer2_chain_indkey_dir(parent, &key, keybits, 3454 base, count, ncount); 3455 break; 3456 } 3457 3458 /* 3459 * Normalize the key for the radix being represented, keeping the 3460 * high bits and throwing away the low bits. 3461 */ 3462 key &= ~(((hammer2_key_t)1 << keybits) - 1); 3463 3464 /* 3465 * Ok, create our new indirect block 3466 */ 3467 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3468 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3469 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 3470 } else { 3471 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 3472 } 3473 dummy.bref.key = key; 3474 dummy.bref.keybits = keybits; 3475 dummy.bref.data_off = hammer2_getradix(nbytes); 3476 dummy.bref.methods = 3477 HAMMER2_ENC_CHECK(HAMMER2_DEC_CHECK(parent->bref.methods)) | 3478 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE); 3479 3480 ichain = hammer2_chain_alloc(hmp, parent->pmp, &dummy.bref); 3481 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 3482 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 3483 /* ichain has one ref at this point */ 3484 3485 /* 3486 * We have to mark it modified to allocate its block, but use 3487 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 3488 * it won't be acted upon by the flush code. 3489 */ 3490 hammer2_chain_modify(ichain, mtid, 0, HAMMER2_MODIFY_OPTDATA); 3491 3492 /* 3493 * Iterate the original parent and move the matching brefs into 3494 * the new indirect block. 3495 * 3496 * XXX handle flushes. 3497 */ 3498 key_beg = 0; 3499 key_end = HAMMER2_KEY_MAX; 3500 key_next = 0; /* avoid gcc warnings */ 3501 cache_index = 0; 3502 hammer2_spin_ex(&parent->core.spin); 3503 loops = 0; 3504 reason = 0; 3505 3506 for (;;) { 3507 /* 3508 * Parent may have been modified, relocating its block array. 3509 * Reload the base pointer. 3510 */ 3511 base = hammer2_chain_base_and_count(parent, &count); 3512 3513 if (++loops > 100000) { 3514 hammer2_spin_unex(&parent->core.spin); 3515 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 3516 reason, parent, base, count, key_next); 3517 } 3518 3519 /* 3520 * NOTE: spinlock stays intact, returned chain (if not NULL) 3521 * is not referenced or locked which means that we 3522 * cannot safely check its flagged / deletion status 3523 * until we lock it. 3524 */ 3525 chain = hammer2_combined_find(parent, base, count, 3526 &cache_index, &key_next, 3527 key_beg, key_end, 3528 &bref); 3529 generation = parent->core.generation; 3530 if (bref == NULL) 3531 break; 3532 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3533 3534 /* 3535 * Skip keys that are not within the key/radix of the new 3536 * indirect block. They stay in the parent. 3537 */ 3538 if ((~(((hammer2_key_t)1 << keybits) - 1) & 3539 (key ^ bref->key)) != 0) { 3540 goto next_key_spinlocked; 3541 } 3542 3543 /* 3544 * Load the new indirect block by acquiring the related 3545 * chains (potentially from media as it might not be 3546 * in-memory). Then move it to the new parent (ichain). 3547 * 3548 * chain is referenced but not locked. We must lock the 3549 * chain to obtain definitive state. 3550 */ 3551 if (chain) { 3552 /* 3553 * Use chain already present in the RBTREE 3554 */ 3555 hammer2_chain_ref(chain); 3556 hammer2_spin_unex(&parent->core.spin); 3557 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3558 } else { 3559 /* 3560 * Get chain for blockref element. _get returns NULL 3561 * on insertion race. 3562 */ 3563 bcopy = *bref; 3564 hammer2_spin_unex(&parent->core.spin); 3565 chain = hammer2_chain_get(parent, generation, &bcopy); 3566 if (chain == NULL) { 3567 reason = 1; 3568 hammer2_spin_ex(&parent->core.spin); 3569 continue; 3570 } 3571 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 3572 kprintf("REASON 2\n"); 3573 reason = 2; 3574 hammer2_chain_drop(chain); 3575 hammer2_spin_ex(&parent->core.spin); 3576 continue; 3577 } 3578 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); 3579 } 3580 3581 /* 3582 * This is always live so if the chain has been deleted 3583 * we raced someone and we have to retry. 3584 * 3585 * NOTE: Lookups can race delete-duplicate because 3586 * delete-duplicate does not lock the parent's core 3587 * (they just use the spinlock on the core). 3588 * 3589 * (note reversed logic for this one) 3590 */ 3591 if (chain->flags & HAMMER2_CHAIN_DELETED) { 3592 hammer2_chain_unlock(chain); 3593 hammer2_chain_drop(chain); 3594 goto next_key; 3595 } 3596 3597 /* 3598 * Shift the chain to the indirect block. 3599 * 3600 * WARNING! No reason for us to load chain data, pass NOSTATS 3601 * to prevent delete/insert from trying to access 3602 * inode stats (and thus asserting if there is no 3603 * chain->data loaded). 3604 * 3605 * WARNING! The (parent, chain) deletion may modify the parent 3606 * and invalidate the base pointer. 3607 */ 3608 hammer2_chain_delete(parent, chain, mtid, 0); 3609 hammer2_chain_rename(NULL, &ichain, chain, mtid, 0); 3610 hammer2_chain_unlock(chain); 3611 hammer2_chain_drop(chain); 3612 KKASSERT(parent->refs > 0); 3613 chain = NULL; 3614 base = NULL; /* safety */ 3615 next_key: 3616 hammer2_spin_ex(&parent->core.spin); 3617 next_key_spinlocked: 3618 if (--maxloops == 0) 3619 panic("hammer2_chain_create_indirect: maxloops"); 3620 reason = 4; 3621 if (key_next == 0 || key_next > key_end) 3622 break; 3623 key_beg = key_next; 3624 /* loop */ 3625 } 3626 hammer2_spin_unex(&parent->core.spin); 3627 3628 /* 3629 * Insert the new indirect block into the parent now that we've 3630 * cleared out some entries in the parent. We calculated a good 3631 * insertion index in the loop above (ichain->index). 3632 * 3633 * We don't have to set UPDATE here because we mark ichain 3634 * modified down below (so the normal modified -> flush -> set-moved 3635 * sequence applies). 3636 * 3637 * The insertion shouldn't race as this is a completely new block 3638 * and the parent is locked. 3639 */ 3640 base = NULL; /* safety, parent modify may change address */ 3641 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 3642 hammer2_chain_insert(parent, ichain, 3643 HAMMER2_CHAIN_INSERT_SPIN | 3644 HAMMER2_CHAIN_INSERT_LIVE, 3645 0); 3646 3647 /* 3648 * Make sure flushes propogate after our manual insertion. 3649 */ 3650 hammer2_chain_setflush(ichain); 3651 hammer2_chain_setflush(parent); 3652 3653 /* 3654 * Figure out what to return. 3655 */ 3656 if (~(((hammer2_key_t)1 << keybits) - 1) & 3657 (create_key ^ key)) { 3658 /* 3659 * Key being created is outside the key range, 3660 * return the original parent. 3661 */ 3662 hammer2_chain_unlock(ichain); 3663 hammer2_chain_drop(ichain); 3664 } else { 3665 /* 3666 * Otherwise its in the range, return the new parent. 3667 * (leave both the new and old parent locked). 3668 */ 3669 parent = ichain; 3670 } 3671 3672 return(parent); 3673 } 3674 3675 /* 3676 * Freemap indirect blocks 3677 * 3678 * Calculate the keybits and highside/lowside of the freemap node the 3679 * caller is creating. 3680 * 3681 * This routine will specify the next higher-level freemap key/radix 3682 * representing the lowest-ordered set. By doing so, eventually all 3683 * low-ordered sets will be moved one level down. 3684 * 3685 * We have to be careful here because the freemap reserves a limited 3686 * number of blocks for a limited number of levels. So we can't just 3687 * push indiscriminately. 3688 */ 3689 int 3690 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 3691 int keybits, hammer2_blockref_t *base, int count) 3692 { 3693 hammer2_chain_t *chain; 3694 hammer2_blockref_t *bref; 3695 hammer2_key_t key; 3696 hammer2_key_t key_beg; 3697 hammer2_key_t key_end; 3698 hammer2_key_t key_next; 3699 int cache_index; 3700 int locount; 3701 int hicount; 3702 int maxloops = 300000; 3703 3704 key = *keyp; 3705 locount = 0; 3706 hicount = 0; 3707 keybits = 64; 3708 3709 /* 3710 * Calculate the range of keys in the array being careful to skip 3711 * slots which are overridden with a deletion. 3712 */ 3713 key_beg = 0; 3714 key_end = HAMMER2_KEY_MAX; 3715 cache_index = 0; 3716 hammer2_spin_ex(&parent->core.spin); 3717 3718 for (;;) { 3719 if (--maxloops == 0) { 3720 panic("indkey_freemap shit %p %p:%d\n", 3721 parent, base, count); 3722 } 3723 chain = hammer2_combined_find(parent, base, count, 3724 &cache_index, &key_next, 3725 key_beg, key_end, 3726 &bref); 3727 3728 /* 3729 * Exhausted search 3730 */ 3731 if (bref == NULL) 3732 break; 3733 3734 /* 3735 * Skip deleted chains. 3736 */ 3737 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3738 if (key_next == 0 || key_next > key_end) 3739 break; 3740 key_beg = key_next; 3741 continue; 3742 } 3743 3744 /* 3745 * Use the full live (not deleted) element for the scan 3746 * iteration. HAMMER2 does not allow partial replacements. 3747 * 3748 * XXX should be built into hammer2_combined_find(). 3749 */ 3750 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3751 3752 if (keybits > bref->keybits) { 3753 key = bref->key; 3754 keybits = bref->keybits; 3755 } else if (keybits == bref->keybits && bref->key < key) { 3756 key = bref->key; 3757 } 3758 if (key_next == 0) 3759 break; 3760 key_beg = key_next; 3761 } 3762 hammer2_spin_unex(&parent->core.spin); 3763 3764 /* 3765 * Return the keybits for a higher-level FREEMAP_NODE covering 3766 * this node. 3767 */ 3768 switch(keybits) { 3769 case HAMMER2_FREEMAP_LEVEL0_RADIX: 3770 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 3771 break; 3772 case HAMMER2_FREEMAP_LEVEL1_RADIX: 3773 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 3774 break; 3775 case HAMMER2_FREEMAP_LEVEL2_RADIX: 3776 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 3777 break; 3778 case HAMMER2_FREEMAP_LEVEL3_RADIX: 3779 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 3780 break; 3781 case HAMMER2_FREEMAP_LEVEL4_RADIX: 3782 keybits = HAMMER2_FREEMAP_LEVEL5_RADIX; 3783 break; 3784 case HAMMER2_FREEMAP_LEVEL5_RADIX: 3785 panic("hammer2_chain_indkey_freemap: level too high"); 3786 break; 3787 default: 3788 panic("hammer2_chain_indkey_freemap: bad radix"); 3789 break; 3790 } 3791 *keyp = key; 3792 3793 return (keybits); 3794 } 3795 3796 /* 3797 * File indirect blocks 3798 * 3799 * Calculate the key/keybits for the indirect block to create by scanning 3800 * existing keys. The key being created is also passed in *keyp and can be 3801 * inside or outside the indirect block. Regardless, the indirect block 3802 * must hold at least two keys in order to guarantee sufficient space. 3803 * 3804 * We use a modified version of the freemap's fixed radix tree, but taylored 3805 * for file data. Basically we configure an indirect block encompassing the 3806 * smallest key. 3807 */ 3808 static int 3809 hammer2_chain_indkey_file(hammer2_chain_t *parent, hammer2_key_t *keyp, 3810 int keybits, hammer2_blockref_t *base, int count, 3811 int ncount) 3812 { 3813 hammer2_chain_t *chain; 3814 hammer2_blockref_t *bref; 3815 hammer2_key_t key; 3816 hammer2_key_t key_beg; 3817 hammer2_key_t key_end; 3818 hammer2_key_t key_next; 3819 int nradix; 3820 int cache_index; 3821 int locount; 3822 int hicount; 3823 int maxloops = 300000; 3824 3825 key = *keyp; 3826 locount = 0; 3827 hicount = 0; 3828 keybits = 64; 3829 3830 /* 3831 * Calculate the range of keys in the array being careful to skip 3832 * slots which are overridden with a deletion. 3833 * 3834 * Locate the smallest key. 3835 */ 3836 key_beg = 0; 3837 key_end = HAMMER2_KEY_MAX; 3838 cache_index = 0; 3839 hammer2_spin_ex(&parent->core.spin); 3840 3841 for (;;) { 3842 if (--maxloops == 0) { 3843 panic("indkey_freemap shit %p %p:%d\n", 3844 parent, base, count); 3845 } 3846 chain = hammer2_combined_find(parent, base, count, 3847 &cache_index, &key_next, 3848 key_beg, key_end, 3849 &bref); 3850 3851 /* 3852 * Exhausted search 3853 */ 3854 if (bref == NULL) 3855 break; 3856 3857 /* 3858 * Skip deleted chains. 3859 */ 3860 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3861 if (key_next == 0 || key_next > key_end) 3862 break; 3863 key_beg = key_next; 3864 continue; 3865 } 3866 3867 /* 3868 * Use the full live (not deleted) element for the scan 3869 * iteration. HAMMER2 does not allow partial replacements. 3870 * 3871 * XXX should be built into hammer2_combined_find(). 3872 */ 3873 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3874 3875 if (keybits > bref->keybits) { 3876 key = bref->key; 3877 keybits = bref->keybits; 3878 } else if (keybits == bref->keybits && bref->key < key) { 3879 key = bref->key; 3880 } 3881 if (key_next == 0) 3882 break; 3883 key_beg = key_next; 3884 } 3885 hammer2_spin_unex(&parent->core.spin); 3886 3887 /* 3888 * Calculate the static keybits for a higher-level indirect block 3889 * that contains the key. 3890 */ 3891 *keyp = key; 3892 3893 switch(ncount) { 3894 case HAMMER2_IND_BYTES_MIN / sizeof(hammer2_blockref_t): 3895 nradix = HAMMER2_IND_RADIX_MIN - HAMMER2_BLOCKREF_RADIX; 3896 break; 3897 case HAMMER2_IND_BYTES_NOM / sizeof(hammer2_blockref_t): 3898 nradix = HAMMER2_IND_RADIX_NOM - HAMMER2_BLOCKREF_RADIX; 3899 break; 3900 case HAMMER2_IND_BYTES_MAX / sizeof(hammer2_blockref_t): 3901 nradix = HAMMER2_IND_RADIX_MAX - HAMMER2_BLOCKREF_RADIX; 3902 break; 3903 default: 3904 panic("bad ncount %d\n", ncount); 3905 nradix = 0; 3906 break; 3907 } 3908 3909 /* 3910 * The largest radix that can be returned for an indirect block is 3911 * 63 bits. (The largest practical indirect block radix is actually 3912 * 62 bits because the top-level inode or volume root contains four 3913 * entries, but allow 63 to be returned). 3914 */ 3915 if (nradix >= 64) 3916 nradix = 63; 3917 3918 return keybits + nradix; 3919 } 3920 3921 #if 1 3922 3923 /* 3924 * Directory indirect blocks. 3925 * 3926 * Covers both the inode index (directory of inodes), and directory contents 3927 * (filenames hardlinked to inodes). 3928 * 3929 * Because directory keys are hashed we generally try to cut the space in 3930 * half. We accomodate the inode index (which tends to have linearly 3931 * increasing inode numbers) by ensuring that the keyspace is at least large 3932 * enough to fill up the indirect block being created. 3933 */ 3934 static int 3935 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 3936 int keybits, hammer2_blockref_t *base, int count, 3937 int ncount) 3938 { 3939 hammer2_blockref_t *bref; 3940 hammer2_chain_t *chain; 3941 hammer2_key_t key_beg; 3942 hammer2_key_t key_end; 3943 hammer2_key_t key_next; 3944 hammer2_key_t key; 3945 int nkeybits; 3946 int locount; 3947 int hicount; 3948 int cache_index; 3949 int maxloops = 300000; 3950 3951 /* 3952 * Shortcut if the parent is the inode. In this situation the 3953 * parent has 4+1 directory entries and we are creating an indirect 3954 * block capable of holding many more. 3955 */ 3956 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 3957 return 63; 3958 } 3959 3960 key = *keyp; 3961 locount = 0; 3962 hicount = 0; 3963 3964 /* 3965 * Calculate the range of keys in the array being careful to skip 3966 * slots which are overridden with a deletion. 3967 */ 3968 key_beg = 0; 3969 key_end = HAMMER2_KEY_MAX; 3970 cache_index = 0; 3971 hammer2_spin_ex(&parent->core.spin); 3972 3973 for (;;) { 3974 if (--maxloops == 0) { 3975 panic("indkey_freemap shit %p %p:%d\n", 3976 parent, base, count); 3977 } 3978 chain = hammer2_combined_find(parent, base, count, 3979 &cache_index, &key_next, 3980 key_beg, key_end, 3981 &bref); 3982 3983 /* 3984 * Exhausted search 3985 */ 3986 if (bref == NULL) 3987 break; 3988 3989 /* 3990 * Deleted object 3991 */ 3992 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3993 if (key_next == 0 || key_next > key_end) 3994 break; 3995 key_beg = key_next; 3996 continue; 3997 } 3998 3999 /* 4000 * Use the full live (not deleted) element for the scan 4001 * iteration. HAMMER2 does not allow partial replacements. 4002 * 4003 * XXX should be built into hammer2_combined_find(). 4004 */ 4005 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4006 4007 /* 4008 * Expand our calculated key range (key, keybits) to fit 4009 * the scanned key. nkeybits represents the full range 4010 * that we will later cut in half (two halves @ nkeybits - 1). 4011 */ 4012 nkeybits = keybits; 4013 if (nkeybits < bref->keybits) { 4014 if (bref->keybits > 64) { 4015 kprintf("bad bref chain %p bref %p\n", 4016 chain, bref); 4017 Debugger("fubar"); 4018 } 4019 nkeybits = bref->keybits; 4020 } 4021 while (nkeybits < 64 && 4022 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4023 (key ^ bref->key)) != 0) { 4024 ++nkeybits; 4025 } 4026 4027 /* 4028 * If the new key range is larger we have to determine 4029 * which side of the new key range the existing keys fall 4030 * under by checking the high bit, then collapsing the 4031 * locount into the hicount or vise-versa. 4032 */ 4033 if (keybits != nkeybits) { 4034 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4035 hicount += locount; 4036 locount = 0; 4037 } else { 4038 locount += hicount; 4039 hicount = 0; 4040 } 4041 keybits = nkeybits; 4042 } 4043 4044 /* 4045 * The newly scanned key will be in the lower half or the 4046 * upper half of the (new) key range. 4047 */ 4048 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4049 ++hicount; 4050 else 4051 ++locount; 4052 4053 if (key_next == 0) 4054 break; 4055 key_beg = key_next; 4056 } 4057 hammer2_spin_unex(&parent->core.spin); 4058 bref = NULL; /* now invalid (safety) */ 4059 4060 /* 4061 * Adjust keybits to represent half of the full range calculated 4062 * above (radix 63 max) for our new indirect block. 4063 */ 4064 --keybits; 4065 4066 /* 4067 * Expand keybits to hold at least ncount elements. ncount will be 4068 * a power of 2. This is to try to completely fill leaf nodes (at 4069 * least for keys which are not hashes). 4070 * 4071 * We aren't counting 'in' or 'out', we are counting 'high side' 4072 * and 'low side' based on the bit at (1LL << keybits). We want 4073 * everything to be inside in these cases so shift it all to 4074 * the low or high side depending on the new high bit. 4075 */ 4076 while (((hammer2_key_t)1 << keybits) < ncount) { 4077 ++keybits; 4078 if (key & ((hammer2_key_t)1 << keybits)) { 4079 hicount += locount; 4080 locount = 0; 4081 } else { 4082 locount += hicount; 4083 hicount = 0; 4084 } 4085 } 4086 4087 if (hicount > locount) 4088 key |= (hammer2_key_t)1 << keybits; 4089 else 4090 key &= ~(hammer2_key_t)1 << keybits; 4091 4092 *keyp = key; 4093 4094 return (keybits); 4095 } 4096 4097 #else 4098 4099 /* 4100 * Directory indirect blocks. 4101 * 4102 * Covers both the inode index (directory of inodes), and directory contents 4103 * (filenames hardlinked to inodes). 4104 * 4105 * Because directory keys are hashed we generally try to cut the space in 4106 * half. We accomodate the inode index (which tends to have linearly 4107 * increasing inode numbers) by ensuring that the keyspace is at least large 4108 * enough to fill up the indirect block being created. 4109 */ 4110 static int 4111 hammer2_chain_indkey_dir(hammer2_chain_t *parent, hammer2_key_t *keyp, 4112 int keybits, hammer2_blockref_t *base, int count, 4113 int ncount) 4114 { 4115 hammer2_blockref_t *bref; 4116 hammer2_chain_t *chain; 4117 hammer2_key_t key_beg; 4118 hammer2_key_t key_end; 4119 hammer2_key_t key_next; 4120 hammer2_key_t key; 4121 int nkeybits; 4122 int locount; 4123 int hicount; 4124 int cache_index; 4125 int maxloops = 300000; 4126 4127 /* 4128 * Shortcut if the parent is the inode. In this situation the 4129 * parent has 4+1 directory entries and we are creating an indirect 4130 * block capable of holding many more. 4131 */ 4132 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 4133 return 63; 4134 } 4135 4136 key = *keyp; 4137 locount = 0; 4138 hicount = 0; 4139 4140 /* 4141 * Calculate the range of keys in the array being careful to skip 4142 * slots which are overridden with a deletion. 4143 */ 4144 key_beg = 0; 4145 key_end = HAMMER2_KEY_MAX; 4146 cache_index = 0; 4147 hammer2_spin_ex(&parent->core.spin); 4148 4149 for (;;) { 4150 if (--maxloops == 0) { 4151 panic("indkey_freemap shit %p %p:%d\n", 4152 parent, base, count); 4153 } 4154 chain = hammer2_combined_find(parent, base, count, 4155 &cache_index, &key_next, 4156 key_beg, key_end, 4157 &bref); 4158 4159 /* 4160 * Exhausted search 4161 */ 4162 if (bref == NULL) 4163 break; 4164 4165 /* 4166 * Deleted object 4167 */ 4168 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 4169 if (key_next == 0 || key_next > key_end) 4170 break; 4171 key_beg = key_next; 4172 continue; 4173 } 4174 4175 /* 4176 * Use the full live (not deleted) element for the scan 4177 * iteration. HAMMER2 does not allow partial replacements. 4178 * 4179 * XXX should be built into hammer2_combined_find(). 4180 */ 4181 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 4182 4183 /* 4184 * Expand our calculated key range (key, keybits) to fit 4185 * the scanned key. nkeybits represents the full range 4186 * that we will later cut in half (two halves @ nkeybits - 1). 4187 */ 4188 nkeybits = keybits; 4189 if (nkeybits < bref->keybits) { 4190 if (bref->keybits > 64) { 4191 kprintf("bad bref chain %p bref %p\n", 4192 chain, bref); 4193 Debugger("fubar"); 4194 } 4195 nkeybits = bref->keybits; 4196 } 4197 while (nkeybits < 64 && 4198 (~(((hammer2_key_t)1 << nkeybits) - 1) & 4199 (key ^ bref->key)) != 0) { 4200 ++nkeybits; 4201 } 4202 4203 /* 4204 * If the new key range is larger we have to determine 4205 * which side of the new key range the existing keys fall 4206 * under by checking the high bit, then collapsing the 4207 * locount into the hicount or vise-versa. 4208 */ 4209 if (keybits != nkeybits) { 4210 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 4211 hicount += locount; 4212 locount = 0; 4213 } else { 4214 locount += hicount; 4215 hicount = 0; 4216 } 4217 keybits = nkeybits; 4218 } 4219 4220 /* 4221 * The newly scanned key will be in the lower half or the 4222 * upper half of the (new) key range. 4223 */ 4224 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 4225 ++hicount; 4226 else 4227 ++locount; 4228 4229 if (key_next == 0) 4230 break; 4231 key_beg = key_next; 4232 } 4233 hammer2_spin_unex(&parent->core.spin); 4234 bref = NULL; /* now invalid (safety) */ 4235 4236 /* 4237 * Adjust keybits to represent half of the full range calculated 4238 * above (radix 63 max) for our new indirect block. 4239 */ 4240 --keybits; 4241 4242 /* 4243 * Expand keybits to hold at least ncount elements. ncount will be 4244 * a power of 2. This is to try to completely fill leaf nodes (at 4245 * least for keys which are not hashes). 4246 * 4247 * We aren't counting 'in' or 'out', we are counting 'high side' 4248 * and 'low side' based on the bit at (1LL << keybits). We want 4249 * everything to be inside in these cases so shift it all to 4250 * the low or high side depending on the new high bit. 4251 */ 4252 while (((hammer2_key_t)1 << keybits) < ncount) { 4253 ++keybits; 4254 if (key & ((hammer2_key_t)1 << keybits)) { 4255 hicount += locount; 4256 locount = 0; 4257 } else { 4258 locount += hicount; 4259 hicount = 0; 4260 } 4261 } 4262 4263 if (hicount > locount) 4264 key |= (hammer2_key_t)1 << keybits; 4265 else 4266 key &= ~(hammer2_key_t)1 << keybits; 4267 4268 *keyp = key; 4269 4270 return (keybits); 4271 } 4272 4273 #endif 4274 4275 /* 4276 * Sets CHAIN_DELETED and remove the chain's blockref from the parent if 4277 * it exists. 4278 * 4279 * Both parent and chain must be locked exclusively. 4280 * 4281 * This function will modify the parent if the blockref requires removal 4282 * from the parent's block table. 4283 * 4284 * This function is NOT recursive. Any entity already pushed into the 4285 * chain (such as an inode) may still need visibility into its contents, 4286 * as well as the ability to read and modify the contents. For example, 4287 * for an unlinked file which is still open. 4288 * 4289 * Also note that the flusher is responsible for cleaning up empty 4290 * indirect blocks. 4291 */ 4292 void 4293 hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, 4294 hammer2_tid_t mtid, int flags) 4295 { 4296 KKASSERT(hammer2_mtx_owned(&chain->lock)); 4297 4298 /* 4299 * Nothing to do if already marked. 4300 * 4301 * We need the spinlock on the core whos RBTREE contains chain 4302 * to protect against races. 4303 */ 4304 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 4305 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 && 4306 chain->parent == parent); 4307 _hammer2_chain_delete_helper(parent, chain, mtid, flags); 4308 } 4309 4310 /* 4311 * Permanent deletions mark the chain as destroyed. 4312 */ 4313 if (flags & HAMMER2_DELETE_PERMANENT) { 4314 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 4315 } else { 4316 /* XXX might not be needed */ 4317 hammer2_chain_setflush(chain); 4318 } 4319 } 4320 4321 /* 4322 * Returns the index of the nearest element in the blockref array >= elm. 4323 * Returns (count) if no element could be found. 4324 * 4325 * Sets *key_nextp to the next key for loop purposes but does not modify 4326 * it if the next key would be higher than the current value of *key_nextp. 4327 * Note that *key_nexp can overflow to 0, which should be tested by the 4328 * caller. 4329 * 4330 * (*cache_indexp) is a heuristic and can be any value without effecting 4331 * the result. 4332 * 4333 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4334 * held through the operation. 4335 */ 4336 static int 4337 hammer2_base_find(hammer2_chain_t *parent, 4338 hammer2_blockref_t *base, int count, 4339 int *cache_indexp, hammer2_key_t *key_nextp, 4340 hammer2_key_t key_beg, hammer2_key_t key_end) 4341 { 4342 hammer2_blockref_t *scan; 4343 hammer2_key_t scan_end; 4344 int i; 4345 int limit; 4346 4347 /* 4348 * Require the live chain's already have their core's counted 4349 * so we can optimize operations. 4350 */ 4351 KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS); 4352 4353 /* 4354 * Degenerate case 4355 */ 4356 if (count == 0 || base == NULL) 4357 return(count); 4358 4359 /* 4360 * Sequential optimization using *cache_indexp. This is the most 4361 * likely scenario. 4362 * 4363 * We can avoid trailing empty entries on live chains, otherwise 4364 * we might have to check the whole block array. 4365 */ 4366 i = *cache_indexp; 4367 cpu_ccfence(); 4368 limit = parent->core.live_zero; 4369 if (i >= limit) 4370 i = limit - 1; 4371 if (i < 0) 4372 i = 0; 4373 KKASSERT(i < count); 4374 4375 /* 4376 * Search backwards 4377 */ 4378 scan = &base[i]; 4379 while (i > 0 && (scan->type == 0 || scan->key > key_beg)) { 4380 --scan; 4381 --i; 4382 } 4383 *cache_indexp = i; 4384 4385 /* 4386 * Search forwards, stop when we find a scan element which 4387 * encloses the key or until we know that there are no further 4388 * elements. 4389 */ 4390 while (i < count) { 4391 if (scan->type != 0) { 4392 scan_end = scan->key + 4393 ((hammer2_key_t)1 << scan->keybits) - 1; 4394 if (scan->key > key_beg || scan_end >= key_beg) 4395 break; 4396 } 4397 if (i >= limit) 4398 return (count); 4399 ++scan; 4400 ++i; 4401 } 4402 if (i != count) { 4403 *cache_indexp = i; 4404 if (i >= limit) { 4405 i = count; 4406 } else { 4407 scan_end = scan->key + 4408 ((hammer2_key_t)1 << scan->keybits); 4409 if (scan_end && (*key_nextp > scan_end || 4410 *key_nextp == 0)) { 4411 *key_nextp = scan_end; 4412 } 4413 } 4414 } 4415 return (i); 4416 } 4417 4418 /* 4419 * Do a combined search and return the next match either from the blockref 4420 * array or from the in-memory chain. Sets *bresp to the returned bref in 4421 * both cases, or sets it to NULL if the search exhausted. Only returns 4422 * a non-NULL chain if the search matched from the in-memory chain. 4423 * 4424 * When no in-memory chain has been found and a non-NULL bref is returned 4425 * in *bresp. 4426 * 4427 * 4428 * The returned chain is not locked or referenced. Use the returned bref 4429 * to determine if the search exhausted or not. Iterate if the base find 4430 * is chosen but matches a deleted chain. 4431 * 4432 * WARNING! Must be called with parent's spinlock held. Spinlock remains 4433 * held through the operation. 4434 */ 4435 static hammer2_chain_t * 4436 hammer2_combined_find(hammer2_chain_t *parent, 4437 hammer2_blockref_t *base, int count, 4438 int *cache_indexp, hammer2_key_t *key_nextp, 4439 hammer2_key_t key_beg, hammer2_key_t key_end, 4440 hammer2_blockref_t **bresp) 4441 { 4442 hammer2_blockref_t *bref; 4443 hammer2_chain_t *chain; 4444 int i; 4445 4446 /* 4447 * Lookup in block array and in rbtree. 4448 */ 4449 *key_nextp = key_end + 1; 4450 i = hammer2_base_find(parent, base, count, cache_indexp, 4451 key_nextp, key_beg, key_end); 4452 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 4453 4454 /* 4455 * Neither matched 4456 */ 4457 if (i == count && chain == NULL) { 4458 *bresp = NULL; 4459 return(NULL); 4460 } 4461 4462 /* 4463 * Only chain matched. 4464 */ 4465 if (i == count) { 4466 bref = &chain->bref; 4467 goto found; 4468 } 4469 4470 /* 4471 * Only blockref matched. 4472 */ 4473 if (chain == NULL) { 4474 bref = &base[i]; 4475 goto found; 4476 } 4477 4478 /* 4479 * Both in-memory and blockref matched, select the nearer element. 4480 * 4481 * If both are flush with the left-hand side or both are the 4482 * same distance away, select the chain. In this situation the 4483 * chain must have been loaded from the matching blockmap. 4484 */ 4485 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 4486 chain->bref.key == base[i].key) { 4487 KKASSERT(chain->bref.key == base[i].key); 4488 bref = &chain->bref; 4489 goto found; 4490 } 4491 4492 /* 4493 * Select the nearer key 4494 */ 4495 if (chain->bref.key < base[i].key) { 4496 bref = &chain->bref; 4497 } else { 4498 bref = &base[i]; 4499 chain = NULL; 4500 } 4501 4502 /* 4503 * If the bref is out of bounds we've exhausted our search. 4504 */ 4505 found: 4506 if (bref->key > key_end) { 4507 *bresp = NULL; 4508 chain = NULL; 4509 } else { 4510 *bresp = bref; 4511 } 4512 return(chain); 4513 } 4514 4515 /* 4516 * Locate the specified block array element and delete it. The element 4517 * must exist. 4518 * 4519 * The spin lock on the related chain must be held. 4520 * 4521 * NOTE: live_count was adjusted when the chain was deleted, so it does not 4522 * need to be adjusted when we commit the media change. 4523 */ 4524 void 4525 hammer2_base_delete(hammer2_chain_t *parent, 4526 hammer2_blockref_t *base, int count, 4527 int *cache_indexp, hammer2_chain_t *chain) 4528 { 4529 hammer2_blockref_t *elm = &chain->bref; 4530 hammer2_key_t key_next; 4531 int i; 4532 4533 /* 4534 * Delete element. Expect the element to exist. 4535 * 4536 * XXX see caller, flush code not yet sophisticated enough to prevent 4537 * re-flushed in some cases. 4538 */ 4539 key_next = 0; /* max range */ 4540 i = hammer2_base_find(parent, base, count, cache_indexp, 4541 &key_next, elm->key, elm->key); 4542 if (i == count || base[i].type == 0 || 4543 base[i].key != elm->key || 4544 ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 && 4545 base[i].keybits != elm->keybits)) { 4546 hammer2_spin_unex(&parent->core.spin); 4547 panic("delete base %p element not found at %d/%d elm %p\n", 4548 base, i, count, elm); 4549 return; 4550 } 4551 4552 /* 4553 * Update stats and zero the entry 4554 */ 4555 parent->bref.data_count -= base[i].data_count; 4556 parent->bref.data_count -= (hammer2_off_t)1 << 4557 (int)(base[i].data_off & HAMMER2_OFF_MASK_RADIX); 4558 parent->bref.inode_count -= base[i].inode_count; 4559 if (base[i].type == HAMMER2_BREF_TYPE_INODE) 4560 parent->bref.inode_count -= 1; 4561 4562 bzero(&base[i], sizeof(*base)); 4563 4564 /* 4565 * We can only optimize parent->core.live_zero for live chains. 4566 */ 4567 if (parent->core.live_zero == i + 1) { 4568 while (--i >= 0 && base[i].type == 0) 4569 ; 4570 parent->core.live_zero = i + 1; 4571 } 4572 4573 /* 4574 * Clear appropriate blockmap flags in chain. 4575 */ 4576 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 4577 HAMMER2_CHAIN_BMAPUPD); 4578 } 4579 4580 /* 4581 * Insert the specified element. The block array must not already have the 4582 * element and must have space available for the insertion. 4583 * 4584 * The spin lock on the related chain must be held. 4585 * 4586 * NOTE: live_count was adjusted when the chain was deleted, so it does not 4587 * need to be adjusted when we commit the media change. 4588 */ 4589 void 4590 hammer2_base_insert(hammer2_chain_t *parent, 4591 hammer2_blockref_t *base, int count, 4592 int *cache_indexp, hammer2_chain_t *chain) 4593 { 4594 hammer2_blockref_t *elm = &chain->bref; 4595 hammer2_key_t key_next; 4596 hammer2_key_t xkey; 4597 int i; 4598 int j; 4599 int k; 4600 int l; 4601 int u = 1; 4602 4603 /* 4604 * Insert new element. Expect the element to not already exist 4605 * unless we are replacing it. 4606 * 4607 * XXX see caller, flush code not yet sophisticated enough to prevent 4608 * re-flushed in some cases. 4609 */ 4610 key_next = 0; /* max range */ 4611 i = hammer2_base_find(parent, base, count, cache_indexp, 4612 &key_next, elm->key, elm->key); 4613 4614 /* 4615 * Shortcut fill optimization, typical ordered insertion(s) may not 4616 * require a search. 4617 */ 4618 KKASSERT(i >= 0 && i <= count); 4619 4620 /* 4621 * Set appropriate blockmap flags in chain. 4622 */ 4623 atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); 4624 4625 /* 4626 * Update stats and zero the entry 4627 */ 4628 parent->bref.data_count += elm->data_count; 4629 parent->bref.data_count += (hammer2_off_t)1 << 4630 (int)(elm->data_off & HAMMER2_OFF_MASK_RADIX); 4631 parent->bref.inode_count += elm->inode_count; 4632 if (elm->type == HAMMER2_BREF_TYPE_INODE) 4633 parent->bref.inode_count += 1; 4634 4635 4636 /* 4637 * We can only optimize parent->core.live_zero for live chains. 4638 */ 4639 if (i == count && parent->core.live_zero < count) { 4640 i = parent->core.live_zero++; 4641 base[i] = *elm; 4642 return; 4643 } 4644 4645 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 4646 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 4647 hammer2_spin_unex(&parent->core.spin); 4648 panic("insert base %p overlapping elements at %d elm %p\n", 4649 base, i, elm); 4650 } 4651 4652 /* 4653 * Try to find an empty slot before or after. 4654 */ 4655 j = i; 4656 k = i; 4657 while (j > 0 || k < count) { 4658 --j; 4659 if (j >= 0 && base[j].type == 0) { 4660 if (j == i - 1) { 4661 base[j] = *elm; 4662 } else { 4663 bcopy(&base[j+1], &base[j], 4664 (i - j - 1) * sizeof(*base)); 4665 base[i - 1] = *elm; 4666 } 4667 goto validate; 4668 } 4669 ++k; 4670 if (k < count && base[k].type == 0) { 4671 bcopy(&base[i], &base[i+1], 4672 (k - i) * sizeof(hammer2_blockref_t)); 4673 base[i] = *elm; 4674 4675 /* 4676 * We can only update parent->core.live_zero for live 4677 * chains. 4678 */ 4679 if (parent->core.live_zero <= k) 4680 parent->core.live_zero = k + 1; 4681 u = 2; 4682 goto validate; 4683 } 4684 } 4685 panic("hammer2_base_insert: no room!"); 4686 4687 /* 4688 * Debugging 4689 */ 4690 validate: 4691 key_next = 0; 4692 for (l = 0; l < count; ++l) { 4693 if (base[l].type) { 4694 key_next = base[l].key + 4695 ((hammer2_key_t)1 << base[l].keybits) - 1; 4696 break; 4697 } 4698 } 4699 while (++l < count) { 4700 if (base[l].type) { 4701 if (base[l].key <= key_next) 4702 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 4703 key_next = base[l].key + 4704 ((hammer2_key_t)1 << base[l].keybits) - 1; 4705 4706 } 4707 } 4708 4709 } 4710 4711 #if 0 4712 4713 /* 4714 * Sort the blockref array for the chain. Used by the flush code to 4715 * sort the blockref[] array. 4716 * 4717 * The chain must be exclusively locked AND spin-locked. 4718 */ 4719 typedef hammer2_blockref_t *hammer2_blockref_p; 4720 4721 static 4722 int 4723 hammer2_base_sort_callback(const void *v1, const void *v2) 4724 { 4725 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 4726 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 4727 4728 /* 4729 * Make sure empty elements are placed at the end of the array 4730 */ 4731 if (bref1->type == 0) { 4732 if (bref2->type == 0) 4733 return(0); 4734 return(1); 4735 } else if (bref2->type == 0) { 4736 return(-1); 4737 } 4738 4739 /* 4740 * Sort by key 4741 */ 4742 if (bref1->key < bref2->key) 4743 return(-1); 4744 if (bref1->key > bref2->key) 4745 return(1); 4746 return(0); 4747 } 4748 4749 void 4750 hammer2_base_sort(hammer2_chain_t *chain) 4751 { 4752 hammer2_blockref_t *base; 4753 int count; 4754 4755 switch(chain->bref.type) { 4756 case HAMMER2_BREF_TYPE_INODE: 4757 /* 4758 * Special shortcut for embedded data returns the inode 4759 * itself. Callers must detect this condition and access 4760 * the embedded data (the strategy code does this for us). 4761 * 4762 * This is only applicable to regular files and softlinks. 4763 */ 4764 if (chain->data->ipdata.meta.op_flags & 4765 HAMMER2_OPFLAG_DIRECTDATA) { 4766 return; 4767 } 4768 base = &chain->data->ipdata.u.blockset.blockref[0]; 4769 count = HAMMER2_SET_COUNT; 4770 break; 4771 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4772 case HAMMER2_BREF_TYPE_INDIRECT: 4773 /* 4774 * Optimize indirect blocks in the INITIAL state to avoid 4775 * I/O. 4776 */ 4777 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4778 base = &chain->data->npdata[0]; 4779 count = chain->bytes / sizeof(hammer2_blockref_t); 4780 break; 4781 case HAMMER2_BREF_TYPE_VOLUME: 4782 base = &chain->data->voldata.sroot_blockset.blockref[0]; 4783 count = HAMMER2_SET_COUNT; 4784 break; 4785 case HAMMER2_BREF_TYPE_FREEMAP: 4786 base = &chain->data->blkset.blockref[0]; 4787 count = HAMMER2_SET_COUNT; 4788 break; 4789 default: 4790 kprintf("hammer2_chain_lookup: unrecognized " 4791 "blockref(A) type: %d", 4792 chain->bref.type); 4793 while (1) 4794 tsleep(&base, 0, "dead", 0); 4795 panic("hammer2_chain_lookup: unrecognized " 4796 "blockref(A) type: %d", 4797 chain->bref.type); 4798 base = NULL; /* safety */ 4799 count = 0; /* safety */ 4800 } 4801 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 4802 } 4803 4804 #endif 4805 4806 /* 4807 * Chain memory management 4808 */ 4809 void 4810 hammer2_chain_wait(hammer2_chain_t *chain) 4811 { 4812 tsleep(chain, 0, "chnflw", 1); 4813 } 4814 4815 const hammer2_media_data_t * 4816 hammer2_chain_rdata(hammer2_chain_t *chain) 4817 { 4818 KKASSERT(chain->data != NULL); 4819 return (chain->data); 4820 } 4821 4822 hammer2_media_data_t * 4823 hammer2_chain_wdata(hammer2_chain_t *chain) 4824 { 4825 KKASSERT(chain->data != NULL); 4826 return (chain->data); 4827 } 4828 4829 /* 4830 * Set the check data for a chain. This can be a heavy-weight operation 4831 * and typically only runs on-flush. For file data check data is calculated 4832 * when the logical buffers are flushed. 4833 */ 4834 void 4835 hammer2_chain_setcheck(hammer2_chain_t *chain, void *bdata) 4836 { 4837 chain->bref.flags &= ~HAMMER2_BREF_FLAG_ZERO; 4838 4839 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 4840 case HAMMER2_CHECK_NONE: 4841 break; 4842 case HAMMER2_CHECK_DISABLED: 4843 break; 4844 case HAMMER2_CHECK_ISCSI32: 4845 chain->bref.check.iscsi32.value = 4846 hammer2_icrc32(bdata, chain->bytes); 4847 break; 4848 case HAMMER2_CHECK_XXHASH64: 4849 chain->bref.check.xxhash64.value = 4850 XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 4851 break; 4852 case HAMMER2_CHECK_SHA192: 4853 { 4854 SHA256_CTX hash_ctx; 4855 union { 4856 uint8_t digest[SHA256_DIGEST_LENGTH]; 4857 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 4858 } u; 4859 4860 SHA256_Init(&hash_ctx); 4861 SHA256_Update(&hash_ctx, bdata, chain->bytes); 4862 SHA256_Final(u.digest, &hash_ctx); 4863 u.digest64[2] ^= u.digest64[3]; 4864 bcopy(u.digest, 4865 chain->bref.check.sha192.data, 4866 sizeof(chain->bref.check.sha192.data)); 4867 } 4868 break; 4869 case HAMMER2_CHECK_FREEMAP: 4870 chain->bref.check.freemap.icrc32 = 4871 hammer2_icrc32(bdata, chain->bytes); 4872 break; 4873 default: 4874 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 4875 chain->bref.methods); 4876 break; 4877 } 4878 } 4879 4880 int 4881 hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) 4882 { 4883 uint32_t check32; 4884 uint64_t check64; 4885 int r; 4886 4887 if (chain->bref.flags & HAMMER2_BREF_FLAG_ZERO) 4888 return 1; 4889 4890 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 4891 case HAMMER2_CHECK_NONE: 4892 r = 1; 4893 break; 4894 case HAMMER2_CHECK_DISABLED: 4895 r = 1; 4896 break; 4897 case HAMMER2_CHECK_ISCSI32: 4898 check32 = hammer2_icrc32(bdata, chain->bytes); 4899 r = (chain->bref.check.iscsi32.value == check32); 4900 if (r == 0) { 4901 kprintf("chain %016jx.%02x meth=%02x CHECK FAIL " 4902 "(flags=%08x, bref/data %08x/%08x)\n", 4903 chain->bref.data_off, 4904 chain->bref.type, 4905 chain->bref.methods, 4906 chain->flags, 4907 chain->bref.check.iscsi32.value, 4908 check32); 4909 } 4910 hammer2_check_icrc32 += chain->bytes; 4911 break; 4912 case HAMMER2_CHECK_XXHASH64: 4913 check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); 4914 r = (chain->bref.check.xxhash64.value == check64); 4915 if (r == 0) { 4916 kprintf("chain %016jx.%02x key=%016jx " 4917 "meth=%02x CHECK FAIL " 4918 "(flags=%08x, bref/data %016jx/%016jx)\n", 4919 chain->bref.data_off, 4920 chain->bref.type, 4921 chain->bref.key, 4922 chain->bref.methods, 4923 chain->flags, 4924 chain->bref.check.xxhash64.value, 4925 check64); 4926 } 4927 hammer2_check_xxhash64 += chain->bytes; 4928 break; 4929 case HAMMER2_CHECK_SHA192: 4930 { 4931 SHA256_CTX hash_ctx; 4932 union { 4933 uint8_t digest[SHA256_DIGEST_LENGTH]; 4934 uint64_t digest64[SHA256_DIGEST_LENGTH/8]; 4935 } u; 4936 4937 SHA256_Init(&hash_ctx); 4938 SHA256_Update(&hash_ctx, bdata, chain->bytes); 4939 SHA256_Final(u.digest, &hash_ctx); 4940 u.digest64[2] ^= u.digest64[3]; 4941 if (bcmp(u.digest, 4942 chain->bref.check.sha192.data, 4943 sizeof(chain->bref.check.sha192.data)) == 0) { 4944 r = 1; 4945 } else { 4946 r = 0; 4947 kprintf("chain %016jx.%02x meth=%02x " 4948 "CHECK FAIL\n", 4949 chain->bref.data_off, 4950 chain->bref.type, 4951 chain->bref.methods); 4952 } 4953 } 4954 break; 4955 case HAMMER2_CHECK_FREEMAP: 4956 r = (chain->bref.check.freemap.icrc32 == 4957 hammer2_icrc32(bdata, chain->bytes)); 4958 if (r == 0) { 4959 kprintf("chain %016jx.%02x meth=%02x " 4960 "CHECK FAIL\n", 4961 chain->bref.data_off, 4962 chain->bref.type, 4963 chain->bref.methods); 4964 kprintf("freemap.icrc %08x icrc32 %08x (%d)\n", 4965 chain->bref.check.freemap.icrc32, 4966 hammer2_icrc32(bdata, chain->bytes), 4967 chain->bytes); 4968 if (chain->dio) 4969 kprintf("dio %p buf %016jx,%d bdata %p/%p\n", 4970 chain->dio, chain->dio->bp->b_loffset, 4971 chain->dio->bp->b_bufsize, bdata, 4972 chain->dio->bp->b_data); 4973 } 4974 4975 break; 4976 default: 4977 kprintf("hammer2_chain_setcheck: unknown check type %02x\n", 4978 chain->bref.methods); 4979 r = 1; 4980 break; 4981 } 4982 return r; 4983 } 4984 4985 /* 4986 * The caller presents a shared-locked (parent, chain) where the chain 4987 * is of type HAMMER2_OBJTYPE_HARDLINK. 4988 * 4989 * The flags passed in are LOOKUP flags, not RESOLVE flags. Only 4990 * HAMMER2_LOOKUP_SHARED is supported. 4991 * 4992 * We locate the actual inode chain & parent. 4993 * 4994 * If we are unable to locate the hardlink, EIO is returned and 4995 * (*chainp) is unlocked and dropped. 4996 */ 4997 int 4998 hammer2_chain_hardlink_find(hammer2_chain_t **parentp, hammer2_chain_t **chainp, 4999 int clindex, int flags) 5000 { 5001 hammer2_chain_t *parent; 5002 hammer2_chain_t *rchain; 5003 hammer2_pfs_t *pmp; 5004 hammer2_key_t key_dummy; 5005 hammer2_key_t lhc; 5006 int cache_index = -1; 5007 int resolve_flags; 5008 5009 resolve_flags = (flags & HAMMER2_LOOKUP_SHARED) ? 5010 HAMMER2_RESOLVE_SHARED : 0; 5011 5012 /* 5013 * Obtain the key for the hardlink from *chainp. 5014 */ 5015 rchain = *chainp; 5016 pmp = rchain->pmp; 5017 lhc = rchain->data->ipdata.meta.inum; 5018 hammer2_chain_unlock(rchain); 5019 hammer2_chain_drop(rchain); 5020 rchain = NULL; 5021 5022 /* 5023 * Hardlinks hang off of iroot 5024 */ 5025 if (*parentp) { 5026 hammer2_chain_unlock(*parentp); 5027 hammer2_chain_drop(*parentp); 5028 } 5029 parent = hammer2_inode_chain(pmp->iroot, clindex, resolve_flags); 5030 rchain = hammer2_chain_lookup(&parent, &key_dummy, 5031 lhc, lhc, 5032 &cache_index, flags); 5033 *parentp = parent; 5034 *chainp = rchain; 5035 return (rchain ? 0 : EINVAL); 5036 } 5037 5038 /* 5039 * Used by the bulkscan code to snapshot the synchronized storage for 5040 * a volume, allowing it to be scanned concurrently against normal 5041 * operation. 5042 */ 5043 hammer2_chain_t * 5044 hammer2_chain_bulksnap(hammer2_chain_t *chain) 5045 { 5046 hammer2_chain_t *copy; 5047 5048 copy = hammer2_chain_alloc(chain->hmp, chain->pmp, &chain->bref); 5049 switch(chain->bref.type) { 5050 case HAMMER2_BREF_TYPE_VOLUME: 5051 copy->data = kmalloc(sizeof(copy->data->voldata), 5052 chain->hmp->mchain, 5053 M_WAITOK | M_ZERO); 5054 hammer2_spin_ex(&chain->core.spin); 5055 copy->data->voldata = chain->data->voldata; 5056 hammer2_spin_unex(&chain->core.spin); 5057 break; 5058 case HAMMER2_BREF_TYPE_FREEMAP: 5059 copy->data = kmalloc(sizeof(hammer2_blockset_t), 5060 chain->hmp->mchain, 5061 M_WAITOK | M_ZERO); 5062 hammer2_spin_ex(&chain->core.spin); 5063 copy->data->blkset = chain->data->blkset; 5064 hammer2_spin_unex(&chain->core.spin); 5065 break; 5066 default: 5067 break; 5068 } 5069 return copy; 5070 } 5071 5072 void 5073 hammer2_chain_bulkdrop(hammer2_chain_t *copy) 5074 { 5075 switch(copy->bref.type) { 5076 case HAMMER2_BREF_TYPE_VOLUME: 5077 case HAMMER2_BREF_TYPE_FREEMAP: 5078 KKASSERT(copy->data); 5079 kfree(copy->data, copy->hmp->mchain); 5080 copy->data = NULL; 5081 atomic_add_long(&hammer2_chain_allocs, -1); 5082 break; 5083 default: 5084 break; 5085 } 5086 hammer2_chain_drop(copy); 5087 } 5088 5089 /* 5090 * Create a snapshot of the specified {parent, ochain} with the specified 5091 * label. The originating hammer2_inode must be exclusively locked for 5092 * safety. 5093 * 5094 * The ioctl code has already synced the filesystem. 5095 */ 5096 int 5097 hammer2_chain_snapshot(hammer2_chain_t *chain, hammer2_ioc_pfs_t *pmp, 5098 hammer2_tid_t mtid) 5099 { 5100 hammer2_dev_t *hmp; 5101 const hammer2_inode_data_t *ripdata; 5102 hammer2_inode_data_t *wipdata; 5103 hammer2_chain_t *nchain; 5104 hammer2_inode_t *nip; 5105 size_t name_len; 5106 hammer2_key_t lhc; 5107 struct vattr vat; 5108 #if 0 5109 uuid_t opfs_clid; 5110 #endif 5111 int error; 5112 5113 kprintf("snapshot %s\n", pmp->name); 5114 5115 name_len = strlen(pmp->name); 5116 lhc = hammer2_dirhash(pmp->name, name_len); 5117 5118 /* 5119 * Get the clid 5120 */ 5121 ripdata = &chain->data->ipdata; 5122 #if 0 5123 opfs_clid = ripdata->meta.pfs_clid; 5124 #endif 5125 hmp = chain->hmp; 5126 5127 /* 5128 * Create the snapshot directory under the super-root 5129 * 5130 * Set PFS type, generate a unique filesystem id, and generate 5131 * a cluster id. Use the same clid when snapshotting a PFS root, 5132 * which theoretically allows the snapshot to be used as part of 5133 * the same cluster (perhaps as a cache). 5134 * 5135 * Copy the (flushed) blockref array. Theoretically we could use 5136 * chain_duplicate() but it becomes difficult to disentangle 5137 * the shared core so for now just brute-force it. 5138 */ 5139 VATTR_NULL(&vat); 5140 vat.va_type = VDIR; 5141 vat.va_mode = 0755; 5142 nip = hammer2_inode_create(hmp->spmp->iroot, hmp->spmp->iroot, 5143 &vat, proc0.p_ucred, 5144 pmp->name, name_len, 0, 5145 1, 0, 0, 5146 HAMMER2_INSERT_PFSROOT, &error); 5147 5148 if (nip) { 5149 hammer2_inode_modify(nip); 5150 nchain = hammer2_inode_chain(nip, 0, HAMMER2_RESOLVE_ALWAYS); 5151 hammer2_chain_modify(nchain, mtid, 0, 0); 5152 wipdata = &nchain->data->ipdata; 5153 5154 nip->meta.pfs_type = HAMMER2_PFSTYPE_MASTER; 5155 nip->meta.pfs_subtype = HAMMER2_PFSSUBTYPE_SNAPSHOT; 5156 nip->meta.op_flags |= HAMMER2_OPFLAG_PFSROOT; 5157 kern_uuidgen(&nip->meta.pfs_fsid, 1); 5158 5159 /* 5160 * Give the snapshot its own private cluster id. As a 5161 * snapshot no further synchronization with the original 5162 * cluster will be done. 5163 */ 5164 #if 0 5165 if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) 5166 nip->meta.pfs_clid = opfs_clid; 5167 else 5168 kern_uuidgen(&nip->meta.pfs_clid, 1); 5169 #endif 5170 kern_uuidgen(&nip->meta.pfs_clid, 1); 5171 nchain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT; 5172 5173 /* XXX hack blockset copy */ 5174 /* XXX doesn't work with real cluster */ 5175 wipdata->meta = nip->meta; 5176 wipdata->u.blockset = ripdata->u.blockset; 5177 hammer2_flush(nchain, 1); 5178 hammer2_chain_unlock(nchain); 5179 hammer2_chain_drop(nchain); 5180 hammer2_inode_unlock(nip); 5181 } 5182 return (error); 5183 } 5184