1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain and hammer2_chain_core structures. 38 * 39 * Chains represent the filesystem media topology in-memory. Any given 40 * chain can represent an inode, indirect block, data, or other types 41 * of blocks. 42 * 43 * This module provides APIs for direct and indirect block searches, 44 * iterations, recursions, creation, deletion, replication, and snapshot 45 * views (used by the flush and snapshot code). 46 * 47 * Generally speaking any modification made to a chain must propagate all 48 * the way back to the volume header, issuing copy-on-write updates to the 49 * blockref tables all the way up. Any chain except the volume header itself 50 * can be flushed to disk at any time, in any order. None of it matters 51 * until we get to the point where we want to synchronize the volume header 52 * (see the flush code). 53 * 54 * The chain structure supports snapshot views in time, which are primarily 55 * used until the related data and meta-data is flushed to allow the 56 * filesystem to make snapshots without requiring it to first flush, 57 * and to allow the filesystem flush and modify the filesystem concurrently 58 * with minimal or no stalls. 59 */ 60 #include <sys/cdefs.h> 61 #include <sys/param.h> 62 #include <sys/systm.h> 63 #include <sys/types.h> 64 #include <sys/lock.h> 65 #include <sys/kern_syscall.h> 66 #include <sys/uuid.h> 67 68 #include "hammer2.h" 69 70 static int hammer2_indirect_optimize; /* XXX SYSCTL */ 71 72 static hammer2_chain_t *hammer2_chain_create_indirect( 73 hammer2_trans_t *trans, hammer2_chain_t *parent, 74 hammer2_key_t key, int keybits, int for_type, int *errorp); 75 static void adjreadcounter(hammer2_blockref_t *bref, size_t bytes); 76 77 /* 78 * We use a red-black tree to guarantee safe lookups under shared locks. 79 * 80 * Chains can be overloaded onto the same index, creating a different 81 * view of a blockref table based on a transaction id. The RBTREE 82 * deconflicts the view by sub-sorting on delete_tid. 83 * 84 * NOTE: Any 'current' chain which is not yet deleted will have a 85 * delete_tid of HAMMER2_MAX_TID (0xFFF....FFFLLU). 86 */ 87 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 88 89 int 90 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 91 { 92 if (chain1->index < chain2->index) 93 return(-1); 94 if (chain1->index > chain2->index) 95 return(1); 96 if (chain1->delete_tid < chain2->delete_tid) 97 return(-1); 98 if (chain1->delete_tid > chain2->delete_tid) 99 return(1); 100 return(0); 101 } 102 103 static __inline 104 int 105 hammer2_isclusterable(hammer2_chain_t *chain) 106 { 107 if (hammer2_cluster_enable) { 108 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 109 chain->bref.type == HAMMER2_BREF_TYPE_INODE || 110 chain->bref.type == HAMMER2_BREF_TYPE_DATA) { 111 return(1); 112 } 113 } 114 return(0); 115 } 116 117 /* 118 * Recursively set the SUBMODIFIED flag up to the root starting at chain's 119 * parent. SUBMODIFIED is not set in chain itself. 120 * 121 * This function only operates on current-time transactions and is not 122 * used during flushes. Instead, the flush code manages the flag itself. 123 */ 124 void 125 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain) 126 { 127 hammer2_chain_core_t *above; 128 129 if (trans->flags & HAMMER2_TRANS_ISFLUSH) 130 return; 131 while ((above = chain->above) != NULL) { 132 spin_lock(&above->cst.spin); 133 chain = above->first_parent; 134 while (hammer2_chain_refactor_test(chain, 1)) 135 chain = chain->next_parent; 136 atomic_set_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED); 137 spin_unlock(&above->cst.spin); 138 } 139 } 140 141 /* 142 * Allocate a new disconnected chain element representing the specified 143 * bref. chain->refs is set to 1 and the passed bref is copied to 144 * chain->bref. chain->bytes is derived from the bref. 145 * 146 * chain->core is NOT allocated and the media data and bp pointers are left 147 * NULL. The caller must call chain_core_alloc() to allocate or associate 148 * a core with the chain. 149 * 150 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 151 */ 152 hammer2_chain_t * 153 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_trans_t *trans, 154 hammer2_blockref_t *bref) 155 { 156 hammer2_chain_t *chain; 157 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 158 159 /* 160 * Construct the appropriate system structure. 161 */ 162 switch(bref->type) { 163 case HAMMER2_BREF_TYPE_INODE: 164 case HAMMER2_BREF_TYPE_INDIRECT: 165 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 166 case HAMMER2_BREF_TYPE_DATA: 167 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 168 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 169 break; 170 case HAMMER2_BREF_TYPE_VOLUME: 171 case HAMMER2_BREF_TYPE_FREEMAP: 172 chain = NULL; 173 panic("hammer2_chain_alloc volume type illegal for op"); 174 default: 175 chain = NULL; 176 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 177 bref->type); 178 } 179 180 chain->hmp = hmp; 181 chain->bref = *bref; 182 chain->index = -1; /* not yet assigned */ 183 chain->bytes = bytes; 184 chain->refs = 1; 185 chain->flags = HAMMER2_CHAIN_ALLOCATED; 186 chain->delete_tid = HAMMER2_MAX_TID; 187 if (trans) 188 chain->modify_tid = trans->sync_tid; 189 190 return (chain); 191 } 192 193 /* 194 * Associate an existing core with the chain or allocate a new core. 195 * 196 * The core is not locked. No additional refs on the chain are made. 197 */ 198 void 199 hammer2_chain_core_alloc(hammer2_chain_t *chain, hammer2_chain_core_t *core) 200 { 201 hammer2_chain_t **scanp; 202 203 KKASSERT(chain->core == NULL); 204 KKASSERT(chain->next_parent == NULL); 205 206 if (core == NULL) { 207 core = kmalloc(sizeof(*core), chain->hmp->mchain, 208 M_WAITOK | M_ZERO); 209 RB_INIT(&core->rbtree); 210 core->sharecnt = 1; 211 chain->core = core; 212 ccms_cst_init(&core->cst, chain); 213 core->first_parent = chain; 214 } else { 215 atomic_add_int(&core->sharecnt, 1); 216 chain->core = core; 217 spin_lock(&core->cst.spin); 218 if (core->first_parent == NULL) { 219 core->first_parent = chain; 220 } else { 221 scanp = &core->first_parent; 222 while (*scanp) 223 scanp = &(*scanp)->next_parent; 224 *scanp = chain; 225 hammer2_chain_ref(chain); /* next_parent link */ 226 } 227 spin_unlock(&core->cst.spin); 228 } 229 } 230 231 /* 232 * Add a reference to a chain element, preventing its destruction. 233 */ 234 void 235 hammer2_chain_ref(hammer2_chain_t *chain) 236 { 237 atomic_add_int(&chain->refs, 1); 238 } 239 240 /* 241 * Drop the caller's reference to the chain. When the ref count drops to 242 * zero this function will disassociate the chain from its parent and 243 * deallocate it, then recursely drop the parent using the implied ref 244 * from the chain's chain->parent. 245 * 246 * WARNING! Just because we are able to deallocate a chain doesn't mean 247 * that chain->core->rbtree is empty. There can still be a sharecnt 248 * on chain->core and RBTREE entries that refer to different parents. 249 */ 250 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain); 251 252 void 253 hammer2_chain_drop(hammer2_chain_t *chain) 254 { 255 u_int refs; 256 u_int need = 0; 257 258 #if 1 259 if (chain->flags & HAMMER2_CHAIN_MOVED) 260 ++need; 261 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 262 ++need; 263 KKASSERT(chain->refs > need); 264 #endif 265 266 while (chain) { 267 refs = chain->refs; 268 cpu_ccfence(); 269 KKASSERT(refs > 0); 270 271 if (refs == 1) { 272 chain = hammer2_chain_lastdrop(chain); 273 } else { 274 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 275 break; 276 /* retry the same chain */ 277 } 278 } 279 } 280 281 /* 282 * Safe handling of the 1->0 transition on chain. Returns a chain for 283 * recursive drop or NULL, possibly returning the same chain of the atomic 284 * op fails. 285 * 286 * The cst spinlock is allowed nest child-to-parent (not parent-to-child). 287 */ 288 static 289 hammer2_chain_t * 290 hammer2_chain_lastdrop(hammer2_chain_t *chain) 291 { 292 hammer2_mount_t *hmp; 293 hammer2_chain_core_t *above; 294 hammer2_chain_core_t *core; 295 hammer2_chain_t *rdrop1; 296 hammer2_chain_t *rdrop2; 297 298 /* 299 * Spinlock the core and check to see if it is empty. If it is 300 * not empty we leave chain intact with refs == 0. 301 */ 302 if ((core = chain->core) != NULL) { 303 spin_lock(&core->cst.spin); 304 if (RB_ROOT(&core->rbtree)) { 305 if (atomic_cmpset_int(&chain->refs, 1, 0)) { 306 /* 1->0 transition successful */ 307 spin_unlock(&core->cst.spin); 308 return(NULL); 309 } else { 310 /* 1->0 transition failed, retry */ 311 spin_unlock(&core->cst.spin); 312 return(chain); 313 } 314 } 315 } 316 317 hmp = chain->hmp; 318 rdrop1 = NULL; 319 rdrop2 = NULL; 320 321 /* 322 * Spinlock the parent and try to drop the last ref. On success 323 * remove chain from its parent. 324 */ 325 if ((above = chain->above) != NULL) { 326 spin_lock(&above->cst.spin); 327 if (!atomic_cmpset_int(&chain->refs, 1, 0)) { 328 /* 1->0 transition failed */ 329 spin_unlock(&above->cst.spin); 330 if (core) 331 spin_unlock(&core->cst.spin); 332 return(chain); 333 /* stop */ 334 } 335 336 /* 337 * 1->0 transition successful 338 */ 339 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 340 RB_REMOVE(hammer2_chain_tree, &above->rbtree, chain); 341 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 342 chain->above = NULL; 343 344 /* 345 * Calculate a chain to return for a recursive drop. 346 * 347 * XXX this needs help, we have a potential deep-recursion 348 * problem which we try to address but sometimes we wind up 349 * with two elements that have to be dropped. 350 * 351 * If the chain has an associated core with refs at 0 352 * the chain must be the first in the core's linked list 353 * by definition, and we will recursively drop the ref 354 * implied by the chain->next_parent field. 355 * 356 * Otherwise if the rbtree containing chain is empty we try 357 * to recursively drop our parent (only the first one could 358 * possibly have refs == 0 since the rest are linked via 359 * next_parent). 360 * 361 * Otherwise we try to recursively drop a sibling. 362 */ 363 if (chain->next_parent) { 364 KKASSERT(core != NULL); 365 rdrop1 = chain->next_parent; 366 } 367 if (RB_EMPTY(&above->rbtree)) { 368 rdrop2 = above->first_parent; 369 if (rdrop2 == NULL || rdrop2->refs || 370 atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0) { 371 rdrop2 = NULL; 372 } 373 } else { 374 rdrop2 = RB_ROOT(&above->rbtree); 375 if (atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0) 376 rdrop2 = NULL; 377 } 378 spin_unlock(&above->cst.spin); 379 above = NULL; /* safety */ 380 } else { 381 if (chain->next_parent) { 382 KKASSERT(core != NULL); 383 rdrop1 = chain->next_parent; 384 } 385 } 386 387 /* 388 * We still have the core spinlock (if core is non-NULL). The 389 * above spinlock is gone. 390 */ 391 if (core) { 392 KKASSERT(core->first_parent == chain); 393 if (chain->next_parent) { 394 /* parent should already be set */ 395 KKASSERT(rdrop1 == chain->next_parent); 396 } 397 core->first_parent = chain->next_parent; 398 chain->next_parent = NULL; 399 chain->core = NULL; 400 401 if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) { 402 /* 403 * On the 1->0 transition of core we can destroy 404 * it. 405 */ 406 spin_unlock(&core->cst.spin); 407 KKASSERT(core->cst.count == 0); 408 KKASSERT(core->cst.upgrade == 0); 409 kfree(core, hmp->mchain); 410 } else { 411 spin_unlock(&core->cst.spin); 412 } 413 core = NULL; /* safety */ 414 } 415 416 /* 417 * All spin locks are gone, finish freeing stuff. 418 */ 419 KKASSERT((chain->flags & (HAMMER2_CHAIN_MOVED | 420 HAMMER2_CHAIN_MODIFIED)) == 0); 421 422 switch(chain->bref.type) { 423 case HAMMER2_BREF_TYPE_VOLUME: 424 case HAMMER2_BREF_TYPE_FREEMAP: 425 chain->data = NULL; 426 break; 427 case HAMMER2_BREF_TYPE_INODE: 428 if (chain->data) { 429 kfree(chain->data, hmp->mchain); 430 chain->data = NULL; 431 } 432 break; 433 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 434 if (chain->data) { 435 kfree(chain->data, hmp->mchain); 436 chain->data = NULL; 437 } 438 break; 439 default: 440 KKASSERT(chain->data == NULL); 441 break; 442 } 443 444 KKASSERT(chain->bp == NULL); 445 chain->hmp = NULL; 446 447 if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 448 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED; 449 kfree(chain, hmp->mchain); 450 } 451 if (rdrop1 && rdrop2) { 452 hammer2_chain_drop(rdrop1); 453 return(rdrop2); 454 } else if (rdrop1) 455 return(rdrop1); 456 else 457 return(rdrop2); 458 } 459 460 /* 461 * Ref and lock a chain element, acquiring its data with I/O if necessary, 462 * and specify how you would like the data to be resolved. 463 * 464 * Returns 0 on success or an error code if the data could not be acquired. 465 * The chain element is locked on return regardless of whether an error 466 * occurred or not. 467 * 468 * The lock is allowed to recurse, multiple locking ops will aggregate 469 * the requested resolve types. Once data is assigned it will not be 470 * removed until the last unlock. 471 * 472 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 473 * (typically used to avoid device/logical buffer 474 * aliasing for data) 475 * 476 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 477 * the INITIAL-create state (indirect blocks only). 478 * 479 * Do not resolve data elements for DATA chains. 480 * (typically used to avoid device/logical buffer 481 * aliasing for data) 482 * 483 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 484 * 485 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 486 * it will be locked exclusive. 487 * 488 * NOTE: Embedded elements (volume header, inodes) are always resolved 489 * regardless. 490 * 491 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 492 * element will instantiate and zero its buffer, and flush it on 493 * release. 494 * 495 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 496 * so as not to instantiate a device buffer, which could alias against 497 * a logical file buffer. However, if ALWAYS is specified the 498 * device buffer will be instantiated anyway. 499 * 500 * WARNING! If data must be fetched a shared lock will temporarily be 501 * upgraded to exclusive. However, a deadlock can occur if 502 * the caller owns more than one shared lock. 503 */ 504 int 505 hammer2_chain_lock(hammer2_chain_t *chain, int how) 506 { 507 hammer2_mount_t *hmp; 508 hammer2_chain_core_t *core; 509 hammer2_blockref_t *bref; 510 hammer2_off_t pbase; 511 hammer2_off_t pmask; 512 hammer2_off_t peof; 513 ccms_state_t ostate; 514 size_t boff; 515 size_t psize; 516 int error; 517 char *bdata; 518 519 /* 520 * Ref and lock the element. Recursive locks are allowed. 521 */ 522 if ((how & HAMMER2_RESOLVE_NOREF) == 0) 523 hammer2_chain_ref(chain); 524 atomic_add_int(&chain->lockcnt, 1); 525 526 hmp = chain->hmp; 527 KKASSERT(hmp != NULL); 528 529 /* 530 * Get the appropriate lock. 531 */ 532 core = chain->core; 533 if (how & HAMMER2_RESOLVE_SHARED) 534 ccms_thread_lock(&core->cst, CCMS_STATE_SHARED); 535 else 536 ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE); 537 538 /* 539 * If we already have a valid data pointer no further action is 540 * necessary. 541 */ 542 if (chain->data) 543 return (0); 544 545 /* 546 * Do we have to resolve the data? 547 */ 548 switch(how & HAMMER2_RESOLVE_MASK) { 549 case HAMMER2_RESOLVE_NEVER: 550 return(0); 551 case HAMMER2_RESOLVE_MAYBE: 552 if (chain->flags & HAMMER2_CHAIN_INITIAL) 553 return(0); 554 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 555 return(0); 556 #if 0 557 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 558 return(0); 559 #endif 560 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 561 return(0); 562 /* fall through */ 563 case HAMMER2_RESOLVE_ALWAYS: 564 break; 565 } 566 567 /* 568 * Upgrade to an exclusive lock so we can safely manipulate the 569 * buffer cache. If another thread got to it before us we 570 * can just return. 571 */ 572 ostate = ccms_thread_lock_upgrade(&core->cst); 573 if (chain->data) { 574 ccms_thread_lock_downgrade(&core->cst, ostate); 575 return (0); 576 } 577 578 /* 579 * We must resolve to a device buffer, either by issuing I/O or 580 * by creating a zero-fill element. We do not mark the buffer 581 * dirty when creating a zero-fill element (the hammer2_chain_modify() 582 * API must still be used to do that). 583 * 584 * The device buffer is variable-sized in powers of 2 down 585 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 586 * chunk always contains buffers of the same size. (XXX) 587 * 588 * The minimum physical IO size may be larger than the variable 589 * block size. 590 */ 591 bref = &chain->bref; 592 593 psize = hammer2_devblksize(chain->bytes); 594 pmask = (hammer2_off_t)psize - 1; 595 pbase = bref->data_off & ~pmask; 596 boff = bref->data_off & (HAMMER2_OFF_MASK & pmask); 597 KKASSERT(pbase != 0); 598 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64; 599 600 /* 601 * The getblk() optimization can only be used on newly created 602 * elements if the physical block size matches the request. 603 */ 604 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 605 chain->bytes == psize) { 606 chain->bp = getblk(hmp->devvp, pbase, psize, 0, 0); 607 error = 0; 608 } else if (hammer2_isclusterable(chain)) { 609 error = cluster_read(hmp->devvp, peof, pbase, psize, 610 psize, HAMMER2_PBUFSIZE*4, 611 &chain->bp); 612 adjreadcounter(&chain->bref, chain->bytes); 613 } else { 614 error = bread(hmp->devvp, pbase, psize, &chain->bp); 615 adjreadcounter(&chain->bref, chain->bytes); 616 } 617 618 if (error) { 619 kprintf("hammer2_chain_get: I/O error %016jx: %d\n", 620 (intmax_t)pbase, error); 621 bqrelse(chain->bp); 622 chain->bp = NULL; 623 ccms_thread_lock_downgrade(&core->cst, ostate); 624 return (error); 625 } 626 627 /* 628 * Zero the data area if the chain is in the INITIAL-create state. 629 * Mark the buffer for bdwrite(). This clears the INITIAL state 630 * but does not mark the chain modified. 631 */ 632 bdata = (char *)chain->bp->b_data + boff; 633 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 634 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 635 bzero(bdata, chain->bytes); 636 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 637 } 638 639 /* 640 * Setup the data pointer, either pointing it to an embedded data 641 * structure and copying the data from the buffer, or pointing it 642 * into the buffer. 643 * 644 * The buffer is not retained when copying to an embedded data 645 * structure in order to avoid potential deadlocks or recursions 646 * on the same physical buffer. 647 */ 648 switch (bref->type) { 649 case HAMMER2_BREF_TYPE_VOLUME: 650 case HAMMER2_BREF_TYPE_FREEMAP: 651 /* 652 * Copy data from bp to embedded buffer 653 */ 654 panic("hammer2_chain_lock: called on unresolved volume header"); 655 #if 0 656 /* NOT YET */ 657 KKASSERT(pbase == 0); 658 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE); 659 bcopy(bdata, &hmp->voldata, chain->bytes); 660 chain->data = (void *)&hmp->voldata; 661 bqrelse(chain->bp); 662 chain->bp = NULL; 663 #endif 664 break; 665 case HAMMER2_BREF_TYPE_INODE: 666 /* 667 * Copy data from bp to embedded buffer, do not retain the 668 * device buffer. 669 */ 670 KKASSERT(chain->bytes == sizeof(chain->data->ipdata)); 671 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED); 672 chain->data = kmalloc(sizeof(chain->data->ipdata), 673 hmp->mchain, M_WAITOK | M_ZERO); 674 bcopy(bdata, &chain->data->ipdata, chain->bytes); 675 bqrelse(chain->bp); 676 chain->bp = NULL; 677 break; 678 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 679 KKASSERT(chain->bytes == sizeof(chain->data->bmdata)); 680 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED); 681 chain->data = kmalloc(sizeof(chain->data->bmdata), 682 hmp->mchain, M_WAITOK | M_ZERO); 683 bcopy(bdata, &chain->data->bmdata, chain->bytes); 684 bqrelse(chain->bp); 685 chain->bp = NULL; 686 break; 687 case HAMMER2_BREF_TYPE_INDIRECT: 688 case HAMMER2_BREF_TYPE_DATA: 689 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 690 default: 691 /* 692 * Point data at the device buffer and leave bp intact. 693 */ 694 chain->data = (void *)bdata; 695 break; 696 } 697 698 /* 699 * Make sure the bp is not specifically owned by this thread before 700 * restoring to a possibly shared lock, so another hammer2 thread 701 * can release it. 702 */ 703 if (chain->bp) 704 BUF_KERNPROC(chain->bp); 705 ccms_thread_lock_downgrade(&core->cst, ostate); 706 return (0); 707 } 708 709 /* 710 * Asynchronously read the device buffer (dbp) and execute the specified 711 * callback. The caller should pass-in a locked chain (shared lock is ok). 712 * The function is responsible for unlocking the chain and for disposing 713 * of dbp. 714 * 715 * NOTE! A NULL dbp (but non-NULL data) will be passed to the function 716 * if the dbp is integrated into the chain, because we do not want 717 * the caller to dispose of dbp in that situation. 718 */ 719 static void hammer2_chain_load_async_callback(struct bio *bio); 720 721 void 722 hammer2_chain_load_async(hammer2_chain_t *chain, 723 void (*func)(hammer2_chain_t *, struct buf *, char *, void *), 724 void *arg) 725 { 726 hammer2_cbinfo_t *cbinfo; 727 hammer2_mount_t *hmp; 728 hammer2_blockref_t *bref; 729 hammer2_off_t pbase; 730 hammer2_off_t pmask; 731 hammer2_off_t peof; 732 struct buf *dbp; 733 size_t boff; 734 size_t psize; 735 char *bdata; 736 737 if (chain->data) { 738 func(chain, NULL, (char *)chain->data, arg); 739 return; 740 } 741 742 /* 743 * We must resolve to a device buffer, either by issuing I/O or 744 * by creating a zero-fill element. We do not mark the buffer 745 * dirty when creating a zero-fill element (the hammer2_chain_modify() 746 * API must still be used to do that). 747 * 748 * The device buffer is variable-sized in powers of 2 down 749 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 750 * chunk always contains buffers of the same size. (XXX) 751 * 752 * The minimum physical IO size may be larger than the variable 753 * block size. 754 */ 755 bref = &chain->bref; 756 757 psize = hammer2_devblksize(chain->bytes); 758 pmask = (hammer2_off_t)psize - 1; 759 pbase = bref->data_off & ~pmask; 760 boff = bref->data_off & (HAMMER2_OFF_MASK & pmask); 761 KKASSERT(pbase != 0); 762 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64; 763 764 hmp = chain->hmp; 765 766 /* 767 * The getblk() optimization can only be used on newly created 768 * elements if the physical block size matches the request. 769 */ 770 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 771 chain->bytes == psize) { 772 dbp = getblk(hmp->devvp, pbase, psize, 0, 0); 773 /*atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);*/ 774 bdata = (char *)dbp->b_data + boff; 775 bzero(bdata, chain->bytes); 776 /*atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);*/ 777 func(chain, dbp, bdata, arg); 778 return; 779 } 780 781 adjreadcounter(&chain->bref, chain->bytes); 782 cbinfo = kmalloc(sizeof(*cbinfo), hmp->mchain, M_INTWAIT | M_ZERO); 783 cbinfo->chain = chain; 784 cbinfo->func = func; 785 cbinfo->arg = arg; 786 cbinfo->boff = boff; 787 788 cluster_readcb(hmp->devvp, peof, pbase, psize, 789 HAMMER2_PBUFSIZE*4, HAMMER2_PBUFSIZE*4, 790 hammer2_chain_load_async_callback, cbinfo); 791 } 792 793 static void 794 hammer2_chain_load_async_callback(struct bio *bio) 795 { 796 hammer2_cbinfo_t *cbinfo; 797 hammer2_mount_t *hmp; 798 struct buf *dbp; 799 char *data; 800 801 /* 802 * Nobody is waiting for bio/dbp to complete, we are 803 * responsible for handling the biowait() equivalent 804 * on dbp which means clearing BIO_DONE and BIO_SYNC 805 * and calling bpdone() if it hasn't already been called 806 * to restore any covered holes in the buffer's backing 807 * store. 808 */ 809 dbp = bio->bio_buf; 810 if ((bio->bio_flags & BIO_DONE) == 0) 811 bpdone(dbp, 0); 812 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC); 813 814 /* 815 * Extract the auxillary info and issue the callback. 816 * Finish up with the dbp after it returns. 817 */ 818 cbinfo = bio->bio_caller_info1.ptr; 819 /*ccms_thread_lock_setown(cbinfo->chain->core);*/ 820 data = dbp->b_data + cbinfo->boff; 821 hmp = cbinfo->chain->hmp; 822 823 cbinfo = bio->bio_caller_info1.ptr; 824 cbinfo->func(cbinfo->chain, dbp, data, cbinfo->arg); 825 /* cbinfo->chain is stale now */ 826 bqrelse(dbp); 827 kfree(cbinfo, hmp->mchain); 828 } 829 830 /* 831 * Unlock and deref a chain element. 832 * 833 * On the last lock release any non-embedded data (chain->bp) will be 834 * retired. 835 */ 836 void 837 hammer2_chain_unlock(hammer2_chain_t *chain) 838 { 839 hammer2_chain_core_t *core = chain->core; 840 ccms_state_t ostate; 841 long *counterp; 842 u_int lockcnt; 843 844 /* 845 * The core->cst lock can be shared across several chains so we 846 * need to track the per-chain lockcnt separately. 847 * 848 * If multiple locks are present (or being attempted) on this 849 * particular chain we can just unlock, drop refs, and return. 850 * 851 * Otherwise fall-through on the 1->0 transition. 852 */ 853 for (;;) { 854 lockcnt = chain->lockcnt; 855 KKASSERT(lockcnt > 0); 856 cpu_ccfence(); 857 if (lockcnt > 1) { 858 if (atomic_cmpset_int(&chain->lockcnt, 859 lockcnt, lockcnt - 1)) { 860 ccms_thread_unlock(&core->cst); 861 hammer2_chain_drop(chain); 862 return; 863 } 864 } else { 865 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 866 break; 867 } 868 /* retry */ 869 } 870 871 /* 872 * On the 1->0 transition we upgrade the core lock (if necessary) 873 * to exclusive for terminal processing. If after upgrading we find 874 * that lockcnt is non-zero, another thread is racing us and will 875 * handle the unload for us later on, so just cleanup and return 876 * leaving the data/bp intact 877 * 878 * Otherwise if lockcnt is still 0 it is possible for it to become 879 * non-zero and race, but since we hold the core->cst lock 880 * exclusively all that will happen is that the chain will be 881 * reloaded after we unload it. 882 */ 883 ostate = ccms_thread_lock_upgrade(&core->cst); 884 if (chain->lockcnt) { 885 ccms_thread_unlock_upgraded(&core->cst, ostate); 886 hammer2_chain_drop(chain); 887 return; 888 } 889 890 /* 891 * Shortcut the case if the data is embedded or not resolved. 892 * 893 * Do NOT NULL out chain->data (e.g. inode data), it might be 894 * dirty. 895 * 896 * The DIRTYBP flag is non-applicable in this situation and can 897 * be cleared to keep the flags state clean. 898 */ 899 if (chain->bp == NULL) { 900 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 901 ccms_thread_unlock_upgraded(&core->cst, ostate); 902 hammer2_chain_drop(chain); 903 return; 904 } 905 906 /* 907 * Statistics 908 */ 909 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) { 910 ; 911 } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 912 switch(chain->bref.type) { 913 case HAMMER2_BREF_TYPE_DATA: 914 counterp = &hammer2_ioa_file_write; 915 break; 916 case HAMMER2_BREF_TYPE_INODE: 917 counterp = &hammer2_ioa_meta_write; 918 break; 919 case HAMMER2_BREF_TYPE_INDIRECT: 920 counterp = &hammer2_ioa_indr_write; 921 break; 922 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 923 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 924 counterp = &hammer2_ioa_fmap_write; 925 break; 926 default: 927 counterp = &hammer2_ioa_volu_write; 928 break; 929 } 930 *counterp += chain->bytes; 931 } else { 932 switch(chain->bref.type) { 933 case HAMMER2_BREF_TYPE_DATA: 934 counterp = &hammer2_iod_file_write; 935 break; 936 case HAMMER2_BREF_TYPE_INODE: 937 counterp = &hammer2_iod_meta_write; 938 break; 939 case HAMMER2_BREF_TYPE_INDIRECT: 940 counterp = &hammer2_iod_indr_write; 941 break; 942 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 943 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 944 counterp = &hammer2_iod_fmap_write; 945 break; 946 default: 947 counterp = &hammer2_iod_volu_write; 948 break; 949 } 950 *counterp += chain->bytes; 951 } 952 953 /* 954 * Clean out the bp. 955 * 956 * If a device buffer was used for data be sure to destroy the 957 * buffer when we are done to avoid aliases (XXX what about the 958 * underlying VM pages?). 959 * 960 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing 961 * is possible. 962 */ 963 #if 0 964 /* 965 * XXX our primary cache is now the block device, not 966 * the logical file. don't release the buffer. 967 */ 968 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 969 chain->bp->b_flags |= B_RELBUF; 970 #endif 971 972 /* 973 * The DIRTYBP flag tracks whether we have to bdwrite() the buffer 974 * or not. The flag will get re-set when chain_modify() is called, 975 * even if MODIFIED is already set, allowing the OS to retire the 976 * buffer independent of a hammer2 flus. 977 */ 978 chain->data = NULL; 979 if (chain->flags & HAMMER2_CHAIN_DIRTYBP) { 980 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 981 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 982 atomic_clear_int(&chain->flags, 983 HAMMER2_CHAIN_IOFLUSH); 984 chain->bp->b_flags |= B_RELBUF; 985 cluster_awrite(chain->bp); 986 } else { 987 chain->bp->b_flags |= B_CLUSTEROK; 988 bdwrite(chain->bp); 989 } 990 } else { 991 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 992 atomic_clear_int(&chain->flags, 993 HAMMER2_CHAIN_IOFLUSH); 994 chain->bp->b_flags |= B_RELBUF; 995 brelse(chain->bp); 996 } else { 997 /* bp might still be dirty */ 998 bqrelse(chain->bp); 999 } 1000 } 1001 chain->bp = NULL; 1002 ccms_thread_unlock_upgraded(&core->cst, ostate); 1003 hammer2_chain_drop(chain); 1004 } 1005 1006 /* 1007 * Resize the chain's physical storage allocation in-place. This may 1008 * replace the passed-in chain with a new chain. 1009 * 1010 * Chains can be resized smaller without reallocating the storage. 1011 * Resizing larger will reallocate the storage. 1012 * 1013 * Must be passed an exclusively locked parent and chain, returns a new 1014 * exclusively locked chain at the same index and unlocks the old chain. 1015 * Flushes the buffer if necessary. 1016 * 1017 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1018 * to avoid instantiating a device buffer that conflicts with the vnode 1019 * data buffer. That is, the passed-in bp is a logical buffer, whereas 1020 * any chain-oriented bp would be a device buffer. 1021 * 1022 * XXX flags currently ignored, uses chain->bp to detect data/no-data. 1023 * XXX return error if cannot resize. 1024 */ 1025 void 1026 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 1027 struct buf *bp, 1028 hammer2_chain_t *parent, hammer2_chain_t **chainp, 1029 int nradix, int flags) 1030 { 1031 hammer2_mount_t *hmp; 1032 hammer2_chain_t *chain; 1033 hammer2_off_t pbase; 1034 size_t obytes; 1035 size_t nbytes; 1036 size_t bbytes; 1037 int boff; 1038 1039 chain = *chainp; 1040 hmp = chain->hmp; 1041 1042 /* 1043 * Only data and indirect blocks can be resized for now. 1044 * (The volu root, inodes, and freemap elements use a fixed size). 1045 */ 1046 KKASSERT(chain != &hmp->vchain); 1047 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1048 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT); 1049 1050 /* 1051 * Nothing to do if the element is already the proper size 1052 */ 1053 obytes = chain->bytes; 1054 nbytes = 1U << nradix; 1055 if (obytes == nbytes) 1056 return; 1057 1058 /* 1059 * Delete the old chain and duplicate it at the same (parent, index), 1060 * returning a new chain. This allows the old chain to still be 1061 * used by the flush code. Duplication occurs in-place. 1062 * 1063 * The parent does not have to be locked for the delete/duplicate call, 1064 * but is in this particular code path. 1065 * 1066 * NOTE: If we are not crossing a synchronization point the 1067 * duplication code will simply reuse the existing chain 1068 * structure. 1069 */ 1070 hammer2_chain_delete_duplicate(trans, &chain, 0); 1071 1072 /* 1073 * Set MODIFIED and add a chain ref to prevent destruction. Both 1074 * modified flags share the same ref. (duplicated chains do not 1075 * start out MODIFIED unless possibly if the duplication code 1076 * decided to reuse the existing chain as-is). 1077 * 1078 * If the chain is already marked MODIFIED then we can safely 1079 * return the previous allocation to the pool without having to 1080 * worry about snapshots. XXX check flush synchronization. 1081 */ 1082 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1083 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1084 hammer2_chain_ref(chain); 1085 } 1086 1087 /* 1088 * Relocate the block, even if making it smaller (because different 1089 * block sizes may be in different regions). 1090 */ 1091 hammer2_freemap_alloc(trans, chain->hmp, &chain->bref, nbytes); 1092 chain->bytes = nbytes; 1093 /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */ 1094 1095 /* 1096 * The device buffer may be larger than the allocation size. 1097 */ 1098 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 1099 bbytes = HAMMER2_MINIOSIZE; 1100 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); 1101 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); 1102 1103 /* 1104 * For now just support it on DATA chains (and not on indirect 1105 * blocks). 1106 */ 1107 KKASSERT(chain->bp == NULL); 1108 1109 /* 1110 * Make sure the chain is marked MOVED and SUBMOD is set in the 1111 * parent(s) so the adjustments are picked up by flush. 1112 */ 1113 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 1114 hammer2_chain_ref(chain); 1115 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 1116 } 1117 hammer2_chain_setsubmod(trans, chain); 1118 *chainp = chain; 1119 } 1120 1121 /* 1122 * Set a chain modified, making it read-write and duplicating it if necessary. 1123 * This function will assign a new physical block to the chain if necessary 1124 * 1125 * Duplication of already-modified chains is possible when the modification 1126 * crosses a flush synchronization boundary. 1127 * 1128 * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE 1129 * level or the COW operation will not work. 1130 * 1131 * Data blocks - The chain is usually locked RESOLVE_NEVER so as not to 1132 * run the data through the device buffers. 1133 * 1134 * This function may return a different chain than was passed, in which case 1135 * the old chain will be unlocked and the new chain will be locked. 1136 * 1137 * ip->chain may be adjusted by hammer2_chain_modify_ip(). 1138 */ 1139 hammer2_inode_data_t * 1140 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 1141 hammer2_chain_t **chainp, int flags) 1142 { 1143 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 1144 hammer2_chain_modify(trans, chainp, flags); 1145 if (ip->chain != *chainp) 1146 hammer2_inode_repoint(ip, NULL, *chainp); 1147 return(&ip->chain->data->ipdata); 1148 } 1149 1150 void 1151 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp, 1152 int flags) 1153 { 1154 hammer2_mount_t *hmp; 1155 hammer2_chain_t *chain; 1156 hammer2_off_t pbase; 1157 hammer2_off_t pmask; 1158 hammer2_off_t peof; 1159 hammer2_tid_t flush_tid; 1160 struct buf *nbp; 1161 int error; 1162 int wasinitial; 1163 size_t psize; 1164 size_t boff; 1165 void *bdata; 1166 1167 /* 1168 * Data must be resolved if already assigned unless explicitly 1169 * flagged otherwise. 1170 */ 1171 chain = *chainp; 1172 hmp = chain->hmp; 1173 if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1174 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1175 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1176 hammer2_chain_unlock(chain); 1177 } 1178 1179 /* 1180 * data is not optional for freemap chains (we must always be sure 1181 * to copy the data on COW storage allocations). 1182 */ 1183 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1184 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1185 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1186 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1187 } 1188 1189 /* 1190 * If the chain is already marked MODIFIED we can usually just 1191 * return. However, if a modified chain is modified again in 1192 * a synchronization-point-crossing manner we have to issue a 1193 * delete/duplicate on the chain to avoid flush interference. 1194 */ 1195 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 1196 /* 1197 * Which flush_tid do we need to check? If the chain is 1198 * related to the freemap we have to use the freemap flush 1199 * tid (free_flush_tid), otherwise we use the normal filesystem 1200 * flush tid (topo_flush_tid). The two flush domains are 1201 * almost completely independent of each other. 1202 */ 1203 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1204 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1205 flush_tid = hmp->topo_flush_tid; /* XXX */ 1206 goto skipxx; /* XXX */ 1207 } else { 1208 flush_tid = hmp->topo_flush_tid; 1209 } 1210 1211 /* 1212 * Main tests 1213 */ 1214 if (chain->modify_tid <= flush_tid && 1215 trans->sync_tid > flush_tid) { 1216 /* 1217 * Modifications cross synchronization point, 1218 * requires delete-duplicate. 1219 */ 1220 KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0); 1221 hammer2_chain_delete_duplicate(trans, chainp, 0); 1222 chain = *chainp; 1223 /* fall through using duplicate */ 1224 } 1225 skipxx: /* XXX */ 1226 /* 1227 * Quick return path, set DIRTYBP to ensure that 1228 * the later retirement of bp will write it out. 1229 * 1230 * quick return path also needs the modify_tid 1231 * logic. 1232 */ 1233 if (chain->bp) 1234 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 1235 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0) 1236 chain->bref.modify_tid = trans->sync_tid; 1237 chain->modify_tid = trans->sync_tid; 1238 return; 1239 } 1240 1241 /* 1242 * modify_tid is only update for primary modifications, not for 1243 * propagated brefs. mirror_tid will be updated regardless during 1244 * the flush, no need to set it here. 1245 */ 1246 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0) 1247 chain->bref.modify_tid = trans->sync_tid; 1248 1249 /* 1250 * Set MODIFIED and add a chain ref to prevent destruction. Both 1251 * modified flags share the same ref. 1252 */ 1253 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1254 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1255 hammer2_chain_ref(chain); 1256 } 1257 1258 /* 1259 * Adjust chain->modify_tid so the flusher knows when the 1260 * modification occurred. 1261 */ 1262 chain->modify_tid = trans->sync_tid; 1263 1264 /* 1265 * The modification or re-modification requires an allocation and 1266 * possible COW. 1267 * 1268 * We normally always allocate new storage here. If storage exists 1269 * and MODIFY_NOREALLOC is passed in, we do not allocate new storage. 1270 */ 1271 if (chain != &hmp->vchain && 1272 chain != &hmp->fchain && 1273 ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1274 (flags & HAMMER2_MODIFY_NOREALLOC) == 0) 1275 ) { 1276 hammer2_freemap_alloc(trans, chain->hmp, 1277 &chain->bref, chain->bytes); 1278 /* XXX failed allocation */ 1279 } 1280 1281 /* 1282 * Do not COW if OPTDATA is set. INITIAL flag remains unchanged. 1283 * (OPTDATA does not prevent [re]allocation of storage, only the 1284 * related copy-on-write op). 1285 */ 1286 if (flags & HAMMER2_MODIFY_OPTDATA) 1287 goto skip2; 1288 1289 /* 1290 * Clearing the INITIAL flag (for indirect blocks) indicates that 1291 * we've processed the uninitialized storage allocation. 1292 * 1293 * If this flag is already clear we are likely in a copy-on-write 1294 * situation but we have to be sure NOT to bzero the storage if 1295 * no data is present. 1296 */ 1297 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1298 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1299 wasinitial = 1; 1300 } else { 1301 wasinitial = 0; 1302 } 1303 1304 #if 0 1305 /* 1306 * We currently should never instantiate a device buffer for a 1307 * file data chain. (We definitely can for a freemap chain). 1308 * 1309 * XXX we can now do this 1310 */ 1311 KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA); 1312 #endif 1313 1314 /* 1315 * Instantiate data buffer and possibly execute COW operation 1316 */ 1317 switch(chain->bref.type) { 1318 case HAMMER2_BREF_TYPE_VOLUME: 1319 case HAMMER2_BREF_TYPE_FREEMAP: 1320 case HAMMER2_BREF_TYPE_INODE: 1321 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1322 /* 1323 * The data is embedded, no copy-on-write operation is 1324 * needed. 1325 */ 1326 KKASSERT(chain->bp == NULL); 1327 break; 1328 case HAMMER2_BREF_TYPE_DATA: 1329 case HAMMER2_BREF_TYPE_INDIRECT: 1330 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1331 /* 1332 * Perform the copy-on-write operation 1333 */ 1334 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1335 1336 psize = hammer2_devblksize(chain->bytes); 1337 pmask = (hammer2_off_t)psize - 1; 1338 pbase = chain->bref.data_off & ~pmask; 1339 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask); 1340 KKASSERT(pbase != 0); 1341 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64; 1342 1343 /* 1344 * The getblk() optimization can only be used if the 1345 * chain element size matches the physical block size. 1346 */ 1347 if (chain->bp && chain->bp->b_loffset == pbase) { 1348 nbp = chain->bp; 1349 error = 0; 1350 } else if (chain->bytes == psize) { 1351 nbp = getblk(hmp->devvp, pbase, psize, 0, 0); 1352 error = 0; 1353 } else if (hammer2_isclusterable(chain)) { 1354 error = cluster_read(hmp->devvp, peof, pbase, psize, 1355 psize, HAMMER2_PBUFSIZE*4, 1356 &nbp); 1357 adjreadcounter(&chain->bref, chain->bytes); 1358 } else { 1359 error = bread(hmp->devvp, pbase, psize, &nbp); 1360 adjreadcounter(&chain->bref, chain->bytes); 1361 } 1362 KKASSERT(error == 0); 1363 bdata = (char *)nbp->b_data + boff; 1364 1365 /* 1366 * Copy or zero-fill on write depending on whether 1367 * chain->data exists or not. Retire the existing bp 1368 * based on the DIRTYBP flag. Set the DIRTYBP flag to 1369 * indicate that retirement of nbp should use bdwrite(). 1370 */ 1371 if (chain->data) { 1372 KKASSERT(chain->bp != NULL); 1373 if (chain->data != bdata) { 1374 bcopy(chain->data, bdata, chain->bytes); 1375 } 1376 } else if (wasinitial) { 1377 bzero(bdata, chain->bytes); 1378 } else { 1379 /* 1380 * We have a problem. We were asked to COW but 1381 * we don't have any data to COW with! 1382 */ 1383 panic("hammer2_chain_modify: having a COW %p\n", 1384 chain); 1385 } 1386 if (chain->bp != nbp) { 1387 if (chain->bp) { 1388 if (chain->flags & HAMMER2_CHAIN_DIRTYBP) { 1389 chain->bp->b_flags |= B_CLUSTEROK; 1390 bdwrite(chain->bp); 1391 } else { 1392 chain->bp->b_flags |= B_RELBUF; 1393 brelse(chain->bp); 1394 } 1395 } 1396 chain->bp = nbp; 1397 BUF_KERNPROC(chain->bp); 1398 } 1399 chain->data = bdata; 1400 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 1401 break; 1402 default: 1403 panic("hammer2_chain_modify: illegal non-embedded type %d", 1404 chain->bref.type); 1405 break; 1406 1407 } 1408 skip2: 1409 hammer2_chain_setsubmod(trans, chain); 1410 } 1411 1412 /* 1413 * Mark the volume as having been modified. This short-cut version 1414 * does not have to lock the volume's chain, which allows the ioctl 1415 * code to make adjustments to connections without deadlocking. XXX 1416 * 1417 * No ref is made on vchain when flagging it MODIFIED. 1418 */ 1419 void 1420 hammer2_modify_volume(hammer2_mount_t *hmp) 1421 { 1422 hammer2_voldata_lock(hmp); 1423 hammer2_voldata_unlock(hmp, 1); 1424 } 1425 1426 /* 1427 * Locate an in-memory chain. The parent must be locked. The in-memory 1428 * chain is returned with a reference and without a lock, or NULL 1429 * if not found. 1430 * 1431 * This function returns the chain at the specified index with the highest 1432 * delete_tid. The caller must check whether the chain is flagged 1433 * CHAIN_DELETED or not. However, because chain iterations can be removed 1434 * from memory we must ALSO check that DELETED chains are not flushed. A 1435 * DELETED chain which has been flushed must be ignored (the caller must 1436 * check the parent's blockref array). 1437 * 1438 * NOTE: If no chain is found the caller usually must check the on-media 1439 * array to determine if a blockref exists at the index. 1440 */ 1441 struct hammer2_chain_find_info { 1442 hammer2_chain_t *best; 1443 hammer2_tid_t delete_tid; 1444 int index; 1445 }; 1446 1447 static 1448 int 1449 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 1450 { 1451 struct hammer2_chain_find_info *info = data; 1452 1453 if (child->index < info->index) 1454 return(-1); 1455 if (child->index > info->index) 1456 return(1); 1457 return(0); 1458 } 1459 1460 static 1461 int 1462 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 1463 { 1464 struct hammer2_chain_find_info *info = data; 1465 1466 if (info->delete_tid < child->delete_tid) { 1467 info->delete_tid = child->delete_tid; 1468 info->best = child; 1469 } 1470 return(0); 1471 } 1472 1473 static 1474 hammer2_chain_t * 1475 hammer2_chain_find_locked(hammer2_chain_t *parent, int index) 1476 { 1477 struct hammer2_chain_find_info info; 1478 hammer2_chain_t *child; 1479 1480 info.index = index; 1481 info.delete_tid = 0; 1482 info.best = NULL; 1483 1484 RB_SCAN(hammer2_chain_tree, &parent->core->rbtree, 1485 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1486 &info); 1487 child = info.best; 1488 1489 return (child); 1490 } 1491 1492 hammer2_chain_t * 1493 hammer2_chain_find(hammer2_chain_t *parent, int index) 1494 { 1495 hammer2_chain_t *child; 1496 1497 spin_lock(&parent->core->cst.spin); 1498 child = hammer2_chain_find_locked(parent, index); 1499 if (child) 1500 hammer2_chain_ref(child); 1501 spin_unlock(&parent->core->cst.spin); 1502 1503 return (child); 1504 } 1505 1506 /* 1507 * Return a locked chain structure with all associated data acquired. 1508 * (if LOOKUP_NOLOCK is requested the returned chain is only referenced). 1509 * 1510 * Caller must hold the parent locked shared or exclusive since we may 1511 * need the parent's bref array to find our block. 1512 * 1513 * The returned child is locked as requested. If NOLOCK, the returned 1514 * child is still at least referenced. 1515 */ 1516 hammer2_chain_t * 1517 hammer2_chain_get(hammer2_chain_t *parent, int index, int flags) 1518 { 1519 hammer2_blockref_t *bref; 1520 hammer2_mount_t *hmp = parent->hmp; 1521 hammer2_chain_core_t *above = parent->core; 1522 hammer2_chain_t *chain; 1523 hammer2_chain_t dummy; 1524 int how; 1525 1526 /* 1527 * Figure out how to lock. MAYBE can be used to optimized 1528 * the initial-create state for indirect blocks. 1529 */ 1530 if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) 1531 how = HAMMER2_RESOLVE_NEVER; 1532 else 1533 how = HAMMER2_RESOLVE_MAYBE; 1534 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 1535 how |= HAMMER2_RESOLVE_SHARED; 1536 1537 retry: 1538 /* 1539 * First see if we have a (possibly modified) chain element cached 1540 * for this (parent, index). Acquire the data if necessary. 1541 * 1542 * If chain->data is non-NULL the chain should already be marked 1543 * modified. 1544 */ 1545 dummy.flags = 0; 1546 dummy.index = index; 1547 dummy.delete_tid = HAMMER2_MAX_TID; 1548 spin_lock(&above->cst.spin); 1549 chain = RB_FIND(hammer2_chain_tree, &above->rbtree, &dummy); 1550 if (chain) { 1551 hammer2_chain_ref(chain); 1552 spin_unlock(&above->cst.spin); 1553 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) 1554 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); 1555 return(chain); 1556 } 1557 spin_unlock(&above->cst.spin); 1558 1559 /* 1560 * The parent chain must not be in the INITIAL state. 1561 */ 1562 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1563 panic("hammer2_chain_get: Missing bref(1)"); 1564 /* NOT REACHED */ 1565 } 1566 1567 /* 1568 * No RBTREE entry found, lookup the bref and issue I/O (switch on 1569 * the parent's bref to determine where and how big the array is). 1570 */ 1571 switch(parent->bref.type) { 1572 case HAMMER2_BREF_TYPE_INODE: 1573 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT); 1574 bref = &parent->data->ipdata.u.blockset.blockref[index]; 1575 break; 1576 case HAMMER2_BREF_TYPE_INDIRECT: 1577 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1578 KKASSERT(parent->data != NULL); 1579 KKASSERT(index >= 0 && 1580 index < parent->bytes / sizeof(hammer2_blockref_t)); 1581 bref = &parent->data->npdata[index]; 1582 break; 1583 case HAMMER2_BREF_TYPE_VOLUME: 1584 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT); 1585 bref = &hmp->voldata.sroot_blockset.blockref[index]; 1586 break; 1587 case HAMMER2_BREF_TYPE_FREEMAP: 1588 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT); 1589 bref = &hmp->voldata.freemap_blockset.blockref[index]; 1590 break; 1591 default: 1592 bref = NULL; 1593 panic("hammer2_chain_get: unrecognized blockref type: %d", 1594 parent->bref.type); 1595 } 1596 if (bref->type == 0) { 1597 panic("hammer2_chain_get: Missing bref(2)"); 1598 /* NOT REACHED */ 1599 } 1600 1601 /* 1602 * Allocate a chain structure representing the existing media 1603 * entry. Resulting chain has one ref and is not locked. 1604 * 1605 * The locking operation we do later will issue I/O to read it. 1606 */ 1607 chain = hammer2_chain_alloc(hmp, NULL, bref); 1608 hammer2_chain_core_alloc(chain, NULL); /* ref'd chain returned */ 1609 1610 /* 1611 * Link the chain into its parent. A spinlock is required to safely 1612 * access the RBTREE, and it is possible to collide with another 1613 * hammer2_chain_get() operation because the caller might only hold 1614 * a shared lock on the parent. 1615 */ 1616 KKASSERT(parent->refs > 0); 1617 spin_lock(&above->cst.spin); 1618 chain->above = above; 1619 chain->index = index; 1620 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, chain)) { 1621 chain->above = NULL; 1622 chain->index = -1; 1623 spin_unlock(&above->cst.spin); 1624 hammer2_chain_drop(chain); 1625 goto retry; 1626 } 1627 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 1628 spin_unlock(&above->cst.spin); 1629 1630 /* 1631 * Our new chain is referenced but NOT locked. Lock the chain 1632 * below. The locking operation also resolves its data. 1633 * 1634 * If NOLOCK is set the release will release the one-and-only lock. 1635 */ 1636 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) { 1637 hammer2_chain_lock(chain, how); /* recusive lock */ 1638 hammer2_chain_drop(chain); /* excess ref */ 1639 } 1640 return (chain); 1641 } 1642 1643 /* 1644 * Lookup initialization/completion API 1645 */ 1646 hammer2_chain_t * 1647 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 1648 { 1649 if (flags & HAMMER2_LOOKUP_SHARED) { 1650 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 1651 HAMMER2_RESOLVE_SHARED); 1652 } else { 1653 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 1654 } 1655 return (parent); 1656 } 1657 1658 void 1659 hammer2_chain_lookup_done(hammer2_chain_t *parent) 1660 { 1661 if (parent) 1662 hammer2_chain_unlock(parent); 1663 } 1664 1665 static 1666 hammer2_chain_t * 1667 hammer2_chain_getparent(hammer2_chain_t **parentp, int how) 1668 { 1669 hammer2_chain_t *oparent; 1670 hammer2_chain_t *nparent; 1671 hammer2_chain_core_t *above; 1672 1673 oparent = *parentp; 1674 above = oparent->above; 1675 1676 spin_lock(&above->cst.spin); 1677 nparent = above->first_parent; 1678 while (hammer2_chain_refactor_test(nparent, 1)) 1679 nparent = nparent->next_parent; 1680 hammer2_chain_ref(nparent); /* protect nparent, use in lock */ 1681 spin_unlock(&above->cst.spin); 1682 1683 hammer2_chain_unlock(oparent); 1684 hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF); 1685 *parentp = nparent; 1686 1687 return (nparent); 1688 } 1689 1690 /* 1691 * Locate any key between key_beg and key_end inclusive. (*parentp) 1692 * typically points to an inode but can also point to a related indirect 1693 * block and this function will recurse upwards and find the inode again. 1694 * 1695 * WARNING! THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER! ANY KEY 1696 * WITHIN THE RANGE CAN BE RETURNED. HOWEVER, AN ITERATION 1697 * WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN 1698 * AND ALL IN-RANGE KEYS WILL EVENTUALLY BE RETURNED (NOT 1699 * NECESSARILY IN ORDER). 1700 * 1701 * (*parentp) must be exclusively locked and referenced and can be an inode 1702 * or an existing indirect block within the inode. 1703 * 1704 * On return (*parentp) will be modified to point at the deepest parent chain 1705 * element encountered during the search, as a helper for an insertion or 1706 * deletion. The new (*parentp) will be locked and referenced and the old 1707 * will be unlocked and dereferenced (no change if they are both the same). 1708 * 1709 * The matching chain will be returned exclusively locked. If NOLOCK is 1710 * requested the chain will be returned only referenced. 1711 * 1712 * NULL is returned if no match was found, but (*parentp) will still 1713 * potentially be adjusted. 1714 * 1715 * This function will also recurse up the chain if the key is not within the 1716 * current parent's range. (*parentp) can never be set to NULL. An iteration 1717 * can simply allow (*parentp) to float inside the loop. 1718 * 1719 * NOTE! chain->data is not always resolved. By default it will not be 1720 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 1721 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 1722 * BREF_TYPE_DATA as the device buffer can alias the logical file 1723 * buffer). 1724 */ 1725 hammer2_chain_t * 1726 hammer2_chain_lookup(hammer2_chain_t **parentp, 1727 hammer2_key_t key_beg, hammer2_key_t key_end, 1728 int flags) 1729 { 1730 hammer2_mount_t *hmp; 1731 hammer2_chain_t *parent; 1732 hammer2_chain_t *chain; 1733 hammer2_chain_t *tmp; 1734 hammer2_blockref_t *base; 1735 hammer2_blockref_t *bref; 1736 hammer2_key_t scan_beg; 1737 hammer2_key_t scan_end; 1738 int count = 0; 1739 int i; 1740 int how_always = HAMMER2_RESOLVE_ALWAYS; 1741 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1742 1743 if (flags & HAMMER2_LOOKUP_ALWAYS) 1744 how_maybe = how_always; 1745 1746 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 1747 how_maybe |= HAMMER2_RESOLVE_SHARED; 1748 how_always |= HAMMER2_RESOLVE_SHARED; 1749 } 1750 1751 /* 1752 * Recurse (*parentp) upward if necessary until the parent completely 1753 * encloses the key range or we hit the inode. 1754 */ 1755 parent = *parentp; 1756 hmp = parent->hmp; 1757 1758 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1759 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1760 scan_beg = parent->bref.key; 1761 scan_end = scan_beg + 1762 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1763 if (key_beg >= scan_beg && key_end <= scan_end) 1764 break; 1765 parent = hammer2_chain_getparent(parentp, how_maybe); 1766 } 1767 1768 again: 1769 /* 1770 * Locate the blockref array. Currently we do a fully associative 1771 * search through the array. 1772 */ 1773 switch(parent->bref.type) { 1774 case HAMMER2_BREF_TYPE_INODE: 1775 /* 1776 * Special shortcut for embedded data returns the inode 1777 * itself. Callers must detect this condition and access 1778 * the embedded data (the strategy code does this for us). 1779 * 1780 * This is only applicable to regular files and softlinks. 1781 */ 1782 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 1783 if (flags & HAMMER2_LOOKUP_NOLOCK) 1784 hammer2_chain_ref(parent); 1785 else 1786 hammer2_chain_lock(parent, how_always); 1787 return (parent); 1788 } 1789 base = &parent->data->ipdata.u.blockset.blockref[0]; 1790 count = HAMMER2_SET_COUNT; 1791 break; 1792 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1793 case HAMMER2_BREF_TYPE_INDIRECT: 1794 /* 1795 * Handle MATCHIND on the parent 1796 */ 1797 if (flags & HAMMER2_LOOKUP_MATCHIND) { 1798 scan_beg = parent->bref.key; 1799 scan_end = scan_beg + 1800 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1801 if (key_beg == scan_beg && key_end == scan_end) { 1802 chain = parent; 1803 hammer2_chain_lock(chain, how_maybe); 1804 goto done; 1805 } 1806 } 1807 /* 1808 * Optimize indirect blocks in the INITIAL state to avoid 1809 * I/O. 1810 */ 1811 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1812 base = NULL; 1813 } else { 1814 if (parent->data == NULL) 1815 panic("parent->data is NULL"); 1816 base = &parent->data->npdata[0]; 1817 } 1818 count = parent->bytes / sizeof(hammer2_blockref_t); 1819 break; 1820 case HAMMER2_BREF_TYPE_VOLUME: 1821 base = &hmp->voldata.sroot_blockset.blockref[0]; 1822 count = HAMMER2_SET_COUNT; 1823 break; 1824 case HAMMER2_BREF_TYPE_FREEMAP: 1825 base = &hmp->voldata.freemap_blockset.blockref[0]; 1826 count = HAMMER2_SET_COUNT; 1827 break; 1828 default: 1829 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 1830 parent->bref.type); 1831 base = NULL; /* safety */ 1832 count = 0; /* safety */ 1833 } 1834 1835 /* 1836 * If the element and key overlap we use the element. 1837 * 1838 * NOTE! Deleted elements are effectively invisible. Deletions 1839 * proactively clear the parent bref to the deleted child 1840 * so we do not try to shadow here to avoid parent updates 1841 * (which would be difficult since multiple deleted elements 1842 * might represent different flush synchronization points). 1843 */ 1844 bref = NULL; 1845 scan_beg = 0; /* avoid compiler warning */ 1846 scan_end = 0; /* avoid compiler warning */ 1847 1848 for (i = 0; i < count; ++i) { 1849 tmp = hammer2_chain_find(parent, i); 1850 if (tmp) { 1851 if (tmp->flags & HAMMER2_CHAIN_DELETED) { 1852 hammer2_chain_drop(tmp); 1853 continue; 1854 } 1855 bref = &tmp->bref; 1856 KKASSERT(bref->type != 0); 1857 } else if (base == NULL || base[i].type == 0) { 1858 continue; 1859 } else { 1860 bref = &base[i]; 1861 } 1862 scan_beg = bref->key; 1863 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1; 1864 if (tmp) 1865 hammer2_chain_drop(tmp); 1866 if (key_beg <= scan_end && key_end >= scan_beg) 1867 break; 1868 } 1869 if (i == count) { 1870 if (key_beg == key_end) 1871 return (NULL); 1872 return (hammer2_chain_next(parentp, NULL, 1873 key_beg, key_end, flags)); 1874 } 1875 1876 /* 1877 * Acquire the new chain element. If the chain element is an 1878 * indirect block we must search recursively. 1879 * 1880 * It is possible for the tmp chain above to be removed from 1881 * the RBTREE but the parent lock ensures it would not have been 1882 * destroyed from the media, so the chain_get() code will simply 1883 * reload it from the media in that case. 1884 */ 1885 chain = hammer2_chain_get(parent, i, flags); 1886 if (chain == NULL) 1887 return (NULL); 1888 1889 /* 1890 * If the chain element is an indirect block it becomes the new 1891 * parent and we loop on it. 1892 * 1893 * The parent always has to be locked with at least RESOLVE_MAYBE 1894 * so we can access its data. It might need a fixup if the caller 1895 * passed incompatible flags. Be careful not to cause a deadlock 1896 * as a data-load requires an exclusive lock. 1897 * 1898 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 1899 * range is within the requested key range we return the indirect 1900 * block and do NOT loop. This is usually only used to acquire 1901 * freemap nodes. 1902 */ 1903 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1904 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1905 hammer2_chain_unlock(parent); 1906 *parentp = parent = chain; 1907 if (flags & HAMMER2_LOOKUP_NOLOCK) { 1908 hammer2_chain_lock(chain, 1909 how_maybe | 1910 HAMMER2_RESOLVE_NOREF); 1911 } else if ((flags & HAMMER2_LOOKUP_NODATA) && 1912 chain->data == NULL) { 1913 hammer2_chain_ref(chain); 1914 hammer2_chain_unlock(chain); 1915 hammer2_chain_lock(chain, 1916 how_maybe | 1917 HAMMER2_RESOLVE_NOREF); 1918 } 1919 goto again; 1920 } 1921 done: 1922 /* 1923 * All done, return the chain 1924 */ 1925 return (chain); 1926 } 1927 1928 /* 1929 * After having issued a lookup we can iterate all matching keys. 1930 * 1931 * If chain is non-NULL we continue the iteration from just after it's index. 1932 * 1933 * If chain is NULL we assume the parent was exhausted and continue the 1934 * iteration at the next parent. 1935 * 1936 * parent must be locked on entry and remains locked throughout. chain's 1937 * lock status must match flags. Chain is always at least referenced. 1938 * 1939 * WARNING! The MATCHIND flag does not apply to this function. 1940 */ 1941 hammer2_chain_t * 1942 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 1943 hammer2_key_t key_beg, hammer2_key_t key_end, 1944 int flags) 1945 { 1946 hammer2_mount_t *hmp; 1947 hammer2_chain_t *parent; 1948 hammer2_chain_t *tmp; 1949 hammer2_blockref_t *base; 1950 hammer2_blockref_t *bref; 1951 hammer2_key_t scan_beg; 1952 hammer2_key_t scan_end; 1953 int i; 1954 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1955 int count; 1956 1957 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 1958 how_maybe |= HAMMER2_RESOLVE_SHARED; 1959 1960 parent = *parentp; 1961 hmp = parent->hmp; 1962 1963 again: 1964 /* 1965 * Calculate the next index and recalculate the parent if necessary. 1966 */ 1967 if (chain) { 1968 /* 1969 * Continue iteration within current parent. If not NULL 1970 * the passed-in chain may or may not be locked, based on 1971 * the LOOKUP_NOLOCK flag (passed in as returned from lookup 1972 * or a prior next). 1973 */ 1974 i = chain->index + 1; 1975 if (flags & HAMMER2_LOOKUP_NOLOCK) 1976 hammer2_chain_drop(chain); 1977 else 1978 hammer2_chain_unlock(chain); 1979 1980 /* 1981 * Any scan where the lookup returned degenerate data embedded 1982 * in the inode has an invalid index and must terminate. 1983 */ 1984 if (chain == parent) 1985 return(NULL); 1986 chain = NULL; 1987 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 1988 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1989 /* 1990 * We reached the end of the iteration. 1991 */ 1992 return (NULL); 1993 } else { 1994 /* 1995 * Continue iteration with next parent unless the current 1996 * parent covers the range. 1997 */ 1998 scan_beg = parent->bref.key; 1999 scan_end = scan_beg + 2000 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2001 if (key_beg >= scan_beg && key_end <= scan_end) 2002 return (NULL); 2003 2004 i = parent->index + 1; 2005 parent = hammer2_chain_getparent(parentp, how_maybe); 2006 } 2007 2008 again2: 2009 /* 2010 * Locate the blockref array. Currently we do a fully associative 2011 * search through the array. 2012 */ 2013 switch(parent->bref.type) { 2014 case HAMMER2_BREF_TYPE_INODE: 2015 base = &parent->data->ipdata.u.blockset.blockref[0]; 2016 count = HAMMER2_SET_COUNT; 2017 break; 2018 case HAMMER2_BREF_TYPE_INDIRECT: 2019 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2020 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2021 base = NULL; 2022 } else { 2023 KKASSERT(parent->data != NULL); 2024 base = &parent->data->npdata[0]; 2025 } 2026 count = parent->bytes / sizeof(hammer2_blockref_t); 2027 break; 2028 case HAMMER2_BREF_TYPE_VOLUME: 2029 base = &hmp->voldata.sroot_blockset.blockref[0]; 2030 count = HAMMER2_SET_COUNT; 2031 break; 2032 case HAMMER2_BREF_TYPE_FREEMAP: 2033 base = &hmp->voldata.freemap_blockset.blockref[0]; 2034 count = HAMMER2_SET_COUNT; 2035 break; 2036 default: 2037 panic("hammer2_chain_next: unrecognized blockref type: %d", 2038 parent->bref.type); 2039 base = NULL; /* safety */ 2040 count = 0; /* safety */ 2041 break; 2042 } 2043 KKASSERT(i <= count); 2044 2045 /* 2046 * Look for the key. If we are unable to find a match and an exact 2047 * match was requested we return NULL. If a range was requested we 2048 * run hammer2_chain_next() to iterate. 2049 * 2050 * NOTE! Deleted elements are effectively invisible. Deletions 2051 * proactively clear the parent bref to the deleted child 2052 * so we do not try to shadow here to avoid parent updates 2053 * (which would be difficult since multiple deleted elements 2054 * might represent different flush synchronization points). 2055 */ 2056 bref = NULL; 2057 scan_beg = 0; /* avoid compiler warning */ 2058 scan_end = 0; /* avoid compiler warning */ 2059 2060 while (i < count) { 2061 tmp = hammer2_chain_find(parent, i); 2062 if (tmp) { 2063 if (tmp->flags & HAMMER2_CHAIN_DELETED) { 2064 hammer2_chain_drop(tmp); 2065 ++i; 2066 continue; 2067 } 2068 bref = &tmp->bref; 2069 } else if (base == NULL || base[i].type == 0) { 2070 ++i; 2071 continue; 2072 } else { 2073 bref = &base[i]; 2074 } 2075 scan_beg = bref->key; 2076 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1; 2077 if (tmp) 2078 hammer2_chain_drop(tmp); 2079 if (key_beg <= scan_end && key_end >= scan_beg) 2080 break; 2081 ++i; 2082 } 2083 2084 /* 2085 * If we couldn't find a match recurse up a parent to continue the 2086 * search. 2087 */ 2088 if (i == count) 2089 goto again; 2090 2091 /* 2092 * Acquire the new chain element. If the chain element is an 2093 * indirect block we must search recursively. 2094 */ 2095 chain = hammer2_chain_get(parent, i, flags); 2096 if (chain == NULL) 2097 return (NULL); 2098 2099 /* 2100 * If the chain element is an indirect block it becomes the new 2101 * parent and we loop on it. 2102 * 2103 * The parent always has to be locked with at least RESOLVE_MAYBE 2104 * so we can access its data. It might need a fixup if the caller 2105 * passed incompatible flags. Be careful not to cause a deadlock 2106 * as a data-load requires an exclusive lock. 2107 * 2108 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2109 * range is within the requested key range we return the indirect 2110 * block and do NOT loop. This is usually only used to acquire 2111 * freemap nodes. 2112 */ 2113 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2114 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2115 if ((flags & HAMMER2_LOOKUP_MATCHIND) == 0 || 2116 key_beg > scan_beg || key_end < scan_end) { 2117 hammer2_chain_unlock(parent); 2118 *parentp = parent = chain; 2119 chain = NULL; 2120 if (flags & HAMMER2_LOOKUP_NOLOCK) { 2121 hammer2_chain_lock(parent, 2122 how_maybe | 2123 HAMMER2_RESOLVE_NOREF); 2124 } else if ((flags & HAMMER2_LOOKUP_NODATA) && 2125 parent->data == NULL) { 2126 hammer2_chain_ref(parent); 2127 hammer2_chain_unlock(parent); 2128 hammer2_chain_lock(parent, 2129 how_maybe | 2130 HAMMER2_RESOLVE_NOREF); 2131 } 2132 i = 0; 2133 goto again2; 2134 } 2135 } 2136 2137 /* 2138 * All done, return chain 2139 */ 2140 return (chain); 2141 } 2142 2143 /* 2144 * Create and return a new hammer2 system memory structure of the specified 2145 * key, type and size and insert it under (*parentp). This is a full 2146 * insertion, based on the supplied key/keybits, and may involve creating 2147 * indirect blocks and moving other chains around via delete/duplicate. 2148 * 2149 * (*parentp) must be exclusive locked and may be replaced on return 2150 * depending on how much work the function had to do. 2151 * 2152 * (*chainp) usually starts out NULL and returns the newly created chain, 2153 * but if the caller desires the caller may allocate a disconnected chain 2154 * and pass it in instead. (It is also possible for the caller to use 2155 * chain_duplicate() to create a disconnected chain, manipulate it, then 2156 * pass it into this function to insert it). 2157 * 2158 * This function should NOT be used to insert INDIRECT blocks. It is 2159 * typically used to create/insert inodes and data blocks. 2160 * 2161 * Caller must pass-in an exclusively locked parent the new chain is to 2162 * be inserted under, and optionally pass-in a disconnected, exclusively 2163 * locked chain to insert (else we create a new chain). The function will 2164 * adjust (*parentp) as necessary, create or connect the chain, and 2165 * return an exclusively locked chain in *chainp. 2166 */ 2167 int 2168 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp, 2169 hammer2_chain_t **chainp, 2170 hammer2_key_t key, int keybits, int type, size_t bytes) 2171 { 2172 hammer2_mount_t *hmp; 2173 hammer2_chain_t *chain; 2174 hammer2_chain_t *child; 2175 hammer2_chain_t *parent = *parentp; 2176 hammer2_chain_core_t *above; 2177 hammer2_blockref_t dummy; 2178 hammer2_blockref_t *base; 2179 int allocated = 0; 2180 int error = 0; 2181 int count; 2182 int i; 2183 2184 above = parent->core; 2185 KKASSERT(ccms_thread_lock_owned(&above->cst)); 2186 hmp = parent->hmp; 2187 chain = *chainp; 2188 2189 if (chain == NULL) { 2190 /* 2191 * First allocate media space and construct the dummy bref, 2192 * then allocate the in-memory chain structure. Set the 2193 * INITIAL flag for fresh chains. 2194 */ 2195 bzero(&dummy, sizeof(dummy)); 2196 dummy.type = type; 2197 dummy.key = key; 2198 dummy.keybits = keybits; 2199 dummy.data_off = hammer2_getradix(bytes); 2200 dummy.methods = parent->bref.methods; 2201 chain = hammer2_chain_alloc(hmp, trans, &dummy); 2202 hammer2_chain_core_alloc(chain, NULL); 2203 2204 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 2205 2206 /* 2207 * Lock the chain manually, chain_lock will load the chain 2208 * which we do NOT want to do. (note: chain->refs is set 2209 * to 1 by chain_alloc() for us, but lockcnt is not). 2210 */ 2211 chain->lockcnt = 1; 2212 ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE); 2213 allocated = 1; 2214 2215 /* 2216 * We do NOT set INITIAL here (yet). INITIAL is only 2217 * used for indirect blocks. 2218 * 2219 * Recalculate bytes to reflect the actual media block 2220 * allocation. 2221 */ 2222 bytes = (hammer2_off_t)1 << 2223 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 2224 chain->bytes = bytes; 2225 2226 switch(type) { 2227 case HAMMER2_BREF_TYPE_VOLUME: 2228 case HAMMER2_BREF_TYPE_FREEMAP: 2229 panic("hammer2_chain_create: called with volume type"); 2230 break; 2231 case HAMMER2_BREF_TYPE_INODE: 2232 KKASSERT(bytes == HAMMER2_INODE_BYTES); 2233 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED); 2234 chain->data = kmalloc(sizeof(chain->data->ipdata), 2235 hmp->mchain, M_WAITOK | M_ZERO); 2236 break; 2237 case HAMMER2_BREF_TYPE_INDIRECT: 2238 panic("hammer2_chain_create: cannot be used to" 2239 "create indirect block"); 2240 break; 2241 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2242 panic("hammer2_chain_create: cannot be used to" 2243 "create freemap root or node"); 2244 break; 2245 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2246 KKASSERT(bytes == sizeof(chain->data->bmdata)); 2247 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED); 2248 chain->data = kmalloc(sizeof(chain->data->bmdata), 2249 hmp->mchain, M_WAITOK | M_ZERO); 2250 break; 2251 case HAMMER2_BREF_TYPE_DATA: 2252 default: 2253 /* leave chain->data NULL */ 2254 KKASSERT(chain->data == NULL); 2255 break; 2256 } 2257 } else { 2258 /* 2259 * Potentially update the existing chain's key/keybits. 2260 * 2261 * Do NOT mess with the current state of the INITIAL flag. 2262 */ 2263 chain->bref.key = key; 2264 chain->bref.keybits = keybits; 2265 KKASSERT(chain->above == NULL); 2266 } 2267 2268 again: 2269 above = parent->core; 2270 2271 /* 2272 * Locate a free blockref in the parent's array 2273 */ 2274 switch(parent->bref.type) { 2275 case HAMMER2_BREF_TYPE_INODE: 2276 KKASSERT((parent->data->ipdata.op_flags & 2277 HAMMER2_OPFLAG_DIRECTDATA) == 0); 2278 KKASSERT(parent->data != NULL); 2279 base = &parent->data->ipdata.u.blockset.blockref[0]; 2280 count = HAMMER2_SET_COUNT; 2281 break; 2282 case HAMMER2_BREF_TYPE_INDIRECT: 2283 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2284 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2285 base = NULL; 2286 } else { 2287 KKASSERT(parent->data != NULL); 2288 base = &parent->data->npdata[0]; 2289 } 2290 count = parent->bytes / sizeof(hammer2_blockref_t); 2291 break; 2292 case HAMMER2_BREF_TYPE_VOLUME: 2293 KKASSERT(parent->data != NULL); 2294 base = &hmp->voldata.sroot_blockset.blockref[0]; 2295 count = HAMMER2_SET_COUNT; 2296 break; 2297 case HAMMER2_BREF_TYPE_FREEMAP: 2298 KKASSERT(parent->data != NULL); 2299 base = &hmp->voldata.freemap_blockset.blockref[0]; 2300 count = HAMMER2_SET_COUNT; 2301 break; 2302 default: 2303 panic("hammer2_chain_create: unrecognized blockref type: %d", 2304 parent->bref.type); 2305 count = 0; 2306 break; 2307 } 2308 2309 /* 2310 * Scan for an unallocated bref, also skipping any slots occupied 2311 * by in-memory chain elements that may not yet have been updated 2312 * in the parent's bref array. 2313 * 2314 * We don't have to hold the spinlock to save an empty slot as 2315 * new slots can only transition from empty if the parent is 2316 * locked exclusively. 2317 */ 2318 spin_lock(&above->cst.spin); 2319 for (i = 0; i < count; ++i) { 2320 child = hammer2_chain_find_locked(parent, i); 2321 if (child) { 2322 if (child->flags & HAMMER2_CHAIN_DELETED) 2323 break; 2324 continue; 2325 } 2326 if (base == NULL) 2327 break; 2328 if (base[i].type == 0) 2329 break; 2330 } 2331 spin_unlock(&above->cst.spin); 2332 2333 /* 2334 * If no free blockref could be found we must create an indirect 2335 * block and move a number of blockrefs into it. With the parent 2336 * locked we can safely lock each child in order to move it without 2337 * causing a deadlock. 2338 * 2339 * This may return the new indirect block or the old parent depending 2340 * on where the key falls. NULL is returned on error. 2341 */ 2342 if (i == count) { 2343 hammer2_chain_t *nparent; 2344 2345 nparent = hammer2_chain_create_indirect(trans, parent, 2346 key, keybits, 2347 type, &error); 2348 if (nparent == NULL) { 2349 if (allocated) 2350 hammer2_chain_drop(chain); 2351 chain = NULL; 2352 goto done; 2353 } 2354 if (parent != nparent) { 2355 hammer2_chain_unlock(parent); 2356 parent = *parentp = nparent; 2357 } 2358 goto again; 2359 } 2360 2361 /* 2362 * Link the chain into its parent. Later on we will have to set 2363 * the MOVED bit in situations where we don't mark the new chain 2364 * as being modified. 2365 */ 2366 if (chain->above != NULL) 2367 panic("hammer2: hammer2_chain_create: chain already connected"); 2368 KKASSERT(chain->above == NULL); 2369 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 2370 2371 chain->above = above; 2372 chain->index = i; 2373 spin_lock(&above->cst.spin); 2374 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, chain)) 2375 panic("hammer2_chain_create: collision"); 2376 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 2377 spin_unlock(&above->cst.spin); 2378 2379 if (allocated) { 2380 /* 2381 * Mark the newly created chain modified. 2382 * 2383 * Device buffers are not instantiated for DATA elements 2384 * as these are handled by logical buffers. 2385 * 2386 * Indirect and freemap node indirect blocks are handled 2387 * by hammer2_chain_create_indirect() and not by this 2388 * function. 2389 * 2390 * Data for all other bref types is expected to be 2391 * instantiated (INODE, LEAF). 2392 */ 2393 switch(chain->bref.type) { 2394 case HAMMER2_BREF_TYPE_DATA: 2395 hammer2_chain_modify(trans, &chain, 2396 HAMMER2_MODIFY_OPTDATA | 2397 HAMMER2_MODIFY_ASSERTNOCOPY); 2398 break; 2399 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2400 case HAMMER2_BREF_TYPE_INODE: 2401 hammer2_chain_modify(trans, &chain, 2402 HAMMER2_MODIFY_ASSERTNOCOPY); 2403 break; 2404 default: 2405 /* 2406 * Remaining types are not supported by this function. 2407 * In particular, INDIRECT and LEAF_NODE types are 2408 * handled by create_indirect(). 2409 */ 2410 panic("hammer2_chain_create: bad type: %d", 2411 chain->bref.type); 2412 /* NOT REACHED */ 2413 break; 2414 } 2415 } else { 2416 /* 2417 * When reconnecting a chain we must set MOVED and setsubmod 2418 * so the flush recognizes that it must update the bref in 2419 * the parent. 2420 */ 2421 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2422 hammer2_chain_ref(chain); 2423 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 2424 } 2425 hammer2_chain_setsubmod(trans, chain); 2426 } 2427 2428 done: 2429 *chainp = chain; 2430 2431 return (error); 2432 } 2433 2434 /* 2435 * Replace (*chainp) with a duplicate. The original *chainp is unlocked 2436 * and the replacement will be returned locked. Both the original and the 2437 * new chain will share the same RBTREE (have the same chain->core), with 2438 * the new chain becoming the 'current' chain (meaning it is the first in 2439 * the linked list at core->chain_first). 2440 * 2441 * If (parent, i) then the new duplicated chain is inserted under the parent 2442 * at the specified index (the parent must not have a ref at that index). 2443 * 2444 * If (NULL, -1) then the new duplicated chain is not inserted anywhere, 2445 * similar to if it had just been chain_alloc()'d (suitable for passing into 2446 * hammer2_chain_create() after this function returns). 2447 * 2448 * NOTE! Duplication is used in order to retain the original topology to 2449 * support flush synchronization points. Both the original and the 2450 * new chain will have the same transaction id and thus the operation 2451 * appears atomic w/regards to media flushes. 2452 */ 2453 static void hammer2_chain_dup_fixup(hammer2_chain_t *ochain, 2454 hammer2_chain_t *nchain); 2455 2456 void 2457 hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent, int i, 2458 hammer2_chain_t **chainp, hammer2_blockref_t *bref) 2459 { 2460 hammer2_mount_t *hmp; 2461 hammer2_blockref_t *base; 2462 hammer2_chain_t *ochain; 2463 hammer2_chain_t *nchain; 2464 hammer2_chain_t *scan; 2465 hammer2_chain_core_t *above; 2466 size_t bytes; 2467 int count; 2468 int oflags; 2469 void *odata; 2470 2471 /* 2472 * First create a duplicate of the chain structure, associating 2473 * it with the same core, making it the same size, pointing it 2474 * to the same bref (the same media block). 2475 */ 2476 ochain = *chainp; 2477 hmp = ochain->hmp; 2478 if (bref == NULL) 2479 bref = &ochain->bref; 2480 nchain = hammer2_chain_alloc(hmp, trans, bref); 2481 hammer2_chain_core_alloc(nchain, ochain->core); 2482 bytes = (hammer2_off_t)1 << 2483 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 2484 nchain->bytes = bytes; 2485 nchain->modify_tid = ochain->modify_tid; 2486 2487 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER); 2488 hammer2_chain_dup_fixup(ochain, nchain); 2489 2490 /* 2491 * If parent is not NULL, insert into the parent at the requested 2492 * index. The newly duplicated chain must be marked MOVED and 2493 * SUBMODIFIED set in its parent(s). 2494 * 2495 * Having both chains locked is extremely important for atomicy. 2496 */ 2497 if (parent) { 2498 /* 2499 * Locate a free blockref in the parent's array 2500 */ 2501 above = parent->core; 2502 KKASSERT(ccms_thread_lock_owned(&above->cst)); 2503 2504 switch(parent->bref.type) { 2505 case HAMMER2_BREF_TYPE_INODE: 2506 KKASSERT((parent->data->ipdata.op_flags & 2507 HAMMER2_OPFLAG_DIRECTDATA) == 0); 2508 KKASSERT(parent->data != NULL); 2509 base = &parent->data->ipdata.u.blockset.blockref[0]; 2510 count = HAMMER2_SET_COUNT; 2511 break; 2512 case HAMMER2_BREF_TYPE_INDIRECT: 2513 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2514 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2515 base = NULL; 2516 } else { 2517 KKASSERT(parent->data != NULL); 2518 base = &parent->data->npdata[0]; 2519 } 2520 count = parent->bytes / sizeof(hammer2_blockref_t); 2521 break; 2522 case HAMMER2_BREF_TYPE_VOLUME: 2523 KKASSERT(parent->data != NULL); 2524 base = &hmp->voldata.sroot_blockset.blockref[0]; 2525 count = HAMMER2_SET_COUNT; 2526 break; 2527 case HAMMER2_BREF_TYPE_FREEMAP: 2528 KKASSERT(parent->data != NULL); 2529 base = &hmp->voldata.freemap_blockset.blockref[0]; 2530 count = HAMMER2_SET_COUNT; 2531 break; 2532 default: 2533 panic("hammer2_chain_create: unrecognized " 2534 "blockref type: %d", 2535 parent->bref.type); 2536 count = 0; 2537 break; 2538 } 2539 KKASSERT(i >= 0 && i < count); 2540 2541 KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0); 2542 KKASSERT(parent->refs > 0); 2543 2544 spin_lock(&above->cst.spin); 2545 nchain->above = above; 2546 nchain->index = i; 2547 scan = hammer2_chain_find_locked(parent, i); 2548 KKASSERT(base == NULL || base[i].type == 0 || 2549 scan == NULL || 2550 (scan->flags & HAMMER2_CHAIN_DELETED)); 2551 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, 2552 nchain)) { 2553 panic("hammer2_chain_duplicate: collision"); 2554 } 2555 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE); 2556 spin_unlock(&above->cst.spin); 2557 2558 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2559 hammer2_chain_ref(nchain); 2560 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED); 2561 } 2562 hammer2_chain_setsubmod(trans, nchain); 2563 } 2564 2565 /* 2566 * We have to unlock ochain to flush any dirty data, asserting the 2567 * case (data == NULL) to catch any extra locks that might have been 2568 * present, then transfer state to nchain. 2569 */ 2570 oflags = ochain->flags; 2571 odata = ochain->data; 2572 hammer2_chain_unlock(ochain); 2573 KKASSERT((ochain->flags & HAMMER2_CHAIN_EMBEDDED) || 2574 ochain->data == NULL); 2575 2576 if (oflags & HAMMER2_CHAIN_INITIAL) 2577 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL); 2578 2579 /* 2580 * WARNING! We should never resolve DATA to device buffers 2581 * (XXX allow it if the caller did?), and since 2582 * we currently do not have the logical buffer cache 2583 * buffer in-hand to fix its cached physical offset 2584 * we also force the modify code to not COW it. XXX 2585 */ 2586 if (oflags & HAMMER2_CHAIN_MODIFIED) { 2587 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) { 2588 hammer2_chain_modify(trans, &nchain, 2589 HAMMER2_MODIFY_OPTDATA | 2590 HAMMER2_MODIFY_NOREALLOC | 2591 HAMMER2_MODIFY_ASSERTNOCOPY); 2592 } else if (oflags & HAMMER2_CHAIN_INITIAL) { 2593 hammer2_chain_modify(trans, &nchain, 2594 HAMMER2_MODIFY_OPTDATA | 2595 HAMMER2_MODIFY_ASSERTNOCOPY); 2596 } else { 2597 hammer2_chain_modify(trans, &nchain, 2598 HAMMER2_MODIFY_ASSERTNOCOPY); 2599 } 2600 hammer2_chain_drop(nchain); 2601 } else { 2602 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) { 2603 hammer2_chain_drop(nchain); 2604 } else if (oflags & HAMMER2_CHAIN_INITIAL) { 2605 hammer2_chain_drop(nchain); 2606 } else { 2607 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS | 2608 HAMMER2_RESOLVE_NOREF); 2609 hammer2_chain_unlock(nchain); 2610 } 2611 } 2612 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED); 2613 *chainp = nchain; 2614 } 2615 2616 #if 0 2617 /* 2618 * When the chain is in the INITIAL state we must still 2619 * ensure that a block has been assigned so MOVED processing 2620 * works as expected. 2621 */ 2622 KKASSERT (nchain->bref.type != HAMMER2_BREF_TYPE_DATA); 2623 hammer2_chain_modify(trans, &nchain, 2624 HAMMER2_MODIFY_OPTDATA | 2625 HAMMER2_MODIFY_ASSERTNOCOPY); 2626 2627 2628 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE | 2629 HAMMER2_RESOLVE_NOREF); /* eat excess ref */ 2630 hammer2_chain_unlock(nchain); 2631 #endif 2632 2633 /* 2634 * Special in-place delete-duplicate sequence which does not require a 2635 * locked parent. (*chainp) is marked DELETED and atomically replaced 2636 * with a duplicate. Atomicy is at the very-fine spin-lock level in 2637 * order to ensure that lookups do not race us. 2638 */ 2639 void 2640 hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp, 2641 int flags) 2642 { 2643 hammer2_mount_t *hmp; 2644 hammer2_chain_t *ochain; 2645 hammer2_chain_t *nchain; 2646 hammer2_chain_core_t *above; 2647 size_t bytes; 2648 int oflags; 2649 void *odata; 2650 2651 /* 2652 * First create a duplicate of the chain structure 2653 */ 2654 ochain = *chainp; 2655 hmp = ochain->hmp; 2656 nchain = hammer2_chain_alloc(hmp, trans, &ochain->bref); /* 1 ref */ 2657 if (flags & HAMMER2_DELDUP_RECORE) 2658 hammer2_chain_core_alloc(nchain, NULL); 2659 else 2660 hammer2_chain_core_alloc(nchain, ochain->core); 2661 above = ochain->above; 2662 2663 bytes = (hammer2_off_t)1 << 2664 (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 2665 nchain->bytes = bytes; 2666 nchain->modify_tid = ochain->modify_tid; 2667 2668 /* 2669 * Lock nchain and insert into ochain's core hierarchy, marking 2670 * ochain DELETED at the same time. Having both chains locked 2671 * is extremely important for atomicy. 2672 */ 2673 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER); 2674 hammer2_chain_dup_fixup(ochain, nchain); 2675 /* extra ref still present from original allocation */ 2676 2677 nchain->index = ochain->index; 2678 2679 spin_lock(&above->cst.spin); 2680 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE); 2681 ochain->delete_tid = trans->sync_tid; 2682 nchain->above = above; 2683 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED); 2684 if ((ochain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2685 hammer2_chain_ref(ochain); 2686 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_MOVED); 2687 } 2688 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, nchain)) { 2689 panic("hammer2_chain_delete_duplicate: collision"); 2690 } 2691 spin_unlock(&above->cst.spin); 2692 2693 /* 2694 * We have to unlock ochain to flush any dirty data, asserting the 2695 * case (data == NULL) to catch any extra locks that might have been 2696 * present, then transfer state to nchain. 2697 */ 2698 oflags = ochain->flags; 2699 odata = ochain->data; 2700 hammer2_chain_unlock(ochain); /* replacing ochain */ 2701 KKASSERT(ochain->bref.type == HAMMER2_BREF_TYPE_INODE || 2702 ochain->data == NULL); 2703 2704 if (oflags & HAMMER2_CHAIN_INITIAL) 2705 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL); 2706 2707 /* 2708 * WARNING! We should never resolve DATA to device buffers 2709 * (XXX allow it if the caller did?), and since 2710 * we currently do not have the logical buffer cache 2711 * buffer in-hand to fix its cached physical offset 2712 * we also force the modify code to not COW it. XXX 2713 */ 2714 if (oflags & HAMMER2_CHAIN_MODIFIED) { 2715 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) { 2716 hammer2_chain_modify(trans, &nchain, 2717 HAMMER2_MODIFY_OPTDATA | 2718 HAMMER2_MODIFY_NOREALLOC | 2719 HAMMER2_MODIFY_ASSERTNOCOPY); 2720 } else if (oflags & HAMMER2_CHAIN_INITIAL) { 2721 hammer2_chain_modify(trans, &nchain, 2722 HAMMER2_MODIFY_OPTDATA | 2723 HAMMER2_MODIFY_ASSERTNOCOPY); 2724 } else { 2725 hammer2_chain_modify(trans, &nchain, 2726 HAMMER2_MODIFY_ASSERTNOCOPY); 2727 } 2728 hammer2_chain_drop(nchain); 2729 } else { 2730 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) { 2731 hammer2_chain_drop(nchain); 2732 } else if (oflags & HAMMER2_CHAIN_INITIAL) { 2733 hammer2_chain_drop(nchain); 2734 } else { 2735 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS | 2736 HAMMER2_RESOLVE_NOREF); 2737 hammer2_chain_unlock(nchain); 2738 } 2739 } 2740 2741 /* 2742 * Unconditionally set the MOVED and SUBMODIFIED bit to force 2743 * update of parent bref and indirect blockrefs during flush. 2744 */ 2745 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2746 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED); 2747 hammer2_chain_ref(nchain); 2748 } 2749 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED); 2750 hammer2_chain_setsubmod(trans, nchain); 2751 *chainp = nchain; 2752 } 2753 2754 /* 2755 * Helper function to fixup inodes. The caller procedure stack may hold 2756 * multiple locks on ochain if it represents an inode, preventing our 2757 * unlock from retiring its state to the buffer cache. 2758 * 2759 * In this situation any attempt to access the buffer cache could result 2760 * either in stale data or a deadlock. Work around the problem by copying 2761 * the embedded data directly. 2762 */ 2763 static 2764 void 2765 hammer2_chain_dup_fixup(hammer2_chain_t *ochain, hammer2_chain_t *nchain) 2766 { 2767 if (ochain->data == NULL) 2768 return; 2769 switch(ochain->bref.type) { 2770 case HAMMER2_BREF_TYPE_INODE: 2771 KKASSERT(nchain->data == NULL); 2772 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED); 2773 nchain->data = kmalloc(sizeof(nchain->data->ipdata), 2774 ochain->hmp->mchain, M_WAITOK | M_ZERO); 2775 nchain->data->ipdata = ochain->data->ipdata; 2776 break; 2777 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2778 KKASSERT(nchain->data == NULL); 2779 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED); 2780 nchain->data = kmalloc(sizeof(nchain->data->bmdata), 2781 ochain->hmp->mchain, M_WAITOK | M_ZERO); 2782 bcopy(ochain->data->bmdata, 2783 nchain->data->bmdata, 2784 sizeof(nchain->data->bmdata)); 2785 break; 2786 default: 2787 break; 2788 } 2789 } 2790 2791 /* 2792 * Create a snapshot of the specified {parent, chain} with the specified 2793 * label. 2794 * 2795 * (a) We create a duplicate connected to the super-root as the specified 2796 * label. 2797 * 2798 * (b) We issue a restricted flush using the current transaction on the 2799 * duplicate. 2800 * 2801 * (c) We disconnect and reallocate the duplicate's core. 2802 */ 2803 int 2804 hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_inode_t *ip, 2805 hammer2_ioc_pfs_t *pfs) 2806 { 2807 hammer2_cluster_t *cluster; 2808 hammer2_mount_t *hmp; 2809 hammer2_chain_t *chain; 2810 hammer2_chain_t *nchain; 2811 hammer2_chain_t *parent; 2812 hammer2_inode_data_t *ipdata; 2813 size_t name_len; 2814 hammer2_key_t lhc; 2815 int error; 2816 2817 name_len = strlen(pfs->name); 2818 lhc = hammer2_dirhash(pfs->name, name_len); 2819 cluster = ip->pmp->mount_cluster; 2820 hmp = ip->chain->hmp; 2821 KKASSERT(hmp == cluster->hmp); /* XXX */ 2822 2823 /* 2824 * Create disconnected duplicate 2825 */ 2826 KKASSERT((trans->flags & HAMMER2_TRANS_RESTRICTED) == 0); 2827 nchain = ip->chain; 2828 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE); 2829 hammer2_chain_duplicate(trans, NULL, -1, &nchain, NULL); 2830 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_RECYCLE | 2831 HAMMER2_CHAIN_SNAPSHOT); 2832 2833 /* 2834 * Create named entry in the super-root. 2835 */ 2836 parent = hammer2_chain_lookup_init(hmp->schain, 0); 2837 error = 0; 2838 while (error == 0) { 2839 chain = hammer2_chain_lookup(&parent, lhc, lhc, 0); 2840 if (chain == NULL) 2841 break; 2842 if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK) 2843 error = ENOSPC; 2844 hammer2_chain_unlock(chain); 2845 chain = NULL; 2846 ++lhc; 2847 } 2848 hammer2_chain_create(trans, &parent, &nchain, lhc, 0, 2849 HAMMER2_BREF_TYPE_INODE, 2850 HAMMER2_INODE_BYTES); 2851 hammer2_chain_modify(trans, &nchain, HAMMER2_MODIFY_ASSERTNOCOPY); 2852 hammer2_chain_lookup_done(parent); 2853 parent = NULL; /* safety */ 2854 2855 /* 2856 * Name fixup 2857 */ 2858 ipdata = &nchain->data->ipdata; 2859 ipdata->name_key = lhc; 2860 ipdata->name_len = name_len; 2861 ksnprintf(ipdata->filename, sizeof(ipdata->filename), "%s", pfs->name); 2862 2863 /* 2864 * Set PFS type, generate a unique filesystem id, and generate 2865 * a cluster id. Use the same clid when snapshotting a PFS root, 2866 * which theoretically allows the snapshot to be used as part of 2867 * the same cluster (perhaps as a cache). 2868 */ 2869 ipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; 2870 kern_uuidgen(&ipdata->pfs_fsid, 1); 2871 if (ip->chain == cluster->rchain) 2872 ipdata->pfs_clid = ip->chain->data->ipdata.pfs_clid; 2873 else 2874 kern_uuidgen(&ipdata->pfs_clid, 1); 2875 2876 /* 2877 * Issue a restricted flush of the snapshot. This is a synchronous 2878 * operation. 2879 */ 2880 trans->flags |= HAMMER2_TRANS_RESTRICTED; 2881 kprintf("SNAPSHOTA\n"); 2882 tsleep(trans, 0, "snapslp", hz*4); 2883 kprintf("SNAPSHOTB\n"); 2884 hammer2_chain_flush(trans, nchain); 2885 trans->flags &= ~HAMMER2_TRANS_RESTRICTED; 2886 2887 #if 0 2888 /* 2889 * Remove the link b/c nchain is a snapshot and snapshots don't 2890 * follow CHAIN_DELETED semantics ? 2891 */ 2892 chain = ip->chain; 2893 2894 2895 KKASSERT(chain->duplink == nchain); 2896 KKASSERT(chain->core == nchain->core); 2897 KKASSERT(nchain->refs >= 2); 2898 chain->duplink = nchain->duplink; 2899 atomic_clear_int(&nchain->flags, HAMMER2_CHAIN_DUPTARGET); 2900 hammer2_chain_drop(nchain); 2901 #endif 2902 2903 kprintf("snapshot %s nchain->refs %d nchain->flags %08x\n", 2904 pfs->name, nchain->refs, nchain->flags); 2905 hammer2_chain_unlock(nchain); 2906 2907 return (error); 2908 } 2909 2910 /* 2911 * Create an indirect block that covers one or more of the elements in the 2912 * current parent. Either returns the existing parent with no locking or 2913 * ref changes or returns the new indirect block locked and referenced 2914 * and leaving the original parent lock/ref intact as well. 2915 * 2916 * If an error occurs, NULL is returned and *errorp is set to the error. 2917 * 2918 * The returned chain depends on where the specified key falls. 2919 * 2920 * The key/keybits for the indirect mode only needs to follow three rules: 2921 * 2922 * (1) That all elements underneath it fit within its key space and 2923 * 2924 * (2) That all elements outside it are outside its key space. 2925 * 2926 * (3) When creating the new indirect block any elements in the current 2927 * parent that fit within the new indirect block's keyspace must be 2928 * moved into the new indirect block. 2929 * 2930 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 2931 * keyspace the the current parent, but lookup/iteration rules will 2932 * ensure (and must ensure) that rule (2) for all parents leading up 2933 * to the nearest inode or the root volume header is adhered to. This 2934 * is accomplished by always recursing through matching keyspaces in 2935 * the hammer2_chain_lookup() and hammer2_chain_next() API. 2936 * 2937 * The current implementation calculates the current worst-case keyspace by 2938 * iterating the current parent and then divides it into two halves, choosing 2939 * whichever half has the most elements (not necessarily the half containing 2940 * the requested key). 2941 * 2942 * We can also opt to use the half with the least number of elements. This 2943 * causes lower-numbered keys (aka logical file offsets) to recurse through 2944 * fewer indirect blocks and higher-numbered keys to recurse through more. 2945 * This also has the risk of not moving enough elements to the new indirect 2946 * block and being forced to create several indirect blocks before the element 2947 * can be inserted. 2948 * 2949 * Must be called with an exclusively locked parent. 2950 */ 2951 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 2952 hammer2_key_t *keyp, int keybits, 2953 hammer2_blockref_t *base, int count); 2954 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent, 2955 hammer2_key_t *keyp, int keybits, 2956 hammer2_blockref_t *base, int count); 2957 static 2958 hammer2_chain_t * 2959 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, 2960 hammer2_key_t create_key, int create_bits, 2961 int for_type, int *errorp) 2962 { 2963 hammer2_mount_t *hmp; 2964 hammer2_chain_core_t *above; 2965 hammer2_chain_core_t *icore; 2966 hammer2_blockref_t *base; 2967 hammer2_blockref_t *bref; 2968 hammer2_chain_t *chain; 2969 hammer2_chain_t *child; 2970 hammer2_chain_t *ichain; 2971 hammer2_chain_t dummy; 2972 hammer2_key_t key = create_key; 2973 int keybits = create_bits; 2974 int count; 2975 int nbytes; 2976 int i; 2977 2978 /* 2979 * Calculate the base blockref pointer or NULL if the chain 2980 * is known to be empty. We need to calculate the array count 2981 * for RB lookups either way. 2982 */ 2983 hmp = parent->hmp; 2984 *errorp = 0; 2985 KKASSERT(ccms_thread_lock_owned(&parent->core->cst)); 2986 above = parent->core; 2987 2988 /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/ 2989 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2990 base = NULL; 2991 2992 switch(parent->bref.type) { 2993 case HAMMER2_BREF_TYPE_INODE: 2994 count = HAMMER2_SET_COUNT; 2995 break; 2996 case HAMMER2_BREF_TYPE_INDIRECT: 2997 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2998 count = parent->bytes / sizeof(hammer2_blockref_t); 2999 break; 3000 case HAMMER2_BREF_TYPE_VOLUME: 3001 count = HAMMER2_SET_COUNT; 3002 break; 3003 case HAMMER2_BREF_TYPE_FREEMAP: 3004 count = HAMMER2_SET_COUNT; 3005 break; 3006 default: 3007 panic("hammer2_chain_create_indirect: " 3008 "unrecognized blockref type: %d", 3009 parent->bref.type); 3010 count = 0; 3011 break; 3012 } 3013 } else { 3014 switch(parent->bref.type) { 3015 case HAMMER2_BREF_TYPE_INODE: 3016 base = &parent->data->ipdata.u.blockset.blockref[0]; 3017 count = HAMMER2_SET_COUNT; 3018 break; 3019 case HAMMER2_BREF_TYPE_INDIRECT: 3020 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3021 base = &parent->data->npdata[0]; 3022 count = parent->bytes / sizeof(hammer2_blockref_t); 3023 break; 3024 case HAMMER2_BREF_TYPE_VOLUME: 3025 base = &hmp->voldata.sroot_blockset.blockref[0]; 3026 count = HAMMER2_SET_COUNT; 3027 break; 3028 case HAMMER2_BREF_TYPE_FREEMAP: 3029 base = &hmp->voldata.freemap_blockset.blockref[0]; 3030 count = HAMMER2_SET_COUNT; 3031 break; 3032 default: 3033 panic("hammer2_chain_create_indirect: " 3034 "unrecognized blockref type: %d", 3035 parent->bref.type); 3036 count = 0; 3037 break; 3038 } 3039 } 3040 3041 /* 3042 * dummy used in later chain allocation (no longer used for lookups). 3043 */ 3044 bzero(&dummy, sizeof(dummy)); 3045 dummy.delete_tid = HAMMER2_MAX_TID; 3046 3047 /* 3048 * When creating an indirect block for a freemap node or leaf 3049 * the key/keybits must be fitted to static radix levels because 3050 * particular radix levels use particular reserved blocks in the 3051 * related zone. 3052 * 3053 * This routine calculates the key/radix of the indirect block 3054 * we need to create, and whether it is on the high-side or the 3055 * low-side. 3056 */ 3057 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3058 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3059 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 3060 base, count); 3061 } else { 3062 keybits = hammer2_chain_indkey_normal(parent, &key, keybits, 3063 base, count); 3064 } 3065 3066 /* 3067 * Normalize the key for the radix being represented, keeping the 3068 * high bits and throwing away the low bits. 3069 */ 3070 key &= ~(((hammer2_key_t)1 << keybits) - 1); 3071 3072 /* 3073 * How big should our new indirect block be? It has to be at least 3074 * as large as its parent. 3075 */ 3076 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) 3077 nbytes = HAMMER2_IND_BYTES_MIN; 3078 else 3079 nbytes = HAMMER2_IND_BYTES_MAX; 3080 if (nbytes < count * sizeof(hammer2_blockref_t)) 3081 nbytes = count * sizeof(hammer2_blockref_t); 3082 3083 /* 3084 * Ok, create our new indirect block 3085 */ 3086 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3087 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3088 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 3089 } else { 3090 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 3091 } 3092 dummy.bref.key = key; 3093 dummy.bref.keybits = keybits; 3094 dummy.bref.data_off = hammer2_getradix(nbytes); 3095 dummy.bref.methods = parent->bref.methods; 3096 3097 ichain = hammer2_chain_alloc(hmp, trans, &dummy.bref); 3098 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 3099 hammer2_chain_core_alloc(ichain, NULL); 3100 icore = ichain->core; 3101 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 3102 hammer2_chain_drop(ichain); /* excess ref from alloc */ 3103 3104 /* 3105 * We have to mark it modified to allocate its block, but use 3106 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 3107 * it won't be acted upon by the flush code. 3108 * 3109 * XXX leave the node unmodified, depend on the SUBMODIFIED 3110 * flush to assign and modify parent blocks. 3111 */ 3112 hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA); 3113 3114 /* 3115 * Iterate the original parent and move the matching brefs into 3116 * the new indirect block. 3117 * 3118 * At the same time locate an empty slot (or what will become an 3119 * empty slot) and assign the new indirect block to that slot. 3120 * 3121 * XXX handle flushes. 3122 */ 3123 spin_lock(&above->cst.spin); 3124 for (i = 0; i < count; ++i) { 3125 /* 3126 * For keying purposes access the bref from the media or 3127 * from our in-memory cache. In cases where the in-memory 3128 * cache overrides the media the keyrefs will be the same 3129 * anyway so we can avoid checking the cache when the media 3130 * has a key. 3131 */ 3132 child = hammer2_chain_find_locked(parent, i); 3133 if (child) { 3134 if (child->flags & HAMMER2_CHAIN_DELETED) { 3135 if (ichain->index < 0) 3136 ichain->index = i; 3137 continue; 3138 } 3139 bref = &child->bref; 3140 } else if (base && base[i].type) { 3141 bref = &base[i]; 3142 } else { 3143 if (ichain->index < 0) 3144 ichain->index = i; 3145 continue; 3146 } 3147 3148 /* 3149 * Skip keys that are not within the key/radix of the new 3150 * indirect block. They stay in the parent. 3151 */ 3152 if ((~(((hammer2_key_t)1 << keybits) - 1) & 3153 (key ^ bref->key)) != 0) { 3154 continue; 3155 } 3156 3157 /* 3158 * This element is being moved from the parent, its slot 3159 * is available for our new indirect block. 3160 */ 3161 if (ichain->index < 0) 3162 ichain->index = i; 3163 3164 /* 3165 * Load the new indirect block by acquiring or allocating 3166 * the related chain entries, then move them to the new 3167 * parent (ichain) by deleting them from their old location 3168 * and inserting a duplicate of the chain and any modified 3169 * sub-chain in the new location. 3170 * 3171 * We must set MOVED in the chain being duplicated and 3172 * SUBMODIFIED in the parent(s) so the flush code knows 3173 * what is going on. The latter is done after the loop. 3174 * 3175 * WARNING! above->cst.spin must be held when parent is 3176 * modified, even though we own the full blown lock, 3177 * to deal with setsubmod and rename races. 3178 * (XXX remove this req). 3179 */ 3180 spin_unlock(&above->cst.spin); 3181 chain = hammer2_chain_get(parent, i, HAMMER2_LOOKUP_NODATA); 3182 hammer2_chain_delete(trans, chain); 3183 hammer2_chain_duplicate(trans, ichain, i, &chain, NULL); 3184 hammer2_chain_unlock(chain); 3185 KKASSERT(parent->refs > 0); 3186 chain = NULL; 3187 spin_lock(&above->cst.spin); 3188 } 3189 spin_unlock(&above->cst.spin); 3190 3191 /* 3192 * Insert the new indirect block into the parent now that we've 3193 * cleared out some entries in the parent. We calculated a good 3194 * insertion index in the loop above (ichain->index). 3195 * 3196 * We don't have to set MOVED here because we mark ichain modified 3197 * down below (so the normal modified -> flush -> set-moved sequence 3198 * applies). 3199 * 3200 * The insertion shouldn't race as this is a completely new block 3201 * and the parent is locked. 3202 */ 3203 if (ichain->index < 0) 3204 kprintf("indirect parent %p count %d key %016jx/%d\n", 3205 parent, count, (intmax_t)key, keybits); 3206 KKASSERT(ichain->index >= 0); 3207 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 3208 spin_lock(&above->cst.spin); 3209 if (RB_INSERT(hammer2_chain_tree, &above->rbtree, ichain)) 3210 panic("hammer2_chain_create_indirect: ichain insertion"); 3211 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_ONRBTREE); 3212 ichain->above = above; 3213 spin_unlock(&above->cst.spin); 3214 3215 /* 3216 * Mark the new indirect block modified after insertion, which 3217 * will propagate up through parent all the way to the root and 3218 * also allocate the physical block in ichain for our caller, 3219 * and assign ichain->data to a pre-zero'd space (because there 3220 * is not prior data to copy into it). 3221 * 3222 * We have to set SUBMODIFIED in ichain's flags manually so the 3223 * flusher knows it has to recurse through it to get to all of 3224 * our moved blocks, then call setsubmod() to set the bit 3225 * recursively. 3226 */ 3227 /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/ 3228 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED); 3229 hammer2_chain_setsubmod(trans, ichain); 3230 3231 /* 3232 * Figure out what to return. 3233 */ 3234 if (~(((hammer2_key_t)1 << keybits) - 1) & 3235 (create_key ^ key)) { 3236 /* 3237 * Key being created is outside the key range, 3238 * return the original parent. 3239 */ 3240 hammer2_chain_unlock(ichain); 3241 } else { 3242 /* 3243 * Otherwise its in the range, return the new parent. 3244 * (leave both the new and old parent locked). 3245 */ 3246 parent = ichain; 3247 } 3248 3249 return(parent); 3250 } 3251 3252 /* 3253 * Calculate the keybits and highside/lowside of the freemap node the 3254 * caller is creating. 3255 * 3256 * This routine will specify the next higher-level freemap key/radix 3257 * representing the lowest-ordered set. By doing so, eventually all 3258 * low-ordered sets will be moved one level down. 3259 * 3260 * We have to be careful here because the freemap reserves a limited 3261 * number of blocks for a limited number of levels. So we can't just 3262 * push indiscriminately. 3263 */ 3264 int 3265 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 3266 int keybits, hammer2_blockref_t *base, int count) 3267 { 3268 hammer2_chain_core_t *above; 3269 hammer2_chain_t *child; 3270 hammer2_blockref_t *bref; 3271 hammer2_key_t key; 3272 int locount; 3273 int hicount; 3274 int i; 3275 3276 key = *keyp; 3277 above = parent->core; 3278 locount = 0; 3279 hicount = 0; 3280 keybits = 64; 3281 3282 /* 3283 * Calculate the range of keys in the array being careful to skip 3284 * slots which are overridden with a deletion. 3285 */ 3286 spin_lock(&above->cst.spin); 3287 for (i = 0; i < count; ++i) { 3288 child = hammer2_chain_find_locked(parent, i); 3289 if (child) { 3290 if (child->flags & HAMMER2_CHAIN_DELETED) 3291 continue; 3292 bref = &child->bref; 3293 } else if (base && base[i].type) { 3294 bref = &base[i]; 3295 } else { 3296 continue; 3297 } 3298 3299 if (keybits > bref->keybits) { 3300 key = bref->key; 3301 keybits = bref->keybits; 3302 } else if (keybits == bref->keybits && bref->key < key) { 3303 key = bref->key; 3304 } 3305 } 3306 spin_unlock(&above->cst.spin); 3307 3308 /* 3309 * Return the keybits for a higher-level FREEMAP_NODE covering 3310 * this node. 3311 */ 3312 switch(keybits) { 3313 case HAMMER2_FREEMAP_LEVEL0_RADIX: 3314 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 3315 break; 3316 case HAMMER2_FREEMAP_LEVEL1_RADIX: 3317 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 3318 break; 3319 case HAMMER2_FREEMAP_LEVEL2_RADIX: 3320 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 3321 break; 3322 case HAMMER2_FREEMAP_LEVEL3_RADIX: 3323 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 3324 break; 3325 case HAMMER2_FREEMAP_LEVEL4_RADIX: 3326 panic("hammer2_chain_indkey_freemap: level too high"); 3327 break; 3328 default: 3329 panic("hammer2_chain_indkey_freemap: bad radix"); 3330 break; 3331 } 3332 *keyp = key; 3333 3334 return (keybits); 3335 } 3336 3337 /* 3338 * Calculate the keybits and highside/lowside of the indirect block the 3339 * caller is creating. 3340 */ 3341 static int 3342 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp, 3343 int keybits, hammer2_blockref_t *base, int count) 3344 { 3345 hammer2_chain_core_t *above; 3346 hammer2_chain_t *child; 3347 hammer2_blockref_t *bref; 3348 hammer2_key_t key; 3349 int nkeybits; 3350 int locount; 3351 int hicount; 3352 int i; 3353 3354 key = *keyp; 3355 above = parent->core; 3356 locount = 0; 3357 hicount = 0; 3358 3359 /* 3360 * Calculate the range of keys in the array being careful to skip 3361 * slots which are overridden with a deletion. Once the scan 3362 * completes we will cut the key range in half and shift half the 3363 * range into the new indirect block. 3364 */ 3365 spin_lock(&above->cst.spin); 3366 for (i = 0; i < count; ++i) { 3367 child = hammer2_chain_find_locked(parent, i); 3368 if (child) { 3369 if (child->flags & HAMMER2_CHAIN_DELETED) 3370 continue; 3371 bref = &child->bref; 3372 } else if (base && base[i].type) { 3373 bref = &base[i]; 3374 } else { 3375 continue; 3376 } 3377 3378 /* 3379 * Expand our calculated key range (key, keybits) to fit 3380 * the scanned key. nkeybits represents the full range 3381 * that we will later cut in half (two halves @ nkeybits - 1). 3382 */ 3383 nkeybits = keybits; 3384 if (nkeybits < bref->keybits) { 3385 if (bref->keybits > 64) { 3386 kprintf("bad bref index %d chain %p bref %p\n", 3387 i, child, bref); 3388 Debugger("fubar"); 3389 } 3390 nkeybits = bref->keybits; 3391 } 3392 while (nkeybits < 64 && 3393 (~(((hammer2_key_t)1 << nkeybits) - 1) & 3394 (key ^ bref->key)) != 0) { 3395 ++nkeybits; 3396 } 3397 3398 /* 3399 * If the new key range is larger we have to determine 3400 * which side of the new key range the existing keys fall 3401 * under by checking the high bit, then collapsing the 3402 * locount into the hicount or vise-versa. 3403 */ 3404 if (keybits != nkeybits) { 3405 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 3406 hicount += locount; 3407 locount = 0; 3408 } else { 3409 locount += hicount; 3410 hicount = 0; 3411 } 3412 keybits = nkeybits; 3413 } 3414 3415 /* 3416 * The newly scanned key will be in the lower half or the 3417 * higher half of the (new) key range. 3418 */ 3419 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 3420 ++hicount; 3421 else 3422 ++locount; 3423 } 3424 spin_unlock(&above->cst.spin); 3425 bref = NULL; /* now invalid (safety) */ 3426 3427 /* 3428 * Adjust keybits to represent half of the full range calculated 3429 * above (radix 63 max) 3430 */ 3431 --keybits; 3432 3433 /* 3434 * Select whichever half contains the most elements. Theoretically 3435 * we can select either side as long as it contains at least one 3436 * element (in order to ensure that a free slot is present to hold 3437 * the indirect block). 3438 */ 3439 if (hammer2_indirect_optimize) { 3440 /* 3441 * Insert node for least number of keys, this will arrange 3442 * the first few blocks of a large file or the first few 3443 * inodes in a directory with fewer indirect blocks when 3444 * created linearly. 3445 */ 3446 if (hicount < locount && hicount != 0) 3447 key |= (hammer2_key_t)1 << keybits; 3448 else 3449 key &= ~(hammer2_key_t)1 << keybits; 3450 } else { 3451 /* 3452 * Insert node for most number of keys, best for heavily 3453 * fragmented files. 3454 */ 3455 if (hicount > locount) 3456 key |= (hammer2_key_t)1 << keybits; 3457 else 3458 key &= ~(hammer2_key_t)1 << keybits; 3459 } 3460 *keyp = key; 3461 3462 return (keybits); 3463 } 3464 3465 /* 3466 * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and 3467 * set chain->delete_tid. 3468 * 3469 * This function does NOT generate a modification to the parent. It 3470 * would be nearly impossible to figure out which parent to modify anyway. 3471 * Such modifications are handled by the flush code and are properly merged 3472 * using the flush synchronization point. 3473 * 3474 * The find/get code will properly overload the RBTREE check on top of 3475 * the bref check to detect deleted entries. 3476 * 3477 * This function is NOT recursive. Any entity already pushed into the 3478 * chain (such as an inode) may still need visibility into its contents, 3479 * as well as the ability to read and modify the contents. For example, 3480 * for an unlinked file which is still open. 3481 * 3482 * NOTE: This function does NOT set chain->modify_tid, allowing future 3483 * code to distinguish between live and deleted chains by testing 3484 * sync_tid. 3485 * 3486 * NOTE: Deletions normally do not occur in the middle of a duplication 3487 * chain but we use a trick for hardlink migration that refactors 3488 * the originating inode without deleting it, so we make no assumptions 3489 * here. 3490 */ 3491 void 3492 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain) 3493 { 3494 KKASSERT(ccms_thread_lock_owned(&chain->core->cst)); 3495 3496 /* 3497 * Nothing to do if already marked. 3498 */ 3499 if (chain->flags & HAMMER2_CHAIN_DELETED) 3500 return; 3501 3502 /* 3503 * We must set MOVED along with DELETED for the flush code to 3504 * recognize the operation and properly disconnect the chain 3505 * in-memory. 3506 * 3507 * The setting of DELETED causes finds, lookups, and _next iterations 3508 * to no longer recognize the chain. RB_SCAN()s will still have 3509 * visibility (needed for flush serialization points). 3510 * 3511 * We need the spinlock on the core whos RBTREE contains chain 3512 * to protect against races. 3513 */ 3514 spin_lock(&chain->above->cst.spin); 3515 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3516 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 3517 hammer2_chain_ref(chain); 3518 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 3519 } 3520 chain->delete_tid = trans->sync_tid; 3521 spin_unlock(&chain->above->cst.spin); 3522 hammer2_chain_setsubmod(trans, chain); 3523 } 3524 3525 void 3526 hammer2_chain_wait(hammer2_chain_t *chain) 3527 { 3528 tsleep(chain, 0, "chnflw", 1); 3529 } 3530 3531 static 3532 void 3533 adjreadcounter(hammer2_blockref_t *bref, size_t bytes) 3534 { 3535 long *counterp; 3536 3537 switch(bref->type) { 3538 case HAMMER2_BREF_TYPE_DATA: 3539 counterp = &hammer2_iod_file_read; 3540 break; 3541 case HAMMER2_BREF_TYPE_INODE: 3542 counterp = &hammer2_iod_meta_read; 3543 break; 3544 case HAMMER2_BREF_TYPE_INDIRECT: 3545 counterp = &hammer2_iod_indr_read; 3546 break; 3547 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3548 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3549 counterp = &hammer2_iod_fmap_read; 3550 break; 3551 default: 3552 counterp = &hammer2_iod_volu_read; 3553 break; 3554 } 3555 *counterp += bytes; 3556 } 3557