1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * A chain is topologically stable once it has been inserted into the 44 * in-memory topology. Modifications which copy, move, or resize the chain 45 * are handled via the DELETE-DUPLICATE mechanic where the original chain 46 * stays intact but is marked deleted and a new chain is allocated which 47 * shares the old chain's children. 48 * 49 * This sharing is handled via the hammer2_chain_core structure. 50 * 51 * The DELETE-DUPLICATE mechanism allows the same topological level to contain 52 * many overloadings. However, our RBTREE mechanics require that there be 53 * no overlaps so we accomplish the overloading by moving conflicting chains 54 * with smaller or equal radii into a sub-RBTREE under the chain being 55 * overloaded. 56 * 57 * DELETE-DUPLICATE is also used when a modification to a chain crosses a 58 * flush synchronization boundary, allowing the flush code to continue flushing 59 * the older version of the topology and not be disrupted by new frontend 60 * operations. 61 * 62 * LIVE VS FLUSH VIEW 63 * 64 * All lookup and iterate operations and most modifications are done on the 65 * live view. During flushes lookups are not normally done and modifications 66 * may be run on the flush view. However, flushes often needs to allocate 67 * blocks and the freemap_alloc/free code issues lookups. This code is 68 * special cased to use the live view when called from a flush. 69 * 70 * General chain lookup/iteration functions are NOT aware of the flush view, 71 * they only know about live views. 72 */ 73 #include <sys/cdefs.h> 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/types.h> 77 #include <sys/lock.h> 78 #include <sys/kern_syscall.h> 79 #include <sys/uuid.h> 80 81 #include "hammer2.h" 82 83 static int hammer2_indirect_optimize; /* XXX SYSCTL */ 84 85 static hammer2_chain_t *hammer2_chain_create_indirect( 86 hammer2_trans_t *trans, hammer2_chain_t *parent, 87 hammer2_key_t key, int keybits, int for_type, int *errorp); 88 static void hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop); 89 static void adjreadcounter(hammer2_blockref_t *bref, size_t bytes); 90 static hammer2_chain_t *hammer2_combined_find( 91 hammer2_chain_t *parent, 92 hammer2_blockref_t *base, int count, 93 int *cache_indexp, hammer2_key_t *key_nextp, 94 hammer2_key_t key_beg, hammer2_key_t key_end, 95 hammer2_blockref_t **bresp); 96 static void hammer2_chain_assert_not_present(hammer2_chain_core_t *above, 97 hammer2_chain_t *chain); 98 99 hammer2_chain_t *XXChain; 100 int XXChainWhy; 101 /* 102 * Basic RBTree for chains. Chains cannot overlap within any given 103 * core->rbtree without recursing through chain->rbtree. We effectively 104 * guarantee this by checking the full range rather than just the first 105 * key element. By matching on the full range callers can detect when 106 * recursrion through chain->rbtree is needed. 107 * 108 * NOTE: This also means the a delete-duplicate on the same key will 109 * overload by placing the deleted element in the new element's 110 * chain->rbtree (when doing a direct replacement). 111 */ 112 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 113 114 int 115 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 116 { 117 hammer2_key_t c1_beg; 118 hammer2_key_t c1_end; 119 hammer2_key_t c2_beg; 120 hammer2_key_t c2_end; 121 122 c1_beg = chain1->bref.key; 123 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 124 c2_beg = chain2->bref.key; 125 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 126 127 if (c1_end < c2_beg) /* fully to the left */ 128 return(-1); 129 if (c1_beg > c2_end) /* fully to the right */ 130 return(1); 131 return(0); /* overlap (must not cross edge boundary) */ 132 } 133 134 static __inline 135 int 136 hammer2_isclusterable(hammer2_chain_t *chain) 137 { 138 if (hammer2_cluster_enable) { 139 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 140 chain->bref.type == HAMMER2_BREF_TYPE_INODE || 141 chain->bref.type == HAMMER2_BREF_TYPE_DATA) { 142 return(1); 143 } 144 } 145 return(0); 146 } 147 148 /* 149 * Recursively set the update_hi flag up to the root starting at chain's 150 * parent->core. update_hi is not set in chain's core. 151 * 152 * This controls top-down visibility for flushes. The child has just one 153 * 'above' core, but the core itself can be multi-homed with parents iterated 154 * via core->ownerq. 155 * 156 * This function is not used during a flush (except when the flush is 157 * allocating which requires the live tree). The flush keeps track of its 158 * recursion itself. 159 * 160 * XXX needs to be optimized to use roll-up TIDs. update_hi is only really 161 * compared against bref.mirror_tid which itself is only updated by a flush. 162 */ 163 void 164 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain) 165 { 166 hammer2_chain_core_t *above; 167 168 while ((above = chain->above) != NULL) { 169 spin_lock(&above->cst.spin); 170 /* XXX optimize */ 171 if (above->update_hi < trans->sync_tid) 172 above->update_hi = trans->sync_tid; 173 chain = TAILQ_LAST(&above->ownerq, h2_core_list); 174 #if 0 175 TAILQ_FOREACH_REVERSE(chain, &above->ownerq, 176 h2_core_list, core_entry) { 177 if (trans->sync_tid >= chain->modify_tid && 178 trans->sync_tid <= chain->delete_tid) { 179 break; 180 } 181 } 182 #endif 183 spin_unlock(&above->cst.spin); 184 } 185 } 186 187 /* 188 * Allocate a new disconnected chain element representing the specified 189 * bref. chain->refs is set to 1 and the passed bref is copied to 190 * chain->bref. chain->bytes is derived from the bref. 191 * 192 * chain->core is NOT allocated and the media data and bp pointers are left 193 * NULL. The caller must call chain_core_alloc() to allocate or associate 194 * a core with the chain. 195 * 196 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 197 */ 198 hammer2_chain_t * 199 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp, 200 hammer2_trans_t *trans, hammer2_blockref_t *bref) 201 { 202 hammer2_chain_t *chain; 203 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 204 205 /* 206 * Construct the appropriate system structure. 207 */ 208 switch(bref->type) { 209 case HAMMER2_BREF_TYPE_INODE: 210 case HAMMER2_BREF_TYPE_INDIRECT: 211 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 212 case HAMMER2_BREF_TYPE_DATA: 213 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 214 /* 215 * Chain's are really only associated with the hmp but we 216 * maintain a pmp association for per-mount memory tracking 217 * purposes. The pmp can be NULL. 218 */ 219 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 220 if (pmp) 221 chain->pmp = pmp; 222 break; 223 case HAMMER2_BREF_TYPE_VOLUME: 224 case HAMMER2_BREF_TYPE_FREEMAP: 225 chain = NULL; 226 panic("hammer2_chain_alloc volume type illegal for op"); 227 default: 228 chain = NULL; 229 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 230 bref->type); 231 } 232 233 chain->hmp = hmp; 234 chain->bref = *bref; 235 chain->bytes = bytes; 236 chain->refs = 1; 237 chain->flags = HAMMER2_CHAIN_ALLOCATED; 238 chain->delete_tid = HAMMER2_MAX_TID; 239 240 /* 241 * Set modify_tid if a transaction is creating the chain. When 242 * loading a chain from backing store trans is passed as NULL and 243 * modify_tid is left set to 0. 244 */ 245 if (trans) 246 chain->modify_tid = trans->sync_tid; 247 248 return (chain); 249 } 250 251 /* 252 * Associate an existing core with the chain or allocate a new core. 253 * 254 * The core is not locked. No additional refs on the chain are made. 255 * (trans) must not be NULL if (core) is not NULL. 256 * 257 * When chains are delete-duplicated during flushes we insert nchain on 258 * the ownerq after ochain instead of at the end in order to give the 259 * drop code visibility in the correct order, otherwise drops can be missed. 260 */ 261 void 262 hammer2_chain_core_alloc(hammer2_trans_t *trans, 263 hammer2_chain_t *nchain, hammer2_chain_t *ochain) 264 { 265 hammer2_chain_core_t *core; 266 267 KKASSERT(nchain->core == NULL); 268 269 if (ochain == NULL) { 270 /* 271 * Fresh core under nchain (no multi-homing of ochain's 272 * sub-tree). 273 */ 274 core = kmalloc(sizeof(*core), nchain->hmp->mchain, 275 M_WAITOK | M_ZERO); 276 TAILQ_INIT(&core->layerq); 277 TAILQ_INIT(&core->ownerq); 278 core->sharecnt = 1; 279 core->good = 0x1234; 280 if (trans) 281 core->update_hi = trans->sync_tid; 282 else 283 core->update_hi = nchain->bref.mirror_tid; 284 nchain->core = core; 285 ccms_cst_init(&core->cst, nchain); 286 TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry); 287 } else { 288 /* 289 * Propagate the PFSROOT flag which we set on all subdirs 290 * under the super-root. 291 */ 292 atomic_set_int(&nchain->flags, 293 ochain->flags & HAMMER2_CHAIN_PFSROOT); 294 295 /* 296 * Duplicating ochain -> nchain. Set the DUPLICATED flag on 297 * ochain if nchain is not a snapshot. 298 * 299 * It is possible for the DUPLICATED flag to already be 300 * set when called via a flush operation because flush 301 * operations may have to work on elements with delete_tid's 302 * beyond the flush sync_tid. In this situation we must 303 * ensure that nchain is placed just after ochain in the 304 * ownerq and that the DUPLICATED flag is set on nchain so 305 * 'live' operations skip past it to the correct chain. 306 * 307 * The flusher understands the blockref synchronization state 308 * for any stale chains by observing bref.mirror_tid, which 309 * delete-duplicate replicates. 310 * 311 * WARNING! However, the case is disallowed when the flusher 312 * is allocating freemap space because this entails 313 * more than just adjusting a block table. 314 */ 315 if (ochain->flags & HAMMER2_CHAIN_DUPLICATED) { 316 KKASSERT((trans->flags & 317 (HAMMER2_TRANS_ISFLUSH | 318 HAMMER2_TRANS_ISALLOCATING)) == 319 HAMMER2_TRANS_ISFLUSH); 320 atomic_set_int(&nchain->flags, 321 HAMMER2_CHAIN_DUPLICATED); 322 } 323 if ((nchain->flags & HAMMER2_CHAIN_SNAPSHOT) == 0) { 324 atomic_set_int(&ochain->flags, 325 HAMMER2_CHAIN_DUPLICATED); 326 } 327 core = ochain->core; 328 atomic_add_int(&core->sharecnt, 1); 329 330 spin_lock(&core->cst.spin); 331 nchain->core = core; 332 333 #if 0 334 if (core->update_hi < trans->sync_tid) 335 core->update_hi = trans->sync_tid; 336 #endif 337 338 /* 339 * Maintain ordering for refactor test so we don't skip over 340 * a snapshot. Also, during flushes, delete-duplications 341 * for block-table updates can occur on blocks already 342 * deleted (delete-duplicated by a later transaction). We 343 * must insert nchain after ochain but before the later 344 * transaction's copy. 345 */ 346 TAILQ_INSERT_AFTER(&core->ownerq, ochain, nchain, core_entry); 347 348 spin_unlock(&core->cst.spin); 349 } 350 } 351 352 /* 353 * Add a reference to a chain element, preventing its destruction. 354 */ 355 void 356 hammer2_chain_ref(hammer2_chain_t *chain) 357 { 358 atomic_add_int(&chain->refs, 1); 359 } 360 361 /* 362 * Insert the chain in the core rbtree at the first layer 363 * which accepts it (for now we don't sort layers by the transaction tid) 364 */ 365 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 366 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 367 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 368 369 static 370 int 371 hammer2_chain_insert(hammer2_chain_core_t *above, hammer2_chain_layer_t *layer, 372 hammer2_chain_t *chain, int flags, int generation) 373 { 374 hammer2_chain_t *xchain; 375 hammer2_chain_layer_t *nlayer; 376 int error = 0; 377 378 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 379 spin_lock(&above->cst.spin); 380 381 /* 382 * Special case, place the chain in the next most-recent layer as the 383 * specified layer, inserting a layer inbetween if necessary. 384 */ 385 if (layer) { 386 KKASSERT((flags & HAMMER2_CHAIN_INSERT_RACE) == 0); 387 nlayer = TAILQ_PREV(layer, h2_layer_list, entry); 388 if (nlayer && RB_INSERT(hammer2_chain_tree, 389 &nlayer->rbtree, chain) == NULL) { 390 layer = nlayer; 391 goto done; 392 } 393 394 spin_unlock(&above->cst.spin); 395 KKASSERT((flags & HAMMER2_CHAIN_INSERT_LIVE) == 0); 396 nlayer = kmalloc(sizeof(*nlayer), chain->hmp->mchain, 397 M_WAITOK | M_ZERO); 398 RB_INIT(&nlayer->rbtree); 399 nlayer->good = 0xABCD; 400 spin_lock(&above->cst.spin); 401 402 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) 403 hammer2_chain_assert_not_present(above, chain); 404 405 TAILQ_INSERT_BEFORE(layer, nlayer, entry); 406 RB_INSERT(hammer2_chain_tree, &nlayer->rbtree, chain); 407 layer = nlayer; 408 goto done; 409 } 410 411 /* 412 * Interlocked by spinlock, check for race 413 */ 414 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 415 above->generation != generation) { 416 error = EAGAIN; 417 goto failed; 418 } 419 420 /* 421 * Try to insert, allocate a new layer if a nominal collision 422 * occurs (a collision is different from a SMP race). 423 */ 424 layer = TAILQ_FIRST(&above->layerq); 425 xchain = NULL; 426 427 if (layer == NULL || 428 (xchain = RB_INSERT(hammer2_chain_tree, 429 &layer->rbtree, chain)) != NULL) { 430 431 /* 432 * Allocate a new layer to resolve the issue. 433 */ 434 spin_unlock(&above->cst.spin); 435 layer = kmalloc(sizeof(*layer), chain->hmp->mchain, 436 M_WAITOK | M_ZERO); 437 RB_INIT(&layer->rbtree); 438 layer->good = 0xABCD; 439 spin_lock(&above->cst.spin); 440 441 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 442 above->generation != generation) { 443 spin_unlock(&above->cst.spin); 444 kfree(layer, chain->hmp->mchain); 445 spin_lock(&above->cst.spin); 446 error = EAGAIN; 447 goto failed; 448 } 449 hammer2_chain_assert_not_present(above, chain); 450 451 TAILQ_INSERT_HEAD(&above->layerq, layer, entry); 452 RB_INSERT(hammer2_chain_tree, &layer->rbtree, chain); 453 } 454 done: 455 chain->above = above; 456 chain->inlayer = layer; 457 ++above->chain_count; 458 ++above->generation; 459 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 460 461 /* 462 * We have to keep track of the effective live-view blockref count 463 * so the create code knows when to push an indirect block. 464 */ 465 if ((flags & HAMMER2_CHAIN_INSERT_LIVE) && 466 (chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 467 atomic_add_int(&above->live_count, 1); 468 } 469 failed: 470 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 471 spin_unlock(&above->cst.spin); 472 return error; 473 } 474 475 /* 476 * Drop the caller's reference to the chain. When the ref count drops to 477 * zero this function will try to disassociate the chain from its parent and 478 * deallocate it, then recursely drop the parent using the implied ref 479 * from the chain's chain->parent. 480 */ 481 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 482 struct h2_core_list *delayq); 483 484 void 485 hammer2_chain_drop(hammer2_chain_t *chain) 486 { 487 struct h2_core_list delayq; 488 hammer2_chain_t *scan; 489 u_int refs; 490 u_int need = 0; 491 492 if (hammer2_debug & 0x200000) 493 Debugger("drop"); 494 495 if (chain->flags & HAMMER2_CHAIN_MOVED) 496 ++need; 497 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 498 ++need; 499 KKASSERT(chain->refs > need); 500 501 TAILQ_INIT(&delayq); 502 503 while (chain) { 504 refs = chain->refs; 505 cpu_ccfence(); 506 KKASSERT(refs > 0); 507 508 if (refs == 1) { 509 chain = hammer2_chain_lastdrop(chain, &delayq); 510 } else { 511 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 512 break; 513 /* retry the same chain */ 514 } 515 516 /* 517 * When we've exhausted lastdrop chaining pull off of delayq. 518 * chains on delayq are dead but are used to placehold other 519 * chains which we added a ref to for the purpose of dropping. 520 */ 521 if (chain == NULL) { 522 hammer2_mount_t *hmp; 523 524 if ((scan = TAILQ_FIRST(&delayq)) != NULL) { 525 chain = (void *)scan->data; 526 TAILQ_REMOVE(&delayq, scan, core_entry); 527 scan->flags &= ~HAMMER2_CHAIN_ALLOCATED; 528 hmp = scan->hmp; 529 scan->hmp = NULL; 530 kfree(scan, hmp->mchain); 531 } 532 } 533 } 534 } 535 536 /* 537 * Safe handling of the 1->0 transition on chain. Returns a chain for 538 * recursive drop or NULL, possibly returning the same chain if the atomic 539 * op fails. 540 * 541 * Whem two chains need to be recursively dropped we use the chain 542 * we would otherwise free to placehold the additional chain. It's a bit 543 * convoluted but we can't just recurse without potentially blowing out 544 * the kernel stack. 545 * 546 * The chain cannot be freed if it has a non-empty core (children) or 547 * it is not at the head of ownerq. 548 * 549 * The cst spinlock is allowed nest child-to-parent (not parent-to-child). 550 */ 551 static 552 hammer2_chain_t * 553 hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq) 554 { 555 hammer2_pfsmount_t *pmp; 556 hammer2_mount_t *hmp; 557 hammer2_chain_core_t *above; 558 hammer2_chain_core_t *core; 559 hammer2_chain_layer_t *layer; 560 hammer2_chain_t *rdrop1; 561 hammer2_chain_t *rdrop2; 562 563 /* 564 * Spinlock the core and check to see if it is empty. If it is 565 * not empty we leave chain intact with refs == 0. The elements 566 * in core->rbtree are associated with other chains contemporary 567 * with ours but not with our chain directly. 568 */ 569 if ((core = chain->core) != NULL) { 570 spin_lock(&core->cst.spin); 571 572 /* 573 * We can't free non-stale chains with children until we are 574 * able to free the children because there might be a flush 575 * dependency. Flushes of stale children (which should also 576 * have their deleted flag set) short-cut recursive flush 577 * dependencies and can be freed here. Any flushes which run 578 * through stale children due to the flush synchronization 579 * point should have the MOVED bit set in the chain and not 580 * reach lastdrop at this time. 581 * 582 * NOTE: We return (chain) on failure to retry. 583 */ 584 if (core->chain_count && 585 (chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0) { 586 if (atomic_cmpset_int(&chain->refs, 1, 0)) 587 chain = NULL; /* success */ 588 spin_unlock(&core->cst.spin); 589 return(chain); 590 } 591 /* no chains left under us */ 592 593 /* 594 * Various parts of the code might be holding a ref on a 595 * stale chain as a placemarker which must be iterated to 596 * locate a later non-stale (live) chain. We must be sure 597 * NOT to free the later non-stale chain (which might have 598 * no refs). Otherwise mass confusion may result. 599 * 600 * The DUPLICATED flag tells us whether the chain is stale 601 * or not, so the rule is that any chain whos DUPLICATED flag 602 * is NOT set must also be at the head of the ownerq. 603 * 604 * Note that the DELETED flag is not involved. That is, a 605 * live chain can represent a deletion that has not yet been 606 * flushed (or still has refs). 607 */ 608 #if 0 609 if (TAILQ_NEXT(chain, core_entry) == NULL && 610 TAILQ_FIRST(&core->ownerq) != chain) { 611 #endif 612 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 && 613 TAILQ_FIRST(&core->ownerq) != chain) { 614 if (atomic_cmpset_int(&chain->refs, 1, 0)) 615 chain = NULL; /* success */ 616 spin_unlock(&core->cst.spin); 617 return(chain); 618 } 619 } 620 621 /* 622 * chain->core has no children left so no accessors can get to our 623 * chain from there. Now we have to lock the above core to interlock 624 * remaining possible accessors that might bump chain's refs before 625 * we can safely drop chain's refs with intent to free the chain. 626 */ 627 hmp = chain->hmp; 628 pmp = chain->pmp; /* can be NULL */ 629 rdrop1 = NULL; 630 rdrop2 = NULL; 631 layer = NULL; 632 633 /* 634 * Spinlock the parent and try to drop the last ref on chain. 635 * On success remove chain from its parent, otherwise return NULL. 636 * 637 * (normal core locks are top-down recursive but we define core 638 * spinlocks as bottom-up recursive, so this is safe). 639 */ 640 if ((above = chain->above) != NULL) { 641 spin_lock(&above->cst.spin); 642 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 643 /* 1->0 transition failed */ 644 spin_unlock(&above->cst.spin); 645 if (core) 646 spin_unlock(&core->cst.spin); 647 return(chain); /* retry */ 648 } 649 650 /* 651 * 1->0 transition successful, remove chain from its 652 * above core. Track layer for removal/freeing. 653 */ 654 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 655 layer = chain->inlayer; 656 RB_REMOVE(hammer2_chain_tree, &layer->rbtree, chain); 657 --above->chain_count; 658 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 659 chain->above = NULL; 660 chain->inlayer = NULL; 661 662 if (RB_EMPTY(&layer->rbtree) && layer->refs == 0) { 663 TAILQ_REMOVE(&above->layerq, layer, entry); 664 } else { 665 layer = NULL; 666 } 667 668 /* 669 * If our chain was the last chain in the parent's core the 670 * core is now empty and its parents might now be droppable. 671 * Try to drop the first multi-homed parent by gaining a 672 * ref on it here and then dropping it below. 673 */ 674 if (above->chain_count == 0) { 675 rdrop1 = TAILQ_FIRST(&above->ownerq); 676 if (rdrop1 && 677 atomic_cmpset_int(&rdrop1->refs, 0, 1) == 0) { 678 rdrop1 = NULL; 679 } 680 } 681 spin_unlock(&above->cst.spin); 682 above = NULL; /* safety */ 683 } 684 685 /* 686 * Successful 1->0 transition and the chain can be destroyed now. 687 * 688 * We still have the core spinlock (if core is non-NULL), and core's 689 * chain_count is 0. The above spinlock is gone. 690 * 691 * Remove chain from ownerq. Once core has no more owners (and no 692 * children which is already the case) we can destroy core. 693 * 694 * If core has more owners we may be able to continue a bottom-up 695 * drop with our next sibling. 696 */ 697 if (core) { 698 chain->core = NULL; 699 700 TAILQ_REMOVE(&core->ownerq, chain, core_entry); 701 rdrop2 = TAILQ_FIRST(&core->ownerq); 702 if (rdrop2 && atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0) 703 rdrop2 = NULL; 704 spin_unlock(&core->cst.spin); 705 706 /* 707 * We can do the final 1->0 transition with an atomic op 708 * after releasing core's spinlock. 709 */ 710 if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) { 711 /* 712 * On the 1->0 transition of core we can destroy 713 * it. Any remaining layers should no longer be 714 * referenced or visibile to other threads. 715 */ 716 KKASSERT(TAILQ_EMPTY(&core->ownerq)); 717 if (layer) { 718 layer->good = 0xEF00; 719 kfree(layer, hmp->mchain); 720 } 721 while ((layer = TAILQ_FIRST(&core->layerq)) != NULL) { 722 KKASSERT(layer->refs == 0 && 723 RB_EMPTY(&layer->rbtree)); 724 TAILQ_REMOVE(&core->layerq, layer, entry); 725 layer->good = 0xEF01; 726 kfree(layer, hmp->mchain); 727 } 728 /* layer now NULL */ 729 KKASSERT(core->cst.count == 0); 730 KKASSERT(core->cst.upgrade == 0); 731 core->good = 0x5678; 732 kfree(core, hmp->mchain); 733 } 734 core = NULL; /* safety */ 735 } 736 737 /* 738 * All spin locks are gone, finish freeing stuff. 739 */ 740 KKASSERT((chain->flags & (HAMMER2_CHAIN_MOVED | 741 HAMMER2_CHAIN_MODIFIED)) == 0); 742 hammer2_chain_drop_data(chain, 1); 743 744 KKASSERT(chain->dio == NULL); 745 746 /* 747 * Free saved empty layer and return chained drop. 748 */ 749 if (layer) { 750 layer->good = 0xEF02; 751 kfree(layer, hmp->mchain); 752 } 753 754 /* 755 * Once chain resources are gone we can use the now dead chain 756 * structure to placehold what might otherwise require a recursive 757 * drop, because we have potentially two things to drop and can only 758 * return one directly. 759 */ 760 if (rdrop1 && rdrop2) { 761 KKASSERT(chain->flags & HAMMER2_CHAIN_ALLOCATED); 762 chain->data = (void *)rdrop1; 763 TAILQ_INSERT_TAIL(delayq, chain, core_entry); 764 rdrop1 = NULL; 765 } else if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 766 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED; 767 chain->hmp = NULL; 768 kfree(chain, hmp->mchain); 769 } 770 771 /* 772 * Either or both can be NULL. We already handled the case where 773 * both might not have been NULL. 774 */ 775 if (rdrop1) 776 return(rdrop1); 777 else 778 return(rdrop2); 779 } 780 781 /* 782 * On either last lock release or last drop 783 */ 784 static void 785 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop) 786 { 787 hammer2_mount_t *hmp = chain->hmp; 788 789 switch(chain->bref.type) { 790 case HAMMER2_BREF_TYPE_VOLUME: 791 case HAMMER2_BREF_TYPE_FREEMAP: 792 if (lastdrop) 793 chain->data = NULL; 794 break; 795 case HAMMER2_BREF_TYPE_INODE: 796 if (chain->data) { 797 kfree(chain->data, hmp->mchain); 798 chain->data = NULL; 799 } 800 break; 801 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 802 if (chain->data) { 803 kfree(chain->data, hmp->mchain); 804 chain->data = NULL; 805 } 806 break; 807 default: 808 KKASSERT(chain->data == NULL); 809 break; 810 } 811 } 812 813 /* 814 * Ref and lock a chain element, acquiring its data with I/O if necessary, 815 * and specify how you would like the data to be resolved. 816 * 817 * Returns 0 on success or an error code if the data could not be acquired. 818 * The chain element is locked on return regardless of whether an error 819 * occurred or not. 820 * 821 * The lock is allowed to recurse, multiple locking ops will aggregate 822 * the requested resolve types. Once data is assigned it will not be 823 * removed until the last unlock. 824 * 825 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 826 * (typically used to avoid device/logical buffer 827 * aliasing for data) 828 * 829 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 830 * the INITIAL-create state (indirect blocks only). 831 * 832 * Do not resolve data elements for DATA chains. 833 * (typically used to avoid device/logical buffer 834 * aliasing for data) 835 * 836 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 837 * 838 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 839 * it will be locked exclusive. 840 * 841 * NOTE: Embedded elements (volume header, inodes) are always resolved 842 * regardless. 843 * 844 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 845 * element will instantiate and zero its buffer, and flush it on 846 * release. 847 * 848 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 849 * so as not to instantiate a device buffer, which could alias against 850 * a logical file buffer. However, if ALWAYS is specified the 851 * device buffer will be instantiated anyway. 852 * 853 * WARNING! If data must be fetched a shared lock will temporarily be 854 * upgraded to exclusive. However, a deadlock can occur if 855 * the caller owns more than one shared lock. 856 */ 857 int 858 hammer2_chain_lock(hammer2_chain_t *chain, int how) 859 { 860 hammer2_mount_t *hmp; 861 hammer2_chain_core_t *core; 862 hammer2_blockref_t *bref; 863 ccms_state_t ostate; 864 char *bdata; 865 int error; 866 867 /* 868 * Ref and lock the element. Recursive locks are allowed. 869 */ 870 if ((how & HAMMER2_RESOLVE_NOREF) == 0) 871 hammer2_chain_ref(chain); 872 atomic_add_int(&chain->lockcnt, 1); 873 874 hmp = chain->hmp; 875 KKASSERT(hmp != NULL); 876 877 /* 878 * Get the appropriate lock. 879 */ 880 core = chain->core; 881 if (how & HAMMER2_RESOLVE_SHARED) 882 ccms_thread_lock(&core->cst, CCMS_STATE_SHARED); 883 else 884 ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE); 885 886 /* 887 * If we already have a valid data pointer no further action is 888 * necessary. 889 */ 890 if (chain->data) 891 return (0); 892 893 /* 894 * Do we have to resolve the data? 895 */ 896 switch(how & HAMMER2_RESOLVE_MASK) { 897 case HAMMER2_RESOLVE_NEVER: 898 return(0); 899 case HAMMER2_RESOLVE_MAYBE: 900 if (chain->flags & HAMMER2_CHAIN_INITIAL) 901 return(0); 902 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 903 return(0); 904 #if 0 905 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 906 return(0); 907 #endif 908 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 909 return(0); 910 /* fall through */ 911 case HAMMER2_RESOLVE_ALWAYS: 912 break; 913 } 914 915 /* 916 * Upgrade to an exclusive lock so we can safely manipulate the 917 * buffer cache. If another thread got to it before us we 918 * can just return. 919 */ 920 ostate = ccms_thread_lock_upgrade(&core->cst); 921 if (chain->data) { 922 ccms_thread_lock_downgrade(&core->cst, ostate); 923 return (0); 924 } 925 926 /* 927 * We must resolve to a device buffer, either by issuing I/O or 928 * by creating a zero-fill element. We do not mark the buffer 929 * dirty when creating a zero-fill element (the hammer2_chain_modify() 930 * API must still be used to do that). 931 * 932 * The device buffer is variable-sized in powers of 2 down 933 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 934 * chunk always contains buffers of the same size. (XXX) 935 * 936 * The minimum physical IO size may be larger than the variable 937 * block size. 938 */ 939 bref = &chain->bref; 940 941 /* 942 * The getblk() optimization can only be used on newly created 943 * elements if the physical block size matches the request. 944 */ 945 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 946 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, 947 &chain->dio); 948 } else { 949 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes, 950 &chain->dio); 951 adjreadcounter(&chain->bref, chain->bytes); 952 } 953 954 if (error) { 955 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 956 (intmax_t)bref->data_off, error); 957 hammer2_io_bqrelse(&chain->dio); 958 ccms_thread_lock_downgrade(&core->cst, ostate); 959 return (error); 960 } 961 962 /* 963 * We can clear the INITIAL state now, we've resolved the buffer 964 * to zeros and marked it dirty with hammer2_io_new(). 965 */ 966 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 967 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 968 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 969 } 970 971 /* 972 * Setup the data pointer, either pointing it to an embedded data 973 * structure and copying the data from the buffer, or pointing it 974 * into the buffer. 975 * 976 * The buffer is not retained when copying to an embedded data 977 * structure in order to avoid potential deadlocks or recursions 978 * on the same physical buffer. 979 */ 980 switch (bref->type) { 981 case HAMMER2_BREF_TYPE_VOLUME: 982 case HAMMER2_BREF_TYPE_FREEMAP: 983 /* 984 * Copy data from bp to embedded buffer 985 */ 986 panic("hammer2_chain_lock: called on unresolved volume header"); 987 break; 988 case HAMMER2_BREF_TYPE_INODE: 989 /* 990 * Copy data from dio to embedded buffer, do not retain the 991 * device buffer. 992 */ 993 KKASSERT(chain->bytes == sizeof(chain->data->ipdata)); 994 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED); 995 chain->data = kmalloc(sizeof(chain->data->ipdata), 996 hmp->mchain, M_WAITOK | M_ZERO); 997 bcopy(bdata, &chain->data->ipdata, chain->bytes); 998 hammer2_io_bqrelse(&chain->dio); 999 break; 1000 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1001 KKASSERT(chain->bytes == sizeof(chain->data->bmdata)); 1002 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED); 1003 chain->data = kmalloc(sizeof(chain->data->bmdata), 1004 hmp->mchain, M_WAITOK | M_ZERO); 1005 bcopy(bdata, &chain->data->bmdata, chain->bytes); 1006 hammer2_io_bqrelse(&chain->dio); 1007 break; 1008 case HAMMER2_BREF_TYPE_INDIRECT: 1009 case HAMMER2_BREF_TYPE_DATA: 1010 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1011 default: 1012 /* 1013 * Point data at the device buffer and leave bp intact. 1014 */ 1015 chain->data = (void *)bdata; 1016 break; 1017 } 1018 ccms_thread_lock_downgrade(&core->cst, ostate); 1019 return (0); 1020 } 1021 1022 /* 1023 * This basically calls hammer2_io_breadcb() but does some pre-processing 1024 * of the chain first to handle certain cases. 1025 */ 1026 void 1027 hammer2_chain_load_async(hammer2_chain_t *chain, 1028 void (*callback)(hammer2_io_t *dio, 1029 hammer2_chain_t *chain, 1030 void *arg_p, off_t arg_o), 1031 void *arg_p, off_t arg_o) 1032 { 1033 hammer2_mount_t *hmp; 1034 struct hammer2_io *dio; 1035 hammer2_blockref_t *bref; 1036 int error; 1037 1038 if (chain->data) { 1039 callback(NULL, chain, arg_p, arg_o); 1040 return; 1041 } 1042 1043 /* 1044 * We must resolve to a device buffer, either by issuing I/O or 1045 * by creating a zero-fill element. We do not mark the buffer 1046 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1047 * API must still be used to do that). 1048 * 1049 * The device buffer is variable-sized in powers of 2 down 1050 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1051 * chunk always contains buffers of the same size. (XXX) 1052 * 1053 * The minimum physical IO size may be larger than the variable 1054 * block size. 1055 */ 1056 bref = &chain->bref; 1057 hmp = chain->hmp; 1058 1059 /* 1060 * The getblk() optimization can only be used on newly created 1061 * elements if the physical block size matches the request. 1062 */ 1063 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1064 chain->bytes == hammer2_devblksize(chain->bytes)) { 1065 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1066 KKASSERT(error == 0); 1067 callback(dio, chain, arg_p, arg_o); 1068 return; 1069 } 1070 1071 /* 1072 * Otherwise issue a read 1073 */ 1074 adjreadcounter(&chain->bref, chain->bytes); 1075 hammer2_io_breadcb(hmp, bref->data_off, chain->bytes, 1076 callback, chain, arg_p, arg_o); 1077 } 1078 1079 /* 1080 * Unlock and deref a chain element. 1081 * 1082 * On the last lock release any non-embedded data (chain->dio) will be 1083 * retired. 1084 */ 1085 void 1086 hammer2_chain_unlock(hammer2_chain_t *chain) 1087 { 1088 hammer2_chain_core_t *core = chain->core; 1089 ccms_state_t ostate; 1090 long *counterp; 1091 u_int lockcnt; 1092 1093 /* 1094 * The core->cst lock can be shared across several chains so we 1095 * need to track the per-chain lockcnt separately. 1096 * 1097 * If multiple locks are present (or being attempted) on this 1098 * particular chain we can just unlock, drop refs, and return. 1099 * 1100 * Otherwise fall-through on the 1->0 transition. 1101 */ 1102 for (;;) { 1103 lockcnt = chain->lockcnt; 1104 KKASSERT(lockcnt > 0); 1105 cpu_ccfence(); 1106 if (lockcnt > 1) { 1107 if (atomic_cmpset_int(&chain->lockcnt, 1108 lockcnt, lockcnt - 1)) { 1109 ccms_thread_unlock(&core->cst); 1110 hammer2_chain_drop(chain); 1111 return; 1112 } 1113 } else { 1114 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1115 break; 1116 } 1117 /* retry */ 1118 } 1119 1120 /* 1121 * On the 1->0 transition we upgrade the core lock (if necessary) 1122 * to exclusive for terminal processing. If after upgrading we find 1123 * that lockcnt is non-zero, another thread is racing us and will 1124 * handle the unload for us later on, so just cleanup and return 1125 * leaving the data/io intact 1126 * 1127 * Otherwise if lockcnt is still 0 it is possible for it to become 1128 * non-zero and race, but since we hold the core->cst lock 1129 * exclusively all that will happen is that the chain will be 1130 * reloaded after we unload it. 1131 */ 1132 ostate = ccms_thread_lock_upgrade(&core->cst); 1133 if (chain->lockcnt) { 1134 ccms_thread_unlock_upgraded(&core->cst, ostate); 1135 hammer2_chain_drop(chain); 1136 return; 1137 } 1138 1139 /* 1140 * Shortcut the case if the data is embedded or not resolved. 1141 * 1142 * Do NOT NULL out chain->data (e.g. inode data), it might be 1143 * dirty. 1144 */ 1145 if (chain->dio == NULL) { 1146 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) 1147 hammer2_chain_drop_data(chain, 0); 1148 ccms_thread_unlock_upgraded(&core->cst, ostate); 1149 hammer2_chain_drop(chain); 1150 return; 1151 } 1152 1153 /* 1154 * Statistics 1155 */ 1156 if (hammer2_io_isdirty(chain->dio) == 0) { 1157 ; 1158 } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 1159 switch(chain->bref.type) { 1160 case HAMMER2_BREF_TYPE_DATA: 1161 counterp = &hammer2_ioa_file_write; 1162 break; 1163 case HAMMER2_BREF_TYPE_INODE: 1164 counterp = &hammer2_ioa_meta_write; 1165 break; 1166 case HAMMER2_BREF_TYPE_INDIRECT: 1167 counterp = &hammer2_ioa_indr_write; 1168 break; 1169 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1170 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1171 counterp = &hammer2_ioa_fmap_write; 1172 break; 1173 default: 1174 counterp = &hammer2_ioa_volu_write; 1175 break; 1176 } 1177 *counterp += chain->bytes; 1178 } else { 1179 switch(chain->bref.type) { 1180 case HAMMER2_BREF_TYPE_DATA: 1181 counterp = &hammer2_iod_file_write; 1182 break; 1183 case HAMMER2_BREF_TYPE_INODE: 1184 counterp = &hammer2_iod_meta_write; 1185 break; 1186 case HAMMER2_BREF_TYPE_INDIRECT: 1187 counterp = &hammer2_iod_indr_write; 1188 break; 1189 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1190 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1191 counterp = &hammer2_iod_fmap_write; 1192 break; 1193 default: 1194 counterp = &hammer2_iod_volu_write; 1195 break; 1196 } 1197 *counterp += chain->bytes; 1198 } 1199 1200 /* 1201 * Clean out the dio. 1202 * 1203 * If a device buffer was used for data be sure to destroy the 1204 * buffer when we are done to avoid aliases (XXX what about the 1205 * underlying VM pages?). 1206 * 1207 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing 1208 * is possible. 1209 * 1210 * NOTE: The isdirty check tracks whether we have to bdwrite() the 1211 * buffer or not. The buffer might already be dirty. The 1212 * flag is re-set when chain_modify() is called, even if 1213 * MODIFIED is already set, allowing the OS to retire the 1214 * buffer independent of a hammer2 flush. 1215 */ 1216 chain->data = NULL; 1217 if ((chain->flags & HAMMER2_CHAIN_IOFLUSH) && 1218 hammer2_io_isdirty(chain->dio)) { 1219 hammer2_io_bawrite(&chain->dio); 1220 } else { 1221 hammer2_io_bqrelse(&chain->dio); 1222 } 1223 ccms_thread_unlock_upgraded(&core->cst, ostate); 1224 hammer2_chain_drop(chain); 1225 } 1226 1227 /* 1228 * This counts the number of live blockrefs in a block array and 1229 * also calculates the point at which all remaining blockrefs are empty. 1230 * This routine can only be called on a live chain (DUPLICATED flag not set). 1231 * 1232 * NOTE: Flag is not set until after the count is complete, allowing 1233 * callers to test the flag without holding the spinlock. 1234 * 1235 * NOTE: If base is NULL the related chain is still in the INITIAL 1236 * state and there are no blockrefs to count. 1237 * 1238 * NOTE: live_count may already have some counts accumulated due to 1239 * creation and deletion and could even be initially negative. 1240 */ 1241 void 1242 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1243 hammer2_blockref_t *base, int count) 1244 { 1245 hammer2_chain_core_t *core = chain->core; 1246 1247 KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0); 1248 1249 spin_lock(&core->cst.spin); 1250 if ((core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) { 1251 if (base) { 1252 while (--count >= 0) { 1253 if (base[count].type) 1254 break; 1255 } 1256 core->live_zero = count + 1; 1257 while (count >= 0) { 1258 if (base[count].type) 1259 atomic_add_int(&core->live_count, 1); 1260 --count; 1261 } 1262 } else { 1263 core->live_zero = 0; 1264 } 1265 /* else do not modify live_count */ 1266 atomic_set_int(&core->flags, HAMMER2_CORE_COUNTEDBREFS); 1267 } 1268 spin_unlock(&core->cst.spin); 1269 } 1270 1271 /* 1272 * Resize the chain's physical storage allocation in-place. This may 1273 * replace the passed-in chain with a new chain. 1274 * 1275 * Chains can be resized smaller without reallocating the storage. 1276 * Resizing larger will reallocate the storage. 1277 * 1278 * Must be passed an exclusively locked parent and chain, returns a new 1279 * exclusively locked chain at the same index and unlocks the old chain. 1280 * Flushes the buffer if necessary. 1281 * 1282 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1283 * to avoid instantiating a device buffer that conflicts with the vnode 1284 * data buffer. That is, the passed-in bp is a logical buffer, whereas 1285 * any chain-oriented bp would be a device buffer. 1286 * 1287 * XXX return error if cannot resize. 1288 */ 1289 void 1290 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 1291 hammer2_chain_t *parent, hammer2_chain_t **chainp, 1292 int nradix, int flags) 1293 { 1294 hammer2_mount_t *hmp; 1295 hammer2_chain_t *chain; 1296 size_t obytes; 1297 size_t nbytes; 1298 1299 chain = *chainp; 1300 hmp = chain->hmp; 1301 1302 /* 1303 * Only data and indirect blocks can be resized for now. 1304 * (The volu root, inodes, and freemap elements use a fixed size). 1305 */ 1306 KKASSERT(chain != &hmp->vchain); 1307 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1308 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT); 1309 1310 /* 1311 * Nothing to do if the element is already the proper size 1312 */ 1313 obytes = chain->bytes; 1314 nbytes = 1U << nradix; 1315 if (obytes == nbytes) 1316 return; 1317 1318 /* 1319 * Delete the old chain and duplicate it at the same (parent, index), 1320 * returning a new chain. This allows the old chain to still be 1321 * used by the flush code. The new chain will be returned in a 1322 * modified state. 1323 * 1324 * The parent does not have to be locked for the delete/duplicate call, 1325 * but is in this particular code path. 1326 * 1327 * NOTE: If we are not crossing a synchronization point the 1328 * duplication code will simply reuse the existing chain 1329 * structure. 1330 */ 1331 hammer2_chain_delete_duplicate(trans, &chain, 0); 1332 1333 /* 1334 * Relocate the block, even if making it smaller (because different 1335 * block sizes may be in different regions). 1336 */ 1337 hammer2_freemap_alloc(trans, chain, nbytes); 1338 chain->bytes = nbytes; 1339 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW); 1340 /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */ 1341 1342 /* 1343 * For now just support it on DATA chains (and not on indirect 1344 * blocks). 1345 */ 1346 KKASSERT(chain->dio == NULL); 1347 1348 #if 0 1349 /* 1350 * Make sure the chain is marked MOVED and propagate the update 1351 * to the root for flush. 1352 */ 1353 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 1354 hammer2_chain_ref(chain); 1355 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 1356 } 1357 hammer2_chain_setsubmod(trans, chain); 1358 #endif 1359 *chainp = chain; 1360 } 1361 1362 /* 1363 * Set a chain modified, making it read-write and duplicating it if necessary. 1364 * This function will assign a new physical block to the chain if necessary 1365 * 1366 * Duplication of already-modified chains is possible when the modification 1367 * crosses a flush synchronization boundary. 1368 * 1369 * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE 1370 * level or the COW operation will not work. 1371 * 1372 * Data blocks - The chain is usually locked RESOLVE_NEVER so as not to 1373 * run the data through the device buffers. 1374 * 1375 * This function may return a different chain than was passed, in which case 1376 * the old chain will be unlocked and the new chain will be locked. 1377 * 1378 * ip->chain may be adjusted by hammer2_chain_modify_ip(). 1379 */ 1380 hammer2_inode_data_t * 1381 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 1382 hammer2_chain_t **chainp, int flags) 1383 { 1384 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 1385 hammer2_chain_modify(trans, chainp, flags); 1386 if (ip->chain != *chainp) 1387 hammer2_inode_repoint(ip, NULL, *chainp); 1388 if (ip->vp) 1389 vsetisdirty(ip->vp); 1390 return(&ip->chain->data->ipdata); 1391 } 1392 1393 void 1394 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp, 1395 int flags) 1396 { 1397 hammer2_mount_t *hmp; 1398 hammer2_chain_t *chain; 1399 hammer2_io_t *dio; 1400 int error; 1401 int wasinitial; 1402 char *bdata; 1403 1404 chain = *chainp; 1405 hmp = chain->hmp; 1406 1407 KKASSERT(chain->bref.mirror_tid != trans->sync_tid || 1408 (chain->flags & HAMMER2_CHAIN_MODIFIED)); 1409 #if 0 1410 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP || 1411 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1412 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1413 kprintf("trans %04jx/%08x MODIFY1 %p.%d [%08x] %016jx/%d %016jx C/D %016jx/%016jx\n", 1414 trans->sync_tid, trans->flags, 1415 chain, chain->bref.type, chain->flags, 1416 chain->bref.key, chain->bref.keybits, 1417 chain->bref.data_off, 1418 chain->modify_tid, chain->delete_tid); 1419 } 1420 #endif 1421 #if 0 1422 kprintf("MODIFY %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid); 1423 #endif 1424 /* 1425 * Data must be resolved if already assigned unless explicitly 1426 * flagged otherwise. 1427 */ 1428 if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1429 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1430 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1431 hammer2_chain_unlock(chain); 1432 } 1433 1434 /* 1435 * data is not optional for freemap chains (we must always be sure 1436 * to copy the data on COW storage allocations). 1437 */ 1438 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1439 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1440 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1441 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1442 } 1443 1444 /* 1445 * Determine if a delete-duplicate is needed. 1446 * 1447 * (a) Modify_tid is part of a prior flush 1448 * (b) Transaction is concurrent with a flush (has higher tid) 1449 * (c) and chain is not in the initial state (freshly created) 1450 * (d) and caller didn't request an in-place modification. 1451 * 1452 * The freemap and volume header special chains are never D-Dd. 1453 */ 1454 if (chain->modify_tid != trans->sync_tid && /* cross boundary */ 1455 (flags & HAMMER2_MODIFY_INPLACE) == 0) { /* from d-d */ 1456 if (chain != &hmp->fchain && chain != &hmp->vchain) { 1457 KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0); 1458 hammer2_chain_delete_duplicate(trans, chainp, 0); 1459 chain = *chainp; 1460 #if 0 1461 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP || 1462 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1463 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1464 kprintf("trans %04jx/%08x MODIFY2 %p.%d [%08x] %016jx/%d %016jx\n", 1465 trans->sync_tid, trans->flags, 1466 chain, chain->bref.type, chain->flags, 1467 chain->bref.key, chain->bref.keybits, 1468 chain->bref.data_off); 1469 return; 1470 } 1471 #endif 1472 } 1473 1474 /* 1475 * Fall through if fchain or vchain, clearing the CHAIN_FLUSHED 1476 * flag. Basically other chains are delete-duplicated and so 1477 * the duplicated chains of course will not have the FLUSHED 1478 * flag set, but fchain and vchain are special-cased and the 1479 * flag must be cleared when changing modify_tid. 1480 */ 1481 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FLUSHED); 1482 } 1483 1484 /* 1485 * Otherwise do initial-chain handling 1486 */ 1487 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1488 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1489 hammer2_chain_ref(chain); 1490 hammer2_chain_memory_inc(chain->pmp); 1491 } 1492 #if 0 1493 /* shouldn't be needed */ 1494 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 1495 hammer2_chain_ref(chain); 1496 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 1497 } 1498 #endif 1499 1500 /* 1501 * The modification or re-modification requires an allocation and 1502 * possible COW. 1503 * 1504 * We normally always allocate new storage here. If storage exists 1505 * and MODIFY_NOREALLOC is passed in, we do not allocate new storage. 1506 */ 1507 if (chain != &hmp->vchain && chain != &hmp->fchain) { 1508 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1509 ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 && 1510 chain->modify_tid != trans->sync_tid) 1511 ) { 1512 hammer2_freemap_alloc(trans, chain, chain->bytes); 1513 /* XXX failed allocation */ 1514 } else if (chain->flags & HAMMER2_CHAIN_FORCECOW) { 1515 hammer2_freemap_alloc(trans, chain, chain->bytes); 1516 /* XXX failed allocation */ 1517 } 1518 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW); 1519 } 1520 1521 /* 1522 * Update modify_tid. XXX special-case vchain/fchain because they 1523 * are always modified in-place. Otherwise the chain being modified 1524 * must not be part of a future transaction. 1525 */ 1526 if (chain == &hmp->vchain || chain == &hmp->fchain) { 1527 if (chain->modify_tid <= trans->sync_tid) 1528 chain->modify_tid = trans->sync_tid; 1529 } else { 1530 KKASSERT(chain->modify_tid <= trans->sync_tid); 1531 chain->modify_tid = trans->sync_tid; 1532 } 1533 1534 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0) 1535 chain->bref.modify_tid = trans->sync_tid; 1536 1537 /* 1538 * Do not COW if OPTDATA is set. INITIAL flag remains unchanged. 1539 * (OPTDATA does not prevent [re]allocation of storage, only the 1540 * related copy-on-write op). 1541 */ 1542 if (flags & HAMMER2_MODIFY_OPTDATA) 1543 goto skip2; 1544 1545 /* 1546 * Clearing the INITIAL flag (for indirect blocks) indicates that 1547 * we've processed the uninitialized storage allocation. 1548 * 1549 * If this flag is already clear we are likely in a copy-on-write 1550 * situation but we have to be sure NOT to bzero the storage if 1551 * no data is present. 1552 */ 1553 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1554 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1555 wasinitial = 1; 1556 } else { 1557 wasinitial = 0; 1558 } 1559 1560 /* 1561 * Instantiate data buffer and possibly execute COW operation 1562 */ 1563 switch(chain->bref.type) { 1564 case HAMMER2_BREF_TYPE_VOLUME: 1565 case HAMMER2_BREF_TYPE_FREEMAP: 1566 case HAMMER2_BREF_TYPE_INODE: 1567 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1568 /* 1569 * The data is embedded, no copy-on-write operation is 1570 * needed. 1571 */ 1572 KKASSERT(chain->dio == NULL); 1573 break; 1574 case HAMMER2_BREF_TYPE_DATA: 1575 case HAMMER2_BREF_TYPE_INDIRECT: 1576 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1577 /* 1578 * Perform the copy-on-write operation 1579 */ 1580 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1581 1582 if (wasinitial) { 1583 error = hammer2_io_new(hmp, chain->bref.data_off, 1584 chain->bytes, &dio); 1585 } else { 1586 error = hammer2_io_bread(hmp, chain->bref.data_off, 1587 chain->bytes, &dio); 1588 } 1589 adjreadcounter(&chain->bref, chain->bytes); 1590 KKASSERT(error == 0); 1591 1592 bdata = hammer2_io_data(dio, chain->bref.data_off); 1593 1594 /* 1595 * Copy or zero-fill on write depending on whether 1596 * chain->data exists or not and set the dirty state for 1597 * the new buffer. Retire the existing buffer. 1598 */ 1599 if (chain->data) { 1600 KKASSERT(chain->dio != NULL); 1601 if (chain->data != (void *)bdata) { 1602 bcopy(chain->data, bdata, chain->bytes); 1603 } 1604 } else if (wasinitial == 0) { 1605 /* 1606 * We have a problem. We were asked to COW but 1607 * we don't have any data to COW with! 1608 */ 1609 panic("hammer2_chain_modify: having a COW %p\n", 1610 chain); 1611 } 1612 hammer2_io_brelse(&chain->dio); 1613 chain->data = (void *)bdata; 1614 chain->dio = dio; 1615 hammer2_io_setdirty(dio); /* modified by bcopy above */ 1616 break; 1617 default: 1618 panic("hammer2_chain_modify: illegal non-embedded type %d", 1619 chain->bref.type); 1620 break; 1621 1622 } 1623 skip2: 1624 #if 0 1625 kprintf("RET2 %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid); 1626 #endif 1627 hammer2_chain_setsubmod(trans, chain); 1628 } 1629 1630 /* 1631 * Mark the volume as having been modified. This short-cut version 1632 * does not have to lock the volume's chain, which allows the ioctl 1633 * code to make adjustments to connections without deadlocking. XXX 1634 * 1635 * No ref is made on vchain when flagging it MODIFIED. 1636 */ 1637 void 1638 hammer2_modify_volume(hammer2_mount_t *hmp) 1639 { 1640 hammer2_voldata_lock(hmp); 1641 hammer2_voldata_unlock(hmp, 1); 1642 } 1643 1644 /* 1645 * This function returns the chain at the nearest key within the specified 1646 * range with the highest delete_tid. The core spinlock must be held on 1647 * call and the returned chain will be referenced but not locked. 1648 * 1649 * The returned chain may or may not be in a deleted state. Note that 1650 * live chains have a delete_tid = MAX_TID. 1651 * 1652 * This function will recurse through chain->rbtree as necessary and will 1653 * return a *key_nextp suitable for iteration. *key_nextp is only set if 1654 * the iteration value is less than the current value of *key_nextp. 1655 * 1656 * The caller should use (*key_nextp) to calculate the actual range of 1657 * the returned element, which will be (key_beg to *key_nextp - 1), because 1658 * there might be another element which is superior to the returned element 1659 * and overlaps it. 1660 * 1661 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 1662 * chains continue to be returned. On EOF (*key_nextp) may overflow since 1663 * it will wind up being (key_end + 1). 1664 */ 1665 struct hammer2_chain_find_info { 1666 hammer2_chain_t *best; 1667 hammer2_key_t key_beg; 1668 hammer2_key_t key_end; 1669 hammer2_key_t key_next; 1670 }; 1671 1672 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 1673 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 1674 1675 /* 1676 * DEBUGGING - Assert that the chain will not collide. 1677 */ 1678 static 1679 void 1680 hammer2_chain_assert_not_present(hammer2_chain_core_t *core, 1681 hammer2_chain_t *chain) 1682 { 1683 struct hammer2_chain_find_info info; 1684 hammer2_chain_layer_t *layer; 1685 1686 if (chain->flags & HAMMER2_CHAIN_DELETED) 1687 return; 1688 1689 info.best = NULL; 1690 info.key_beg = chain->bref.key; 1691 info.key_end = chain->bref.key + 1692 ((hammer2_key_t)1 << chain->bref.keybits) - 1; 1693 info.key_next = HAMMER2_MAX_KEY; 1694 1695 TAILQ_FOREACH(layer, &core->layerq, entry) { 1696 KKASSERT(layer->good == 0xABCD); 1697 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 1698 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1699 &info); 1700 } 1701 if (info.best && (info.best->flags & HAMMER2_CHAIN_DELETED) == 0) 1702 panic("hammer2_chain_assert_not_present: %p/%p\n", 1703 chain, info.best); 1704 } 1705 1706 static 1707 hammer2_chain_t * 1708 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 1709 hammer2_key_t key_beg, hammer2_key_t key_end) 1710 { 1711 struct hammer2_chain_find_info info; 1712 hammer2_chain_layer_t *layer; 1713 1714 info.best = NULL; 1715 info.key_beg = key_beg; 1716 info.key_end = key_end; 1717 info.key_next = *key_nextp; 1718 1719 KKASSERT(parent->core->good == 0x1234); 1720 TAILQ_FOREACH(layer, &parent->core->layerq, entry) { 1721 KKASSERT(layer->good == 0xABCD); 1722 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 1723 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1724 &info); 1725 } 1726 *key_nextp = info.key_next; 1727 #if 0 1728 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 1729 parent, key_beg, key_end, *key_nextp); 1730 #endif 1731 1732 return (info.best); 1733 } 1734 1735 static 1736 int 1737 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 1738 { 1739 struct hammer2_chain_find_info *info = data; 1740 hammer2_key_t child_beg; 1741 hammer2_key_t child_end; 1742 1743 child_beg = child->bref.key; 1744 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 1745 1746 if (child_end < info->key_beg) 1747 return(-1); 1748 if (child_beg > info->key_end) 1749 return(1); 1750 return(0); 1751 } 1752 1753 static 1754 int 1755 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 1756 { 1757 struct hammer2_chain_find_info *info = data; 1758 hammer2_chain_t *best; 1759 hammer2_key_t child_end; 1760 1761 /* 1762 * WARNING! Do not discard DUPLICATED chains, it is possible that 1763 * we are catching an insertion half-way done. If a 1764 * duplicated chain turns out to be the best choice the 1765 * caller will re-check its flags after locking it. 1766 * 1767 * WARNING! Layerq is scanned forwards, exact matches should keep 1768 * the existing info->best. 1769 */ 1770 if ((best = info->best) == NULL) { 1771 /* 1772 * No previous best. Assign best 1773 */ 1774 info->best = child; 1775 } else if (best->bref.key <= info->key_beg && 1776 child->bref.key <= info->key_beg) { 1777 /* 1778 * If our current best is flush with key_beg and child is 1779 * also flush with key_beg choose based on delete_tid. 1780 * 1781 * key_next will automatically be limited to the smaller of 1782 * the two end-points. 1783 */ 1784 if (child->delete_tid > best->delete_tid) 1785 info->best = child; 1786 } else if (child->bref.key < best->bref.key) { 1787 /* 1788 * Child has a nearer key and best is not flush with key_beg. 1789 * Truncate key_next to the old best key iff it had a better 1790 * delete_tid. 1791 */ 1792 info->best = child; 1793 if (best->delete_tid >= child->delete_tid && 1794 (info->key_next > best->bref.key || info->key_next == 0)) 1795 info->key_next = best->bref.key; 1796 } else if (child->bref.key == best->bref.key) { 1797 /* 1798 * If our current best is flush with the child then choose 1799 * based on delete_tid. 1800 * 1801 * key_next will automatically be limited to the smaller of 1802 * the two end-points. 1803 */ 1804 if (child->delete_tid > best->delete_tid) 1805 info->best = child; 1806 } else { 1807 /* 1808 * Keep the current best but truncate key_next to the child's 1809 * base iff the child has a higher delete_tid. 1810 * 1811 * key_next will also automatically be limited to the smaller 1812 * of the two end-points (probably not necessary for this case 1813 * but we do it anyway). 1814 */ 1815 if (child->delete_tid >= best->delete_tid && 1816 (info->key_next > child->bref.key || info->key_next == 0)) 1817 info->key_next = child->bref.key; 1818 } 1819 1820 /* 1821 * Always truncate key_next based on child's end-of-range. 1822 */ 1823 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 1824 if (child_end && (info->key_next > child_end || info->key_next == 0)) 1825 info->key_next = child_end; 1826 1827 return(0); 1828 } 1829 1830 /* 1831 * Retrieve the specified chain from a media blockref, creating the 1832 * in-memory chain structure which reflects it. modify_tid will be 1833 * left 0 which forces any modifications to issue a delete-duplicate. 1834 * 1835 * To handle insertion races pass the INSERT_RACE flag along with the 1836 * generation number of the core. NULL will be returned if the generation 1837 * number changes before we have a chance to insert the chain. Insert 1838 * races can occur because the parent might be held shared. 1839 * 1840 * Caller must hold the parent locked shared or exclusive since we may 1841 * need the parent's bref array to find our block. 1842 */ 1843 hammer2_chain_t * 1844 hammer2_chain_get(hammer2_chain_t *parent, hammer2_blockref_t *bref, 1845 int generation) 1846 { 1847 hammer2_mount_t *hmp = parent->hmp; 1848 hammer2_chain_core_t *above = parent->core; 1849 hammer2_chain_t *chain; 1850 int error; 1851 1852 /* 1853 * Allocate a chain structure representing the existing media 1854 * entry. Resulting chain has one ref and is not locked. 1855 */ 1856 chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref); 1857 hammer2_chain_core_alloc(NULL, chain, NULL); 1858 /* ref'd chain returned */ 1859 chain->modify_tid = chain->bref.mirror_tid; 1860 1861 /* 1862 * Link the chain into its parent. A spinlock is required to safely 1863 * access the RBTREE, and it is possible to collide with another 1864 * hammer2_chain_get() operation because the caller might only hold 1865 * a shared lock on the parent. 1866 */ 1867 KKASSERT(parent->refs > 0); 1868 error = hammer2_chain_insert(above, NULL, chain, 1869 HAMMER2_CHAIN_INSERT_SPIN | 1870 HAMMER2_CHAIN_INSERT_RACE, 1871 generation); 1872 if (error) { 1873 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 1874 kprintf("chain %p get race\n", chain); 1875 hammer2_chain_drop(chain); 1876 chain = NULL; 1877 } else { 1878 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 1879 } 1880 1881 /* 1882 * Return our new chain referenced but not locked, or NULL if 1883 * a race occurred. 1884 */ 1885 return (chain); 1886 } 1887 1888 /* 1889 * Lookup initialization/completion API 1890 */ 1891 hammer2_chain_t * 1892 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 1893 { 1894 if (flags & HAMMER2_LOOKUP_SHARED) { 1895 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 1896 HAMMER2_RESOLVE_SHARED); 1897 } else { 1898 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 1899 } 1900 return (parent); 1901 } 1902 1903 void 1904 hammer2_chain_lookup_done(hammer2_chain_t *parent) 1905 { 1906 if (parent) 1907 hammer2_chain_unlock(parent); 1908 } 1909 1910 static 1911 hammer2_chain_t * 1912 hammer2_chain_getparent(hammer2_chain_t **parentp, int how) 1913 { 1914 hammer2_chain_t *oparent; 1915 hammer2_chain_t *bparent; 1916 hammer2_chain_t *nparent; 1917 hammer2_chain_core_t *above; 1918 1919 oparent = *parentp; 1920 above = oparent->above; 1921 1922 spin_lock(&above->cst.spin); 1923 bparent = TAILQ_FIRST(&above->ownerq); 1924 hammer2_chain_ref(bparent); 1925 1926 /* 1927 * Be careful of order, oparent must be unlocked before nparent 1928 * is locked below to avoid a deadlock. We might as well delay its 1929 * unlocking until we conveniently no longer have the spinlock (instead 1930 * of cycling the spinlock). 1931 * 1932 * Theoretically our ref on bparent should prevent elements of the 1933 * following chain from going away and prevent above from going away, 1934 * but we still need the spinlock to safely scan the list. 1935 */ 1936 for (;;) { 1937 nparent = bparent; 1938 while (nparent->flags & HAMMER2_CHAIN_DUPLICATED) 1939 nparent = TAILQ_NEXT(nparent, core_entry); 1940 hammer2_chain_ref(nparent); 1941 spin_unlock(&above->cst.spin); 1942 1943 if (oparent) { 1944 hammer2_chain_unlock(oparent); 1945 oparent = NULL; 1946 } 1947 hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF); 1948 hammer2_chain_drop(bparent); 1949 1950 /* 1951 * We might have raced a delete-duplicate. 1952 */ 1953 if ((nparent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) 1954 break; 1955 bparent = nparent; 1956 hammer2_chain_ref(bparent); 1957 hammer2_chain_unlock(nparent); 1958 spin_lock(&above->cst.spin); 1959 /* retry */ 1960 } 1961 *parentp = nparent; 1962 1963 return (nparent); 1964 } 1965 1966 /* 1967 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 1968 * (*parentp) typically points to an inode but can also point to a related 1969 * indirect block and this function will recurse upwards and find the inode 1970 * again. 1971 * 1972 * (*parentp) must be exclusively locked and referenced and can be an inode 1973 * or an existing indirect block within the inode. 1974 * 1975 * On return (*parentp) will be modified to point at the deepest parent chain 1976 * element encountered during the search, as a helper for an insertion or 1977 * deletion. The new (*parentp) will be locked and referenced and the old 1978 * will be unlocked and dereferenced (no change if they are both the same). 1979 * 1980 * The matching chain will be returned exclusively locked. If NOLOCK is 1981 * requested the chain will be returned only referenced. 1982 * 1983 * NULL is returned if no match was found, but (*parentp) will still 1984 * potentially be adjusted. 1985 * 1986 * On return (*key_nextp) will point to an iterative value for key_beg. 1987 * (If NULL is returned (*key_nextp) is set to key_end). 1988 * 1989 * This function will also recurse up the chain if the key is not within the 1990 * current parent's range. (*parentp) can never be set to NULL. An iteration 1991 * can simply allow (*parentp) to float inside the loop. 1992 * 1993 * NOTE! chain->data is not always resolved. By default it will not be 1994 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 1995 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 1996 * BREF_TYPE_DATA as the device buffer can alias the logical file 1997 * buffer). 1998 */ 1999 hammer2_chain_t * 2000 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 2001 hammer2_key_t key_beg, hammer2_key_t key_end, 2002 int *cache_indexp, int flags) 2003 { 2004 hammer2_mount_t *hmp; 2005 hammer2_chain_t *parent; 2006 hammer2_chain_t *chain; 2007 hammer2_blockref_t *base; 2008 hammer2_blockref_t *bref; 2009 hammer2_blockref_t bcopy; 2010 hammer2_key_t scan_beg; 2011 hammer2_key_t scan_end; 2012 hammer2_chain_core_t *above; 2013 int count = 0; 2014 int how_always = HAMMER2_RESOLVE_ALWAYS; 2015 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2016 int how; 2017 int generation; 2018 int maxloops = 300000; 2019 int wasdup; 2020 hammer2_chain_t * volatile xxchain = NULL; 2021 volatile int xxchainwhy; 2022 2023 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2024 how_maybe = how_always; 2025 how = HAMMER2_RESOLVE_ALWAYS; 2026 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2027 how = HAMMER2_RESOLVE_NEVER; 2028 } else { 2029 how = HAMMER2_RESOLVE_MAYBE; 2030 } 2031 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 2032 how_maybe |= HAMMER2_RESOLVE_SHARED; 2033 how_always |= HAMMER2_RESOLVE_SHARED; 2034 how |= HAMMER2_RESOLVE_SHARED; 2035 } 2036 2037 /* 2038 * Recurse (*parentp) upward if necessary until the parent completely 2039 * encloses the key range or we hit the inode. 2040 * 2041 * This function handles races against the flusher doing a delete- 2042 * duplicate above us and re-homes the parent to the duplicate in 2043 * that case, otherwise we'd wind up recursing down a stale chain. 2044 */ 2045 parent = *parentp; 2046 hmp = parent->hmp; 2047 2048 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2049 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2050 scan_beg = parent->bref.key; 2051 scan_end = scan_beg + 2052 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2053 if (key_beg >= scan_beg && key_end <= scan_end) 2054 break; 2055 parent = hammer2_chain_getparent(parentp, how_maybe); 2056 } 2057 2058 again: 2059 if (--maxloops == 0) 2060 panic("hammer2_chain_lookup: maxloops"); 2061 /* 2062 * Locate the blockref array. Currently we do a fully associative 2063 * search through the array. 2064 */ 2065 switch(parent->bref.type) { 2066 case HAMMER2_BREF_TYPE_INODE: 2067 /* 2068 * Special shortcut for embedded data returns the inode 2069 * itself. Callers must detect this condition and access 2070 * the embedded data (the strategy code does this for us). 2071 * 2072 * This is only applicable to regular files and softlinks. 2073 */ 2074 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 2075 if (flags & HAMMER2_LOOKUP_NOLOCK) 2076 hammer2_chain_ref(parent); 2077 else 2078 hammer2_chain_lock(parent, how_always); 2079 *key_nextp = key_end + 1; 2080 return (parent); 2081 } 2082 base = &parent->data->ipdata.u.blockset.blockref[0]; 2083 count = HAMMER2_SET_COUNT; 2084 break; 2085 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2086 case HAMMER2_BREF_TYPE_INDIRECT: 2087 /* 2088 * Handle MATCHIND on the parent 2089 */ 2090 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2091 scan_beg = parent->bref.key; 2092 scan_end = scan_beg + 2093 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2094 if (key_beg == scan_beg && key_end == scan_end) { 2095 chain = parent; 2096 hammer2_chain_lock(chain, how_maybe); 2097 *key_nextp = scan_end + 1; 2098 goto done; 2099 } 2100 } 2101 /* 2102 * Optimize indirect blocks in the INITIAL state to avoid 2103 * I/O. 2104 */ 2105 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2106 base = NULL; 2107 } else { 2108 if (parent->data == NULL) 2109 panic("parent->data is NULL"); 2110 base = &parent->data->npdata[0]; 2111 } 2112 count = parent->bytes / sizeof(hammer2_blockref_t); 2113 break; 2114 case HAMMER2_BREF_TYPE_VOLUME: 2115 base = &hmp->voldata.sroot_blockset.blockref[0]; 2116 count = HAMMER2_SET_COUNT; 2117 break; 2118 case HAMMER2_BREF_TYPE_FREEMAP: 2119 base = &hmp->voldata.freemap_blockset.blockref[0]; 2120 count = HAMMER2_SET_COUNT; 2121 break; 2122 default: 2123 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2124 parent->bref.type); 2125 base = NULL; /* safety */ 2126 count = 0; /* safety */ 2127 } 2128 2129 /* 2130 * Merged scan to find next candidate. 2131 * 2132 * hammer2_base_*() functions require the above->live_* fields 2133 * to be synchronized. 2134 * 2135 * We need to hold the spinlock to access the block array and RB tree 2136 * and to interlock chain creation. 2137 */ 2138 above = parent->core; 2139 if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2140 hammer2_chain_countbrefs(parent, base, count); 2141 2142 /* 2143 * Combined search 2144 */ 2145 spin_lock(&above->cst.spin); 2146 chain = hammer2_combined_find(parent, base, count, 2147 cache_indexp, key_nextp, 2148 key_beg, key_end, &bref); 2149 generation = above->generation; 2150 2151 /* 2152 * Exhausted parent chain, iterate. 2153 */ 2154 if (bref == NULL) { 2155 spin_unlock(&above->cst.spin); 2156 if (key_beg == key_end) /* short cut single-key case */ 2157 return (NULL); 2158 return (hammer2_chain_next(parentp, NULL, key_nextp, 2159 key_beg, key_end, 2160 cache_indexp, flags)); 2161 } 2162 2163 /* 2164 * Selected from blockref or in-memory chain. 2165 */ 2166 if (chain == NULL) { 2167 bcopy = *bref; 2168 spin_unlock(&above->cst.spin); 2169 chain = hammer2_chain_get(parent, &bcopy, generation); 2170 if (chain == NULL) { 2171 kprintf("retry lookup parent %p keys %016jx:%016jx\n", 2172 parent, key_beg, key_end); 2173 goto again; 2174 } 2175 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2176 xxchain = chain; 2177 xxchainwhy = 1; 2178 hammer2_chain_drop(chain); 2179 goto again; 2180 } 2181 wasdup = 0; 2182 } else { 2183 hammer2_chain_ref(chain); 2184 wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0); 2185 spin_unlock(&above->cst.spin); 2186 } 2187 2188 /* 2189 * chain is referenced but not locked. We must lock the chain 2190 * to obtain definitive DUPLICATED/DELETED state 2191 */ 2192 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2193 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2194 hammer2_chain_lock(chain, how_maybe | HAMMER2_RESOLVE_NOREF); 2195 } else { 2196 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); 2197 } 2198 2199 /* 2200 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2201 * 2202 * NOTE: Chain's key range is not relevant as there might be 2203 * one-offs within the range that are not deleted. 2204 * 2205 * NOTE: Lookups can race delete-duplicate because 2206 * delete-duplicate does not lock the parent's core 2207 * (they just use the spinlock on the core). We must 2208 * check for races by comparing the DUPLICATED flag before 2209 * releasing the spinlock with the flag after locking the 2210 * chain. 2211 */ 2212 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2213 hammer2_chain_unlock(chain); 2214 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) { 2215 key_beg = *key_nextp; 2216 if (key_beg == 0 || key_beg > key_end) 2217 return(NULL); 2218 } 2219 xxchain = chain; 2220 xxchainwhy = 2; 2221 goto again; 2222 } 2223 2224 /* 2225 * If the chain element is an indirect block it becomes the new 2226 * parent and we loop on it. We must maintain our top-down locks 2227 * to prevent the flusher from interfering (i.e. doing a 2228 * delete-duplicate and leaving us recursing down a deleted chain). 2229 * 2230 * The parent always has to be locked with at least RESOLVE_MAYBE 2231 * so we can access its data. It might need a fixup if the caller 2232 * passed incompatible flags. Be careful not to cause a deadlock 2233 * as a data-load requires an exclusive lock. 2234 * 2235 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2236 * range is within the requested key range we return the indirect 2237 * block and do NOT loop. This is usually only used to acquire 2238 * freemap nodes. 2239 */ 2240 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2241 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2242 hammer2_chain_unlock(parent); 2243 *parentp = parent = chain; 2244 xxchain = chain; 2245 xxchainwhy = 3; 2246 goto again; 2247 } 2248 done: 2249 /* 2250 * All done, return the chain 2251 */ 2252 XXChain = chain; 2253 XXChainWhy = xxchainwhy; 2254 return (chain); 2255 } 2256 2257 /* 2258 * After having issued a lookup we can iterate all matching keys. 2259 * 2260 * If chain is non-NULL we continue the iteration from just after it's index. 2261 * 2262 * If chain is NULL we assume the parent was exhausted and continue the 2263 * iteration at the next parent. 2264 * 2265 * parent must be locked on entry and remains locked throughout. chain's 2266 * lock status must match flags. Chain is always at least referenced. 2267 * 2268 * WARNING! The MATCHIND flag does not apply to this function. 2269 */ 2270 hammer2_chain_t * 2271 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2272 hammer2_key_t *key_nextp, 2273 hammer2_key_t key_beg, hammer2_key_t key_end, 2274 int *cache_indexp, int flags) 2275 { 2276 hammer2_chain_t *parent; 2277 int how_maybe; 2278 2279 /* 2280 * Calculate locking flags for upward recursion. 2281 */ 2282 how_maybe = HAMMER2_RESOLVE_MAYBE; 2283 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 2284 how_maybe |= HAMMER2_RESOLVE_SHARED; 2285 2286 parent = *parentp; 2287 2288 /* 2289 * Calculate the next index and recalculate the parent if necessary. 2290 */ 2291 if (chain) { 2292 key_beg = chain->bref.key + 2293 ((hammer2_key_t)1 << chain->bref.keybits); 2294 if (flags & HAMMER2_LOOKUP_NOLOCK) 2295 hammer2_chain_drop(chain); 2296 else 2297 hammer2_chain_unlock(chain); 2298 2299 /* 2300 * Any scan where the lookup returned degenerate data embedded 2301 * in the inode has an invalid index and must terminate. 2302 */ 2303 if (chain == parent) 2304 return(NULL); 2305 if (key_beg == 0 || key_beg > key_end) 2306 return(NULL); 2307 chain = NULL; 2308 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2309 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2310 /* 2311 * We reached the end of the iteration. 2312 */ 2313 return (NULL); 2314 } else { 2315 /* 2316 * Continue iteration with next parent unless the current 2317 * parent covers the range. 2318 */ 2319 key_beg = parent->bref.key + 2320 ((hammer2_key_t)1 << parent->bref.keybits); 2321 if (key_beg == 0 || key_beg > key_end) 2322 return (NULL); 2323 parent = hammer2_chain_getparent(parentp, how_maybe); 2324 } 2325 2326 /* 2327 * And execute 2328 */ 2329 return (hammer2_chain_lookup(parentp, key_nextp, 2330 key_beg, key_end, 2331 cache_indexp, flags)); 2332 } 2333 2334 /* 2335 * Raw scan functions are similar to lookup/next but do not seek the parent 2336 * chain and do not skip stale chains. These functions are primarily used 2337 * by the recovery code. 2338 * 2339 * Parent and chain are locked, parent's data must be resolved. To acquire 2340 * the first sub-chain under parent pass chain == NULL. 2341 */ 2342 hammer2_chain_t * 2343 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain, 2344 int *cache_indexp, int flags) 2345 { 2346 hammer2_mount_t *hmp; 2347 hammer2_blockref_t *base; 2348 hammer2_blockref_t *bref; 2349 hammer2_blockref_t bcopy; 2350 hammer2_chain_core_t *above; 2351 hammer2_key_t key; 2352 hammer2_key_t next_key; 2353 int count = 0; 2354 int how_always = HAMMER2_RESOLVE_ALWAYS; 2355 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2356 int how; 2357 int generation; 2358 int maxloops = 300000; 2359 int wasdup; 2360 2361 hmp = parent->hmp; 2362 2363 /* 2364 * Scan flags borrowed from lookup 2365 */ 2366 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2367 how_maybe = how_always; 2368 how = HAMMER2_RESOLVE_ALWAYS; 2369 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2370 how = HAMMER2_RESOLVE_NEVER; 2371 } else { 2372 how = HAMMER2_RESOLVE_MAYBE; 2373 } 2374 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 2375 how_maybe |= HAMMER2_RESOLVE_SHARED; 2376 how_always |= HAMMER2_RESOLVE_SHARED; 2377 how |= HAMMER2_RESOLVE_SHARED; 2378 } 2379 2380 /* 2381 * Calculate key to locate first/next element, unlocking the previous 2382 * element as we go. Be careful, the key calculation can overflow. 2383 */ 2384 if (chain) { 2385 key = chain->bref.key + 2386 ((hammer2_key_t)1 << chain->bref.keybits); 2387 hammer2_chain_unlock(chain); 2388 chain = NULL; 2389 if (key == 0) 2390 goto done; 2391 } else { 2392 key = 0; 2393 } 2394 2395 again: 2396 if (--maxloops == 0) 2397 panic("hammer2_chain_scan: maxloops"); 2398 /* 2399 * Locate the blockref array. Currently we do a fully associative 2400 * search through the array. 2401 */ 2402 switch(parent->bref.type) { 2403 case HAMMER2_BREF_TYPE_INODE: 2404 /* 2405 * An inode with embedded data has no sub-chains. 2406 */ 2407 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) 2408 goto done; 2409 base = &parent->data->ipdata.u.blockset.blockref[0]; 2410 count = HAMMER2_SET_COUNT; 2411 break; 2412 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2413 case HAMMER2_BREF_TYPE_INDIRECT: 2414 /* 2415 * Optimize indirect blocks in the INITIAL state to avoid 2416 * I/O. 2417 */ 2418 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2419 base = NULL; 2420 } else { 2421 if (parent->data == NULL) 2422 panic("parent->data is NULL"); 2423 base = &parent->data->npdata[0]; 2424 } 2425 count = parent->bytes / sizeof(hammer2_blockref_t); 2426 break; 2427 case HAMMER2_BREF_TYPE_VOLUME: 2428 base = &hmp->voldata.sroot_blockset.blockref[0]; 2429 count = HAMMER2_SET_COUNT; 2430 break; 2431 case HAMMER2_BREF_TYPE_FREEMAP: 2432 base = &hmp->voldata.freemap_blockset.blockref[0]; 2433 count = HAMMER2_SET_COUNT; 2434 break; 2435 default: 2436 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2437 parent->bref.type); 2438 base = NULL; /* safety */ 2439 count = 0; /* safety */ 2440 } 2441 2442 /* 2443 * Merged scan to find next candidate. 2444 * 2445 * hammer2_base_*() functions require the above->live_* fields 2446 * to be synchronized. 2447 * 2448 * We need to hold the spinlock to access the block array and RB tree 2449 * and to interlock chain creation. 2450 */ 2451 if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2452 hammer2_chain_countbrefs(parent, base, count); 2453 2454 above = parent->core; 2455 next_key = 0; 2456 spin_lock(&above->cst.spin); 2457 chain = hammer2_combined_find(parent, base, count, 2458 cache_indexp, &next_key, 2459 key, HAMMER2_MAX_KEY, &bref); 2460 generation = above->generation; 2461 2462 /* 2463 * Exhausted parent chain, we're done. 2464 */ 2465 if (bref == NULL) { 2466 spin_unlock(&above->cst.spin); 2467 KKASSERT(chain == NULL); 2468 goto done; 2469 } 2470 2471 /* 2472 * Selected from blockref or in-memory chain. 2473 */ 2474 if (chain == NULL) { 2475 bcopy = *bref; 2476 spin_unlock(&above->cst.spin); 2477 chain = hammer2_chain_get(parent, &bcopy, generation); 2478 if (chain == NULL) { 2479 kprintf("retry scan parent %p keys %016jx\n", 2480 parent, key); 2481 goto again; 2482 } 2483 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2484 hammer2_chain_drop(chain); 2485 chain = NULL; 2486 goto again; 2487 } 2488 wasdup = 0; 2489 } else { 2490 hammer2_chain_ref(chain); 2491 wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0); 2492 spin_unlock(&above->cst.spin); 2493 } 2494 2495 /* 2496 * chain is referenced but not locked. We must lock the chain 2497 * to obtain definitive DUPLICATED/DELETED state 2498 */ 2499 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); 2500 2501 /* 2502 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2503 * 2504 * NOTE: chain's key range is not relevant as there might be 2505 * one-offs within the range that are not deleted. 2506 * 2507 * NOTE: XXX this could create problems with scans used in 2508 * situations other than mount-time recovery. 2509 * 2510 * NOTE: Lookups can race delete-duplicate because 2511 * delete-duplicate does not lock the parent's core 2512 * (they just use the spinlock on the core). We must 2513 * check for races by comparing the DUPLICATED flag before 2514 * releasing the spinlock with the flag after locking the 2515 * chain. 2516 */ 2517 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2518 hammer2_chain_unlock(chain); 2519 chain = NULL; 2520 2521 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) { 2522 key = next_key; 2523 if (key == 0) 2524 goto done; 2525 } 2526 goto again; 2527 } 2528 2529 done: 2530 /* 2531 * All done, return the chain or NULL 2532 */ 2533 return (chain); 2534 } 2535 2536 /* 2537 * Create and return a new hammer2 system memory structure of the specified 2538 * key, type and size and insert it under (*parentp). This is a full 2539 * insertion, based on the supplied key/keybits, and may involve creating 2540 * indirect blocks and moving other chains around via delete/duplicate. 2541 * 2542 * (*parentp) must be exclusive locked and may be replaced on return 2543 * depending on how much work the function had to do. 2544 * 2545 * (*chainp) usually starts out NULL and returns the newly created chain, 2546 * but if the caller desires the caller may allocate a disconnected chain 2547 * and pass it in instead. (It is also possible for the caller to use 2548 * chain_duplicate() to create a disconnected chain, manipulate it, then 2549 * pass it into this function to insert it). 2550 * 2551 * This function should NOT be used to insert INDIRECT blocks. It is 2552 * typically used to create/insert inodes and data blocks. 2553 * 2554 * Caller must pass-in an exclusively locked parent the new chain is to 2555 * be inserted under, and optionally pass-in a disconnected, exclusively 2556 * locked chain to insert (else we create a new chain). The function will 2557 * adjust (*parentp) as necessary, create or connect the chain, and 2558 * return an exclusively locked chain in *chainp. 2559 */ 2560 int 2561 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp, 2562 hammer2_chain_t **chainp, 2563 hammer2_key_t key, int keybits, int type, size_t bytes) 2564 { 2565 hammer2_mount_t *hmp; 2566 hammer2_chain_t *chain; 2567 hammer2_chain_t *parent = *parentp; 2568 hammer2_chain_core_t *above; 2569 hammer2_blockref_t *base; 2570 hammer2_blockref_t dummy; 2571 int allocated = 0; 2572 int error = 0; 2573 int count; 2574 int maxloops = 300000; 2575 2576 above = parent->core; 2577 KKASSERT(ccms_thread_lock_owned(&above->cst)); 2578 hmp = parent->hmp; 2579 chain = *chainp; 2580 2581 if (chain == NULL) { 2582 /* 2583 * First allocate media space and construct the dummy bref, 2584 * then allocate the in-memory chain structure. Set the 2585 * INITIAL flag for fresh chains which do not have embedded 2586 * data. 2587 */ 2588 bzero(&dummy, sizeof(dummy)); 2589 dummy.type = type; 2590 dummy.key = key; 2591 dummy.keybits = keybits; 2592 dummy.data_off = hammer2_getradix(bytes); 2593 dummy.methods = parent->bref.methods; 2594 chain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy); 2595 hammer2_chain_core_alloc(trans, chain, NULL); 2596 2597 /* 2598 * Lock the chain manually, chain_lock will load the chain 2599 * which we do NOT want to do. (note: chain->refs is set 2600 * to 1 by chain_alloc() for us, but lockcnt is not). 2601 */ 2602 chain->lockcnt = 1; 2603 ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE); 2604 allocated = 1; 2605 2606 /* 2607 * We do NOT set INITIAL here (yet). INITIAL is only 2608 * used for indirect blocks. 2609 * 2610 * Recalculate bytes to reflect the actual media block 2611 * allocation. 2612 */ 2613 bytes = (hammer2_off_t)1 << 2614 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 2615 chain->bytes = bytes; 2616 2617 switch(type) { 2618 case HAMMER2_BREF_TYPE_VOLUME: 2619 case HAMMER2_BREF_TYPE_FREEMAP: 2620 panic("hammer2_chain_create: called with volume type"); 2621 break; 2622 case HAMMER2_BREF_TYPE_INODE: 2623 KKASSERT(bytes == HAMMER2_INODE_BYTES); 2624 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED); 2625 chain->data = kmalloc(sizeof(chain->data->ipdata), 2626 hmp->mchain, M_WAITOK | M_ZERO); 2627 break; 2628 case HAMMER2_BREF_TYPE_INDIRECT: 2629 panic("hammer2_chain_create: cannot be used to" 2630 "create indirect block"); 2631 break; 2632 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2633 panic("hammer2_chain_create: cannot be used to" 2634 "create freemap root or node"); 2635 break; 2636 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2637 KKASSERT(bytes == sizeof(chain->data->bmdata)); 2638 atomic_set_int(&chain->flags, HAMMER2_CHAIN_EMBEDDED); 2639 chain->data = kmalloc(sizeof(chain->data->bmdata), 2640 hmp->mchain, M_WAITOK | M_ZERO); 2641 break; 2642 case HAMMER2_BREF_TYPE_DATA: 2643 default: 2644 /* 2645 * leave chain->data NULL, set INITIAL 2646 */ 2647 KKASSERT(chain->data == NULL); 2648 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 2649 break; 2650 } 2651 } else { 2652 /* 2653 * We are reattaching a chain that has been duplicated and 2654 * left disconnected under a DIFFERENT parent with potentially 2655 * different key/keybits. 2656 * 2657 * The chain must be modified in the current transaction 2658 * (the duplication code should have done that for us), 2659 * and it's modify_tid should be greater than the parent's 2660 * bref.mirror_tid. This should cause it to be created under 2661 * the new parent. 2662 * 2663 * If deleted in the same transaction, the create/delete TIDs 2664 * will be the same and effective the chain will not have 2665 * existed at all from the point of view of the parent. 2666 * 2667 * Do NOT mess with the current state of the INITIAL flag. 2668 */ 2669 KKASSERT(chain->modify_tid > parent->bref.mirror_tid); 2670 KKASSERT(chain->modify_tid == trans->sync_tid); 2671 chain->bref.key = key; 2672 chain->bref.keybits = keybits; 2673 /* chain->modify_tid = chain->bref.mirror_tid; */ 2674 KKASSERT(chain->above == NULL); 2675 } 2676 2677 /* 2678 * Calculate how many entries we have in the blockref array and 2679 * determine if an indirect block is required. 2680 */ 2681 again: 2682 if (--maxloops == 0) 2683 panic("hammer2_chain_create: maxloops"); 2684 above = parent->core; 2685 2686 switch(parent->bref.type) { 2687 case HAMMER2_BREF_TYPE_INODE: 2688 KKASSERT((parent->data->ipdata.op_flags & 2689 HAMMER2_OPFLAG_DIRECTDATA) == 0); 2690 KKASSERT(parent->data != NULL); 2691 base = &parent->data->ipdata.u.blockset.blockref[0]; 2692 count = HAMMER2_SET_COUNT; 2693 break; 2694 case HAMMER2_BREF_TYPE_INDIRECT: 2695 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2696 if (parent->flags & HAMMER2_CHAIN_INITIAL) 2697 base = NULL; 2698 else 2699 base = &parent->data->npdata[0]; 2700 count = parent->bytes / sizeof(hammer2_blockref_t); 2701 break; 2702 case HAMMER2_BREF_TYPE_VOLUME: 2703 KKASSERT(parent->data != NULL); 2704 base = &hmp->voldata.sroot_blockset.blockref[0]; 2705 count = HAMMER2_SET_COUNT; 2706 break; 2707 case HAMMER2_BREF_TYPE_FREEMAP: 2708 KKASSERT(parent->data != NULL); 2709 base = &hmp->voldata.freemap_blockset.blockref[0]; 2710 count = HAMMER2_SET_COUNT; 2711 break; 2712 default: 2713 panic("hammer2_chain_create: unrecognized blockref type: %d", 2714 parent->bref.type); 2715 base = NULL; 2716 count = 0; 2717 break; 2718 } 2719 2720 /* 2721 * Make sure we've counted the brefs 2722 */ 2723 if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2724 hammer2_chain_countbrefs(parent, base, count); 2725 2726 KKASSERT(above->live_count >= 0 && above->live_count <= count); 2727 2728 /* 2729 * If no free blockref could be found we must create an indirect 2730 * block and move a number of blockrefs into it. With the parent 2731 * locked we can safely lock each child in order to delete+duplicate 2732 * it without causing a deadlock. 2733 * 2734 * This may return the new indirect block or the old parent depending 2735 * on where the key falls. NULL is returned on error. 2736 */ 2737 if (above->live_count == count) { 2738 hammer2_chain_t *nparent; 2739 2740 nparent = hammer2_chain_create_indirect(trans, parent, 2741 key, keybits, 2742 type, &error); 2743 if (nparent == NULL) { 2744 if (allocated) 2745 hammer2_chain_drop(chain); 2746 chain = NULL; 2747 goto done; 2748 } 2749 if (parent != nparent) { 2750 hammer2_chain_unlock(parent); 2751 parent = *parentp = nparent; 2752 } 2753 goto again; 2754 } 2755 2756 /* 2757 * Link the chain into its parent. Later on we will have to set 2758 * the MOVED bit in situations where we don't mark the new chain 2759 * as being modified. 2760 */ 2761 if (chain->above != NULL) 2762 panic("hammer2: hammer2_chain_create: chain already connected"); 2763 KKASSERT(chain->above == NULL); 2764 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 2765 hammer2_chain_insert(above, NULL, chain, 2766 HAMMER2_CHAIN_INSERT_SPIN | 2767 HAMMER2_CHAIN_INSERT_LIVE, 2768 0); 2769 2770 if (allocated) { 2771 /* 2772 * Mark the newly created chain modified. 2773 * 2774 * Device buffers are not instantiated for DATA elements 2775 * as these are handled by logical buffers. 2776 * 2777 * Indirect and freemap node indirect blocks are handled 2778 * by hammer2_chain_create_indirect() and not by this 2779 * function. 2780 * 2781 * Data for all other bref types is expected to be 2782 * instantiated (INODE, LEAF). 2783 */ 2784 switch(chain->bref.type) { 2785 case HAMMER2_BREF_TYPE_DATA: 2786 hammer2_chain_modify(trans, &chain, 2787 HAMMER2_MODIFY_OPTDATA | 2788 HAMMER2_MODIFY_ASSERTNOCOPY); 2789 break; 2790 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2791 case HAMMER2_BREF_TYPE_INODE: 2792 hammer2_chain_modify(trans, &chain, 2793 HAMMER2_MODIFY_ASSERTNOCOPY); 2794 break; 2795 default: 2796 /* 2797 * Remaining types are not supported by this function. 2798 * In particular, INDIRECT and LEAF_NODE types are 2799 * handled by create_indirect(). 2800 */ 2801 panic("hammer2_chain_create: bad type: %d", 2802 chain->bref.type); 2803 /* NOT REACHED */ 2804 break; 2805 } 2806 } else { 2807 /* 2808 * When reconnecting a chain we must set MOVED and setsubmod 2809 * so the flush recognizes that it must update the bref in 2810 * the parent. 2811 */ 2812 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2813 hammer2_chain_ref(chain); 2814 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 2815 } 2816 } 2817 hammer2_chain_setsubmod(trans, chain); 2818 2819 done: 2820 *chainp = chain; 2821 2822 return (error); 2823 } 2824 2825 /* 2826 * Replace (*chainp) with a duplicate in-memory chain structure which shares 2827 * the same core and media state as the orignal. The original *chainp is 2828 * unlocked and the replacement will be returned locked. 2829 * 2830 * The old chain must be in a DELETED state unless snapshot is non-zero. 2831 * 2832 * The new chain will be live (i.e. not deleted), and modified. 2833 * 2834 * If (parent) is non-NULL then the new duplicated chain is inserted under 2835 * the parent. 2836 * 2837 * If (parent) is NULL then the newly duplicated chain is not inserted 2838 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 2839 * passing into hammer2_chain_create() after this function returns). 2840 * 2841 * WARNING! This function cannot take snapshots all by itself. The caller 2842 * needs to do other massaging for snapshots. 2843 * 2844 * WARNING! This function calls create which means it can insert indirect 2845 * blocks. Callers may have to refactor locked chains held across 2846 * the call (other than the ones passed into the call). 2847 */ 2848 static void hammer2_chain_dup_fixup(hammer2_chain_t *ochain, 2849 hammer2_chain_t *nchain); 2850 2851 void 2852 hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp, 2853 hammer2_chain_t **chainp, hammer2_blockref_t *bref, 2854 int snapshot, int duplicate_reason) 2855 { 2856 hammer2_mount_t *hmp; 2857 hammer2_chain_t *parent; 2858 hammer2_chain_t *ochain; 2859 hammer2_chain_t *nchain; 2860 hammer2_chain_core_t *above; 2861 size_t bytes; 2862 2863 /* 2864 * We want nchain to be our go-to live chain, but ochain may be in 2865 * a MODIFIED state within the current flush synchronization segment. 2866 * Force any further modifications of ochain to do another COW 2867 * operation even if modify_tid indicates that one is not needed. 2868 * 2869 * We don't want to set FORCECOW on nchain simply as an optimization, 2870 * as many duplication calls simply move chains into ichains and 2871 * then delete the original. 2872 * 2873 * WARNING! We should never resolve DATA to device buffers 2874 * (XXX allow it if the caller did?), and since 2875 * we currently do not have the logical buffer cache 2876 * buffer in-hand to fix its cached physical offset 2877 * we also force the modify code to not COW it. XXX 2878 */ 2879 ochain = *chainp; 2880 hmp = ochain->hmp; 2881 KKASSERT(snapshot == 1 || (ochain->flags & HAMMER2_CHAIN_DELETED)); 2882 2883 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW); 2884 2885 /* 2886 * Now create a duplicate of the chain structure, associating 2887 * it with the same core, making it the same size, pointing it 2888 * to the same bref (the same media block). 2889 * 2890 * Give the duplicate the same modify_tid that we previously 2891 * ensured was sufficiently advanced to trigger a block table 2892 * insertion on flush. 2893 * 2894 * NOTE: bref.mirror_tid duplicated by virtue of bref copy in 2895 * hammer2_chain_alloc() 2896 */ 2897 if (bref == NULL) 2898 bref = &ochain->bref; 2899 if (snapshot) { 2900 nchain = hammer2_chain_alloc(hmp, NULL, trans, bref); 2901 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SNAPSHOT); 2902 } else { 2903 nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, bref); 2904 } 2905 hammer2_chain_core_alloc(trans, nchain, ochain); 2906 bytes = (hammer2_off_t)1 << 2907 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 2908 nchain->bytes = bytes; 2909 nchain->modify_tid = ochain->modify_tid; 2910 nchain->inode_reason = ochain->inode_reason + 0x100000; 2911 if (ochain->flags & HAMMER2_CHAIN_INITIAL) 2912 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL); 2913 2914 /* 2915 * Fixup (copy) any embedded data. Non-embedded data relies on the 2916 * media block. We must unlock ochain before we can access nchain's 2917 * media block because they might share the same bp and deadlock if 2918 * we don't. 2919 */ 2920 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER | 2921 HAMMER2_RESOLVE_NOREF); 2922 hammer2_chain_dup_fixup(ochain, nchain); 2923 /* nchain has 1 ref */ 2924 hammer2_chain_unlock(ochain); 2925 KKASSERT((ochain->flags & HAMMER2_CHAIN_EMBEDDED) || 2926 ochain->data == NULL); 2927 2928 /* 2929 * Place nchain in the modified state, instantiate media data 2930 * if necessary. Because modify_tid is already completely 2931 * synchronized this should not result in a delete-duplicate. 2932 * 2933 * We want nchain at the target to look like a new insertion. 2934 * Forcing the modification to be INPLACE accomplishes this 2935 * because we get the same nchain with an updated modify_tid. 2936 */ 2937 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) { 2938 hammer2_chain_modify(trans, &nchain, 2939 HAMMER2_MODIFY_OPTDATA | 2940 HAMMER2_MODIFY_NOREALLOC | 2941 HAMMER2_MODIFY_INPLACE); 2942 } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) { 2943 hammer2_chain_modify(trans, &nchain, 2944 HAMMER2_MODIFY_OPTDATA | 2945 HAMMER2_MODIFY_INPLACE); 2946 } else { 2947 hammer2_chain_modify(trans, &nchain, 2948 HAMMER2_MODIFY_INPLACE); 2949 } 2950 2951 /* 2952 * If parent is not NULL the duplicated chain will be entered under 2953 * the parent and the MOVED bit set. 2954 * 2955 * Having both chains locked is extremely important for atomicy. 2956 */ 2957 if (parentp && (parent = *parentp) != NULL) { 2958 above = parent->core; 2959 KKASSERT(ccms_thread_lock_owned(&above->cst)); 2960 KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0); 2961 KKASSERT(parent->refs > 0); 2962 2963 hammer2_chain_create(trans, parentp, &nchain, 2964 nchain->bref.key, nchain->bref.keybits, 2965 nchain->bref.type, nchain->bytes); 2966 parent = NULL; 2967 2968 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2969 hammer2_chain_ref(nchain); 2970 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED); 2971 } 2972 hammer2_chain_setsubmod(trans, nchain); 2973 } 2974 2975 #if 0 2976 /* 2977 * Unconditionally set MOVED to force the parent blockrefs to 2978 * update, and adjust update_hi below nchain so nchain's 2979 * blockrefs are updated with the new attachment. 2980 */ 2981 if (nchain->core->update_hi < trans->sync_tid) { 2982 spin_lock(&nchain->core->cst.spin); 2983 if (nchain->core->update_hi < trans->sync_tid) 2984 nchain->core->update_hi = trans->sync_tid; 2985 spin_unlock(&nchain->core->cst.spin); 2986 } 2987 #endif 2988 2989 *chainp = nchain; 2990 } 2991 2992 /* 2993 * Special in-place delete-duplicate sequence which does not require a 2994 * locked parent. (*chainp) is marked DELETED and atomically replaced 2995 * with a duplicate. Atomicy is at the very-fine spin-lock level in 2996 * order to ensure that lookups do not race us. 2997 * 2998 * If the old chain is already marked deleted the new chain will also be 2999 * marked deleted. This case can occur when an inode is removed from the 3000 * filesystem but programs still have an open descriptor to it, and during 3001 * flushes when the flush needs to operate on a chain that is deleted in 3002 * the live view but still alive in the flush view. 3003 * 3004 * The new chain will be marked modified for the current transaction. 3005 */ 3006 void 3007 hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp, 3008 int flags) 3009 { 3010 hammer2_mount_t *hmp; 3011 hammer2_chain_t *ochain; 3012 hammer2_chain_t *nchain; 3013 hammer2_chain_core_t *above; 3014 size_t bytes; 3015 3016 if (hammer2_debug & 0x20000) 3017 Debugger("dd"); 3018 3019 /* 3020 * Note that we do not have to call setsubmod on ochain, calling it 3021 * on nchain is sufficient. 3022 */ 3023 ochain = *chainp; 3024 hmp = ochain->hmp; 3025 3026 if (ochain->bref.type == HAMMER2_BREF_TYPE_INODE) { 3027 KKASSERT(ochain->data); 3028 } 3029 3030 /* 3031 * First create a duplicate of the chain structure. 3032 * (nchain is allocated with one ref). 3033 * 3034 * In the case where nchain inherits ochains core, nchain is 3035 * effectively locked due to ochain being locked (and sharing the 3036 * core), until we can give nchain its own official ock. 3037 */ 3038 nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, &ochain->bref); 3039 if (flags & HAMMER2_DELDUP_RECORE) 3040 hammer2_chain_core_alloc(trans, nchain, NULL); 3041 else 3042 hammer2_chain_core_alloc(trans, nchain, ochain); 3043 above = ochain->above; 3044 3045 bytes = (hammer2_off_t)1 << 3046 (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 3047 nchain->bytes = bytes; 3048 3049 /* 3050 * Duplicate inherits ochain's live state including its modification 3051 * state. This function disposes of the original. Because we are 3052 * doing this in-place under the same parent the block array 3053 * inserted/deleted state does not change. 3054 * 3055 * The caller isn't expected to make further modifications of ochain 3056 * but set the FORCECOW bit anyway, just in case it does. If ochain 3057 * was previously marked FORCECOW we also flag nchain FORCECOW 3058 * (used during hardlink splits). FORCECOW forces a reallocation 3059 * of the block when we modify the chain a little later, it does 3060 * not force another delete-duplicate. 3061 * 3062 * NOTE: bref.mirror_tid duplicated by virtue of bref copy in 3063 * hammer2_chain_alloc() 3064 */ 3065 nchain->data_count += ochain->data_count; 3066 nchain->inode_count += ochain->inode_count; 3067 atomic_set_int(&nchain->flags, 3068 ochain->flags & (HAMMER2_CHAIN_INITIAL | 3069 HAMMER2_CHAIN_FORCECOW)); 3070 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW); 3071 nchain->inode_reason = ochain->inode_reason + 0x1000; 3072 3073 /* 3074 * Lock nchain so both chains are now locked (extremely important 3075 * for atomicy). Mark ochain deleted and reinsert into the topology 3076 * and insert nchain all in one go. 3077 * 3078 * If the ochain is already deleted it is left alone and nchain 3079 * is inserted into the topology as a deleted chain. This is 3080 * important because it allows ongoing operations to be executed 3081 * on a deleted inode which still has open descriptors. 3082 * 3083 * The deleted case can also occur when a flush delete-duplicates 3084 * a node which is being concurrently modified by ongoing operations 3085 * in a later transaction. This creates a problem because the flush 3086 * is intended to update blockrefs which then propagate, allowing 3087 * the original covering in-memory chains to be freed up. In this 3088 * situation the flush code does NOT free the original covering 3089 * chains and will re-apply them to successive copies. 3090 */ 3091 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER); 3092 hammer2_chain_dup_fixup(ochain, nchain); 3093 /* extra ref still present from original allocation */ 3094 3095 KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE); 3096 spin_lock(&above->cst.spin); 3097 KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE); 3098 3099 /* 3100 * Ultimately nchain->modify_tid will be set to trans->sync_tid, 3101 * but we can't do that here because we want to call 3102 * hammer2_chain_modify() to reallocate the block (if necessary). 3103 */ 3104 nchain->modify_tid = ochain->modify_tid; 3105 3106 if (ochain->flags & HAMMER2_CHAIN_DELETED) { 3107 /* 3108 * ochain was deleted 3109 */ 3110 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED); 3111 if (ochain->delete_tid > trans->sync_tid) { 3112 /* 3113 * delete-duplicate a chain deleted in a later 3114 * transaction. Only allowed on chains created 3115 * before or during the current transaction (flush 3116 * code should filter out chains created after the 3117 * current transaction). 3118 * 3119 * To make this work is a bit of a hack. We convert 3120 * ochain's delete_tid to the current sync_tid and 3121 * create a nchain which sets up ochains original 3122 * delete_tid. 3123 * 3124 * This effectively forces ochain to flush as a 3125 * deletion and nchain as a creation. Thus MOVED 3126 * must be set in ochain (it should already be 3127 * set since it's original delete_tid could not 3128 * have been flushed yet). Since ochain's delete_tid 3129 * has been moved down to sync_tid, a re-flush at 3130 * sync_tid won't try to delete-duplicate ochain 3131 * again. 3132 */ 3133 KKASSERT(ochain->modify_tid <= trans->sync_tid); 3134 nchain->delete_tid = ochain->delete_tid; 3135 ochain->delete_tid = trans->sync_tid; 3136 KKASSERT(ochain->flags & HAMMER2_CHAIN_MOVED); 3137 } else if (ochain->delete_tid == trans->sync_tid) { 3138 /* 3139 * ochain was deleted in the current transaction 3140 */ 3141 nchain->delete_tid = trans->sync_tid; 3142 } else { 3143 /* 3144 * ochain was deleted in a prior transaction. 3145 * create and delete nchain in the current 3146 * transaction. 3147 * 3148 * (delete_tid might represent a deleted inode 3149 * which still has an open descriptor). 3150 */ 3151 nchain->delete_tid = trans->sync_tid; 3152 } 3153 hammer2_chain_insert(above, ochain->inlayer, nchain, 0, 0); 3154 } else { 3155 /* 3156 * ochain was not deleted, delete it in the current 3157 * transaction. 3158 */ 3159 KKASSERT(trans->sync_tid >= ochain->modify_tid); 3160 ochain->delete_tid = trans->sync_tid; 3161 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED); 3162 atomic_add_int(&above->live_count, -1); 3163 hammer2_chain_insert(above, NULL, nchain, 3164 HAMMER2_CHAIN_INSERT_LIVE, 0); 3165 } 3166 3167 if ((ochain->flags & HAMMER2_CHAIN_MOVED) == 0) { 3168 hammer2_chain_ref(ochain); 3169 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_MOVED); 3170 } 3171 spin_unlock(&above->cst.spin); 3172 3173 /* 3174 * ochain must be unlocked because ochain and nchain might share 3175 * a buffer cache buffer, so we need to release it so nchain can 3176 * potentially obtain it. 3177 */ 3178 hammer2_chain_unlock(ochain); 3179 3180 /* 3181 * Finishing fixing up nchain. A new block will be allocated if 3182 * crossing a synchronization point (meta-data only). 3183 * 3184 * Calling hammer2_chain_modify() will update modify_tid to 3185 * (typically) trans->sync_tid. 3186 */ 3187 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) { 3188 hammer2_chain_modify(trans, &nchain, 3189 HAMMER2_MODIFY_OPTDATA | 3190 HAMMER2_MODIFY_NOREALLOC | 3191 HAMMER2_MODIFY_INPLACE); 3192 } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) { 3193 hammer2_chain_modify(trans, &nchain, 3194 HAMMER2_MODIFY_OPTDATA | 3195 HAMMER2_MODIFY_INPLACE); 3196 } else { 3197 hammer2_chain_modify(trans, &nchain, 3198 HAMMER2_MODIFY_INPLACE); 3199 } 3200 hammer2_chain_drop(nchain); 3201 3202 /* 3203 * Unconditionally set MOVED to force the parent blockrefs to 3204 * update as the chain_modify() above won't necessarily do it. 3205 * 3206 * Adjust update_hi below nchain so nchain's blockrefs are updated 3207 * with the new attachment. 3208 */ 3209 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) { 3210 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED); 3211 hammer2_chain_ref(nchain); 3212 } 3213 #if 0 3214 if (nchain->core->update_hi < trans->sync_tid) { 3215 spin_lock(&nchain->core->cst.spin); 3216 if (nchain->core->update_hi < trans->sync_tid) 3217 nchain->core->update_hi = trans->sync_tid; 3218 spin_unlock(&nchain->core->cst.spin); 3219 } 3220 #endif 3221 hammer2_chain_setsubmod(trans, nchain); 3222 *chainp = nchain; 3223 } 3224 3225 /* 3226 * Helper function to fixup inodes. The caller procedure stack may hold 3227 * multiple locks on ochain if it represents an inode, preventing our 3228 * unlock from retiring its state to the buffer cache. 3229 * 3230 * In this situation any attempt to access the buffer cache could result 3231 * either in stale data or a deadlock. Work around the problem by copying 3232 * the embedded data directly. 3233 */ 3234 static 3235 void 3236 hammer2_chain_dup_fixup(hammer2_chain_t *ochain, hammer2_chain_t *nchain) 3237 { 3238 if (ochain->data == NULL) 3239 return; 3240 switch(ochain->bref.type) { 3241 case HAMMER2_BREF_TYPE_INODE: 3242 KKASSERT(nchain->data == NULL); 3243 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED); 3244 nchain->data = kmalloc(sizeof(nchain->data->ipdata), 3245 ochain->hmp->mchain, M_WAITOK | M_ZERO); 3246 nchain->data->ipdata = ochain->data->ipdata; 3247 break; 3248 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 3249 KKASSERT(nchain->data == NULL); 3250 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_EMBEDDED); 3251 nchain->data = kmalloc(sizeof(nchain->data->bmdata), 3252 ochain->hmp->mchain, M_WAITOK | M_ZERO); 3253 bcopy(ochain->data->bmdata, 3254 nchain->data->bmdata, 3255 sizeof(nchain->data->bmdata)); 3256 break; 3257 default: 3258 break; 3259 } 3260 } 3261 3262 /* 3263 * Create a snapshot of the specified {parent, ochain} with the specified 3264 * label. The originating hammer2_inode must be exclusively locked for 3265 * safety. 3266 * 3267 * The ioctl code has already synced the filesystem. 3268 */ 3269 int 3270 hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_chain_t **ochainp, 3271 hammer2_ioc_pfs_t *pfs) 3272 { 3273 hammer2_mount_t *hmp; 3274 hammer2_chain_t *ochain = *ochainp; 3275 hammer2_chain_t *nchain; 3276 hammer2_inode_data_t *ipdata; 3277 hammer2_inode_t *nip; 3278 size_t name_len; 3279 hammer2_key_t lhc; 3280 struct vattr vat; 3281 uuid_t opfs_clid; 3282 int error; 3283 3284 kprintf("snapshot %s ochain->refs %d ochain->flags %08x\n", 3285 pfs->name, ochain->refs, ochain->flags); 3286 3287 name_len = strlen(pfs->name); 3288 lhc = hammer2_dirhash(pfs->name, name_len); 3289 3290 hmp = ochain->hmp; 3291 opfs_clid = ochain->data->ipdata.pfs_clid; 3292 3293 *ochainp = ochain; 3294 3295 /* 3296 * Create the snapshot directory under the super-root 3297 * 3298 * Set PFS type, generate a unique filesystem id, and generate 3299 * a cluster id. Use the same clid when snapshotting a PFS root, 3300 * which theoretically allows the snapshot to be used as part of 3301 * the same cluster (perhaps as a cache). 3302 * 3303 * Copy the (flushed) ochain's blockref array. Theoretically we 3304 * could use chain_duplicate() but it becomes difficult to disentangle 3305 * the shared core so for now just brute-force it. 3306 */ 3307 VATTR_NULL(&vat); 3308 vat.va_type = VDIR; 3309 vat.va_mode = 0755; 3310 nchain = NULL; 3311 nip = hammer2_inode_create(trans, hmp->sroot, &vat, proc0.p_ucred, 3312 pfs->name, name_len, &nchain, &error); 3313 3314 if (nip) { 3315 ipdata = hammer2_chain_modify_ip(trans, nip, &nchain, 0); 3316 ipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; 3317 kern_uuidgen(&ipdata->pfs_fsid, 1); 3318 if (ochain->flags & HAMMER2_CHAIN_PFSROOT) 3319 ipdata->pfs_clid = opfs_clid; 3320 else 3321 kern_uuidgen(&ipdata->pfs_clid, 1); 3322 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_PFSROOT); 3323 ipdata->u.blockset = ochain->data->ipdata.u.blockset; 3324 3325 hammer2_inode_unlock_ex(nip, nchain); 3326 } 3327 return (error); 3328 } 3329 3330 /* 3331 * Create an indirect block that covers one or more of the elements in the 3332 * current parent. Either returns the existing parent with no locking or 3333 * ref changes or returns the new indirect block locked and referenced 3334 * and leaving the original parent lock/ref intact as well. 3335 * 3336 * If an error occurs, NULL is returned and *errorp is set to the error. 3337 * 3338 * The returned chain depends on where the specified key falls. 3339 * 3340 * The key/keybits for the indirect mode only needs to follow three rules: 3341 * 3342 * (1) That all elements underneath it fit within its key space and 3343 * 3344 * (2) That all elements outside it are outside its key space. 3345 * 3346 * (3) When creating the new indirect block any elements in the current 3347 * parent that fit within the new indirect block's keyspace must be 3348 * moved into the new indirect block. 3349 * 3350 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3351 * keyspace the the current parent, but lookup/iteration rules will 3352 * ensure (and must ensure) that rule (2) for all parents leading up 3353 * to the nearest inode or the root volume header is adhered to. This 3354 * is accomplished by always recursing through matching keyspaces in 3355 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3356 * 3357 * The current implementation calculates the current worst-case keyspace by 3358 * iterating the current parent and then divides it into two halves, choosing 3359 * whichever half has the most elements (not necessarily the half containing 3360 * the requested key). 3361 * 3362 * We can also opt to use the half with the least number of elements. This 3363 * causes lower-numbered keys (aka logical file offsets) to recurse through 3364 * fewer indirect blocks and higher-numbered keys to recurse through more. 3365 * This also has the risk of not moving enough elements to the new indirect 3366 * block and being forced to create several indirect blocks before the element 3367 * can be inserted. 3368 * 3369 * Must be called with an exclusively locked parent. 3370 */ 3371 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3372 hammer2_key_t *keyp, int keybits, 3373 hammer2_blockref_t *base, int count); 3374 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent, 3375 hammer2_key_t *keyp, int keybits, 3376 hammer2_blockref_t *base, int count); 3377 static 3378 hammer2_chain_t * 3379 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, 3380 hammer2_key_t create_key, int create_bits, 3381 int for_type, int *errorp) 3382 { 3383 hammer2_mount_t *hmp; 3384 hammer2_chain_core_t *above; 3385 hammer2_chain_core_t *icore; 3386 hammer2_blockref_t *base; 3387 hammer2_blockref_t *bref; 3388 hammer2_blockref_t bcopy; 3389 hammer2_chain_t *chain; 3390 hammer2_chain_t *ichain; 3391 hammer2_chain_t dummy; 3392 hammer2_key_t key = create_key; 3393 hammer2_key_t key_beg; 3394 hammer2_key_t key_end; 3395 hammer2_key_t key_next; 3396 int keybits = create_bits; 3397 int count; 3398 int nbytes; 3399 int cache_index; 3400 int loops; 3401 int reason; 3402 int generation; 3403 int maxloops = 300000; 3404 int retry_same; 3405 int wasdup; 3406 hammer2_chain_t * volatile xxchain = NULL; 3407 3408 /* 3409 * Calculate the base blockref pointer or NULL if the chain 3410 * is known to be empty. We need to calculate the array count 3411 * for RB lookups either way. 3412 */ 3413 hmp = parent->hmp; 3414 *errorp = 0; 3415 KKASSERT(ccms_thread_lock_owned(&parent->core->cst)); 3416 above = parent->core; 3417 3418 /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/ 3419 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3420 base = NULL; 3421 3422 switch(parent->bref.type) { 3423 case HAMMER2_BREF_TYPE_INODE: 3424 count = HAMMER2_SET_COUNT; 3425 break; 3426 case HAMMER2_BREF_TYPE_INDIRECT: 3427 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3428 count = parent->bytes / sizeof(hammer2_blockref_t); 3429 break; 3430 case HAMMER2_BREF_TYPE_VOLUME: 3431 count = HAMMER2_SET_COUNT; 3432 break; 3433 case HAMMER2_BREF_TYPE_FREEMAP: 3434 count = HAMMER2_SET_COUNT; 3435 break; 3436 default: 3437 panic("hammer2_chain_create_indirect: " 3438 "unrecognized blockref type: %d", 3439 parent->bref.type); 3440 count = 0; 3441 break; 3442 } 3443 } else { 3444 switch(parent->bref.type) { 3445 case HAMMER2_BREF_TYPE_INODE: 3446 base = &parent->data->ipdata.u.blockset.blockref[0]; 3447 count = HAMMER2_SET_COUNT; 3448 break; 3449 case HAMMER2_BREF_TYPE_INDIRECT: 3450 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3451 base = &parent->data->npdata[0]; 3452 count = parent->bytes / sizeof(hammer2_blockref_t); 3453 break; 3454 case HAMMER2_BREF_TYPE_VOLUME: 3455 base = &hmp->voldata.sroot_blockset.blockref[0]; 3456 count = HAMMER2_SET_COUNT; 3457 break; 3458 case HAMMER2_BREF_TYPE_FREEMAP: 3459 base = &hmp->voldata.freemap_blockset.blockref[0]; 3460 count = HAMMER2_SET_COUNT; 3461 break; 3462 default: 3463 panic("hammer2_chain_create_indirect: " 3464 "unrecognized blockref type: %d", 3465 parent->bref.type); 3466 count = 0; 3467 break; 3468 } 3469 } 3470 3471 /* 3472 * dummy used in later chain allocation (no longer used for lookups). 3473 */ 3474 bzero(&dummy, sizeof(dummy)); 3475 dummy.delete_tid = HAMMER2_MAX_TID; 3476 3477 /* 3478 * When creating an indirect block for a freemap node or leaf 3479 * the key/keybits must be fitted to static radix levels because 3480 * particular radix levels use particular reserved blocks in the 3481 * related zone. 3482 * 3483 * This routine calculates the key/radix of the indirect block 3484 * we need to create, and whether it is on the high-side or the 3485 * low-side. 3486 */ 3487 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3488 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3489 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 3490 base, count); 3491 } else { 3492 keybits = hammer2_chain_indkey_normal(parent, &key, keybits, 3493 base, count); 3494 } 3495 3496 /* 3497 * Normalize the key for the radix being represented, keeping the 3498 * high bits and throwing away the low bits. 3499 */ 3500 key &= ~(((hammer2_key_t)1 << keybits) - 1); 3501 3502 /* 3503 * How big should our new indirect block be? It has to be at least 3504 * as large as its parent. 3505 */ 3506 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) 3507 nbytes = HAMMER2_IND_BYTES_MIN; 3508 else 3509 nbytes = HAMMER2_IND_BYTES_MAX; 3510 if (nbytes < count * sizeof(hammer2_blockref_t)) 3511 nbytes = count * sizeof(hammer2_blockref_t); 3512 3513 /* 3514 * Ok, create our new indirect block 3515 */ 3516 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3517 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3518 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 3519 } else { 3520 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 3521 } 3522 dummy.bref.key = key; 3523 dummy.bref.keybits = keybits; 3524 dummy.bref.data_off = hammer2_getradix(nbytes); 3525 dummy.bref.methods = parent->bref.methods; 3526 3527 ichain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy.bref); 3528 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 3529 hammer2_chain_core_alloc(trans, ichain, NULL); 3530 icore = ichain->core; 3531 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 3532 hammer2_chain_drop(ichain); /* excess ref from alloc */ 3533 3534 /* 3535 * We have to mark it modified to allocate its block, but use 3536 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 3537 * it won't be acted upon by the flush code. 3538 * 3539 * XXX leave the node unmodified, depend on the update_hi 3540 * flush to assign and modify parent blocks. 3541 */ 3542 hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA); 3543 3544 /* 3545 * Iterate the original parent and move the matching brefs into 3546 * the new indirect block. 3547 * 3548 * XXX handle flushes. 3549 */ 3550 key_beg = 0; 3551 key_end = HAMMER2_MAX_KEY; 3552 cache_index = 0; 3553 spin_lock(&above->cst.spin); 3554 loops = 0; 3555 reason = 0; 3556 retry_same = 0; 3557 3558 for (;;) { 3559 if (++loops > 100000) { 3560 spin_unlock(&above->cst.spin); 3561 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 3562 reason, parent, base, count, key_next); 3563 } 3564 3565 /* 3566 * NOTE: spinlock stays intact, returned chain (if not NULL) 3567 * is not referenced or locked which means that we 3568 * cannot safely check its flagged / deletion status 3569 * until we lock it. 3570 */ 3571 chain = hammer2_combined_find(parent, base, count, 3572 &cache_index, &key_next, 3573 key_beg, key_end, 3574 &bref); 3575 generation = above->generation; 3576 if (bref == NULL) 3577 break; 3578 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3579 3580 /* 3581 * Skip keys that are not within the key/radix of the new 3582 * indirect block. They stay in the parent. 3583 */ 3584 if ((~(((hammer2_key_t)1 << keybits) - 1) & 3585 (key ^ bref->key)) != 0) { 3586 goto next_key_spinlocked; 3587 } 3588 3589 /* 3590 * Load the new indirect block by acquiring the related 3591 * chains (potentially from media as it might not be 3592 * in-memory). Then move it to the new parent (ichain) 3593 * via DELETE-DUPLICATE. 3594 * 3595 * chain is referenced but not locked. We must lock the 3596 * chain to obtain definitive DUPLICATED/DELETED state 3597 */ 3598 if (chain) { 3599 /* 3600 * Use chain already present in the RBTREE 3601 */ 3602 hammer2_chain_ref(chain); 3603 wasdup = ((chain->flags & 3604 HAMMER2_CHAIN_DUPLICATED) != 0); 3605 spin_unlock(&above->cst.spin); 3606 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER | 3607 HAMMER2_RESOLVE_NOREF); 3608 } else { 3609 /* 3610 * Get chain for blockref element. _get returns NULL 3611 * on insertion race. 3612 */ 3613 bcopy = *bref; 3614 spin_unlock(&above->cst.spin); 3615 chain = hammer2_chain_get(parent, &bcopy, generation); 3616 if (chain == NULL) { 3617 reason = 1; 3618 spin_lock(&above->cst.spin); 3619 continue; 3620 } 3621 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 3622 reason = 2; 3623 hammer2_chain_drop(chain); 3624 spin_lock(&above->cst.spin); 3625 continue; 3626 } 3627 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER | 3628 HAMMER2_RESOLVE_NOREF); 3629 wasdup = 0; 3630 } 3631 3632 /* 3633 * This is always live so if the chain has been delete- 3634 * duplicated we raced someone and we have to retry. 3635 * 3636 * NOTE: Lookups can race delete-duplicate because 3637 * delete-duplicate does not lock the parent's core 3638 * (they just use the spinlock on the core). We must 3639 * check for races by comparing the DUPLICATED flag before 3640 * releasing the spinlock with the flag after locking the 3641 * chain. 3642 * 3643 * (note reversed logic for this one) 3644 */ 3645 if (chain->flags & HAMMER2_CHAIN_DELETED) { 3646 hammer2_chain_unlock(chain); 3647 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) && 3648 wasdup == 0) { 3649 retry_same = 1; 3650 xxchain = chain; 3651 } 3652 goto next_key; 3653 } 3654 3655 /* 3656 * Shift the chain to the indirect block. 3657 * 3658 * WARNING! Can cause held-over chains to require a refactor. 3659 * Fortunately we have none (our locked chains are 3660 * passed into and modified by the call). 3661 */ 3662 hammer2_chain_delete(trans, chain, 0); 3663 hammer2_chain_duplicate(trans, &ichain, &chain, NULL, 0, 1); 3664 hammer2_chain_unlock(chain); 3665 KKASSERT(parent->refs > 0); 3666 chain = NULL; 3667 next_key: 3668 spin_lock(&above->cst.spin); 3669 next_key_spinlocked: 3670 if (--maxloops == 0) 3671 panic("hammer2_chain_create_indirect: maxloops"); 3672 reason = 4; 3673 if (retry_same == 0) { 3674 if (key_next == 0 || key_next > key_end) 3675 break; 3676 key_beg = key_next; 3677 } 3678 /* loop */ 3679 } 3680 spin_unlock(&above->cst.spin); 3681 3682 /* 3683 * Insert the new indirect block into the parent now that we've 3684 * cleared out some entries in the parent. We calculated a good 3685 * insertion index in the loop above (ichain->index). 3686 * 3687 * We don't have to set MOVED here because we mark ichain modified 3688 * down below (so the normal modified -> flush -> set-moved sequence 3689 * applies). 3690 * 3691 * The insertion shouldn't race as this is a completely new block 3692 * and the parent is locked. 3693 */ 3694 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 3695 hammer2_chain_insert(above, NULL, ichain, 3696 HAMMER2_CHAIN_INSERT_SPIN | 3697 HAMMER2_CHAIN_INSERT_LIVE, 3698 0); 3699 3700 /* 3701 * Mark the new indirect block modified after insertion, which 3702 * will propagate up through parent all the way to the root and 3703 * also allocate the physical block in ichain for our caller, 3704 * and assign ichain->data to a pre-zero'd space (because there 3705 * is not prior data to copy into it). 3706 * 3707 * We have to set update_hi in ichain's flags manually so the 3708 * flusher knows it has to recurse through it to get to all of 3709 * our moved blocks, then call setsubmod() to set the bit 3710 * recursively. 3711 */ 3712 /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/ 3713 if (ichain->core->update_hi < trans->sync_tid) { 3714 spin_lock(&ichain->core->cst.spin); 3715 if (ichain->core->update_hi < trans->sync_tid) 3716 ichain->core->update_hi = trans->sync_tid; 3717 spin_unlock(&ichain->core->cst.spin); 3718 } 3719 hammer2_chain_setsubmod(trans, ichain); 3720 3721 /* 3722 * Figure out what to return. 3723 */ 3724 if (~(((hammer2_key_t)1 << keybits) - 1) & 3725 (create_key ^ key)) { 3726 /* 3727 * Key being created is outside the key range, 3728 * return the original parent. 3729 */ 3730 hammer2_chain_unlock(ichain); 3731 } else { 3732 /* 3733 * Otherwise its in the range, return the new parent. 3734 * (leave both the new and old parent locked). 3735 */ 3736 parent = ichain; 3737 } 3738 XXChain = xxchain; 3739 3740 return(parent); 3741 } 3742 3743 /* 3744 * Calculate the keybits and highside/lowside of the freemap node the 3745 * caller is creating. 3746 * 3747 * This routine will specify the next higher-level freemap key/radix 3748 * representing the lowest-ordered set. By doing so, eventually all 3749 * low-ordered sets will be moved one level down. 3750 * 3751 * We have to be careful here because the freemap reserves a limited 3752 * number of blocks for a limited number of levels. So we can't just 3753 * push indiscriminately. 3754 */ 3755 int 3756 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 3757 int keybits, hammer2_blockref_t *base, int count) 3758 { 3759 hammer2_chain_core_t *above; 3760 hammer2_chain_t *chain; 3761 hammer2_blockref_t *bref; 3762 hammer2_key_t key; 3763 hammer2_key_t key_beg; 3764 hammer2_key_t key_end; 3765 hammer2_key_t key_next; 3766 int cache_index; 3767 int locount; 3768 int hicount; 3769 int maxloops = 300000; 3770 3771 key = *keyp; 3772 above = parent->core; 3773 locount = 0; 3774 hicount = 0; 3775 keybits = 64; 3776 3777 /* 3778 * Calculate the range of keys in the array being careful to skip 3779 * slots which are overridden with a deletion. 3780 */ 3781 key_beg = 0; 3782 key_end = HAMMER2_MAX_KEY; 3783 cache_index = 0; 3784 spin_lock(&above->cst.spin); 3785 3786 for (;;) { 3787 if (--maxloops == 0) { 3788 panic("indkey_freemap shit %p %p:%d\n", 3789 parent, base, count); 3790 } 3791 chain = hammer2_combined_find(parent, base, count, 3792 &cache_index, &key_next, 3793 key_beg, key_end, &bref); 3794 3795 /* 3796 * Exhausted search 3797 */ 3798 if (bref == NULL) 3799 break; 3800 3801 /* 3802 * NOTE: No need to check DUPLICATED here because we do 3803 * not release the spinlock. 3804 */ 3805 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3806 if (key_next == 0 || key_next > key_end) 3807 break; 3808 key_beg = key_next; 3809 continue; 3810 } 3811 3812 /* 3813 * Use the full live (not deleted) element for the scan 3814 * iteration. HAMMER2 does not allow partial replacements. 3815 * 3816 * XXX should be built into hammer2_combined_find(). 3817 */ 3818 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3819 3820 if (keybits > bref->keybits) { 3821 key = bref->key; 3822 keybits = bref->keybits; 3823 } else if (keybits == bref->keybits && bref->key < key) { 3824 key = bref->key; 3825 } 3826 if (key_next == 0) 3827 break; 3828 key_beg = key_next; 3829 } 3830 spin_unlock(&above->cst.spin); 3831 3832 /* 3833 * Return the keybits for a higher-level FREEMAP_NODE covering 3834 * this node. 3835 */ 3836 switch(keybits) { 3837 case HAMMER2_FREEMAP_LEVEL0_RADIX: 3838 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 3839 break; 3840 case HAMMER2_FREEMAP_LEVEL1_RADIX: 3841 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 3842 break; 3843 case HAMMER2_FREEMAP_LEVEL2_RADIX: 3844 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 3845 break; 3846 case HAMMER2_FREEMAP_LEVEL3_RADIX: 3847 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 3848 break; 3849 case HAMMER2_FREEMAP_LEVEL4_RADIX: 3850 panic("hammer2_chain_indkey_freemap: level too high"); 3851 break; 3852 default: 3853 panic("hammer2_chain_indkey_freemap: bad radix"); 3854 break; 3855 } 3856 *keyp = key; 3857 3858 return (keybits); 3859 } 3860 3861 /* 3862 * Calculate the keybits and highside/lowside of the indirect block the 3863 * caller is creating. 3864 */ 3865 static int 3866 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp, 3867 int keybits, hammer2_blockref_t *base, int count) 3868 { 3869 hammer2_chain_core_t *above; 3870 hammer2_blockref_t *bref; 3871 hammer2_chain_t *chain; 3872 hammer2_key_t key_beg; 3873 hammer2_key_t key_end; 3874 hammer2_key_t key_next; 3875 hammer2_key_t key; 3876 int nkeybits; 3877 int locount; 3878 int hicount; 3879 int cache_index; 3880 int maxloops = 300000; 3881 3882 key = *keyp; 3883 above = parent->core; 3884 locount = 0; 3885 hicount = 0; 3886 3887 /* 3888 * Calculate the range of keys in the array being careful to skip 3889 * slots which are overridden with a deletion. Once the scan 3890 * completes we will cut the key range in half and shift half the 3891 * range into the new indirect block. 3892 */ 3893 key_beg = 0; 3894 key_end = HAMMER2_MAX_KEY; 3895 cache_index = 0; 3896 spin_lock(&above->cst.spin); 3897 3898 for (;;) { 3899 if (--maxloops == 0) { 3900 panic("indkey_freemap shit %p %p:%d\n", 3901 parent, base, count); 3902 } 3903 chain = hammer2_combined_find(parent, base, count, 3904 &cache_index, &key_next, 3905 key_beg, key_end, &bref); 3906 3907 /* 3908 * Exhausted search 3909 */ 3910 if (bref == NULL) 3911 break; 3912 3913 /* 3914 * NOTE: No need to check DUPLICATED here because we do 3915 * not release the spinlock. 3916 */ 3917 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3918 if (key_next == 0 || key_next > key_end) 3919 break; 3920 key_beg = key_next; 3921 continue; 3922 } 3923 3924 /* 3925 * Use the full live (not deleted) element for the scan 3926 * iteration. HAMMER2 does not allow partial replacements. 3927 * 3928 * XXX should be built into hammer2_combined_find(). 3929 */ 3930 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3931 3932 /* 3933 * Expand our calculated key range (key, keybits) to fit 3934 * the scanned key. nkeybits represents the full range 3935 * that we will later cut in half (two halves @ nkeybits - 1). 3936 */ 3937 nkeybits = keybits; 3938 if (nkeybits < bref->keybits) { 3939 if (bref->keybits > 64) { 3940 kprintf("bad bref chain %p bref %p\n", 3941 chain, bref); 3942 Debugger("fubar"); 3943 } 3944 nkeybits = bref->keybits; 3945 } 3946 while (nkeybits < 64 && 3947 (~(((hammer2_key_t)1 << nkeybits) - 1) & 3948 (key ^ bref->key)) != 0) { 3949 ++nkeybits; 3950 } 3951 3952 /* 3953 * If the new key range is larger we have to determine 3954 * which side of the new key range the existing keys fall 3955 * under by checking the high bit, then collapsing the 3956 * locount into the hicount or vise-versa. 3957 */ 3958 if (keybits != nkeybits) { 3959 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 3960 hicount += locount; 3961 locount = 0; 3962 } else { 3963 locount += hicount; 3964 hicount = 0; 3965 } 3966 keybits = nkeybits; 3967 } 3968 3969 /* 3970 * The newly scanned key will be in the lower half or the 3971 * upper half of the (new) key range. 3972 */ 3973 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 3974 ++hicount; 3975 else 3976 ++locount; 3977 3978 if (key_next == 0) 3979 break; 3980 key_beg = key_next; 3981 } 3982 spin_unlock(&above->cst.spin); 3983 bref = NULL; /* now invalid (safety) */ 3984 3985 /* 3986 * Adjust keybits to represent half of the full range calculated 3987 * above (radix 63 max) 3988 */ 3989 --keybits; 3990 3991 /* 3992 * Select whichever half contains the most elements. Theoretically 3993 * we can select either side as long as it contains at least one 3994 * element (in order to ensure that a free slot is present to hold 3995 * the indirect block). 3996 */ 3997 if (hammer2_indirect_optimize) { 3998 /* 3999 * Insert node for least number of keys, this will arrange 4000 * the first few blocks of a large file or the first few 4001 * inodes in a directory with fewer indirect blocks when 4002 * created linearly. 4003 */ 4004 if (hicount < locount && hicount != 0) 4005 key |= (hammer2_key_t)1 << keybits; 4006 else 4007 key &= ~(hammer2_key_t)1 << keybits; 4008 } else { 4009 /* 4010 * Insert node for most number of keys, best for heavily 4011 * fragmented files. 4012 */ 4013 if (hicount > locount) 4014 key |= (hammer2_key_t)1 << keybits; 4015 else 4016 key &= ~(hammer2_key_t)1 << keybits; 4017 } 4018 *keyp = key; 4019 4020 return (keybits); 4021 } 4022 4023 /* 4024 * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and 4025 * set chain->delete_tid. The chain is not actually marked possibly-free 4026 * in the freemap until the deletion is completely flushed out (because 4027 * a flush which doesn't cover the entire deletion is flushing the deleted 4028 * chain as if it were live). 4029 * 4030 * This function does NOT generate a modification to the parent. It 4031 * would be nearly impossible to figure out which parent to modify anyway. 4032 * Such modifications are handled top-down by the flush code and are 4033 * properly merged using the flush synchronization point. 4034 * 4035 * The find/get code will properly overload the RBTREE check on top of 4036 * the bref check to detect deleted entries. 4037 * 4038 * This function is NOT recursive. Any entity already pushed into the 4039 * chain (such as an inode) may still need visibility into its contents, 4040 * as well as the ability to read and modify the contents. For example, 4041 * for an unlinked file which is still open. 4042 * 4043 * NOTE: This function does NOT set chain->modify_tid, allowing future 4044 * code to distinguish between live and deleted chains by testing 4045 * trans->sync_tid vs chain->modify_tid and chain->delete_tid. 4046 * 4047 * NOTE: Deletions normally do not occur in the middle of a duplication 4048 * chain but we use a trick for hardlink migration that refactors 4049 * the originating inode without deleting it, so we make no assumptions 4050 * here. 4051 */ 4052 void 4053 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags) 4054 { 4055 KKASSERT(ccms_thread_lock_owned(&chain->core->cst)); 4056 4057 /* 4058 * Nothing to do if already marked. 4059 */ 4060 if (chain->flags & HAMMER2_CHAIN_DELETED) 4061 return; 4062 4063 /* 4064 * The setting of DELETED causes finds, lookups, and _next iterations 4065 * to no longer recognize the chain. RB_SCAN()s will still have 4066 * visibility (needed for flush serialization points). 4067 * 4068 * We need the spinlock on the core whos RBTREE contains chain 4069 * to protect against races. 4070 */ 4071 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 4072 spin_lock(&chain->above->cst.spin); 4073 4074 KKASSERT(trans->sync_tid >= chain->modify_tid); 4075 chain->delete_tid = trans->sync_tid; 4076 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 4077 atomic_add_int(&chain->above->live_count, -1); 4078 ++chain->above->generation; 4079 4080 /* 4081 * We must set MOVED along with DELETED for the flush code to 4082 * recognize the operation and properly disconnect the chain 4083 * in-memory. 4084 */ 4085 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 4086 hammer2_chain_ref(chain); 4087 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 4088 } 4089 spin_unlock(&chain->above->cst.spin); 4090 4091 hammer2_chain_setsubmod(trans, chain); 4092 } 4093 4094 /* 4095 * Called with the core spinlock held to check for freeable layers. 4096 * Used by the flush code. Layers can wind up not being freed due 4097 * to the temporary layer->refs count. This function frees up any 4098 * layers that were missed. 4099 */ 4100 void 4101 hammer2_chain_layer_check_locked(hammer2_mount_t *hmp, 4102 hammer2_chain_core_t *core) 4103 { 4104 hammer2_chain_layer_t *layer; 4105 hammer2_chain_layer_t *tmp; 4106 4107 tmp = TAILQ_FIRST(&core->layerq); 4108 while ((layer = tmp) != NULL) { 4109 tmp = TAILQ_NEXT(tmp, entry); 4110 if (layer->refs == 0 && RB_EMPTY(&layer->rbtree)) { 4111 TAILQ_REMOVE(&core->layerq, layer, entry); 4112 if (tmp) 4113 ++tmp->refs; 4114 spin_unlock(&core->cst.spin); 4115 kfree(layer, hmp->mchain); 4116 spin_lock(&core->cst.spin); 4117 if (tmp) 4118 --tmp->refs; 4119 } 4120 } 4121 } 4122 4123 /* 4124 * Returns the index of the nearest element in the blockref array >= elm. 4125 * Returns (count) if no element could be found. 4126 * 4127 * Sets *key_nextp to the next key for loop purposes but does not modify 4128 * it if the next key would be higher than the current value of *key_nextp. 4129 * Note that *key_nexp can overflow to 0, which should be tested by the 4130 * caller. 4131 * 4132 * (*cache_indexp) is a heuristic and can be any value without effecting 4133 * the result. 4134 * 4135 * The spin lock on the related chain must be held. 4136 */ 4137 int 4138 hammer2_base_find(hammer2_chain_t *chain, 4139 hammer2_blockref_t *base, int count, 4140 int *cache_indexp, hammer2_key_t *key_nextp, 4141 hammer2_key_t key_beg, hammer2_key_t key_end) 4142 { 4143 hammer2_chain_core_t *core = chain->core; 4144 hammer2_blockref_t *scan; 4145 hammer2_key_t scan_end; 4146 int i; 4147 int limit; 4148 4149 /* 4150 * Require the live chain's already have their core's counted 4151 * so we can optimize operations. 4152 */ 4153 KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) || 4154 core->flags & HAMMER2_CORE_COUNTEDBREFS); 4155 4156 /* 4157 * Degenerate case 4158 */ 4159 if (count == 0 || base == NULL) 4160 return(count); 4161 4162 /* 4163 * Sequential optimization using *cache_indexp. This is the most 4164 * likely scenario. 4165 * 4166 * We can avoid trailing empty entries on live chains, otherwise 4167 * we might have to check the whole block array. 4168 */ 4169 i = *cache_indexp; 4170 cpu_ccfence(); 4171 if (chain->flags & HAMMER2_CHAIN_DUPLICATED) 4172 limit = count; 4173 else 4174 limit = core->live_zero; 4175 if (i >= limit) 4176 i = limit - 1; 4177 if (i < 0) 4178 i = 0; 4179 KKASSERT(i < count); 4180 4181 /* 4182 * Search backwards 4183 */ 4184 scan = &base[i]; 4185 while (i > 0 && (scan->type == 0 || scan->key > key_beg)) { 4186 --scan; 4187 --i; 4188 } 4189 *cache_indexp = i; 4190 4191 /* 4192 * Search forwards, stop when we find a scan element which 4193 * encloses the key or until we know that there are no further 4194 * elements. 4195 */ 4196 while (i < count) { 4197 if (scan->type != 0) { 4198 if (scan->key > key_beg) 4199 break; 4200 scan_end = scan->key + 4201 ((hammer2_key_t)1 << scan->keybits) - 1; 4202 if (scan_end >= key_beg) 4203 break; 4204 } 4205 if (i >= limit) 4206 return (count); 4207 ++scan; 4208 ++i; 4209 } 4210 if (i != count) { 4211 *cache_indexp = i; 4212 if (i >= limit) { 4213 i = count; 4214 } else { 4215 scan_end = scan->key + 4216 ((hammer2_key_t)1 << scan->keybits); 4217 if (scan_end && (*key_nextp > scan_end || 4218 *key_nextp == 0)) { 4219 *key_nextp = scan_end; 4220 } 4221 } 4222 } 4223 return (i); 4224 } 4225 4226 /* 4227 * Do a combined search and return the next match either from the blockref 4228 * array or from the in-memory chain. Sets *bresp to the returned bref in 4229 * both cases, or sets it to NULL if the search exhausted. Only returns 4230 * a non-NULL chain if the search matched from the in-memory chain. 4231 * 4232 * Must be called with above's spinlock held. Spinlock remains held 4233 * through the operation. 4234 * 4235 * The returned chain is not locked or referenced. Use the returned bref 4236 * to determine if the search exhausted or not. 4237 */ 4238 static hammer2_chain_t * 4239 hammer2_combined_find(hammer2_chain_t *parent, 4240 hammer2_blockref_t *base, int count, 4241 int *cache_indexp, hammer2_key_t *key_nextp, 4242 hammer2_key_t key_beg, hammer2_key_t key_end, 4243 hammer2_blockref_t **bresp) 4244 { 4245 hammer2_blockref_t *bref; 4246 hammer2_chain_t *chain; 4247 int i; 4248 4249 *key_nextp = key_end + 1; 4250 i = hammer2_base_find(parent, base, count, cache_indexp, 4251 key_nextp, key_beg, key_end); 4252 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 4253 4254 /* 4255 * Neither matched 4256 */ 4257 if (i == count && chain == NULL) { 4258 *bresp = NULL; 4259 return(chain); /* NULL */ 4260 } 4261 4262 /* 4263 * Only chain matched 4264 */ 4265 if (i == count) { 4266 bref = &chain->bref; 4267 goto found; 4268 } 4269 4270 /* 4271 * Only blockref matched. 4272 */ 4273 if (chain == NULL) { 4274 bref = &base[i]; 4275 goto found; 4276 } 4277 4278 /* 4279 * Both in-memory and blockref match. Select the nearer element. 4280 * If both are flush with the left-hand side they are considered 4281 * to be the same distance. 4282 * 4283 * When both are the same distance away select the chain if it is 4284 * live or if it's delete_tid is greater than the parent's 4285 * synchronized bref.mirror_tid (a single test suffices for both 4286 * conditions), otherwise select the element. 4287 * 4288 * (It is possible for an old deletion to linger after a rename-over 4289 * and flush, which would make the media copy the correct choice). 4290 */ 4291 4292 /* 4293 * Either both are flush with the left-hand side or they are the 4294 * same distance away. Select the chain if it is not deleted 4295 * or it has a higher delete_tid, else select the media. 4296 */ 4297 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 4298 chain->bref.key == base[i].key) { 4299 if (chain->delete_tid > base[i].mirror_tid) { 4300 bref = &chain->bref; 4301 } else { 4302 KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED); 4303 bref = &base[i]; 4304 chain = NULL; 4305 } 4306 goto found; 4307 } 4308 4309 /* 4310 * Select the nearer key. 4311 */ 4312 if (chain->bref.key < base[i].key) { 4313 bref = &chain->bref; 4314 } else { 4315 bref = &base[i]; 4316 chain = NULL; 4317 } 4318 4319 /* 4320 * If the bref is out of bounds we've exhausted our search. 4321 */ 4322 found: 4323 if (bref->key > key_end) { 4324 *bresp = NULL; 4325 chain = NULL; 4326 } else { 4327 *bresp = bref; 4328 } 4329 return(chain); 4330 } 4331 4332 /* 4333 * Locate the specified block array element and delete it. The element 4334 * must exist. 4335 * 4336 * The spin lock on the related chain must be held. 4337 * 4338 * NOTE: live_count was adjusted when the chain was deleted, so it does not 4339 * need to be adjusted when we commit the media change. 4340 */ 4341 void 4342 hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent, 4343 hammer2_blockref_t *base, int count, 4344 int *cache_indexp, hammer2_chain_t *child) 4345 { 4346 hammer2_blockref_t *elm = &child->bref; 4347 hammer2_chain_core_t *core = parent->core; 4348 hammer2_key_t key_next; 4349 int i; 4350 4351 /* 4352 * Delete element. Expect the element to exist. 4353 * 4354 * XXX see caller, flush code not yet sophisticated enough to prevent 4355 * re-flushed in some cases. 4356 */ 4357 key_next = 0; /* max range */ 4358 i = hammer2_base_find(parent, base, count, cache_indexp, 4359 &key_next, elm->key, elm->key); 4360 if (i == count || base[i].type == 0 || 4361 base[i].key != elm->key || base[i].keybits != elm->keybits) { 4362 panic("delete base %p element not found at %d/%d elm %p\n", 4363 base, i, count, elm); 4364 return; 4365 } 4366 bzero(&base[i], sizeof(*base)); 4367 base[i].mirror_tid = (intptr_t)parent; /* debug */ 4368 base[i].modify_tid = (intptr_t)child; /* debug */ 4369 base[i].check.debug.sync_tid = trans->sync_tid; /* debug */ 4370 4371 /* 4372 * We can only optimize core->live_zero for live chains. 4373 */ 4374 if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) { 4375 if (core->live_zero == i + 1) { 4376 while (--i >= 0 && base[i].type == 0) 4377 ; 4378 core->live_zero = i + 1; 4379 } 4380 } 4381 } 4382 4383 /* 4384 * Insert the specified element. The block array must not already have the 4385 * element and must have space available for the insertion. 4386 * 4387 * The spin lock on the related chain must be held. 4388 * 4389 * NOTE: live_count was adjusted when the chain was deleted, so it does not 4390 * need to be adjusted when we commit the media change. 4391 */ 4392 void 4393 hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent, 4394 hammer2_blockref_t *base, int count, 4395 int *cache_indexp, hammer2_chain_t *child) 4396 { 4397 hammer2_blockref_t *elm = &child->bref; 4398 hammer2_chain_core_t *core = parent->core; 4399 hammer2_key_t key_next; 4400 hammer2_key_t xkey; 4401 int i; 4402 int j; 4403 int k; 4404 int l; 4405 int u = 1; 4406 4407 /* 4408 * Insert new element. Expect the element to not already exist 4409 * unless we are replacing it. 4410 * 4411 * XXX see caller, flush code not yet sophisticated enough to prevent 4412 * re-flushed in some cases. 4413 */ 4414 key_next = 0; /* max range */ 4415 i = hammer2_base_find(parent, base, count, cache_indexp, 4416 &key_next, elm->key, elm->key); 4417 4418 /* 4419 * Shortcut fill optimization, typical ordered insertion(s) may not 4420 * require a search. 4421 */ 4422 KKASSERT(i >= 0 && i <= count); 4423 4424 /* 4425 * We can only optimize core->live_zero for live chains. 4426 */ 4427 if (i == count && core->live_zero < count) { 4428 if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) { 4429 i = core->live_zero++; 4430 base[i] = *elm; 4431 return; 4432 } 4433 } 4434 4435 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 4436 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 4437 panic("insert base %p overlapping elements at %d elm %p\n", 4438 base, i, elm); 4439 } 4440 4441 /* 4442 * Try to find an empty slot before or after. 4443 */ 4444 j = i; 4445 k = i; 4446 while (j > 0 || k < count) { 4447 --j; 4448 if (j >= 0 && base[j].type == 0) { 4449 if (j == i - 1) { 4450 base[j] = *elm; 4451 } else { 4452 bcopy(&base[j+1], &base[j], 4453 (i - j - 1) * sizeof(*base)); 4454 base[i - 1] = *elm; 4455 } 4456 goto validate; 4457 } 4458 ++k; 4459 if (k < count && base[k].type == 0) { 4460 bcopy(&base[i], &base[i+1], 4461 (k - i) * sizeof(hammer2_blockref_t)); 4462 base[i] = *elm; 4463 4464 /* 4465 * We can only update core->live_zero for live 4466 * chains. 4467 */ 4468 if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) { 4469 if (core->live_zero <= k) 4470 core->live_zero = k + 1; 4471 } 4472 u = 2; 4473 goto validate; 4474 } 4475 } 4476 panic("hammer2_base_insert: no room!"); 4477 4478 /* 4479 * Debugging 4480 */ 4481 validate: 4482 key_next = 0; 4483 for (l = 0; l < count; ++l) { 4484 if (base[l].type) { 4485 key_next = base[l].key + 4486 ((hammer2_key_t)1 << base[l].keybits) - 1; 4487 break; 4488 } 4489 } 4490 while (++l < count) { 4491 if (base[l].type) { 4492 if (base[l].key <= key_next) 4493 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 4494 key_next = base[l].key + 4495 ((hammer2_key_t)1 << base[l].keybits) - 1; 4496 4497 } 4498 } 4499 4500 } 4501 4502 #if 0 4503 4504 /* 4505 * Sort the blockref array for the chain. Used by the flush code to 4506 * sort the blockref[] array. 4507 * 4508 * The chain must be exclusively locked AND spin-locked. 4509 */ 4510 typedef hammer2_blockref_t *hammer2_blockref_p; 4511 4512 static 4513 int 4514 hammer2_base_sort_callback(const void *v1, const void *v2) 4515 { 4516 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 4517 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 4518 4519 /* 4520 * Make sure empty elements are placed at the end of the array 4521 */ 4522 if (bref1->type == 0) { 4523 if (bref2->type == 0) 4524 return(0); 4525 return(1); 4526 } else if (bref2->type == 0) { 4527 return(-1); 4528 } 4529 4530 /* 4531 * Sort by key 4532 */ 4533 if (bref1->key < bref2->key) 4534 return(-1); 4535 if (bref1->key > bref2->key) 4536 return(1); 4537 return(0); 4538 } 4539 4540 void 4541 hammer2_base_sort(hammer2_chain_t *chain) 4542 { 4543 hammer2_blockref_t *base; 4544 int count; 4545 4546 switch(chain->bref.type) { 4547 case HAMMER2_BREF_TYPE_INODE: 4548 /* 4549 * Special shortcut for embedded data returns the inode 4550 * itself. Callers must detect this condition and access 4551 * the embedded data (the strategy code does this for us). 4552 * 4553 * This is only applicable to regular files and softlinks. 4554 */ 4555 if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) 4556 return; 4557 base = &chain->data->ipdata.u.blockset.blockref[0]; 4558 count = HAMMER2_SET_COUNT; 4559 break; 4560 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4561 case HAMMER2_BREF_TYPE_INDIRECT: 4562 /* 4563 * Optimize indirect blocks in the INITIAL state to avoid 4564 * I/O. 4565 */ 4566 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4567 base = &chain->data->npdata[0]; 4568 count = chain->bytes / sizeof(hammer2_blockref_t); 4569 break; 4570 case HAMMER2_BREF_TYPE_VOLUME: 4571 base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 4572 count = HAMMER2_SET_COUNT; 4573 break; 4574 case HAMMER2_BREF_TYPE_FREEMAP: 4575 base = &chain->hmp->voldata.freemap_blockset.blockref[0]; 4576 count = HAMMER2_SET_COUNT; 4577 break; 4578 default: 4579 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 4580 chain->bref.type); 4581 base = NULL; /* safety */ 4582 count = 0; /* safety */ 4583 } 4584 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 4585 } 4586 4587 #endif 4588 4589 /* 4590 * Chain memory management 4591 */ 4592 void 4593 hammer2_chain_wait(hammer2_chain_t *chain) 4594 { 4595 tsleep(chain, 0, "chnflw", 1); 4596 } 4597 4598 /* 4599 * Manage excessive memory resource use for chain and related 4600 * structures. 4601 */ 4602 void 4603 hammer2_chain_memory_wait(hammer2_pfsmount_t *pmp) 4604 { 4605 long waiting; 4606 long count; 4607 long limit; 4608 static int zzticks; 4609 4610 /* 4611 * Atomic check condition and wait. Also do an early speedup of 4612 * the syncer to try to avoid hitting the wait. 4613 */ 4614 for (;;) { 4615 waiting = pmp->inmem_dirty_chains; 4616 cpu_ccfence(); 4617 count = waiting & HAMMER2_DIRTYCHAIN_MASK; 4618 4619 limit = pmp->mp->mnt_nvnodelistsize / 10; 4620 if (limit < hammer2_limit_dirty_chains) 4621 limit = hammer2_limit_dirty_chains; 4622 if (limit < 1000) 4623 limit = 1000; 4624 4625 if ((int)(ticks - zzticks) > hz) { 4626 zzticks = ticks; 4627 kprintf("count %ld %ld\n", count, limit); 4628 } 4629 4630 /* 4631 * Block if there are too many dirty chains present, wait 4632 * for the flush to clean some out. 4633 */ 4634 if (count > limit) { 4635 tsleep_interlock(&pmp->inmem_dirty_chains, 0); 4636 if (atomic_cmpset_long(&pmp->inmem_dirty_chains, 4637 waiting, 4638 waiting | HAMMER2_DIRTYCHAIN_WAITING)) { 4639 speedup_syncer(pmp->mp); 4640 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED, 4641 "chnmem", hz); 4642 } 4643 continue; /* loop on success or fail */ 4644 } 4645 4646 /* 4647 * Try to start an early flush before we are forced to block. 4648 */ 4649 if (count > limit * 7 / 10) 4650 speedup_syncer(pmp->mp); 4651 break; 4652 } 4653 } 4654 4655 void 4656 hammer2_chain_memory_inc(hammer2_pfsmount_t *pmp) 4657 { 4658 if (pmp) 4659 atomic_add_long(&pmp->inmem_dirty_chains, 1); 4660 } 4661 4662 void 4663 hammer2_chain_memory_wakeup(hammer2_pfsmount_t *pmp) 4664 { 4665 long waiting; 4666 4667 if (pmp == NULL) 4668 return; 4669 4670 for (;;) { 4671 waiting = pmp->inmem_dirty_chains; 4672 cpu_ccfence(); 4673 if (atomic_cmpset_long(&pmp->inmem_dirty_chains, 4674 waiting, 4675 (waiting - 1) & 4676 ~HAMMER2_DIRTYCHAIN_WAITING)) { 4677 break; 4678 } 4679 } 4680 if (waiting & HAMMER2_DIRTYCHAIN_WAITING) 4681 wakeup(&pmp->inmem_dirty_chains); 4682 } 4683 4684 static 4685 void 4686 adjreadcounter(hammer2_blockref_t *bref, size_t bytes) 4687 { 4688 long *counterp; 4689 4690 switch(bref->type) { 4691 case HAMMER2_BREF_TYPE_DATA: 4692 counterp = &hammer2_iod_file_read; 4693 break; 4694 case HAMMER2_BREF_TYPE_INODE: 4695 counterp = &hammer2_iod_meta_read; 4696 break; 4697 case HAMMER2_BREF_TYPE_INDIRECT: 4698 counterp = &hammer2_iod_indr_read; 4699 break; 4700 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4701 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4702 counterp = &hammer2_iod_fmap_read; 4703 break; 4704 default: 4705 counterp = &hammer2_iod_volu_read; 4706 break; 4707 } 4708 *counterp += bytes; 4709 } 4710