1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem implements most of the core support functions for 37 * the hammer2_chain structure. 38 * 39 * Chains are the in-memory version on media objects (volume header, inodes, 40 * indirect blocks, data blocks, etc). Chains represent a portion of the 41 * HAMMER2 topology. 42 * 43 * A chain is topologically stable once it has been inserted into the 44 * in-memory topology. Modifications which copy, move, or resize the chain 45 * are handled via the DELETE-DUPLICATE mechanic where the original chain 46 * stays intact but is marked deleted and a new chain is allocated which 47 * shares the old chain's children. 48 * 49 * This sharing is handled via the hammer2_chain_core structure. 50 * 51 * The DELETE-DUPLICATE mechanism allows the same topological level to contain 52 * many overloadings. However, our RBTREE mechanics require that there be 53 * no overlaps so we accomplish the overloading by moving conflicting chains 54 * with smaller or equal radii into a sub-RBTREE under the chain being 55 * overloaded. 56 * 57 * DELETE-DUPLICATE is also used when a modification to a chain crosses a 58 * flush synchronization boundary, allowing the flush code to continue flushing 59 * the older version of the topology and not be disrupted by new frontend 60 * operations. 61 * 62 * LIVE VS FLUSH VIEW 63 * 64 * All lookup and iterate operations and most modifications are done on the 65 * live view. During flushes lookups are not normally done and modifications 66 * may be run on the flush view. However, flushes often needs to allocate 67 * blocks and the freemap_alloc/free code issues lookups. This code is 68 * special cased to use the live view when called from a flush. 69 * 70 * General chain lookup/iteration functions are NOT aware of the flush view, 71 * they only know about live views. 72 */ 73 #include <sys/cdefs.h> 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/types.h> 77 #include <sys/lock.h> 78 #include <sys/kern_syscall.h> 79 #include <sys/uuid.h> 80 81 #include "hammer2.h" 82 83 static int hammer2_indirect_optimize; /* XXX SYSCTL */ 84 85 static hammer2_chain_t *hammer2_chain_create_indirect( 86 hammer2_trans_t *trans, hammer2_chain_t *parent, 87 hammer2_key_t key, int keybits, int for_type, int *errorp); 88 static void hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop); 89 static void adjreadcounter(hammer2_blockref_t *bref, size_t bytes); 90 static hammer2_chain_t *hammer2_combined_find( 91 hammer2_chain_t *parent, 92 hammer2_blockref_t *base, int count, 93 int *cache_indexp, hammer2_key_t *key_nextp, 94 hammer2_key_t key_beg, hammer2_key_t key_end, 95 hammer2_blockref_t **bresp); 96 static void hammer2_chain_assert_not_present(hammer2_chain_core_t *above, 97 hammer2_chain_t *chain); 98 99 /* 100 * Basic RBTree for chains. Chains cannot overlap within any given 101 * core->rbtree without recursing through chain->rbtree. We effectively 102 * guarantee this by checking the full range rather than just the first 103 * key element. By matching on the full range callers can detect when 104 * recursrion through chain->rbtree is needed. 105 * 106 * NOTE: This also means the a delete-duplicate on the same key will 107 * overload by placing the deleted element in the new element's 108 * chain->rbtree (when doing a direct replacement). 109 */ 110 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 111 112 int 113 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 114 { 115 hammer2_key_t c1_beg; 116 hammer2_key_t c1_end; 117 hammer2_key_t c2_beg; 118 hammer2_key_t c2_end; 119 120 c1_beg = chain1->bref.key; 121 c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1; 122 c2_beg = chain2->bref.key; 123 c2_end = c2_beg + ((hammer2_key_t)1 << chain2->bref.keybits) - 1; 124 125 if (c1_end < c2_beg) /* fully to the left */ 126 return(-1); 127 if (c1_beg > c2_end) /* fully to the right */ 128 return(1); 129 return(0); /* overlap (must not cross edge boundary) */ 130 } 131 132 static __inline 133 int 134 hammer2_isclusterable(hammer2_chain_t *chain) 135 { 136 if (hammer2_cluster_enable) { 137 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 138 chain->bref.type == HAMMER2_BREF_TYPE_INODE || 139 chain->bref.type == HAMMER2_BREF_TYPE_DATA) { 140 return(1); 141 } 142 } 143 return(0); 144 } 145 146 /* 147 * Recursively set the update_hi flag up to the root starting at chain's 148 * parent->core. update_hi is not set in chain's core. 149 * 150 * This controls top-down visibility for flushes. The child has just one 151 * 'above' core, but the core itself can be multi-homed with parents iterated 152 * via core->ownerq. 153 * 154 * This function is not used during a flush (except when the flush is 155 * allocating which requires the live tree). The flush keeps track of its 156 * recursion itself. 157 * 158 * XXX needs to be optimized to use roll-up TIDs. update_hi is only really 159 * compared against bref.mirror_tid which itself is only updated by a flush. 160 */ 161 void 162 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain) 163 { 164 hammer2_chain_core_t *above; 165 166 while ((above = chain->above) != NULL) { 167 spin_lock(&above->cst.spin); 168 /* XXX optimize */ 169 if (above->update_hi < trans->sync_tid) 170 above->update_hi = trans->sync_tid; 171 chain = TAILQ_LAST(&above->ownerq, h2_core_list); 172 #if 0 173 TAILQ_FOREACH_REVERSE(chain, &above->ownerq, 174 h2_core_list, core_entry) { 175 if (trans->sync_tid >= chain->modify_tid && 176 trans->sync_tid <= chain->delete_tid) { 177 break; 178 } 179 } 180 #endif 181 spin_unlock(&above->cst.spin); 182 } 183 } 184 185 /* 186 * Allocate a new disconnected chain element representing the specified 187 * bref. chain->refs is set to 1 and the passed bref is copied to 188 * chain->bref. chain->bytes is derived from the bref. 189 * 190 * chain->core is NOT allocated and the media data and bp pointers are left 191 * NULL. The caller must call chain_core_alloc() to allocate or associate 192 * a core with the chain. 193 * 194 * NOTE: Returns a referenced but unlocked (because there is no core) chain. 195 */ 196 hammer2_chain_t * 197 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp, 198 hammer2_trans_t *trans, hammer2_blockref_t *bref) 199 { 200 hammer2_chain_t *chain; 201 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 202 203 /* 204 * Construct the appropriate system structure. 205 */ 206 switch(bref->type) { 207 case HAMMER2_BREF_TYPE_INODE: 208 case HAMMER2_BREF_TYPE_INDIRECT: 209 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 210 case HAMMER2_BREF_TYPE_DATA: 211 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 212 /* 213 * Chain's are really only associated with the hmp but we 214 * maintain a pmp association for per-mount memory tracking 215 * purposes. The pmp can be NULL. 216 */ 217 chain = kmalloc(sizeof(*chain), hmp->mchain, M_WAITOK | M_ZERO); 218 if (pmp) 219 chain->pmp = pmp; 220 break; 221 case HAMMER2_BREF_TYPE_VOLUME: 222 case HAMMER2_BREF_TYPE_FREEMAP: 223 chain = NULL; 224 panic("hammer2_chain_alloc volume type illegal for op"); 225 default: 226 chain = NULL; 227 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 228 bref->type); 229 } 230 231 chain->hmp = hmp; 232 chain->bref = *bref; 233 chain->bytes = bytes; 234 chain->refs = 1; 235 chain->flags = HAMMER2_CHAIN_ALLOCATED; 236 chain->delete_tid = HAMMER2_MAX_TID; 237 238 /* 239 * Set modify_tid if a transaction is creating the chain. When 240 * loading a chain from backing store trans is passed as NULL and 241 * modify_tid is left set to 0. 242 */ 243 if (trans) 244 chain->modify_tid = trans->sync_tid; 245 246 return (chain); 247 } 248 249 /* 250 * Associate an existing core with the chain or allocate a new core. 251 * 252 * The core is not locked. No additional refs on the chain are made. 253 * (trans) must not be NULL if (core) is not NULL. 254 * 255 * When chains are delete-duplicated during flushes we insert nchain on 256 * the ownerq after ochain instead of at the end in order to give the 257 * drop code visibility in the correct order, otherwise drops can be missed. 258 */ 259 void 260 hammer2_chain_core_alloc(hammer2_trans_t *trans, 261 hammer2_chain_t *nchain, hammer2_chain_t *ochain) 262 { 263 hammer2_chain_core_t *core; 264 265 KKASSERT(nchain->core == NULL); 266 267 if (ochain == NULL) { 268 /* 269 * Fresh core under nchain (no multi-homing of ochain's 270 * sub-tree). 271 */ 272 core = kmalloc(sizeof(*core), nchain->hmp->mchain, 273 M_WAITOK | M_ZERO); 274 TAILQ_INIT(&core->layerq); 275 TAILQ_INIT(&core->ownerq); 276 core->sharecnt = 1; 277 core->good = 0x1234; 278 if (trans) 279 core->update_hi = trans->sync_tid; 280 else 281 core->update_hi = nchain->bref.mirror_tid; 282 nchain->core = core; 283 ccms_cst_init(&core->cst, nchain); 284 TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry); 285 } else { 286 /* 287 * Propagate the PFSROOT flag which we set on all subdirs 288 * under the super-root. 289 */ 290 atomic_set_int(&nchain->flags, 291 ochain->flags & HAMMER2_CHAIN_PFSROOT); 292 293 /* 294 * Duplicating ochain -> nchain. Set the DUPLICATED flag on 295 * ochain if nchain is not a snapshot. 296 * 297 * It is possible for the DUPLICATED flag to already be 298 * set when called via a flush operation because flush 299 * operations may have to work on elements with delete_tid's 300 * beyond the flush sync_tid. In this situation we must 301 * ensure that nchain is placed just after ochain in the 302 * ownerq and that the DUPLICATED flag is set on nchain so 303 * 'live' operations skip past it to the correct chain. 304 * 305 * The flusher understands the blockref synchronization state 306 * for any stale chains by observing bref.mirror_tid, which 307 * delete-duplicate replicates. 308 * 309 * WARNING! However, the case is disallowed when the flusher 310 * is allocating freemap space because this entails 311 * more than just adjusting a block table. 312 */ 313 if (ochain->flags & HAMMER2_CHAIN_DUPLICATED) { 314 KKASSERT((trans->flags & 315 (HAMMER2_TRANS_ISFLUSH | 316 HAMMER2_TRANS_ISALLOCATING)) == 317 HAMMER2_TRANS_ISFLUSH); 318 atomic_set_int(&nchain->flags, 319 HAMMER2_CHAIN_DUPLICATED); 320 } 321 if ((nchain->flags & HAMMER2_CHAIN_SNAPSHOT) == 0) { 322 atomic_set_int(&ochain->flags, 323 HAMMER2_CHAIN_DUPLICATED); 324 } 325 core = ochain->core; 326 atomic_add_int(&core->sharecnt, 1); 327 328 spin_lock(&core->cst.spin); 329 nchain->core = core; 330 331 #if 0 332 if (core->update_hi < trans->sync_tid) 333 core->update_hi = trans->sync_tid; 334 #endif 335 336 /* 337 * Maintain ordering for refactor test so we don't skip over 338 * a snapshot. Also, during flushes, delete-duplications 339 * for block-table updates can occur on blocks already 340 * deleted (delete-duplicated by a later transaction). We 341 * must insert nchain after ochain but before the later 342 * transaction's copy. 343 */ 344 TAILQ_INSERT_AFTER(&core->ownerq, ochain, nchain, core_entry); 345 346 spin_unlock(&core->cst.spin); 347 } 348 } 349 350 /* 351 * Add a reference to a chain element, preventing its destruction. 352 */ 353 void 354 hammer2_chain_ref(hammer2_chain_t *chain) 355 { 356 atomic_add_int(&chain->refs, 1); 357 } 358 359 /* 360 * Insert the chain in the core rbtree at the first layer 361 * which accepts it (for now we don't sort layers by the transaction tid) 362 */ 363 #define HAMMER2_CHAIN_INSERT_SPIN 0x0001 364 #define HAMMER2_CHAIN_INSERT_LIVE 0x0002 365 #define HAMMER2_CHAIN_INSERT_RACE 0x0004 366 367 static 368 int 369 hammer2_chain_insert(hammer2_chain_core_t *above, hammer2_chain_layer_t *layer, 370 hammer2_chain_t *chain, int flags, int generation) 371 { 372 hammer2_chain_t *xchain; 373 hammer2_chain_layer_t *nlayer; 374 int error = 0; 375 376 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 377 spin_lock(&above->cst.spin); 378 379 /* 380 * Special case, place the chain in the next most-recent layer as the 381 * specified layer, inserting a layer inbetween if necessary. 382 */ 383 if (layer) { 384 KKASSERT((flags & HAMMER2_CHAIN_INSERT_RACE) == 0); 385 nlayer = TAILQ_PREV(layer, h2_layer_list, entry); 386 if (nlayer && RB_INSERT(hammer2_chain_tree, 387 &nlayer->rbtree, chain) == NULL) { 388 layer = nlayer; 389 goto done; 390 } 391 392 spin_unlock(&above->cst.spin); 393 KKASSERT((flags & HAMMER2_CHAIN_INSERT_LIVE) == 0); 394 nlayer = kmalloc(sizeof(*nlayer), chain->hmp->mchain, 395 M_WAITOK | M_ZERO); 396 RB_INIT(&nlayer->rbtree); 397 nlayer->good = 0xABCD; 398 spin_lock(&above->cst.spin); 399 400 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) 401 hammer2_chain_assert_not_present(above, chain); 402 403 TAILQ_INSERT_BEFORE(layer, nlayer, entry); 404 RB_INSERT(hammer2_chain_tree, &nlayer->rbtree, chain); 405 layer = nlayer; 406 goto done; 407 } 408 409 /* 410 * Interlocked by spinlock, check for race 411 */ 412 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 413 above->generation != generation) { 414 error = EAGAIN; 415 goto failed; 416 } 417 418 /* 419 * Try to insert, allocate a new layer if a nominal collision 420 * occurs (a collision is different from a SMP race). 421 */ 422 layer = TAILQ_FIRST(&above->layerq); 423 xchain = NULL; 424 425 if (layer == NULL || 426 (xchain = RB_INSERT(hammer2_chain_tree, 427 &layer->rbtree, chain)) != NULL) { 428 429 /* 430 * Allocate a new layer to resolve the issue. 431 */ 432 spin_unlock(&above->cst.spin); 433 layer = kmalloc(sizeof(*layer), chain->hmp->mchain, 434 M_WAITOK | M_ZERO); 435 RB_INIT(&layer->rbtree); 436 layer->good = 0xABCD; 437 spin_lock(&above->cst.spin); 438 439 if ((flags & HAMMER2_CHAIN_INSERT_RACE) && 440 above->generation != generation) { 441 spin_unlock(&above->cst.spin); 442 kfree(layer, chain->hmp->mchain); 443 spin_lock(&above->cst.spin); 444 error = EAGAIN; 445 goto failed; 446 } 447 hammer2_chain_assert_not_present(above, chain); 448 449 TAILQ_INSERT_HEAD(&above->layerq, layer, entry); 450 RB_INSERT(hammer2_chain_tree, &layer->rbtree, chain); 451 } 452 done: 453 chain->above = above; 454 chain->inlayer = layer; 455 ++above->chain_count; 456 ++above->generation; 457 atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 458 459 /* 460 * We have to keep track of the effective live-view blockref count 461 * so the create code knows when to push an indirect block. 462 */ 463 if ((flags & HAMMER2_CHAIN_INSERT_LIVE) && 464 (chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 465 atomic_add_int(&above->live_count, 1); 466 } 467 failed: 468 if (flags & HAMMER2_CHAIN_INSERT_SPIN) 469 spin_unlock(&above->cst.spin); 470 return error; 471 } 472 473 /* 474 * Drop the caller's reference to the chain. When the ref count drops to 475 * zero this function will try to disassociate the chain from its parent and 476 * deallocate it, then recursely drop the parent using the implied ref 477 * from the chain's chain->parent. 478 */ 479 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain, 480 struct h2_core_list *delayq); 481 482 void 483 hammer2_chain_drop(hammer2_chain_t *chain) 484 { 485 struct h2_core_list delayq; 486 hammer2_chain_t *scan; 487 u_int refs; 488 u_int need = 0; 489 490 if (hammer2_debug & 0x200000) 491 Debugger("drop"); 492 493 if (chain->flags & HAMMER2_CHAIN_MOVED) 494 ++need; 495 if (chain->flags & HAMMER2_CHAIN_MODIFIED) 496 ++need; 497 KKASSERT(chain->refs > need); 498 499 TAILQ_INIT(&delayq); 500 501 while (chain) { 502 refs = chain->refs; 503 cpu_ccfence(); 504 KKASSERT(refs > 0); 505 506 if (refs == 1) { 507 chain = hammer2_chain_lastdrop(chain, &delayq); 508 } else { 509 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) 510 break; 511 /* retry the same chain */ 512 } 513 514 /* 515 * When we've exhausted lastdrop chaining pull off of delayq. 516 * chains on delayq are dead but are used to placehold other 517 * chains which we added a ref to for the purpose of dropping. 518 */ 519 if (chain == NULL) { 520 hammer2_mount_t *hmp; 521 522 if ((scan = TAILQ_FIRST(&delayq)) != NULL) { 523 chain = (void *)scan->data; 524 TAILQ_REMOVE(&delayq, scan, core_entry); 525 scan->flags &= ~HAMMER2_CHAIN_ALLOCATED; 526 hmp = scan->hmp; 527 scan->hmp = NULL; 528 kfree(scan, hmp->mchain); 529 } 530 } 531 } 532 } 533 534 /* 535 * Safe handling of the 1->0 transition on chain. Returns a chain for 536 * recursive drop or NULL, possibly returning the same chain if the atomic 537 * op fails. 538 * 539 * Whem two chains need to be recursively dropped we use the chain 540 * we would otherwise free to placehold the additional chain. It's a bit 541 * convoluted but we can't just recurse without potentially blowing out 542 * the kernel stack. 543 * 544 * The chain cannot be freed if it has a non-empty core (children) or 545 * it is not at the head of ownerq. 546 * 547 * The cst spinlock is allowed nest child-to-parent (not parent-to-child). 548 */ 549 static 550 hammer2_chain_t * 551 hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq) 552 { 553 hammer2_pfsmount_t *pmp; 554 hammer2_mount_t *hmp; 555 hammer2_chain_core_t *above; 556 hammer2_chain_core_t *core; 557 hammer2_chain_layer_t *layer; 558 hammer2_chain_t *rdrop1; 559 hammer2_chain_t *rdrop2; 560 561 /* 562 * Spinlock the core and check to see if it is empty. If it is 563 * not empty we leave chain intact with refs == 0. The elements 564 * in core->rbtree are associated with other chains contemporary 565 * with ours but not with our chain directly. 566 */ 567 if ((core = chain->core) != NULL) { 568 spin_lock(&core->cst.spin); 569 570 /* 571 * We can't free non-stale chains with children until we are 572 * able to free the children because there might be a flush 573 * dependency. Flushes of stale children (which should also 574 * have their deleted flag set) short-cut recursive flush 575 * dependencies and can be freed here. Any flushes which run 576 * through stale children due to the flush synchronization 577 * point should have the MOVED bit set in the chain and not 578 * reach lastdrop at this time. 579 * 580 * NOTE: We return (chain) on failure to retry. 581 */ 582 if (core->chain_count && 583 (chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0) { 584 if (atomic_cmpset_int(&chain->refs, 1, 0)) 585 chain = NULL; /* success */ 586 spin_unlock(&core->cst.spin); 587 return(chain); 588 } 589 /* no chains left under us */ 590 591 /* 592 * Various parts of the code might be holding a ref on a 593 * stale chain as a placemarker which must be iterated to 594 * locate a later non-stale (live) chain. We must be sure 595 * NOT to free the later non-stale chain (which might have 596 * no refs). Otherwise mass confusion may result. 597 * 598 * The DUPLICATED flag tells us whether the chain is stale 599 * or not, so the rule is that any chain whos DUPLICATED flag 600 * is NOT set must also be at the head of the ownerq. 601 * 602 * Note that the DELETED flag is not involved. That is, a 603 * live chain can represent a deletion that has not yet been 604 * flushed (or still has refs). 605 */ 606 #if 0 607 if (TAILQ_NEXT(chain, core_entry) == NULL && 608 TAILQ_FIRST(&core->ownerq) != chain) { 609 #endif 610 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 && 611 TAILQ_FIRST(&core->ownerq) != chain) { 612 if (atomic_cmpset_int(&chain->refs, 1, 0)) 613 chain = NULL; /* success */ 614 spin_unlock(&core->cst.spin); 615 return(chain); 616 } 617 } 618 619 /* 620 * chain->core has no children left so no accessors can get to our 621 * chain from there. Now we have to lock the above core to interlock 622 * remaining possible accessors that might bump chain's refs before 623 * we can safely drop chain's refs with intent to free the chain. 624 */ 625 hmp = chain->hmp; 626 pmp = chain->pmp; /* can be NULL */ 627 rdrop1 = NULL; 628 rdrop2 = NULL; 629 layer = NULL; 630 631 /* 632 * Spinlock the parent and try to drop the last ref on chain. 633 * On success remove chain from its parent, otherwise return NULL. 634 * 635 * (normal core locks are top-down recursive but we define core 636 * spinlocks as bottom-up recursive, so this is safe). 637 */ 638 if ((above = chain->above) != NULL) { 639 spin_lock(&above->cst.spin); 640 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 641 /* 1->0 transition failed */ 642 spin_unlock(&above->cst.spin); 643 if (core) 644 spin_unlock(&core->cst.spin); 645 return(chain); /* retry */ 646 } 647 648 /* 649 * 1->0 transition successful, remove chain from its 650 * above core. Track layer for removal/freeing. 651 */ 652 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 653 layer = chain->inlayer; 654 RB_REMOVE(hammer2_chain_tree, &layer->rbtree, chain); 655 --above->chain_count; 656 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE); 657 chain->above = NULL; 658 chain->inlayer = NULL; 659 660 if (RB_EMPTY(&layer->rbtree) && layer->refs == 0) { 661 TAILQ_REMOVE(&above->layerq, layer, entry); 662 } else { 663 layer = NULL; 664 } 665 666 /* 667 * If our chain was the last chain in the parent's core the 668 * core is now empty and its parents might now be droppable. 669 * Try to drop the first multi-homed parent by gaining a 670 * ref on it here and then dropping it below. 671 */ 672 if (above->chain_count == 0) { 673 rdrop1 = TAILQ_FIRST(&above->ownerq); 674 if (rdrop1 && 675 atomic_cmpset_int(&rdrop1->refs, 0, 1) == 0) { 676 rdrop1 = NULL; 677 } 678 } 679 spin_unlock(&above->cst.spin); 680 above = NULL; /* safety */ 681 } 682 683 /* 684 * Successful 1->0 transition and the chain can be destroyed now. 685 * 686 * We still have the core spinlock (if core is non-NULL), and core's 687 * chain_count is 0. The above spinlock is gone. 688 * 689 * Remove chain from ownerq. Once core has no more owners (and no 690 * children which is already the case) we can destroy core. 691 * 692 * If core has more owners we may be able to continue a bottom-up 693 * drop with our next sibling. 694 */ 695 if (core) { 696 chain->core = NULL; 697 698 TAILQ_REMOVE(&core->ownerq, chain, core_entry); 699 rdrop2 = TAILQ_FIRST(&core->ownerq); 700 if (rdrop2 && atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0) 701 rdrop2 = NULL; 702 spin_unlock(&core->cst.spin); 703 704 /* 705 * We can do the final 1->0 transition with an atomic op 706 * after releasing core's spinlock. 707 */ 708 if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) { 709 /* 710 * On the 1->0 transition of core we can destroy 711 * it. Any remaining layers should no longer be 712 * referenced or visibile to other threads. 713 */ 714 KKASSERT(TAILQ_EMPTY(&core->ownerq)); 715 if (layer) { 716 layer->good = 0xEF00; 717 kfree(layer, hmp->mchain); 718 } 719 while ((layer = TAILQ_FIRST(&core->layerq)) != NULL) { 720 KKASSERT(layer->refs == 0 && 721 RB_EMPTY(&layer->rbtree)); 722 TAILQ_REMOVE(&core->layerq, layer, entry); 723 layer->good = 0xEF01; 724 kfree(layer, hmp->mchain); 725 } 726 /* layer now NULL */ 727 KKASSERT(core->cst.count == 0); 728 KKASSERT(core->cst.upgrade == 0); 729 core->good = 0x5678; 730 kfree(core, hmp->mchain); 731 } 732 core = NULL; /* safety */ 733 } 734 735 /* 736 * All spin locks are gone, finish freeing stuff. 737 */ 738 KKASSERT((chain->flags & (HAMMER2_CHAIN_MOVED | 739 HAMMER2_CHAIN_MODIFIED)) == 0); 740 hammer2_chain_drop_data(chain, 1); 741 742 KKASSERT(chain->dio == NULL); 743 744 /* 745 * Free saved empty layer and return chained drop. 746 */ 747 if (layer) { 748 layer->good = 0xEF02; 749 kfree(layer, hmp->mchain); 750 } 751 752 /* 753 * Once chain resources are gone we can use the now dead chain 754 * structure to placehold what might otherwise require a recursive 755 * drop, because we have potentially two things to drop and can only 756 * return one directly. 757 */ 758 if (rdrop1 && rdrop2) { 759 KKASSERT(chain->flags & HAMMER2_CHAIN_ALLOCATED); 760 chain->data = (void *)rdrop1; 761 TAILQ_INSERT_TAIL(delayq, chain, core_entry); 762 rdrop1 = NULL; 763 } else if (chain->flags & HAMMER2_CHAIN_ALLOCATED) { 764 chain->flags &= ~HAMMER2_CHAIN_ALLOCATED; 765 chain->hmp = NULL; 766 kfree(chain, hmp->mchain); 767 } 768 769 /* 770 * Either or both can be NULL. We already handled the case where 771 * both might not have been NULL. 772 */ 773 if (rdrop1) 774 return(rdrop1); 775 else 776 return(rdrop2); 777 } 778 779 /* 780 * On either last lock release or last drop 781 */ 782 static void 783 hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop) 784 { 785 /*hammer2_mount_t *hmp = chain->hmp;*/ 786 787 switch(chain->bref.type) { 788 case HAMMER2_BREF_TYPE_VOLUME: 789 case HAMMER2_BREF_TYPE_FREEMAP: 790 if (lastdrop) 791 chain->data = NULL; 792 break; 793 default: 794 KKASSERT(chain->data == NULL); 795 break; 796 } 797 } 798 799 /* 800 * Ref and lock a chain element, acquiring its data with I/O if necessary, 801 * and specify how you would like the data to be resolved. 802 * 803 * Returns 0 on success or an error code if the data could not be acquired. 804 * The chain element is locked on return regardless of whether an error 805 * occurred or not. 806 * 807 * The lock is allowed to recurse, multiple locking ops will aggregate 808 * the requested resolve types. Once data is assigned it will not be 809 * removed until the last unlock. 810 * 811 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 812 * (typically used to avoid device/logical buffer 813 * aliasing for data) 814 * 815 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 816 * the INITIAL-create state (indirect blocks only). 817 * 818 * Do not resolve data elements for DATA chains. 819 * (typically used to avoid device/logical buffer 820 * aliasing for data) 821 * 822 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 823 * 824 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 825 * it will be locked exclusive. 826 * 827 * NOTE: Embedded elements (volume header, inodes) are always resolved 828 * regardless. 829 * 830 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 831 * element will instantiate and zero its buffer, and flush it on 832 * release. 833 * 834 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 835 * so as not to instantiate a device buffer, which could alias against 836 * a logical file buffer. However, if ALWAYS is specified the 837 * device buffer will be instantiated anyway. 838 * 839 * WARNING! If data must be fetched a shared lock will temporarily be 840 * upgraded to exclusive. However, a deadlock can occur if 841 * the caller owns more than one shared lock. 842 */ 843 int 844 hammer2_chain_lock(hammer2_chain_t *chain, int how) 845 { 846 hammer2_mount_t *hmp; 847 hammer2_chain_core_t *core; 848 hammer2_blockref_t *bref; 849 ccms_state_t ostate; 850 char *bdata; 851 int error; 852 853 /* 854 * Ref and lock the element. Recursive locks are allowed. 855 */ 856 if ((how & HAMMER2_RESOLVE_NOREF) == 0) 857 hammer2_chain_ref(chain); 858 atomic_add_int(&chain->lockcnt, 1); 859 860 hmp = chain->hmp; 861 KKASSERT(hmp != NULL); 862 863 /* 864 * Get the appropriate lock. 865 */ 866 core = chain->core; 867 if (how & HAMMER2_RESOLVE_SHARED) 868 ccms_thread_lock(&core->cst, CCMS_STATE_SHARED); 869 else 870 ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE); 871 872 /* 873 * If we already have a valid data pointer no further action is 874 * necessary. 875 */ 876 if (chain->data) 877 return (0); 878 879 /* 880 * Do we have to resolve the data? 881 */ 882 switch(how & HAMMER2_RESOLVE_MASK) { 883 case HAMMER2_RESOLVE_NEVER: 884 return(0); 885 case HAMMER2_RESOLVE_MAYBE: 886 if (chain->flags & HAMMER2_CHAIN_INITIAL) 887 return(0); 888 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 889 return(0); 890 #if 0 891 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) 892 return(0); 893 #endif 894 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) 895 return(0); 896 /* fall through */ 897 case HAMMER2_RESOLVE_ALWAYS: 898 break; 899 } 900 901 /* 902 * Upgrade to an exclusive lock so we can safely manipulate the 903 * buffer cache. If another thread got to it before us we 904 * can just return. 905 */ 906 ostate = ccms_thread_lock_upgrade(&core->cst); 907 if (chain->data) { 908 ccms_thread_lock_downgrade(&core->cst, ostate); 909 return (0); 910 } 911 912 /* 913 * We must resolve to a device buffer, either by issuing I/O or 914 * by creating a zero-fill element. We do not mark the buffer 915 * dirty when creating a zero-fill element (the hammer2_chain_modify() 916 * API must still be used to do that). 917 * 918 * The device buffer is variable-sized in powers of 2 down 919 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 920 * chunk always contains buffers of the same size. (XXX) 921 * 922 * The minimum physical IO size may be larger than the variable 923 * block size. 924 */ 925 bref = &chain->bref; 926 927 /* 928 * The getblk() optimization can only be used on newly created 929 * elements if the physical block size matches the request. 930 */ 931 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 932 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, 933 &chain->dio); 934 } else { 935 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes, 936 &chain->dio); 937 adjreadcounter(&chain->bref, chain->bytes); 938 } 939 940 if (error) { 941 kprintf("hammer2_chain_lock: I/O error %016jx: %d\n", 942 (intmax_t)bref->data_off, error); 943 hammer2_io_bqrelse(&chain->dio); 944 ccms_thread_lock_downgrade(&core->cst, ostate); 945 return (error); 946 } 947 948 /* 949 * We can clear the INITIAL state now, we've resolved the buffer 950 * to zeros and marked it dirty with hammer2_io_new(). 951 */ 952 bdata = hammer2_io_data(chain->dio, chain->bref.data_off); 953 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 954 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 955 } 956 957 /* 958 * Setup the data pointer, either pointing it to an embedded data 959 * structure and copying the data from the buffer, or pointing it 960 * into the buffer. 961 * 962 * The buffer is not retained when copying to an embedded data 963 * structure in order to avoid potential deadlocks or recursions 964 * on the same physical buffer. 965 */ 966 switch (bref->type) { 967 case HAMMER2_BREF_TYPE_VOLUME: 968 case HAMMER2_BREF_TYPE_FREEMAP: 969 /* 970 * Copy data from bp to embedded buffer 971 */ 972 panic("hammer2_chain_lock: called on unresolved volume header"); 973 break; 974 case HAMMER2_BREF_TYPE_INODE: 975 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 976 case HAMMER2_BREF_TYPE_INDIRECT: 977 case HAMMER2_BREF_TYPE_DATA: 978 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 979 default: 980 /* 981 * Point data at the device buffer and leave dio intact. 982 */ 983 chain->data = (void *)bdata; 984 break; 985 } 986 ccms_thread_lock_downgrade(&core->cst, ostate); 987 return (0); 988 } 989 990 /* 991 * This basically calls hammer2_io_breadcb() but does some pre-processing 992 * of the chain first to handle certain cases. 993 */ 994 void 995 hammer2_chain_load_async(hammer2_chain_t *chain, 996 void (*callback)(hammer2_io_t *dio, 997 hammer2_chain_t *chain, 998 void *arg_p, off_t arg_o), 999 void *arg_p, off_t arg_o) 1000 { 1001 hammer2_mount_t *hmp; 1002 struct hammer2_io *dio; 1003 hammer2_blockref_t *bref; 1004 int error; 1005 1006 if (chain->data) { 1007 callback(NULL, chain, arg_p, arg_o); 1008 return; 1009 } 1010 1011 /* 1012 * We must resolve to a device buffer, either by issuing I/O or 1013 * by creating a zero-fill element. We do not mark the buffer 1014 * dirty when creating a zero-fill element (the hammer2_chain_modify() 1015 * API must still be used to do that). 1016 * 1017 * The device buffer is variable-sized in powers of 2 down 1018 * to HAMMER2_MIN_ALLOC (typically 1K). A 64K physical storage 1019 * chunk always contains buffers of the same size. (XXX) 1020 * 1021 * The minimum physical IO size may be larger than the variable 1022 * block size. 1023 */ 1024 bref = &chain->bref; 1025 hmp = chain->hmp; 1026 1027 /* 1028 * The getblk() optimization can only be used on newly created 1029 * elements if the physical block size matches the request. 1030 */ 1031 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 1032 chain->bytes == hammer2_devblksize(chain->bytes)) { 1033 error = hammer2_io_new(hmp, bref->data_off, chain->bytes, &dio); 1034 KKASSERT(error == 0); 1035 callback(dio, chain, arg_p, arg_o); 1036 return; 1037 } 1038 1039 /* 1040 * Otherwise issue a read 1041 */ 1042 adjreadcounter(&chain->bref, chain->bytes); 1043 hammer2_io_breadcb(hmp, bref->data_off, chain->bytes, 1044 callback, chain, arg_p, arg_o); 1045 } 1046 1047 /* 1048 * Unlock and deref a chain element. 1049 * 1050 * On the last lock release any non-embedded data (chain->dio) will be 1051 * retired. 1052 */ 1053 void 1054 hammer2_chain_unlock(hammer2_chain_t *chain) 1055 { 1056 hammer2_chain_core_t *core = chain->core; 1057 ccms_state_t ostate; 1058 long *counterp; 1059 u_int lockcnt; 1060 1061 /* 1062 * The core->cst lock can be shared across several chains so we 1063 * need to track the per-chain lockcnt separately. 1064 * 1065 * If multiple locks are present (or being attempted) on this 1066 * particular chain we can just unlock, drop refs, and return. 1067 * 1068 * Otherwise fall-through on the 1->0 transition. 1069 */ 1070 for (;;) { 1071 lockcnt = chain->lockcnt; 1072 KKASSERT(lockcnt > 0); 1073 cpu_ccfence(); 1074 if (lockcnt > 1) { 1075 if (atomic_cmpset_int(&chain->lockcnt, 1076 lockcnt, lockcnt - 1)) { 1077 ccms_thread_unlock(&core->cst); 1078 hammer2_chain_drop(chain); 1079 return; 1080 } 1081 } else { 1082 if (atomic_cmpset_int(&chain->lockcnt, 1, 0)) 1083 break; 1084 } 1085 /* retry */ 1086 } 1087 1088 /* 1089 * On the 1->0 transition we upgrade the core lock (if necessary) 1090 * to exclusive for terminal processing. If after upgrading we find 1091 * that lockcnt is non-zero, another thread is racing us and will 1092 * handle the unload for us later on, so just cleanup and return 1093 * leaving the data/io intact 1094 * 1095 * Otherwise if lockcnt is still 0 it is possible for it to become 1096 * non-zero and race, but since we hold the core->cst lock 1097 * exclusively all that will happen is that the chain will be 1098 * reloaded after we unload it. 1099 */ 1100 ostate = ccms_thread_lock_upgrade(&core->cst); 1101 if (chain->lockcnt) { 1102 ccms_thread_unlock_upgraded(&core->cst, ostate); 1103 hammer2_chain_drop(chain); 1104 return; 1105 } 1106 1107 /* 1108 * Shortcut the case if the data is embedded or not resolved. 1109 * 1110 * Do NOT NULL out chain->data (e.g. inode data), it might be 1111 * dirty. 1112 */ 1113 if (chain->dio == NULL) { 1114 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) 1115 hammer2_chain_drop_data(chain, 0); 1116 ccms_thread_unlock_upgraded(&core->cst, ostate); 1117 hammer2_chain_drop(chain); 1118 return; 1119 } 1120 1121 /* 1122 * Statistics 1123 */ 1124 if (hammer2_io_isdirty(chain->dio) == 0) { 1125 ; 1126 } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 1127 switch(chain->bref.type) { 1128 case HAMMER2_BREF_TYPE_DATA: 1129 counterp = &hammer2_ioa_file_write; 1130 break; 1131 case HAMMER2_BREF_TYPE_INODE: 1132 counterp = &hammer2_ioa_meta_write; 1133 break; 1134 case HAMMER2_BREF_TYPE_INDIRECT: 1135 counterp = &hammer2_ioa_indr_write; 1136 break; 1137 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1138 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1139 counterp = &hammer2_ioa_fmap_write; 1140 break; 1141 default: 1142 counterp = &hammer2_ioa_volu_write; 1143 break; 1144 } 1145 *counterp += chain->bytes; 1146 } else { 1147 switch(chain->bref.type) { 1148 case HAMMER2_BREF_TYPE_DATA: 1149 counterp = &hammer2_iod_file_write; 1150 break; 1151 case HAMMER2_BREF_TYPE_INODE: 1152 counterp = &hammer2_iod_meta_write; 1153 break; 1154 case HAMMER2_BREF_TYPE_INDIRECT: 1155 counterp = &hammer2_iod_indr_write; 1156 break; 1157 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1158 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1159 counterp = &hammer2_iod_fmap_write; 1160 break; 1161 default: 1162 counterp = &hammer2_iod_volu_write; 1163 break; 1164 } 1165 *counterp += chain->bytes; 1166 } 1167 1168 /* 1169 * Clean out the dio. 1170 * 1171 * If a device buffer was used for data be sure to destroy the 1172 * buffer when we are done to avoid aliases (XXX what about the 1173 * underlying VM pages?). 1174 * 1175 * NOTE: Freemap leaf's use reserved blocks and thus no aliasing 1176 * is possible. 1177 * 1178 * NOTE: The isdirty check tracks whether we have to bdwrite() the 1179 * buffer or not. The buffer might already be dirty. The 1180 * flag is re-set when chain_modify() is called, even if 1181 * MODIFIED is already set, allowing the OS to retire the 1182 * buffer independent of a hammer2 flush. 1183 */ 1184 chain->data = NULL; 1185 if ((chain->flags & HAMMER2_CHAIN_IOFLUSH) && 1186 hammer2_io_isdirty(chain->dio)) { 1187 hammer2_io_bawrite(&chain->dio); 1188 } else { 1189 hammer2_io_bqrelse(&chain->dio); 1190 } 1191 ccms_thread_unlock_upgraded(&core->cst, ostate); 1192 hammer2_chain_drop(chain); 1193 } 1194 1195 /* 1196 * This counts the number of live blockrefs in a block array and 1197 * also calculates the point at which all remaining blockrefs are empty. 1198 * This routine can only be called on a live chain (DUPLICATED flag not set). 1199 * 1200 * NOTE: Flag is not set until after the count is complete, allowing 1201 * callers to test the flag without holding the spinlock. 1202 * 1203 * NOTE: If base is NULL the related chain is still in the INITIAL 1204 * state and there are no blockrefs to count. 1205 * 1206 * NOTE: live_count may already have some counts accumulated due to 1207 * creation and deletion and could even be initially negative. 1208 */ 1209 void 1210 hammer2_chain_countbrefs(hammer2_chain_t *chain, 1211 hammer2_blockref_t *base, int count) 1212 { 1213 hammer2_chain_core_t *core = chain->core; 1214 1215 KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0); 1216 1217 spin_lock(&core->cst.spin); 1218 if ((core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) { 1219 if (base) { 1220 while (--count >= 0) { 1221 if (base[count].type) 1222 break; 1223 } 1224 core->live_zero = count + 1; 1225 while (count >= 0) { 1226 if (base[count].type) 1227 atomic_add_int(&core->live_count, 1); 1228 --count; 1229 } 1230 } else { 1231 core->live_zero = 0; 1232 } 1233 /* else do not modify live_count */ 1234 atomic_set_int(&core->flags, HAMMER2_CORE_COUNTEDBREFS); 1235 } 1236 spin_unlock(&core->cst.spin); 1237 } 1238 1239 /* 1240 * Resize the chain's physical storage allocation in-place. This may 1241 * replace the passed-in chain with a new chain. 1242 * 1243 * Chains can be resized smaller without reallocating the storage. 1244 * Resizing larger will reallocate the storage. 1245 * 1246 * Must be passed an exclusively locked parent and chain, returns a new 1247 * exclusively locked chain at the same index and unlocks the old chain. 1248 * Flushes the buffer if necessary. 1249 * 1250 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 1251 * to avoid instantiating a device buffer that conflicts with the vnode 1252 * data buffer. That is, the passed-in bp is a logical buffer, whereas 1253 * any chain-oriented bp would be a device buffer. 1254 * 1255 * XXX return error if cannot resize. 1256 */ 1257 void 1258 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip, 1259 hammer2_chain_t *parent, hammer2_chain_t **chainp, 1260 int nradix, int flags) 1261 { 1262 hammer2_mount_t *hmp; 1263 hammer2_chain_t *chain; 1264 size_t obytes; 1265 size_t nbytes; 1266 1267 chain = *chainp; 1268 hmp = chain->hmp; 1269 1270 /* 1271 * Only data and indirect blocks can be resized for now. 1272 * (The volu root, inodes, and freemap elements use a fixed size). 1273 */ 1274 KKASSERT(chain != &hmp->vchain); 1275 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 1276 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT); 1277 1278 /* 1279 * Nothing to do if the element is already the proper size 1280 */ 1281 obytes = chain->bytes; 1282 nbytes = 1U << nradix; 1283 if (obytes == nbytes) 1284 return; 1285 1286 /* 1287 * Delete the old chain and duplicate it at the same (parent, index), 1288 * returning a new chain. This allows the old chain to still be 1289 * used by the flush code. The new chain will be returned in a 1290 * modified state. 1291 * 1292 * The parent does not have to be locked for the delete/duplicate call, 1293 * but is in this particular code path. 1294 * 1295 * NOTE: If we are not crossing a synchronization point the 1296 * duplication code will simply reuse the existing chain 1297 * structure. 1298 */ 1299 hammer2_chain_delete_duplicate(trans, &chain, 0); 1300 1301 /* 1302 * Relocate the block, even if making it smaller (because different 1303 * block sizes may be in different regions). 1304 */ 1305 hammer2_freemap_alloc(trans, chain, nbytes); 1306 chain->bytes = nbytes; 1307 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW); 1308 /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */ 1309 1310 /* 1311 * For now just support it on DATA chains (and not on indirect 1312 * blocks). 1313 */ 1314 KKASSERT(chain->dio == NULL); 1315 1316 #if 0 1317 /* 1318 * Make sure the chain is marked MOVED and propagate the update 1319 * to the root for flush. 1320 */ 1321 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 1322 hammer2_chain_ref(chain); 1323 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 1324 } 1325 hammer2_chain_setsubmod(trans, chain); 1326 #endif 1327 *chainp = chain; 1328 } 1329 1330 /* 1331 * Set a chain modified, making it read-write and duplicating it if necessary. 1332 * This function will assign a new physical block to the chain if necessary 1333 * 1334 * Duplication of already-modified chains is possible when the modification 1335 * crosses a flush synchronization boundary. 1336 * 1337 * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE 1338 * level or the COW operation will not work. 1339 * 1340 * Data blocks - The chain is usually locked RESOLVE_NEVER so as not to 1341 * run the data through the device buffers. 1342 * 1343 * This function may return a different chain than was passed, in which case 1344 * the old chain will be unlocked and the new chain will be locked. 1345 * 1346 * ip->chain may be adjusted by hammer2_chain_modify_ip(). 1347 */ 1348 hammer2_inode_data_t * 1349 hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, 1350 hammer2_chain_t **chainp, int flags) 1351 { 1352 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); 1353 hammer2_chain_modify(trans, chainp, flags); 1354 if (ip->chain != *chainp) 1355 hammer2_inode_repoint(ip, NULL, *chainp); 1356 if (ip->vp) 1357 vsetisdirty(ip->vp); 1358 return(&ip->chain->data->ipdata); 1359 } 1360 1361 void 1362 hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp, 1363 int flags) 1364 { 1365 hammer2_mount_t *hmp; 1366 hammer2_chain_t *chain; 1367 hammer2_io_t *dio; 1368 int error; 1369 int wasinitial; 1370 char *bdata; 1371 1372 chain = *chainp; 1373 hmp = chain->hmp; 1374 1375 KKASSERT(chain->bref.mirror_tid != trans->sync_tid || 1376 (chain->flags & HAMMER2_CHAIN_MODIFIED)); 1377 1378 /* 1379 * data is not optional for freemap chains (we must always be sure 1380 * to copy the data on COW storage allocations). 1381 */ 1382 if (chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 1383 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 1384 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) || 1385 (flags & HAMMER2_MODIFY_OPTDATA) == 0); 1386 } 1387 1388 /* 1389 * Determine if a delete-duplicate is needed. 1390 * 1391 * (a) Modify_tid is part of a prior flush 1392 * (b) Transaction is concurrent with a flush (has higher tid) 1393 * (c) and chain is not in the initial state (freshly created) 1394 * (d) and caller didn't request an in-place modification. 1395 * 1396 * The freemap and volume header special chains are never D-Dd. 1397 */ 1398 if (chain->modify_tid != trans->sync_tid && /* cross boundary */ 1399 (flags & HAMMER2_MODIFY_INPLACE) == 0) { /* from d-d */ 1400 if (chain != &hmp->fchain && chain != &hmp->vchain) { 1401 KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0); 1402 hammer2_chain_delete_duplicate(trans, chainp, 0); 1403 chain = *chainp; 1404 } 1405 1406 /* 1407 * Fall through if fchain or vchain, clearing the CHAIN_FLUSHED 1408 * flag. Basically other chains are delete-duplicated and so 1409 * the duplicated chains of course will not have the FLUSHED 1410 * flag set, but fchain and vchain are special-cased and the 1411 * flag must be cleared when changing modify_tid. 1412 */ 1413 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FLUSHED); 1414 } 1415 1416 /* 1417 * Data must be resolved if already assigned unless explicitly 1418 * flagged otherwise. 1419 */ 1420 if (chain->data == NULL && (flags & HAMMER2_MODIFY_OPTDATA) == 0 && 1421 (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { 1422 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1423 hammer2_chain_unlock(chain); 1424 } 1425 1426 /* 1427 * Otherwise do initial-chain handling 1428 */ 1429 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 1430 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 1431 hammer2_chain_ref(chain); 1432 hammer2_chain_memory_inc(chain->pmp); 1433 } 1434 #if 0 1435 /* shouldn't be needed */ 1436 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 1437 hammer2_chain_ref(chain); 1438 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 1439 } 1440 #endif 1441 1442 /* 1443 * The modification or re-modification requires an allocation and 1444 * possible COW. 1445 * 1446 * We normally always allocate new storage here. If storage exists 1447 * and MODIFY_NOREALLOC is passed in, we do not allocate new storage. 1448 */ 1449 if (chain != &hmp->vchain && chain != &hmp->fchain) { 1450 if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || 1451 ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 && 1452 chain->modify_tid != trans->sync_tid) 1453 ) { 1454 hammer2_freemap_alloc(trans, chain, chain->bytes); 1455 /* XXX failed allocation */ 1456 } else if (chain->flags & HAMMER2_CHAIN_FORCECOW) { 1457 hammer2_freemap_alloc(trans, chain, chain->bytes); 1458 /* XXX failed allocation */ 1459 } 1460 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW); 1461 } 1462 1463 /* 1464 * Update modify_tid. XXX special-case vchain/fchain because they 1465 * are always modified in-place. Otherwise the chain being modified 1466 * must not be part of a future transaction. 1467 */ 1468 if (chain == &hmp->vchain || chain == &hmp->fchain) { 1469 if (chain->modify_tid <= trans->sync_tid) 1470 chain->modify_tid = trans->sync_tid; 1471 } else { 1472 KKASSERT(chain->modify_tid <= trans->sync_tid); 1473 chain->modify_tid = trans->sync_tid; 1474 } 1475 1476 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0) 1477 chain->bref.modify_tid = trans->sync_tid; 1478 1479 /* 1480 * Do not COW BREF_TYPE_DATA when OPTDATA is set. This is because 1481 * data modifications are done via the logical buffer cache so COWing 1482 * it here would result in unnecessary extra copies (and possibly extra 1483 * block reallocations). The INITIAL flag remains unchanged in this 1484 * situation. 1485 * 1486 * (This is a bit of a hack). 1487 */ 1488 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA && 1489 (flags & HAMMER2_MODIFY_OPTDATA)) { 1490 goto skip2; 1491 } 1492 1493 /* 1494 * Clearing the INITIAL flag (for indirect blocks) indicates that 1495 * we've processed the uninitialized storage allocation. 1496 * 1497 * If this flag is already clear we are likely in a copy-on-write 1498 * situation but we have to be sure NOT to bzero the storage if 1499 * no data is present. 1500 */ 1501 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 1502 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1503 wasinitial = 1; 1504 } else { 1505 wasinitial = 0; 1506 } 1507 1508 /* 1509 * Instantiate data buffer and possibly execute COW operation 1510 */ 1511 switch(chain->bref.type) { 1512 case HAMMER2_BREF_TYPE_VOLUME: 1513 case HAMMER2_BREF_TYPE_FREEMAP: 1514 /* 1515 * The data is embedded, no copy-on-write operation is 1516 * needed. 1517 */ 1518 KKASSERT(chain->dio == NULL); 1519 break; 1520 case HAMMER2_BREF_TYPE_INODE: 1521 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1522 case HAMMER2_BREF_TYPE_DATA: 1523 case HAMMER2_BREF_TYPE_INDIRECT: 1524 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1525 /* 1526 * Perform the copy-on-write operation 1527 * 1528 * zero-fill or copy-on-write depending on whether 1529 * chain->data exists or not and set the dirty state for 1530 * the new buffer. hammer2_io_new() will handle the 1531 * zero-fill. 1532 */ 1533 KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); 1534 1535 if (wasinitial) { 1536 error = hammer2_io_new(hmp, chain->bref.data_off, 1537 chain->bytes, &dio); 1538 } else { 1539 error = hammer2_io_bread(hmp, chain->bref.data_off, 1540 chain->bytes, &dio); 1541 } 1542 adjreadcounter(&chain->bref, chain->bytes); 1543 KKASSERT(error == 0); 1544 1545 bdata = hammer2_io_data(dio, chain->bref.data_off); 1546 1547 if (chain->data) { 1548 KKASSERT(chain->dio != NULL); 1549 if (chain->data != (void *)bdata) { 1550 bcopy(chain->data, bdata, chain->bytes); 1551 } 1552 } else if (wasinitial == 0) { 1553 /* 1554 * We have a problem. We were asked to COW but 1555 * we don't have any data to COW with! 1556 */ 1557 panic("hammer2_chain_modify: having a COW %p\n", 1558 chain); 1559 } 1560 1561 /* 1562 * Retire the old buffer, replace with the new 1563 */ 1564 if (chain->dio) 1565 hammer2_io_brelse(&chain->dio); 1566 chain->data = (void *)bdata; 1567 chain->dio = dio; 1568 hammer2_io_setdirty(dio); /* modified by bcopy above */ 1569 break; 1570 default: 1571 panic("hammer2_chain_modify: illegal non-embedded type %d", 1572 chain->bref.type); 1573 break; 1574 1575 } 1576 skip2: 1577 hammer2_chain_setsubmod(trans, chain); 1578 } 1579 1580 /* 1581 * Mark the volume as having been modified. This short-cut version 1582 * does not have to lock the volume's chain, which allows the ioctl 1583 * code to make adjustments to connections without deadlocking. XXX 1584 * 1585 * No ref is made on vchain when flagging it MODIFIED. 1586 */ 1587 void 1588 hammer2_modify_volume(hammer2_mount_t *hmp) 1589 { 1590 hammer2_voldata_lock(hmp); 1591 hammer2_voldata_unlock(hmp, 1); 1592 } 1593 1594 /* 1595 * This function returns the chain at the nearest key within the specified 1596 * range with the highest delete_tid. The core spinlock must be held on 1597 * call and the returned chain will be referenced but not locked. 1598 * 1599 * The returned chain may or may not be in a deleted state. Note that 1600 * live chains have a delete_tid = MAX_TID. 1601 * 1602 * This function will recurse through chain->rbtree as necessary and will 1603 * return a *key_nextp suitable for iteration. *key_nextp is only set if 1604 * the iteration value is less than the current value of *key_nextp. 1605 * 1606 * The caller should use (*key_nextp) to calculate the actual range of 1607 * the returned element, which will be (key_beg to *key_nextp - 1), because 1608 * there might be another element which is superior to the returned element 1609 * and overlaps it. 1610 * 1611 * (*key_nextp) can be passed as key_beg in an iteration only while non-NULL 1612 * chains continue to be returned. On EOF (*key_nextp) may overflow since 1613 * it will wind up being (key_end + 1). 1614 */ 1615 struct hammer2_chain_find_info { 1616 hammer2_chain_t *best; 1617 hammer2_key_t key_beg; 1618 hammer2_key_t key_end; 1619 hammer2_key_t key_next; 1620 }; 1621 1622 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data); 1623 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data); 1624 1625 /* 1626 * DEBUGGING - Assert that the chain will not collide. 1627 */ 1628 static 1629 void 1630 hammer2_chain_assert_not_present(hammer2_chain_core_t *core, 1631 hammer2_chain_t *chain) 1632 { 1633 struct hammer2_chain_find_info info; 1634 hammer2_chain_layer_t *layer; 1635 1636 if (chain->flags & HAMMER2_CHAIN_DELETED) 1637 return; 1638 1639 info.best = NULL; 1640 info.key_beg = chain->bref.key; 1641 info.key_end = chain->bref.key + 1642 ((hammer2_key_t)1 << chain->bref.keybits) - 1; 1643 info.key_next = HAMMER2_MAX_KEY; 1644 1645 TAILQ_FOREACH(layer, &core->layerq, entry) { 1646 KKASSERT(layer->good == 0xABCD); 1647 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 1648 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1649 &info); 1650 } 1651 if (info.best && (info.best->flags & HAMMER2_CHAIN_DELETED) == 0) 1652 panic("hammer2_chain_assert_not_present: %p/%p\n", 1653 chain, info.best); 1654 } 1655 1656 static 1657 hammer2_chain_t * 1658 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp, 1659 hammer2_key_t key_beg, hammer2_key_t key_end) 1660 { 1661 struct hammer2_chain_find_info info; 1662 hammer2_chain_layer_t *layer; 1663 1664 info.best = NULL; 1665 info.key_beg = key_beg; 1666 info.key_end = key_end; 1667 info.key_next = *key_nextp; 1668 1669 KKASSERT(parent->core->good == 0x1234); 1670 TAILQ_FOREACH(layer, &parent->core->layerq, entry) { 1671 KKASSERT(layer->good == 0xABCD); 1672 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 1673 hammer2_chain_find_cmp, hammer2_chain_find_callback, 1674 &info); 1675 } 1676 *key_nextp = info.key_next; 1677 #if 0 1678 kprintf("chain_find %p %016jx:%016jx next=%016jx\n", 1679 parent, key_beg, key_end, *key_nextp); 1680 #endif 1681 1682 return (info.best); 1683 } 1684 1685 static 1686 int 1687 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data) 1688 { 1689 struct hammer2_chain_find_info *info = data; 1690 hammer2_key_t child_beg; 1691 hammer2_key_t child_end; 1692 1693 child_beg = child->bref.key; 1694 child_end = child_beg + ((hammer2_key_t)1 << child->bref.keybits) - 1; 1695 1696 if (child_end < info->key_beg) 1697 return(-1); 1698 if (child_beg > info->key_end) 1699 return(1); 1700 return(0); 1701 } 1702 1703 static 1704 int 1705 hammer2_chain_find_callback(hammer2_chain_t *child, void *data) 1706 { 1707 struct hammer2_chain_find_info *info = data; 1708 hammer2_chain_t *best; 1709 hammer2_key_t child_end; 1710 1711 /* 1712 * WARNING! Do not discard DUPLICATED chains, it is possible that 1713 * we are catching an insertion half-way done. If a 1714 * duplicated chain turns out to be the best choice the 1715 * caller will re-check its flags after locking it. 1716 * 1717 * WARNING! Layerq is scanned forwards, exact matches should keep 1718 * the existing info->best. 1719 */ 1720 if ((best = info->best) == NULL) { 1721 /* 1722 * No previous best. Assign best 1723 */ 1724 info->best = child; 1725 } else if (best->bref.key <= info->key_beg && 1726 child->bref.key <= info->key_beg) { 1727 /* 1728 * If our current best is flush with key_beg and child is 1729 * also flush with key_beg choose based on delete_tid. 1730 * 1731 * key_next will automatically be limited to the smaller of 1732 * the two end-points. 1733 */ 1734 if (child->delete_tid > best->delete_tid) 1735 info->best = child; 1736 } else if (child->bref.key < best->bref.key) { 1737 /* 1738 * Child has a nearer key and best is not flush with key_beg. 1739 * Truncate key_next to the old best key iff it had a better 1740 * delete_tid. 1741 */ 1742 info->best = child; 1743 if (best->delete_tid >= child->delete_tid && 1744 (info->key_next > best->bref.key || info->key_next == 0)) 1745 info->key_next = best->bref.key; 1746 } else if (child->bref.key == best->bref.key) { 1747 /* 1748 * If our current best is flush with the child then choose 1749 * based on delete_tid. 1750 * 1751 * key_next will automatically be limited to the smaller of 1752 * the two end-points. 1753 */ 1754 if (child->delete_tid > best->delete_tid) 1755 info->best = child; 1756 } else { 1757 /* 1758 * Keep the current best but truncate key_next to the child's 1759 * base iff the child has a higher delete_tid. 1760 * 1761 * key_next will also automatically be limited to the smaller 1762 * of the two end-points (probably not necessary for this case 1763 * but we do it anyway). 1764 */ 1765 if (child->delete_tid >= best->delete_tid && 1766 (info->key_next > child->bref.key || info->key_next == 0)) 1767 info->key_next = child->bref.key; 1768 } 1769 1770 /* 1771 * Always truncate key_next based on child's end-of-range. 1772 */ 1773 child_end = child->bref.key + ((hammer2_key_t)1 << child->bref.keybits); 1774 if (child_end && (info->key_next > child_end || info->key_next == 0)) 1775 info->key_next = child_end; 1776 1777 return(0); 1778 } 1779 1780 /* 1781 * Retrieve the specified chain from a media blockref, creating the 1782 * in-memory chain structure which reflects it. modify_tid will be 1783 * left 0 which forces any modifications to issue a delete-duplicate. 1784 * 1785 * To handle insertion races pass the INSERT_RACE flag along with the 1786 * generation number of the core. NULL will be returned if the generation 1787 * number changes before we have a chance to insert the chain. Insert 1788 * races can occur because the parent might be held shared. 1789 * 1790 * Caller must hold the parent locked shared or exclusive since we may 1791 * need the parent's bref array to find our block. 1792 */ 1793 hammer2_chain_t * 1794 hammer2_chain_get(hammer2_chain_t *parent, hammer2_blockref_t *bref, 1795 int generation) 1796 { 1797 hammer2_mount_t *hmp = parent->hmp; 1798 hammer2_chain_core_t *above = parent->core; 1799 hammer2_chain_t *chain; 1800 int error; 1801 1802 /* 1803 * Allocate a chain structure representing the existing media 1804 * entry. Resulting chain has one ref and is not locked. 1805 */ 1806 chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref); 1807 hammer2_chain_core_alloc(NULL, chain, NULL); 1808 /* ref'd chain returned */ 1809 chain->modify_tid = chain->bref.mirror_tid; 1810 1811 /* 1812 * Link the chain into its parent. A spinlock is required to safely 1813 * access the RBTREE, and it is possible to collide with another 1814 * hammer2_chain_get() operation because the caller might only hold 1815 * a shared lock on the parent. 1816 */ 1817 KKASSERT(parent->refs > 0); 1818 error = hammer2_chain_insert(above, NULL, chain, 1819 HAMMER2_CHAIN_INSERT_SPIN | 1820 HAMMER2_CHAIN_INSERT_RACE, 1821 generation); 1822 if (error) { 1823 KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 1824 kprintf("chain %p get race\n", chain); 1825 hammer2_chain_drop(chain); 1826 chain = NULL; 1827 } else { 1828 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 1829 } 1830 1831 /* 1832 * Return our new chain referenced but not locked, or NULL if 1833 * a race occurred. 1834 */ 1835 return (chain); 1836 } 1837 1838 /* 1839 * Lookup initialization/completion API 1840 */ 1841 hammer2_chain_t * 1842 hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags) 1843 { 1844 if (flags & HAMMER2_LOOKUP_SHARED) { 1845 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | 1846 HAMMER2_RESOLVE_SHARED); 1847 } else { 1848 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 1849 } 1850 return (parent); 1851 } 1852 1853 void 1854 hammer2_chain_lookup_done(hammer2_chain_t *parent) 1855 { 1856 if (parent) 1857 hammer2_chain_unlock(parent); 1858 } 1859 1860 static 1861 hammer2_chain_t * 1862 hammer2_chain_getparent(hammer2_chain_t **parentp, int how) 1863 { 1864 hammer2_chain_t *oparent; 1865 hammer2_chain_t *bparent; 1866 hammer2_chain_t *nparent; 1867 hammer2_chain_core_t *above; 1868 1869 oparent = *parentp; 1870 above = oparent->above; 1871 1872 spin_lock(&above->cst.spin); 1873 bparent = TAILQ_FIRST(&above->ownerq); 1874 hammer2_chain_ref(bparent); 1875 1876 /* 1877 * Be careful of order, oparent must be unlocked before nparent 1878 * is locked below to avoid a deadlock. We might as well delay its 1879 * unlocking until we conveniently no longer have the spinlock (instead 1880 * of cycling the spinlock). 1881 * 1882 * Theoretically our ref on bparent should prevent elements of the 1883 * following chain from going away and prevent above from going away, 1884 * but we still need the spinlock to safely scan the list. 1885 */ 1886 for (;;) { 1887 nparent = bparent; 1888 while (nparent->flags & HAMMER2_CHAIN_DUPLICATED) 1889 nparent = TAILQ_NEXT(nparent, core_entry); 1890 hammer2_chain_ref(nparent); 1891 spin_unlock(&above->cst.spin); 1892 1893 if (oparent) { 1894 hammer2_chain_unlock(oparent); 1895 oparent = NULL; 1896 } 1897 hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF); 1898 hammer2_chain_drop(bparent); 1899 1900 /* 1901 * We might have raced a delete-duplicate. 1902 */ 1903 if ((nparent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) 1904 break; 1905 bparent = nparent; 1906 hammer2_chain_ref(bparent); 1907 hammer2_chain_unlock(nparent); 1908 spin_lock(&above->cst.spin); 1909 /* retry */ 1910 } 1911 *parentp = nparent; 1912 1913 return (nparent); 1914 } 1915 1916 /* 1917 * Locate the first chain whos key range overlaps (key_beg, key_end) inclusive. 1918 * (*parentp) typically points to an inode but can also point to a related 1919 * indirect block and this function will recurse upwards and find the inode 1920 * again. 1921 * 1922 * (*parentp) must be exclusively locked and referenced and can be an inode 1923 * or an existing indirect block within the inode. 1924 * 1925 * On return (*parentp) will be modified to point at the deepest parent chain 1926 * element encountered during the search, as a helper for an insertion or 1927 * deletion. The new (*parentp) will be locked and referenced and the old 1928 * will be unlocked and dereferenced (no change if they are both the same). 1929 * 1930 * The matching chain will be returned exclusively locked. If NOLOCK is 1931 * requested the chain will be returned only referenced. 1932 * 1933 * NULL is returned if no match was found, but (*parentp) will still 1934 * potentially be adjusted. 1935 * 1936 * On return (*key_nextp) will point to an iterative value for key_beg. 1937 * (If NULL is returned (*key_nextp) is set to key_end). 1938 * 1939 * This function will also recurse up the chain if the key is not within the 1940 * current parent's range. (*parentp) can never be set to NULL. An iteration 1941 * can simply allow (*parentp) to float inside the loop. 1942 * 1943 * NOTE! chain->data is not always resolved. By default it will not be 1944 * resolved for BREF_TYPE_DATA, FREEMAP_NODE, or FREEMAP_LEAF. Use 1945 * HAMMER2_LOOKUP_ALWAYS to force resolution (but be careful w/ 1946 * BREF_TYPE_DATA as the device buffer can alias the logical file 1947 * buffer). 1948 */ 1949 hammer2_chain_t * 1950 hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, 1951 hammer2_key_t key_beg, hammer2_key_t key_end, 1952 int *cache_indexp, int flags) 1953 { 1954 hammer2_mount_t *hmp; 1955 hammer2_chain_t *parent; 1956 hammer2_chain_t *chain; 1957 hammer2_blockref_t *base; 1958 hammer2_blockref_t *bref; 1959 hammer2_blockref_t bcopy; 1960 hammer2_key_t scan_beg; 1961 hammer2_key_t scan_end; 1962 hammer2_chain_core_t *above; 1963 int count = 0; 1964 int how_always = HAMMER2_RESOLVE_ALWAYS; 1965 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1966 int how; 1967 int generation; 1968 int maxloops = 300000; 1969 int wasdup; 1970 1971 if (flags & HAMMER2_LOOKUP_ALWAYS) { 1972 how_maybe = how_always; 1973 how = HAMMER2_RESOLVE_ALWAYS; 1974 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 1975 how = HAMMER2_RESOLVE_NEVER; 1976 } else { 1977 how = HAMMER2_RESOLVE_MAYBE; 1978 } 1979 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 1980 how_maybe |= HAMMER2_RESOLVE_SHARED; 1981 how_always |= HAMMER2_RESOLVE_SHARED; 1982 how |= HAMMER2_RESOLVE_SHARED; 1983 } 1984 1985 /* 1986 * Recurse (*parentp) upward if necessary until the parent completely 1987 * encloses the key range or we hit the inode. 1988 * 1989 * This function handles races against the flusher doing a delete- 1990 * duplicate above us and re-homes the parent to the duplicate in 1991 * that case, otherwise we'd wind up recursing down a stale chain. 1992 */ 1993 parent = *parentp; 1994 hmp = parent->hmp; 1995 1996 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 1997 parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 1998 scan_beg = parent->bref.key; 1999 scan_end = scan_beg + 2000 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2001 if (key_beg >= scan_beg && key_end <= scan_end) 2002 break; 2003 parent = hammer2_chain_getparent(parentp, how_maybe); 2004 } 2005 2006 again: 2007 if (--maxloops == 0) 2008 panic("hammer2_chain_lookup: maxloops"); 2009 /* 2010 * Locate the blockref array. Currently we do a fully associative 2011 * search through the array. 2012 */ 2013 switch(parent->bref.type) { 2014 case HAMMER2_BREF_TYPE_INODE: 2015 /* 2016 * Special shortcut for embedded data returns the inode 2017 * itself. Callers must detect this condition and access 2018 * the embedded data (the strategy code does this for us). 2019 * 2020 * This is only applicable to regular files and softlinks. 2021 */ 2022 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 2023 if (flags & HAMMER2_LOOKUP_NOLOCK) 2024 hammer2_chain_ref(parent); 2025 else 2026 hammer2_chain_lock(parent, how_always); 2027 *key_nextp = key_end + 1; 2028 return (parent); 2029 } 2030 base = &parent->data->ipdata.u.blockset.blockref[0]; 2031 count = HAMMER2_SET_COUNT; 2032 break; 2033 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2034 case HAMMER2_BREF_TYPE_INDIRECT: 2035 /* 2036 * Handle MATCHIND on the parent 2037 */ 2038 if (flags & HAMMER2_LOOKUP_MATCHIND) { 2039 scan_beg = parent->bref.key; 2040 scan_end = scan_beg + 2041 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 2042 if (key_beg == scan_beg && key_end == scan_end) { 2043 chain = parent; 2044 hammer2_chain_lock(chain, how_maybe); 2045 *key_nextp = scan_end + 1; 2046 goto done; 2047 } 2048 } 2049 /* 2050 * Optimize indirect blocks in the INITIAL state to avoid 2051 * I/O. 2052 */ 2053 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2054 base = NULL; 2055 } else { 2056 if (parent->data == NULL) 2057 panic("parent->data is NULL"); 2058 base = &parent->data->npdata[0]; 2059 } 2060 count = parent->bytes / sizeof(hammer2_blockref_t); 2061 break; 2062 case HAMMER2_BREF_TYPE_VOLUME: 2063 base = &hmp->voldata.sroot_blockset.blockref[0]; 2064 count = HAMMER2_SET_COUNT; 2065 break; 2066 case HAMMER2_BREF_TYPE_FREEMAP: 2067 base = &hmp->voldata.freemap_blockset.blockref[0]; 2068 count = HAMMER2_SET_COUNT; 2069 break; 2070 default: 2071 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2072 parent->bref.type); 2073 base = NULL; /* safety */ 2074 count = 0; /* safety */ 2075 } 2076 2077 /* 2078 * Merged scan to find next candidate. 2079 * 2080 * hammer2_base_*() functions require the above->live_* fields 2081 * to be synchronized. 2082 * 2083 * We need to hold the spinlock to access the block array and RB tree 2084 * and to interlock chain creation. 2085 */ 2086 above = parent->core; 2087 if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2088 hammer2_chain_countbrefs(parent, base, count); 2089 2090 /* 2091 * Combined search 2092 */ 2093 spin_lock(&above->cst.spin); 2094 chain = hammer2_combined_find(parent, base, count, 2095 cache_indexp, key_nextp, 2096 key_beg, key_end, &bref); 2097 generation = above->generation; 2098 2099 /* 2100 * Exhausted parent chain, iterate. 2101 */ 2102 if (bref == NULL) { 2103 spin_unlock(&above->cst.spin); 2104 if (key_beg == key_end) /* short cut single-key case */ 2105 return (NULL); 2106 return (hammer2_chain_next(parentp, NULL, key_nextp, 2107 key_beg, key_end, 2108 cache_indexp, flags)); 2109 } 2110 2111 /* 2112 * Selected from blockref or in-memory chain. 2113 */ 2114 if (chain == NULL) { 2115 bcopy = *bref; 2116 spin_unlock(&above->cst.spin); 2117 chain = hammer2_chain_get(parent, &bcopy, generation); 2118 if (chain == NULL) { 2119 kprintf("retry lookup parent %p keys %016jx:%016jx\n", 2120 parent, key_beg, key_end); 2121 goto again; 2122 } 2123 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2124 hammer2_chain_drop(chain); 2125 goto again; 2126 } 2127 wasdup = 0; 2128 } else { 2129 hammer2_chain_ref(chain); 2130 wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0); 2131 spin_unlock(&above->cst.spin); 2132 } 2133 2134 /* 2135 * chain is referenced but not locked. We must lock the chain 2136 * to obtain definitive DUPLICATED/DELETED state 2137 */ 2138 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2139 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2140 hammer2_chain_lock(chain, how_maybe | HAMMER2_RESOLVE_NOREF); 2141 } else { 2142 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); 2143 } 2144 2145 /* 2146 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2147 * 2148 * NOTE: Chain's key range is not relevant as there might be 2149 * one-offs within the range that are not deleted. 2150 * 2151 * NOTE: Lookups can race delete-duplicate because 2152 * delete-duplicate does not lock the parent's core 2153 * (they just use the spinlock on the core). We must 2154 * check for races by comparing the DUPLICATED flag before 2155 * releasing the spinlock with the flag after locking the 2156 * chain. 2157 */ 2158 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2159 hammer2_chain_unlock(chain); 2160 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) { 2161 key_beg = *key_nextp; 2162 if (key_beg == 0 || key_beg > key_end) 2163 return(NULL); 2164 } 2165 goto again; 2166 } 2167 2168 /* 2169 * If the chain element is an indirect block it becomes the new 2170 * parent and we loop on it. We must maintain our top-down locks 2171 * to prevent the flusher from interfering (i.e. doing a 2172 * delete-duplicate and leaving us recursing down a deleted chain). 2173 * 2174 * The parent always has to be locked with at least RESOLVE_MAYBE 2175 * so we can access its data. It might need a fixup if the caller 2176 * passed incompatible flags. Be careful not to cause a deadlock 2177 * as a data-load requires an exclusive lock. 2178 * 2179 * If HAMMER2_LOOKUP_MATCHIND is set and the indirect block's key 2180 * range is within the requested key range we return the indirect 2181 * block and do NOT loop. This is usually only used to acquire 2182 * freemap nodes. 2183 */ 2184 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 2185 chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2186 hammer2_chain_unlock(parent); 2187 *parentp = parent = chain; 2188 goto again; 2189 } 2190 done: 2191 /* 2192 * All done, return the chain 2193 */ 2194 return (chain); 2195 } 2196 2197 /* 2198 * After having issued a lookup we can iterate all matching keys. 2199 * 2200 * If chain is non-NULL we continue the iteration from just after it's index. 2201 * 2202 * If chain is NULL we assume the parent was exhausted and continue the 2203 * iteration at the next parent. 2204 * 2205 * parent must be locked on entry and remains locked throughout. chain's 2206 * lock status must match flags. Chain is always at least referenced. 2207 * 2208 * WARNING! The MATCHIND flag does not apply to this function. 2209 */ 2210 hammer2_chain_t * 2211 hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, 2212 hammer2_key_t *key_nextp, 2213 hammer2_key_t key_beg, hammer2_key_t key_end, 2214 int *cache_indexp, int flags) 2215 { 2216 hammer2_chain_t *parent; 2217 int how_maybe; 2218 2219 /* 2220 * Calculate locking flags for upward recursion. 2221 */ 2222 how_maybe = HAMMER2_RESOLVE_MAYBE; 2223 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 2224 how_maybe |= HAMMER2_RESOLVE_SHARED; 2225 2226 parent = *parentp; 2227 2228 /* 2229 * Calculate the next index and recalculate the parent if necessary. 2230 */ 2231 if (chain) { 2232 key_beg = chain->bref.key + 2233 ((hammer2_key_t)1 << chain->bref.keybits); 2234 if (flags & HAMMER2_LOOKUP_NOLOCK) 2235 hammer2_chain_drop(chain); 2236 else 2237 hammer2_chain_unlock(chain); 2238 2239 /* 2240 * Any scan where the lookup returned degenerate data embedded 2241 * in the inode has an invalid index and must terminate. 2242 */ 2243 if (chain == parent) 2244 return(NULL); 2245 if (key_beg == 0 || key_beg > key_end) 2246 return(NULL); 2247 chain = NULL; 2248 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT && 2249 parent->bref.type != HAMMER2_BREF_TYPE_FREEMAP_NODE) { 2250 /* 2251 * We reached the end of the iteration. 2252 */ 2253 return (NULL); 2254 } else { 2255 /* 2256 * Continue iteration with next parent unless the current 2257 * parent covers the range. 2258 */ 2259 key_beg = parent->bref.key + 2260 ((hammer2_key_t)1 << parent->bref.keybits); 2261 if (key_beg == 0 || key_beg > key_end) 2262 return (NULL); 2263 parent = hammer2_chain_getparent(parentp, how_maybe); 2264 } 2265 2266 /* 2267 * And execute 2268 */ 2269 return (hammer2_chain_lookup(parentp, key_nextp, 2270 key_beg, key_end, 2271 cache_indexp, flags)); 2272 } 2273 2274 /* 2275 * The raw scan function is similar to lookup/next but does not seek to a key. 2276 * Blockrefs are iterated via first_chain = (parent, NULL) and 2277 * next_chain = (parent, chain). 2278 * 2279 * The passed-in parent must be locked and its data resolved. The returned 2280 * chain will be locked. Pass chain == NULL to acquire the first sub-chain 2281 * under parent and then iterate with the passed-in chain (which this 2282 * function will unlock). 2283 */ 2284 hammer2_chain_t * 2285 hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain, 2286 int *cache_indexp, int flags) 2287 { 2288 hammer2_mount_t *hmp; 2289 hammer2_blockref_t *base; 2290 hammer2_blockref_t *bref; 2291 hammer2_blockref_t bcopy; 2292 hammer2_chain_core_t *above; 2293 hammer2_key_t key; 2294 hammer2_key_t next_key; 2295 int count = 0; 2296 int how_always = HAMMER2_RESOLVE_ALWAYS; 2297 int how_maybe = HAMMER2_RESOLVE_MAYBE; 2298 int how; 2299 int generation; 2300 int maxloops = 300000; 2301 int wasdup; 2302 2303 hmp = parent->hmp; 2304 2305 /* 2306 * Scan flags borrowed from lookup 2307 */ 2308 if (flags & HAMMER2_LOOKUP_ALWAYS) { 2309 how_maybe = how_always; 2310 how = HAMMER2_RESOLVE_ALWAYS; 2311 } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { 2312 how = HAMMER2_RESOLVE_NEVER; 2313 } else { 2314 how = HAMMER2_RESOLVE_MAYBE; 2315 } 2316 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 2317 how_maybe |= HAMMER2_RESOLVE_SHARED; 2318 how_always |= HAMMER2_RESOLVE_SHARED; 2319 how |= HAMMER2_RESOLVE_SHARED; 2320 } 2321 2322 /* 2323 * Calculate key to locate first/next element, unlocking the previous 2324 * element as we go. Be careful, the key calculation can overflow. 2325 */ 2326 if (chain) { 2327 key = chain->bref.key + 2328 ((hammer2_key_t)1 << chain->bref.keybits); 2329 hammer2_chain_unlock(chain); 2330 chain = NULL; 2331 if (key == 0) 2332 goto done; 2333 } else { 2334 key = 0; 2335 } 2336 2337 again: 2338 if (--maxloops == 0) 2339 panic("hammer2_chain_scan: maxloops"); 2340 /* 2341 * Locate the blockref array. Currently we do a fully associative 2342 * search through the array. 2343 */ 2344 switch(parent->bref.type) { 2345 case HAMMER2_BREF_TYPE_INODE: 2346 /* 2347 * An inode with embedded data has no sub-chains. 2348 */ 2349 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) 2350 goto done; 2351 base = &parent->data->ipdata.u.blockset.blockref[0]; 2352 count = HAMMER2_SET_COUNT; 2353 break; 2354 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2355 case HAMMER2_BREF_TYPE_INDIRECT: 2356 /* 2357 * Optimize indirect blocks in the INITIAL state to avoid 2358 * I/O. 2359 */ 2360 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 2361 base = NULL; 2362 } else { 2363 if (parent->data == NULL) 2364 panic("parent->data is NULL"); 2365 base = &parent->data->npdata[0]; 2366 } 2367 count = parent->bytes / sizeof(hammer2_blockref_t); 2368 break; 2369 case HAMMER2_BREF_TYPE_VOLUME: 2370 base = &hmp->voldata.sroot_blockset.blockref[0]; 2371 count = HAMMER2_SET_COUNT; 2372 break; 2373 case HAMMER2_BREF_TYPE_FREEMAP: 2374 base = &hmp->voldata.freemap_blockset.blockref[0]; 2375 count = HAMMER2_SET_COUNT; 2376 break; 2377 default: 2378 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 2379 parent->bref.type); 2380 base = NULL; /* safety */ 2381 count = 0; /* safety */ 2382 } 2383 2384 /* 2385 * Merged scan to find next candidate. 2386 * 2387 * hammer2_base_*() functions require the above->live_* fields 2388 * to be synchronized. 2389 * 2390 * We need to hold the spinlock to access the block array and RB tree 2391 * and to interlock chain creation. 2392 */ 2393 if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2394 hammer2_chain_countbrefs(parent, base, count); 2395 2396 above = parent->core; 2397 next_key = 0; 2398 spin_lock(&above->cst.spin); 2399 chain = hammer2_combined_find(parent, base, count, 2400 cache_indexp, &next_key, 2401 key, HAMMER2_MAX_KEY, &bref); 2402 generation = above->generation; 2403 2404 /* 2405 * Exhausted parent chain, we're done. 2406 */ 2407 if (bref == NULL) { 2408 spin_unlock(&above->cst.spin); 2409 KKASSERT(chain == NULL); 2410 goto done; 2411 } 2412 2413 /* 2414 * Selected from blockref or in-memory chain. 2415 */ 2416 if (chain == NULL) { 2417 bcopy = *bref; 2418 spin_unlock(&above->cst.spin); 2419 chain = hammer2_chain_get(parent, &bcopy, generation); 2420 if (chain == NULL) { 2421 kprintf("retry scan parent %p keys %016jx\n", 2422 parent, key); 2423 goto again; 2424 } 2425 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 2426 hammer2_chain_drop(chain); 2427 chain = NULL; 2428 goto again; 2429 } 2430 wasdup = 0; 2431 } else { 2432 hammer2_chain_ref(chain); 2433 wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0); 2434 spin_unlock(&above->cst.spin); 2435 } 2436 2437 /* 2438 * chain is referenced but not locked. We must lock the chain 2439 * to obtain definitive DUPLICATED/DELETED state 2440 */ 2441 hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); 2442 2443 /* 2444 * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) 2445 * 2446 * NOTE: chain's key range is not relevant as there might be 2447 * one-offs within the range that are not deleted. 2448 * 2449 * NOTE: XXX this could create problems with scans used in 2450 * situations other than mount-time recovery. 2451 * 2452 * NOTE: Lookups can race delete-duplicate because 2453 * delete-duplicate does not lock the parent's core 2454 * (they just use the spinlock on the core). We must 2455 * check for races by comparing the DUPLICATED flag before 2456 * releasing the spinlock with the flag after locking the 2457 * chain. 2458 */ 2459 if (chain->flags & HAMMER2_CHAIN_DELETED) { 2460 hammer2_chain_unlock(chain); 2461 chain = NULL; 2462 2463 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) { 2464 key = next_key; 2465 if (key == 0) 2466 goto done; 2467 } 2468 goto again; 2469 } 2470 2471 done: 2472 /* 2473 * All done, return the chain or NULL 2474 */ 2475 return (chain); 2476 } 2477 2478 /* 2479 * Create and return a new hammer2 system memory structure of the specified 2480 * key, type and size and insert it under (*parentp). This is a full 2481 * insertion, based on the supplied key/keybits, and may involve creating 2482 * indirect blocks and moving other chains around via delete/duplicate. 2483 * 2484 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 2485 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 2486 * FULL. This typically means that the caller is creating the chain after 2487 * doing a hammer2_chain_lookup(). 2488 * 2489 * (*parentp) must be exclusive locked and may be replaced on return 2490 * depending on how much work the function had to do. 2491 * 2492 * (*chainp) usually starts out NULL and returns the newly created chain, 2493 * but if the caller desires the caller may allocate a disconnected chain 2494 * and pass it in instead. (It is also possible for the caller to use 2495 * chain_duplicate() to create a disconnected chain, manipulate it, then 2496 * pass it into this function to insert it). 2497 * 2498 * This function should NOT be used to insert INDIRECT blocks. It is 2499 * typically used to create/insert inodes and data blocks. 2500 * 2501 * Caller must pass-in an exclusively locked parent the new chain is to 2502 * be inserted under, and optionally pass-in a disconnected, exclusively 2503 * locked chain to insert (else we create a new chain). The function will 2504 * adjust (*parentp) as necessary, create or connect the chain, and 2505 * return an exclusively locked chain in *chainp. 2506 */ 2507 int 2508 hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp, 2509 hammer2_chain_t **chainp, 2510 hammer2_key_t key, int keybits, int type, size_t bytes) 2511 { 2512 hammer2_mount_t *hmp; 2513 hammer2_chain_t *chain; 2514 hammer2_chain_t *parent = *parentp; 2515 hammer2_chain_core_t *above; 2516 hammer2_blockref_t *base; 2517 hammer2_blockref_t dummy; 2518 int allocated = 0; 2519 int error = 0; 2520 int count; 2521 int maxloops = 300000; 2522 2523 above = parent->core; 2524 KKASSERT(ccms_thread_lock_owned(&above->cst)); 2525 hmp = parent->hmp; 2526 chain = *chainp; 2527 2528 if (chain == NULL) { 2529 /* 2530 * First allocate media space and construct the dummy bref, 2531 * then allocate the in-memory chain structure. Set the 2532 * INITIAL flag for fresh chains which do not have embedded 2533 * data. 2534 */ 2535 bzero(&dummy, sizeof(dummy)); 2536 dummy.type = type; 2537 dummy.key = key; 2538 dummy.keybits = keybits; 2539 dummy.data_off = hammer2_getradix(bytes); 2540 dummy.methods = parent->bref.methods; 2541 chain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy); 2542 hammer2_chain_core_alloc(trans, chain, NULL); 2543 2544 /* 2545 * Lock the chain manually, chain_lock will load the chain 2546 * which we do NOT want to do. (note: chain->refs is set 2547 * to 1 by chain_alloc() for us, but lockcnt is not). 2548 */ 2549 chain->lockcnt = 1; 2550 ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE); 2551 allocated = 1; 2552 2553 /* 2554 * We do NOT set INITIAL here (yet). INITIAL is only 2555 * used for indirect blocks. 2556 * 2557 * Recalculate bytes to reflect the actual media block 2558 * allocation. 2559 */ 2560 bytes = (hammer2_off_t)1 << 2561 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 2562 chain->bytes = bytes; 2563 2564 switch(type) { 2565 case HAMMER2_BREF_TYPE_VOLUME: 2566 case HAMMER2_BREF_TYPE_FREEMAP: 2567 panic("hammer2_chain_create: called with volume type"); 2568 break; 2569 case HAMMER2_BREF_TYPE_INDIRECT: 2570 panic("hammer2_chain_create: cannot be used to" 2571 "create indirect block"); 2572 break; 2573 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2574 panic("hammer2_chain_create: cannot be used to" 2575 "create freemap root or node"); 2576 break; 2577 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2578 KKASSERT(bytes == sizeof(chain->data->bmdata)); 2579 /* fall through */ 2580 case HAMMER2_BREF_TYPE_INODE: 2581 case HAMMER2_BREF_TYPE_DATA: 2582 default: 2583 /* 2584 * leave chain->data NULL, set INITIAL 2585 */ 2586 KKASSERT(chain->data == NULL); 2587 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 2588 break; 2589 } 2590 } else { 2591 /* 2592 * We are reattaching a chain that has been duplicated and 2593 * left disconnected under a DIFFERENT parent with potentially 2594 * different key/keybits. 2595 * 2596 * The chain must be modified in the current transaction 2597 * (the duplication code should have done that for us), 2598 * and it's modify_tid should be greater than the parent's 2599 * bref.mirror_tid. This should cause it to be created under 2600 * the new parent. 2601 * 2602 * If deleted in the same transaction, the create/delete TIDs 2603 * will be the same and effective the chain will not have 2604 * existed at all from the point of view of the parent. 2605 * 2606 * Do NOT mess with the current state of the INITIAL flag. 2607 */ 2608 KKASSERT(chain->modify_tid > parent->bref.mirror_tid); 2609 KKASSERT(chain->modify_tid == trans->sync_tid); 2610 chain->bref.key = key; 2611 chain->bref.keybits = keybits; 2612 /* chain->modify_tid = chain->bref.mirror_tid; */ 2613 KKASSERT(chain->above == NULL); 2614 } 2615 2616 /* 2617 * Calculate how many entries we have in the blockref array and 2618 * determine if an indirect block is required. 2619 */ 2620 again: 2621 if (--maxloops == 0) 2622 panic("hammer2_chain_create: maxloops"); 2623 above = parent->core; 2624 2625 switch(parent->bref.type) { 2626 case HAMMER2_BREF_TYPE_INODE: 2627 KKASSERT((parent->data->ipdata.op_flags & 2628 HAMMER2_OPFLAG_DIRECTDATA) == 0); 2629 KKASSERT(parent->data != NULL); 2630 base = &parent->data->ipdata.u.blockset.blockref[0]; 2631 count = HAMMER2_SET_COUNT; 2632 break; 2633 case HAMMER2_BREF_TYPE_INDIRECT: 2634 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 2635 if (parent->flags & HAMMER2_CHAIN_INITIAL) 2636 base = NULL; 2637 else 2638 base = &parent->data->npdata[0]; 2639 count = parent->bytes / sizeof(hammer2_blockref_t); 2640 break; 2641 case HAMMER2_BREF_TYPE_VOLUME: 2642 KKASSERT(parent->data != NULL); 2643 base = &hmp->voldata.sroot_blockset.blockref[0]; 2644 count = HAMMER2_SET_COUNT; 2645 break; 2646 case HAMMER2_BREF_TYPE_FREEMAP: 2647 KKASSERT(parent->data != NULL); 2648 base = &hmp->voldata.freemap_blockset.blockref[0]; 2649 count = HAMMER2_SET_COUNT; 2650 break; 2651 default: 2652 panic("hammer2_chain_create: unrecognized blockref type: %d", 2653 parent->bref.type); 2654 base = NULL; 2655 count = 0; 2656 break; 2657 } 2658 2659 /* 2660 * Make sure we've counted the brefs 2661 */ 2662 if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) 2663 hammer2_chain_countbrefs(parent, base, count); 2664 2665 KKASSERT(above->live_count >= 0 && above->live_count <= count); 2666 2667 /* 2668 * If no free blockref could be found we must create an indirect 2669 * block and move a number of blockrefs into it. With the parent 2670 * locked we can safely lock each child in order to delete+duplicate 2671 * it without causing a deadlock. 2672 * 2673 * This may return the new indirect block or the old parent depending 2674 * on where the key falls. NULL is returned on error. 2675 */ 2676 if (above->live_count == count) { 2677 hammer2_chain_t *nparent; 2678 2679 nparent = hammer2_chain_create_indirect(trans, parent, 2680 key, keybits, 2681 type, &error); 2682 if (nparent == NULL) { 2683 if (allocated) 2684 hammer2_chain_drop(chain); 2685 chain = NULL; 2686 goto done; 2687 } 2688 if (parent != nparent) { 2689 hammer2_chain_unlock(parent); 2690 parent = *parentp = nparent; 2691 } 2692 goto again; 2693 } 2694 2695 /* 2696 * Link the chain into its parent. Later on we will have to set 2697 * the MOVED bit in situations where we don't mark the new chain 2698 * as being modified. 2699 */ 2700 if (chain->above != NULL) 2701 panic("hammer2: hammer2_chain_create: chain already connected"); 2702 KKASSERT(chain->above == NULL); 2703 KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0); 2704 hammer2_chain_insert(above, NULL, chain, 2705 HAMMER2_CHAIN_INSERT_SPIN | 2706 HAMMER2_CHAIN_INSERT_LIVE, 2707 0); 2708 2709 if (allocated) { 2710 /* 2711 * Mark the newly created chain modified. 2712 * 2713 * Device buffers are not instantiated for DATA elements 2714 * as these are handled by logical buffers. 2715 * 2716 * Indirect and freemap node indirect blocks are handled 2717 * by hammer2_chain_create_indirect() and not by this 2718 * function. 2719 * 2720 * Data for all other bref types is expected to be 2721 * instantiated (INODE, LEAF). 2722 */ 2723 switch(chain->bref.type) { 2724 case HAMMER2_BREF_TYPE_DATA: 2725 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 2726 case HAMMER2_BREF_TYPE_INODE: 2727 hammer2_chain_modify(trans, &chain, 2728 HAMMER2_MODIFY_OPTDATA | 2729 HAMMER2_MODIFY_ASSERTNOCOPY); 2730 break; 2731 default: 2732 /* 2733 * Remaining types are not supported by this function. 2734 * In particular, INDIRECT and LEAF_NODE types are 2735 * handled by create_indirect(). 2736 */ 2737 panic("hammer2_chain_create: bad type: %d", 2738 chain->bref.type); 2739 /* NOT REACHED */ 2740 break; 2741 } 2742 } else { 2743 /* 2744 * When reconnecting a chain we must set MOVED and setsubmod 2745 * so the flush recognizes that it must update the bref in 2746 * the parent. 2747 */ 2748 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2749 hammer2_chain_ref(chain); 2750 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 2751 } 2752 } 2753 hammer2_chain_setsubmod(trans, chain); 2754 2755 done: 2756 *chainp = chain; 2757 2758 return (error); 2759 } 2760 2761 /* 2762 * Replace (*chainp) with a duplicate in-memory chain structure which shares 2763 * the same core and media state as the orignal. The original *chainp is 2764 * unlocked and the replacement will be returned locked. The duplicated 2765 * chain is inserted under (*parentp). 2766 * 2767 * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION 2768 * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING 2769 * FULL. This typically means that the caller is creating the chain after 2770 * doing a hammer2_chain_lookup(). 2771 * 2772 * The old chain must be in a DELETED state unless snapshot is non-zero. 2773 * 2774 * The new chain will be live (i.e. not deleted), and modified. 2775 * 2776 * If (parent) is non-NULL then the new duplicated chain is inserted under 2777 * the parent. 2778 * 2779 * If (parent) is NULL then the newly duplicated chain is not inserted 2780 * anywhere, similar to if it had just been chain_alloc()'d (suitable for 2781 * passing into hammer2_chain_create() after this function returns). 2782 * 2783 * WARNING! This function cannot take snapshots all by itself. The caller 2784 * needs to do other massaging for snapshots. 2785 * 2786 * WARNING! This function calls create which means it can insert indirect 2787 * blocks. Callers may have to refactor locked chains held across 2788 * the call (other than the ones passed into the call). 2789 */ 2790 void 2791 hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp, 2792 hammer2_chain_t **chainp, hammer2_blockref_t *bref, 2793 int snapshot, int duplicate_reason) 2794 { 2795 hammer2_mount_t *hmp; 2796 hammer2_chain_t *parent; 2797 hammer2_chain_t *ochain; 2798 hammer2_chain_t *nchain; 2799 hammer2_chain_core_t *above; 2800 size_t bytes; 2801 2802 /* 2803 * We want nchain to be our go-to live chain, but ochain may be in 2804 * a MODIFIED state within the current flush synchronization segment. 2805 * Force any further modifications of ochain to do another COW 2806 * operation even if modify_tid indicates that one is not needed. 2807 * 2808 * We don't want to set FORCECOW on nchain simply as an optimization, 2809 * as many duplication calls simply move chains into ichains and 2810 * then delete the original. 2811 * 2812 * WARNING! We should never resolve DATA to device buffers 2813 * (XXX allow it if the caller did?), and since 2814 * we currently do not have the logical buffer cache 2815 * buffer in-hand to fix its cached physical offset 2816 * we also force the modify code to not COW it. XXX 2817 */ 2818 ochain = *chainp; 2819 hmp = ochain->hmp; 2820 KKASSERT(snapshot == 1 || (ochain->flags & HAMMER2_CHAIN_DELETED)); 2821 2822 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW); 2823 2824 /* 2825 * Now create a duplicate of the chain structure, associating 2826 * it with the same core, making it the same size, pointing it 2827 * to the same bref (the same media block). 2828 * 2829 * Give the duplicate the same modify_tid that we previously 2830 * ensured was sufficiently advanced to trigger a block table 2831 * insertion on flush. 2832 * 2833 * NOTE: bref.mirror_tid duplicated by virtue of bref copy in 2834 * hammer2_chain_alloc() 2835 */ 2836 if (bref == NULL) 2837 bref = &ochain->bref; 2838 if (snapshot) { 2839 nchain = hammer2_chain_alloc(hmp, NULL, trans, bref); 2840 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SNAPSHOT); 2841 } else { 2842 nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, bref); 2843 } 2844 hammer2_chain_core_alloc(trans, nchain, ochain); 2845 bytes = (hammer2_off_t)1 << 2846 (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 2847 nchain->bytes = bytes; 2848 nchain->modify_tid = ochain->modify_tid; 2849 nchain->inode_reason = ochain->inode_reason + 0x100000; 2850 if (ochain->flags & HAMMER2_CHAIN_INITIAL) 2851 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL); 2852 if (ochain->flags & HAMMER2_CHAIN_UNLINKED) 2853 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_UNLINKED); 2854 2855 /* 2856 * Switch from ochain to nchain 2857 */ 2858 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER | 2859 HAMMER2_RESOLVE_NOREF); 2860 /* nchain has 1 ref */ 2861 hammer2_chain_unlock(ochain); 2862 2863 /* 2864 * Place nchain in the modified state, instantiate media data 2865 * if necessary. Because modify_tid is already completely 2866 * synchronized this should not result in a delete-duplicate. 2867 * 2868 * We want nchain at the target to look like a new insertion. 2869 * Forcing the modification to be INPLACE accomplishes this 2870 * because we get the same nchain with an updated modify_tid. 2871 */ 2872 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) { 2873 hammer2_chain_modify(trans, &nchain, 2874 HAMMER2_MODIFY_OPTDATA | 2875 HAMMER2_MODIFY_NOREALLOC | 2876 HAMMER2_MODIFY_INPLACE); 2877 } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) { 2878 hammer2_chain_modify(trans, &nchain, 2879 HAMMER2_MODIFY_OPTDATA | 2880 HAMMER2_MODIFY_INPLACE); 2881 } else { 2882 hammer2_chain_modify(trans, &nchain, 2883 HAMMER2_MODIFY_INPLACE); 2884 } 2885 2886 /* 2887 * If parent is not NULL the duplicated chain will be entered under 2888 * the parent and the MOVED bit set. 2889 * 2890 * Having both chains locked is extremely important for atomicy. 2891 */ 2892 if (parentp && (parent = *parentp) != NULL) { 2893 above = parent->core; 2894 KKASSERT(ccms_thread_lock_owned(&above->cst)); 2895 KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0); 2896 KKASSERT(parent->refs > 0); 2897 2898 hammer2_chain_create(trans, parentp, &nchain, 2899 nchain->bref.key, nchain->bref.keybits, 2900 nchain->bref.type, nchain->bytes); 2901 parent = NULL; 2902 2903 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2904 hammer2_chain_ref(nchain); 2905 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED); 2906 } 2907 hammer2_chain_setsubmod(trans, nchain); 2908 } 2909 2910 #if 0 2911 /* 2912 * Unconditionally set MOVED to force the parent blockrefs to 2913 * update, and adjust update_hi below nchain so nchain's 2914 * blockrefs are updated with the new attachment. 2915 */ 2916 if (nchain->core->update_hi < trans->sync_tid) { 2917 spin_lock(&nchain->core->cst.spin); 2918 if (nchain->core->update_hi < trans->sync_tid) 2919 nchain->core->update_hi = trans->sync_tid; 2920 spin_unlock(&nchain->core->cst.spin); 2921 } 2922 #endif 2923 2924 *chainp = nchain; 2925 } 2926 2927 /* 2928 * Special in-place delete-duplicate sequence which does not require a 2929 * locked parent. (*chainp) is marked DELETED and atomically replaced 2930 * with a duplicate. Atomicy is at the very-fine spin-lock level in 2931 * order to ensure that lookups do not race us. 2932 * 2933 * If the old chain is already marked deleted the new chain will also be 2934 * marked deleted. This case can occur when an inode is removed from the 2935 * filesystem but programs still have an open descriptor to it, and during 2936 * flushes when the flush needs to operate on a chain that is deleted in 2937 * the live view but still alive in the flush view. 2938 * 2939 * The new chain will be marked modified for the current transaction. 2940 */ 2941 void 2942 hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp, 2943 int flags) 2944 { 2945 hammer2_mount_t *hmp; 2946 hammer2_chain_t *ochain; 2947 hammer2_chain_t *nchain; 2948 hammer2_chain_core_t *above; 2949 size_t bytes; 2950 2951 if (hammer2_debug & 0x20000) 2952 Debugger("dd"); 2953 2954 /* 2955 * Note that we do not have to call setsubmod on ochain, calling it 2956 * on nchain is sufficient. 2957 */ 2958 ochain = *chainp; 2959 hmp = ochain->hmp; 2960 2961 if (ochain->bref.type == HAMMER2_BREF_TYPE_INODE) { 2962 KKASSERT(ochain->data); 2963 } 2964 2965 /* 2966 * First create a duplicate of the chain structure. 2967 * (nchain is allocated with one ref). 2968 * 2969 * In the case where nchain inherits ochains core, nchain is 2970 * effectively locked due to ochain being locked (and sharing the 2971 * core), until we can give nchain its own official ock. 2972 */ 2973 nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, &ochain->bref); 2974 if (flags & HAMMER2_DELDUP_RECORE) 2975 hammer2_chain_core_alloc(trans, nchain, NULL); 2976 else 2977 hammer2_chain_core_alloc(trans, nchain, ochain); 2978 above = ochain->above; 2979 2980 bytes = (hammer2_off_t)1 << 2981 (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 2982 nchain->bytes = bytes; 2983 2984 /* 2985 * Duplicate inherits ochain's live state including its modification 2986 * state. This function disposes of the original. Because we are 2987 * doing this in-place under the same parent the block array 2988 * inserted/deleted state does not change. 2989 * 2990 * The caller isn't expected to make further modifications of ochain 2991 * but set the FORCECOW bit anyway, just in case it does. If ochain 2992 * was previously marked FORCECOW we also flag nchain FORCECOW 2993 * (used during hardlink splits). FORCECOW forces a reallocation 2994 * of the block when we modify the chain a little later, it does 2995 * not force another delete-duplicate. 2996 * 2997 * NOTE: bref.mirror_tid duplicated by virtue of bref copy in 2998 * hammer2_chain_alloc() 2999 */ 3000 nchain->data_count += ochain->data_count; 3001 nchain->inode_count += ochain->inode_count; 3002 atomic_set_int(&nchain->flags, 3003 ochain->flags & (HAMMER2_CHAIN_INITIAL | 3004 HAMMER2_CHAIN_FORCECOW | 3005 HAMMER2_CHAIN_UNLINKED)); 3006 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW); 3007 nchain->inode_reason = ochain->inode_reason + 0x1000; 3008 3009 /* 3010 * Lock nchain so both chains are now locked (extremely important 3011 * for atomicy). Mark ochain deleted and reinsert into the topology 3012 * and insert nchain all in one go. 3013 * 3014 * If the ochain is already deleted it is left alone and nchain 3015 * is inserted into the topology as a deleted chain. This is 3016 * important because it allows ongoing operations to be executed 3017 * on a deleted inode which still has open descriptors. 3018 * 3019 * The deleted case can also occur when a flush delete-duplicates 3020 * a node which is being concurrently modified by ongoing operations 3021 * in a later transaction. This creates a problem because the flush 3022 * is intended to update blockrefs which then propagate, allowing 3023 * the original covering in-memory chains to be freed up. In this 3024 * situation the flush code does NOT free the original covering 3025 * chains and will re-apply them to successive copies. 3026 */ 3027 hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER); 3028 /* extra ref still present from original allocation */ 3029 3030 KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE); 3031 spin_lock(&above->cst.spin); 3032 KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE); 3033 3034 /* 3035 * Ultimately nchain->modify_tid will be set to trans->sync_tid, 3036 * but we can't do that here because we want to call 3037 * hammer2_chain_modify() to reallocate the block (if necessary). 3038 */ 3039 nchain->modify_tid = ochain->modify_tid; 3040 3041 if (ochain->flags & HAMMER2_CHAIN_DELETED) { 3042 /* 3043 * ochain was deleted 3044 */ 3045 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED); 3046 if (ochain->delete_tid > trans->sync_tid) { 3047 /* 3048 * delete-duplicate a chain deleted in a later 3049 * transaction. Only allowed on chains created 3050 * before or during the current transaction (flush 3051 * code should filter out chains created after the 3052 * current transaction). 3053 * 3054 * To make this work is a bit of a hack. We convert 3055 * ochain's delete_tid to the current sync_tid and 3056 * create a nchain which sets up ochains original 3057 * delete_tid. 3058 * 3059 * This effectively forces ochain to flush as a 3060 * deletion and nchain as a creation. Thus MOVED 3061 * must be set in ochain (it should already be 3062 * set since it's original delete_tid could not 3063 * have been flushed yet). Since ochain's delete_tid 3064 * has been moved down to sync_tid, a re-flush at 3065 * sync_tid won't try to delete-duplicate ochain 3066 * again. 3067 */ 3068 KKASSERT(ochain->modify_tid <= trans->sync_tid); 3069 nchain->delete_tid = ochain->delete_tid; 3070 ochain->delete_tid = trans->sync_tid; 3071 KKASSERT(ochain->flags & HAMMER2_CHAIN_MOVED); 3072 } else if (ochain->delete_tid == trans->sync_tid) { 3073 /* 3074 * ochain was deleted in the current transaction 3075 */ 3076 nchain->delete_tid = trans->sync_tid; 3077 } else { 3078 /* 3079 * ochain was deleted in a prior transaction. 3080 * create and delete nchain in the current 3081 * transaction. 3082 * 3083 * (delete_tid might represent a deleted inode 3084 * which still has an open descriptor). 3085 */ 3086 nchain->delete_tid = trans->sync_tid; 3087 } 3088 hammer2_chain_insert(above, ochain->inlayer, nchain, 0, 0); 3089 } else { 3090 /* 3091 * ochain was not deleted, delete it in the current 3092 * transaction. 3093 */ 3094 KKASSERT(trans->sync_tid >= ochain->modify_tid); 3095 ochain->delete_tid = trans->sync_tid; 3096 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED); 3097 atomic_add_int(&above->live_count, -1); 3098 hammer2_chain_insert(above, NULL, nchain, 3099 HAMMER2_CHAIN_INSERT_LIVE, 0); 3100 } 3101 3102 if ((ochain->flags & HAMMER2_CHAIN_MOVED) == 0) { 3103 hammer2_chain_ref(ochain); 3104 atomic_set_int(&ochain->flags, HAMMER2_CHAIN_MOVED); 3105 } 3106 spin_unlock(&above->cst.spin); 3107 3108 /* 3109 * ochain must be unlocked because ochain and nchain might share 3110 * a buffer cache buffer, so we need to release it so nchain can 3111 * potentially obtain it. 3112 */ 3113 hammer2_chain_unlock(ochain); 3114 3115 /* 3116 * Finishing fixing up nchain. A new block will be allocated if 3117 * crossing a synchronization point (meta-data only). 3118 * 3119 * Calling hammer2_chain_modify() will update modify_tid to 3120 * (typically) trans->sync_tid. 3121 */ 3122 if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) { 3123 hammer2_chain_modify(trans, &nchain, 3124 HAMMER2_MODIFY_OPTDATA | 3125 HAMMER2_MODIFY_NOREALLOC | 3126 HAMMER2_MODIFY_INPLACE); 3127 } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) { 3128 hammer2_chain_modify(trans, &nchain, 3129 HAMMER2_MODIFY_OPTDATA | 3130 HAMMER2_MODIFY_INPLACE); 3131 } else { 3132 hammer2_chain_modify(trans, &nchain, 3133 HAMMER2_MODIFY_INPLACE); 3134 } 3135 hammer2_chain_drop(nchain); 3136 3137 /* 3138 * Unconditionally set MOVED to force the parent blockrefs to 3139 * update as the chain_modify() above won't necessarily do it. 3140 * 3141 * Adjust update_hi below nchain so nchain's blockrefs are updated 3142 * with the new attachment. 3143 */ 3144 if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) { 3145 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED); 3146 hammer2_chain_ref(nchain); 3147 } 3148 #if 0 3149 if (nchain->core->update_hi < trans->sync_tid) { 3150 spin_lock(&nchain->core->cst.spin); 3151 if (nchain->core->update_hi < trans->sync_tid) 3152 nchain->core->update_hi = trans->sync_tid; 3153 spin_unlock(&nchain->core->cst.spin); 3154 } 3155 #endif 3156 hammer2_chain_setsubmod(trans, nchain); 3157 *chainp = nchain; 3158 } 3159 3160 /* 3161 * Create a snapshot of the specified {parent, ochain} with the specified 3162 * label. The originating hammer2_inode must be exclusively locked for 3163 * safety. 3164 * 3165 * The ioctl code has already synced the filesystem. 3166 */ 3167 int 3168 hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_chain_t **ochainp, 3169 hammer2_ioc_pfs_t *pfs) 3170 { 3171 hammer2_mount_t *hmp; 3172 hammer2_chain_t *ochain = *ochainp; 3173 hammer2_chain_t *nchain; 3174 hammer2_inode_data_t *ipdata; 3175 hammer2_inode_t *nip; 3176 size_t name_len; 3177 hammer2_key_t lhc; 3178 struct vattr vat; 3179 uuid_t opfs_clid; 3180 int error; 3181 3182 kprintf("snapshot %s ochain->refs %d ochain->flags %08x\n", 3183 pfs->name, ochain->refs, ochain->flags); 3184 3185 name_len = strlen(pfs->name); 3186 lhc = hammer2_dirhash(pfs->name, name_len); 3187 3188 hmp = ochain->hmp; 3189 opfs_clid = ochain->data->ipdata.pfs_clid; 3190 3191 *ochainp = ochain; 3192 3193 /* 3194 * Create the snapshot directory under the super-root 3195 * 3196 * Set PFS type, generate a unique filesystem id, and generate 3197 * a cluster id. Use the same clid when snapshotting a PFS root, 3198 * which theoretically allows the snapshot to be used as part of 3199 * the same cluster (perhaps as a cache). 3200 * 3201 * Copy the (flushed) ochain's blockref array. Theoretically we 3202 * could use chain_duplicate() but it becomes difficult to disentangle 3203 * the shared core so for now just brute-force it. 3204 */ 3205 VATTR_NULL(&vat); 3206 vat.va_type = VDIR; 3207 vat.va_mode = 0755; 3208 nchain = NULL; 3209 nip = hammer2_inode_create(trans, hmp->sroot, &vat, proc0.p_ucred, 3210 pfs->name, name_len, &nchain, &error); 3211 3212 if (nip) { 3213 ipdata = hammer2_chain_modify_ip(trans, nip, &nchain, 0); 3214 ipdata->pfs_type = HAMMER2_PFSTYPE_SNAPSHOT; 3215 kern_uuidgen(&ipdata->pfs_fsid, 1); 3216 if (ochain->flags & HAMMER2_CHAIN_PFSROOT) 3217 ipdata->pfs_clid = opfs_clid; 3218 else 3219 kern_uuidgen(&ipdata->pfs_clid, 1); 3220 atomic_set_int(&nchain->flags, HAMMER2_CHAIN_PFSROOT); 3221 ipdata->u.blockset = ochain->data->ipdata.u.blockset; 3222 3223 hammer2_inode_unlock_ex(nip, nchain); 3224 } 3225 return (error); 3226 } 3227 3228 /* 3229 * Create an indirect block that covers one or more of the elements in the 3230 * current parent. Either returns the existing parent with no locking or 3231 * ref changes or returns the new indirect block locked and referenced 3232 * and leaving the original parent lock/ref intact as well. 3233 * 3234 * If an error occurs, NULL is returned and *errorp is set to the error. 3235 * 3236 * The returned chain depends on where the specified key falls. 3237 * 3238 * The key/keybits for the indirect mode only needs to follow three rules: 3239 * 3240 * (1) That all elements underneath it fit within its key space and 3241 * 3242 * (2) That all elements outside it are outside its key space. 3243 * 3244 * (3) When creating the new indirect block any elements in the current 3245 * parent that fit within the new indirect block's keyspace must be 3246 * moved into the new indirect block. 3247 * 3248 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 3249 * keyspace the the current parent, but lookup/iteration rules will 3250 * ensure (and must ensure) that rule (2) for all parents leading up 3251 * to the nearest inode or the root volume header is adhered to. This 3252 * is accomplished by always recursing through matching keyspaces in 3253 * the hammer2_chain_lookup() and hammer2_chain_next() API. 3254 * 3255 * The current implementation calculates the current worst-case keyspace by 3256 * iterating the current parent and then divides it into two halves, choosing 3257 * whichever half has the most elements (not necessarily the half containing 3258 * the requested key). 3259 * 3260 * We can also opt to use the half with the least number of elements. This 3261 * causes lower-numbered keys (aka logical file offsets) to recurse through 3262 * fewer indirect blocks and higher-numbered keys to recurse through more. 3263 * This also has the risk of not moving enough elements to the new indirect 3264 * block and being forced to create several indirect blocks before the element 3265 * can be inserted. 3266 * 3267 * Must be called with an exclusively locked parent. 3268 */ 3269 static int hammer2_chain_indkey_freemap(hammer2_chain_t *parent, 3270 hammer2_key_t *keyp, int keybits, 3271 hammer2_blockref_t *base, int count); 3272 static int hammer2_chain_indkey_normal(hammer2_chain_t *parent, 3273 hammer2_key_t *keyp, int keybits, 3274 hammer2_blockref_t *base, int count); 3275 static 3276 hammer2_chain_t * 3277 hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, 3278 hammer2_key_t create_key, int create_bits, 3279 int for_type, int *errorp) 3280 { 3281 hammer2_mount_t *hmp; 3282 hammer2_chain_core_t *above; 3283 hammer2_chain_core_t *icore; 3284 hammer2_blockref_t *base; 3285 hammer2_blockref_t *bref; 3286 hammer2_blockref_t bcopy; 3287 hammer2_chain_t *chain; 3288 hammer2_chain_t *ichain; 3289 hammer2_chain_t dummy; 3290 hammer2_key_t key = create_key; 3291 hammer2_key_t key_beg; 3292 hammer2_key_t key_end; 3293 hammer2_key_t key_next; 3294 int keybits = create_bits; 3295 int count; 3296 int nbytes; 3297 int cache_index; 3298 int loops; 3299 int reason; 3300 int generation; 3301 int maxloops = 300000; 3302 int retry_same; 3303 int wasdup; 3304 3305 /* 3306 * Calculate the base blockref pointer or NULL if the chain 3307 * is known to be empty. We need to calculate the array count 3308 * for RB lookups either way. 3309 */ 3310 hmp = parent->hmp; 3311 *errorp = 0; 3312 KKASSERT(ccms_thread_lock_owned(&parent->core->cst)); 3313 above = parent->core; 3314 3315 /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/ 3316 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 3317 base = NULL; 3318 3319 switch(parent->bref.type) { 3320 case HAMMER2_BREF_TYPE_INODE: 3321 count = HAMMER2_SET_COUNT; 3322 break; 3323 case HAMMER2_BREF_TYPE_INDIRECT: 3324 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3325 count = parent->bytes / sizeof(hammer2_blockref_t); 3326 break; 3327 case HAMMER2_BREF_TYPE_VOLUME: 3328 count = HAMMER2_SET_COUNT; 3329 break; 3330 case HAMMER2_BREF_TYPE_FREEMAP: 3331 count = HAMMER2_SET_COUNT; 3332 break; 3333 default: 3334 panic("hammer2_chain_create_indirect: " 3335 "unrecognized blockref type: %d", 3336 parent->bref.type); 3337 count = 0; 3338 break; 3339 } 3340 } else { 3341 switch(parent->bref.type) { 3342 case HAMMER2_BREF_TYPE_INODE: 3343 base = &parent->data->ipdata.u.blockset.blockref[0]; 3344 count = HAMMER2_SET_COUNT; 3345 break; 3346 case HAMMER2_BREF_TYPE_INDIRECT: 3347 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 3348 base = &parent->data->npdata[0]; 3349 count = parent->bytes / sizeof(hammer2_blockref_t); 3350 break; 3351 case HAMMER2_BREF_TYPE_VOLUME: 3352 base = &hmp->voldata.sroot_blockset.blockref[0]; 3353 count = HAMMER2_SET_COUNT; 3354 break; 3355 case HAMMER2_BREF_TYPE_FREEMAP: 3356 base = &hmp->voldata.freemap_blockset.blockref[0]; 3357 count = HAMMER2_SET_COUNT; 3358 break; 3359 default: 3360 panic("hammer2_chain_create_indirect: " 3361 "unrecognized blockref type: %d", 3362 parent->bref.type); 3363 count = 0; 3364 break; 3365 } 3366 } 3367 3368 /* 3369 * dummy used in later chain allocation (no longer used for lookups). 3370 */ 3371 bzero(&dummy, sizeof(dummy)); 3372 dummy.delete_tid = HAMMER2_MAX_TID; 3373 3374 /* 3375 * When creating an indirect block for a freemap node or leaf 3376 * the key/keybits must be fitted to static radix levels because 3377 * particular radix levels use particular reserved blocks in the 3378 * related zone. 3379 * 3380 * This routine calculates the key/radix of the indirect block 3381 * we need to create, and whether it is on the high-side or the 3382 * low-side. 3383 */ 3384 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3385 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3386 keybits = hammer2_chain_indkey_freemap(parent, &key, keybits, 3387 base, count); 3388 } else { 3389 keybits = hammer2_chain_indkey_normal(parent, &key, keybits, 3390 base, count); 3391 } 3392 3393 /* 3394 * Normalize the key for the radix being represented, keeping the 3395 * high bits and throwing away the low bits. 3396 */ 3397 key &= ~(((hammer2_key_t)1 << keybits) - 1); 3398 3399 /* 3400 * How big should our new indirect block be? It has to be at least 3401 * as large as its parent. 3402 */ 3403 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) 3404 nbytes = HAMMER2_IND_BYTES_MIN; 3405 else 3406 nbytes = HAMMER2_IND_BYTES_MAX; 3407 if (nbytes < count * sizeof(hammer2_blockref_t)) 3408 nbytes = count * sizeof(hammer2_blockref_t); 3409 3410 /* 3411 * Ok, create our new indirect block 3412 */ 3413 if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 3414 for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 3415 dummy.bref.type = HAMMER2_BREF_TYPE_FREEMAP_NODE; 3416 } else { 3417 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 3418 } 3419 dummy.bref.key = key; 3420 dummy.bref.keybits = keybits; 3421 dummy.bref.data_off = hammer2_getradix(nbytes); 3422 dummy.bref.methods = parent->bref.methods; 3423 3424 ichain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy.bref); 3425 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 3426 hammer2_chain_core_alloc(trans, ichain, NULL); 3427 icore = ichain->core; 3428 hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE); 3429 hammer2_chain_drop(ichain); /* excess ref from alloc */ 3430 3431 /* 3432 * We have to mark it modified to allocate its block, but use 3433 * OPTDATA to allow it to remain in the INITIAL state. Otherwise 3434 * it won't be acted upon by the flush code. 3435 * 3436 * XXX leave the node unmodified, depend on the update_hi 3437 * flush to assign and modify parent blocks. 3438 */ 3439 hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA); 3440 3441 /* 3442 * Iterate the original parent and move the matching brefs into 3443 * the new indirect block. 3444 * 3445 * XXX handle flushes. 3446 */ 3447 key_beg = 0; 3448 key_end = HAMMER2_MAX_KEY; 3449 cache_index = 0; 3450 spin_lock(&above->cst.spin); 3451 loops = 0; 3452 reason = 0; 3453 retry_same = 0; 3454 3455 for (;;) { 3456 if (++loops > 100000) { 3457 spin_unlock(&above->cst.spin); 3458 panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", 3459 reason, parent, base, count, key_next); 3460 } 3461 3462 /* 3463 * NOTE: spinlock stays intact, returned chain (if not NULL) 3464 * is not referenced or locked which means that we 3465 * cannot safely check its flagged / deletion status 3466 * until we lock it. 3467 */ 3468 chain = hammer2_combined_find(parent, base, count, 3469 &cache_index, &key_next, 3470 key_beg, key_end, 3471 &bref); 3472 generation = above->generation; 3473 if (bref == NULL) 3474 break; 3475 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3476 3477 /* 3478 * Skip keys that are not within the key/radix of the new 3479 * indirect block. They stay in the parent. 3480 */ 3481 if ((~(((hammer2_key_t)1 << keybits) - 1) & 3482 (key ^ bref->key)) != 0) { 3483 goto next_key_spinlocked; 3484 } 3485 3486 /* 3487 * Load the new indirect block by acquiring the related 3488 * chains (potentially from media as it might not be 3489 * in-memory). Then move it to the new parent (ichain) 3490 * via DELETE-DUPLICATE. 3491 * 3492 * chain is referenced but not locked. We must lock the 3493 * chain to obtain definitive DUPLICATED/DELETED state 3494 */ 3495 if (chain) { 3496 /* 3497 * Use chain already present in the RBTREE 3498 */ 3499 hammer2_chain_ref(chain); 3500 wasdup = ((chain->flags & 3501 HAMMER2_CHAIN_DUPLICATED) != 0); 3502 spin_unlock(&above->cst.spin); 3503 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER | 3504 HAMMER2_RESOLVE_NOREF); 3505 } else { 3506 /* 3507 * Get chain for blockref element. _get returns NULL 3508 * on insertion race. 3509 */ 3510 bcopy = *bref; 3511 spin_unlock(&above->cst.spin); 3512 chain = hammer2_chain_get(parent, &bcopy, generation); 3513 if (chain == NULL) { 3514 reason = 1; 3515 spin_lock(&above->cst.spin); 3516 continue; 3517 } 3518 if (bcmp(&bcopy, bref, sizeof(bcopy))) { 3519 reason = 2; 3520 hammer2_chain_drop(chain); 3521 spin_lock(&above->cst.spin); 3522 continue; 3523 } 3524 hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER | 3525 HAMMER2_RESOLVE_NOREF); 3526 wasdup = 0; 3527 } 3528 3529 /* 3530 * This is always live so if the chain has been delete- 3531 * duplicated we raced someone and we have to retry. 3532 * 3533 * NOTE: Lookups can race delete-duplicate because 3534 * delete-duplicate does not lock the parent's core 3535 * (they just use the spinlock on the core). We must 3536 * check for races by comparing the DUPLICATED flag before 3537 * releasing the spinlock with the flag after locking the 3538 * chain. 3539 * 3540 * (note reversed logic for this one) 3541 */ 3542 if (chain->flags & HAMMER2_CHAIN_DELETED) { 3543 hammer2_chain_unlock(chain); 3544 if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) && 3545 wasdup == 0) { 3546 retry_same = 1; 3547 } 3548 goto next_key; 3549 } 3550 3551 /* 3552 * Shift the chain to the indirect block. 3553 * 3554 * WARNING! Can cause held-over chains to require a refactor. 3555 * Fortunately we have none (our locked chains are 3556 * passed into and modified by the call). 3557 */ 3558 hammer2_chain_delete(trans, chain, 0); 3559 hammer2_chain_duplicate(trans, &ichain, &chain, NULL, 0, 1); 3560 hammer2_chain_unlock(chain); 3561 KKASSERT(parent->refs > 0); 3562 chain = NULL; 3563 next_key: 3564 spin_lock(&above->cst.spin); 3565 next_key_spinlocked: 3566 if (--maxloops == 0) 3567 panic("hammer2_chain_create_indirect: maxloops"); 3568 reason = 4; 3569 if (retry_same == 0) { 3570 if (key_next == 0 || key_next > key_end) 3571 break; 3572 key_beg = key_next; 3573 } 3574 /* loop */ 3575 } 3576 spin_unlock(&above->cst.spin); 3577 3578 /* 3579 * Insert the new indirect block into the parent now that we've 3580 * cleared out some entries in the parent. We calculated a good 3581 * insertion index in the loop above (ichain->index). 3582 * 3583 * We don't have to set MOVED here because we mark ichain modified 3584 * down below (so the normal modified -> flush -> set-moved sequence 3585 * applies). 3586 * 3587 * The insertion shouldn't race as this is a completely new block 3588 * and the parent is locked. 3589 */ 3590 KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); 3591 hammer2_chain_insert(above, NULL, ichain, 3592 HAMMER2_CHAIN_INSERT_SPIN | 3593 HAMMER2_CHAIN_INSERT_LIVE, 3594 0); 3595 3596 /* 3597 * Mark the new indirect block modified after insertion, which 3598 * will propagate up through parent all the way to the root and 3599 * also allocate the physical block in ichain for our caller, 3600 * and assign ichain->data to a pre-zero'd space (because there 3601 * is not prior data to copy into it). 3602 * 3603 * We have to set update_hi in ichain's flags manually so the 3604 * flusher knows it has to recurse through it to get to all of 3605 * our moved blocks, then call setsubmod() to set the bit 3606 * recursively. 3607 */ 3608 /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/ 3609 if (ichain->core->update_hi < trans->sync_tid) { 3610 spin_lock(&ichain->core->cst.spin); 3611 if (ichain->core->update_hi < trans->sync_tid) 3612 ichain->core->update_hi = trans->sync_tid; 3613 spin_unlock(&ichain->core->cst.spin); 3614 } 3615 hammer2_chain_setsubmod(trans, ichain); 3616 3617 /* 3618 * Figure out what to return. 3619 */ 3620 if (~(((hammer2_key_t)1 << keybits) - 1) & 3621 (create_key ^ key)) { 3622 /* 3623 * Key being created is outside the key range, 3624 * return the original parent. 3625 */ 3626 hammer2_chain_unlock(ichain); 3627 } else { 3628 /* 3629 * Otherwise its in the range, return the new parent. 3630 * (leave both the new and old parent locked). 3631 */ 3632 parent = ichain; 3633 } 3634 3635 return(parent); 3636 } 3637 3638 /* 3639 * Calculate the keybits and highside/lowside of the freemap node the 3640 * caller is creating. 3641 * 3642 * This routine will specify the next higher-level freemap key/radix 3643 * representing the lowest-ordered set. By doing so, eventually all 3644 * low-ordered sets will be moved one level down. 3645 * 3646 * We have to be careful here because the freemap reserves a limited 3647 * number of blocks for a limited number of levels. So we can't just 3648 * push indiscriminately. 3649 */ 3650 int 3651 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp, 3652 int keybits, hammer2_blockref_t *base, int count) 3653 { 3654 hammer2_chain_core_t *above; 3655 hammer2_chain_t *chain; 3656 hammer2_blockref_t *bref; 3657 hammer2_key_t key; 3658 hammer2_key_t key_beg; 3659 hammer2_key_t key_end; 3660 hammer2_key_t key_next; 3661 int cache_index; 3662 int locount; 3663 int hicount; 3664 int maxloops = 300000; 3665 3666 key = *keyp; 3667 above = parent->core; 3668 locount = 0; 3669 hicount = 0; 3670 keybits = 64; 3671 3672 /* 3673 * Calculate the range of keys in the array being careful to skip 3674 * slots which are overridden with a deletion. 3675 */ 3676 key_beg = 0; 3677 key_end = HAMMER2_MAX_KEY; 3678 cache_index = 0; 3679 spin_lock(&above->cst.spin); 3680 3681 for (;;) { 3682 if (--maxloops == 0) { 3683 panic("indkey_freemap shit %p %p:%d\n", 3684 parent, base, count); 3685 } 3686 chain = hammer2_combined_find(parent, base, count, 3687 &cache_index, &key_next, 3688 key_beg, key_end, &bref); 3689 3690 /* 3691 * Exhausted search 3692 */ 3693 if (bref == NULL) 3694 break; 3695 3696 /* 3697 * NOTE: No need to check DUPLICATED here because we do 3698 * not release the spinlock. 3699 */ 3700 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3701 if (key_next == 0 || key_next > key_end) 3702 break; 3703 key_beg = key_next; 3704 continue; 3705 } 3706 3707 /* 3708 * Use the full live (not deleted) element for the scan 3709 * iteration. HAMMER2 does not allow partial replacements. 3710 * 3711 * XXX should be built into hammer2_combined_find(). 3712 */ 3713 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3714 3715 if (keybits > bref->keybits) { 3716 key = bref->key; 3717 keybits = bref->keybits; 3718 } else if (keybits == bref->keybits && bref->key < key) { 3719 key = bref->key; 3720 } 3721 if (key_next == 0) 3722 break; 3723 key_beg = key_next; 3724 } 3725 spin_unlock(&above->cst.spin); 3726 3727 /* 3728 * Return the keybits for a higher-level FREEMAP_NODE covering 3729 * this node. 3730 */ 3731 switch(keybits) { 3732 case HAMMER2_FREEMAP_LEVEL0_RADIX: 3733 keybits = HAMMER2_FREEMAP_LEVEL1_RADIX; 3734 break; 3735 case HAMMER2_FREEMAP_LEVEL1_RADIX: 3736 keybits = HAMMER2_FREEMAP_LEVEL2_RADIX; 3737 break; 3738 case HAMMER2_FREEMAP_LEVEL2_RADIX: 3739 keybits = HAMMER2_FREEMAP_LEVEL3_RADIX; 3740 break; 3741 case HAMMER2_FREEMAP_LEVEL3_RADIX: 3742 keybits = HAMMER2_FREEMAP_LEVEL4_RADIX; 3743 break; 3744 case HAMMER2_FREEMAP_LEVEL4_RADIX: 3745 panic("hammer2_chain_indkey_freemap: level too high"); 3746 break; 3747 default: 3748 panic("hammer2_chain_indkey_freemap: bad radix"); 3749 break; 3750 } 3751 *keyp = key; 3752 3753 return (keybits); 3754 } 3755 3756 /* 3757 * Calculate the keybits and highside/lowside of the indirect block the 3758 * caller is creating. 3759 */ 3760 static int 3761 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp, 3762 int keybits, hammer2_blockref_t *base, int count) 3763 { 3764 hammer2_chain_core_t *above; 3765 hammer2_blockref_t *bref; 3766 hammer2_chain_t *chain; 3767 hammer2_key_t key_beg; 3768 hammer2_key_t key_end; 3769 hammer2_key_t key_next; 3770 hammer2_key_t key; 3771 int nkeybits; 3772 int locount; 3773 int hicount; 3774 int cache_index; 3775 int maxloops = 300000; 3776 3777 key = *keyp; 3778 above = parent->core; 3779 locount = 0; 3780 hicount = 0; 3781 3782 /* 3783 * Calculate the range of keys in the array being careful to skip 3784 * slots which are overridden with a deletion. Once the scan 3785 * completes we will cut the key range in half and shift half the 3786 * range into the new indirect block. 3787 */ 3788 key_beg = 0; 3789 key_end = HAMMER2_MAX_KEY; 3790 cache_index = 0; 3791 spin_lock(&above->cst.spin); 3792 3793 for (;;) { 3794 if (--maxloops == 0) { 3795 panic("indkey_freemap shit %p %p:%d\n", 3796 parent, base, count); 3797 } 3798 chain = hammer2_combined_find(parent, base, count, 3799 &cache_index, &key_next, 3800 key_beg, key_end, &bref); 3801 3802 /* 3803 * Exhausted search 3804 */ 3805 if (bref == NULL) 3806 break; 3807 3808 /* 3809 * NOTE: No need to check DUPLICATED here because we do 3810 * not release the spinlock. 3811 */ 3812 if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { 3813 if (key_next == 0 || key_next > key_end) 3814 break; 3815 key_beg = key_next; 3816 continue; 3817 } 3818 3819 /* 3820 * Use the full live (not deleted) element for the scan 3821 * iteration. HAMMER2 does not allow partial replacements. 3822 * 3823 * XXX should be built into hammer2_combined_find(). 3824 */ 3825 key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); 3826 3827 /* 3828 * Expand our calculated key range (key, keybits) to fit 3829 * the scanned key. nkeybits represents the full range 3830 * that we will later cut in half (two halves @ nkeybits - 1). 3831 */ 3832 nkeybits = keybits; 3833 if (nkeybits < bref->keybits) { 3834 if (bref->keybits > 64) { 3835 kprintf("bad bref chain %p bref %p\n", 3836 chain, bref); 3837 Debugger("fubar"); 3838 } 3839 nkeybits = bref->keybits; 3840 } 3841 while (nkeybits < 64 && 3842 (~(((hammer2_key_t)1 << nkeybits) - 1) & 3843 (key ^ bref->key)) != 0) { 3844 ++nkeybits; 3845 } 3846 3847 /* 3848 * If the new key range is larger we have to determine 3849 * which side of the new key range the existing keys fall 3850 * under by checking the high bit, then collapsing the 3851 * locount into the hicount or vise-versa. 3852 */ 3853 if (keybits != nkeybits) { 3854 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 3855 hicount += locount; 3856 locount = 0; 3857 } else { 3858 locount += hicount; 3859 hicount = 0; 3860 } 3861 keybits = nkeybits; 3862 } 3863 3864 /* 3865 * The newly scanned key will be in the lower half or the 3866 * upper half of the (new) key range. 3867 */ 3868 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 3869 ++hicount; 3870 else 3871 ++locount; 3872 3873 if (key_next == 0) 3874 break; 3875 key_beg = key_next; 3876 } 3877 spin_unlock(&above->cst.spin); 3878 bref = NULL; /* now invalid (safety) */ 3879 3880 /* 3881 * Adjust keybits to represent half of the full range calculated 3882 * above (radix 63 max) 3883 */ 3884 --keybits; 3885 3886 /* 3887 * Select whichever half contains the most elements. Theoretically 3888 * we can select either side as long as it contains at least one 3889 * element (in order to ensure that a free slot is present to hold 3890 * the indirect block). 3891 */ 3892 if (hammer2_indirect_optimize) { 3893 /* 3894 * Insert node for least number of keys, this will arrange 3895 * the first few blocks of a large file or the first few 3896 * inodes in a directory with fewer indirect blocks when 3897 * created linearly. 3898 */ 3899 if (hicount < locount && hicount != 0) 3900 key |= (hammer2_key_t)1 << keybits; 3901 else 3902 key &= ~(hammer2_key_t)1 << keybits; 3903 } else { 3904 /* 3905 * Insert node for most number of keys, best for heavily 3906 * fragmented files. 3907 */ 3908 if (hicount > locount) 3909 key |= (hammer2_key_t)1 << keybits; 3910 else 3911 key &= ~(hammer2_key_t)1 << keybits; 3912 } 3913 *keyp = key; 3914 3915 return (keybits); 3916 } 3917 3918 /* 3919 * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and 3920 * set chain->delete_tid. The chain is not actually marked possibly-free 3921 * in the freemap until the deletion is completely flushed out (because 3922 * a flush which doesn't cover the entire deletion is flushing the deleted 3923 * chain as if it were live). 3924 * 3925 * This function does NOT generate a modification to the parent. It 3926 * would be nearly impossible to figure out which parent to modify anyway. 3927 * Such modifications are handled top-down by the flush code and are 3928 * properly merged using the flush synchronization point. 3929 * 3930 * The find/get code will properly overload the RBTREE check on top of 3931 * the bref check to detect deleted entries. 3932 * 3933 * This function is NOT recursive. Any entity already pushed into the 3934 * chain (such as an inode) may still need visibility into its contents, 3935 * as well as the ability to read and modify the contents. For example, 3936 * for an unlinked file which is still open. 3937 * 3938 * NOTE: This function does NOT set chain->modify_tid, allowing future 3939 * code to distinguish between live and deleted chains by testing 3940 * trans->sync_tid vs chain->modify_tid and chain->delete_tid. 3941 * 3942 * NOTE: Deletions normally do not occur in the middle of a duplication 3943 * chain but we use a trick for hardlink migration that refactors 3944 * the originating inode without deleting it, so we make no assumptions 3945 * here. 3946 */ 3947 void 3948 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags) 3949 { 3950 KKASSERT(ccms_thread_lock_owned(&chain->core->cst)); 3951 3952 /* 3953 * Nothing to do if already marked. 3954 */ 3955 if (chain->flags & HAMMER2_CHAIN_DELETED) 3956 return; 3957 3958 /* 3959 * The setting of DELETED causes finds, lookups, and _next iterations 3960 * to no longer recognize the chain. RB_SCAN()s will still have 3961 * visibility (needed for flush serialization points). 3962 * 3963 * We need the spinlock on the core whos RBTREE contains chain 3964 * to protect against races. 3965 */ 3966 KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE); 3967 spin_lock(&chain->above->cst.spin); 3968 3969 KKASSERT(trans->sync_tid >= chain->modify_tid); 3970 chain->delete_tid = trans->sync_tid; 3971 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 3972 atomic_add_int(&chain->above->live_count, -1); 3973 ++chain->above->generation; 3974 3975 /* 3976 * We must set MOVED along with DELETED for the flush code to 3977 * recognize the operation and properly disconnect the chain 3978 * in-memory. 3979 */ 3980 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 3981 hammer2_chain_ref(chain); 3982 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 3983 } 3984 spin_unlock(&chain->above->cst.spin); 3985 3986 hammer2_chain_setsubmod(trans, chain); 3987 } 3988 3989 /* 3990 * Called with the core spinlock held to check for freeable layers. 3991 * Used by the flush code. Layers can wind up not being freed due 3992 * to the temporary layer->refs count. This function frees up any 3993 * layers that were missed. 3994 */ 3995 void 3996 hammer2_chain_layer_check_locked(hammer2_mount_t *hmp, 3997 hammer2_chain_core_t *core) 3998 { 3999 hammer2_chain_layer_t *layer; 4000 hammer2_chain_layer_t *tmp; 4001 4002 tmp = TAILQ_FIRST(&core->layerq); 4003 while ((layer = tmp) != NULL) { 4004 tmp = TAILQ_NEXT(tmp, entry); 4005 if (layer->refs == 0 && RB_EMPTY(&layer->rbtree)) { 4006 TAILQ_REMOVE(&core->layerq, layer, entry); 4007 if (tmp) 4008 ++tmp->refs; 4009 spin_unlock(&core->cst.spin); 4010 kfree(layer, hmp->mchain); 4011 spin_lock(&core->cst.spin); 4012 if (tmp) 4013 --tmp->refs; 4014 } 4015 } 4016 } 4017 4018 /* 4019 * Returns the index of the nearest element in the blockref array >= elm. 4020 * Returns (count) if no element could be found. 4021 * 4022 * Sets *key_nextp to the next key for loop purposes but does not modify 4023 * it if the next key would be higher than the current value of *key_nextp. 4024 * Note that *key_nexp can overflow to 0, which should be tested by the 4025 * caller. 4026 * 4027 * (*cache_indexp) is a heuristic and can be any value without effecting 4028 * the result. 4029 * 4030 * The spin lock on the related chain must be held. 4031 */ 4032 int 4033 hammer2_base_find(hammer2_chain_t *chain, 4034 hammer2_blockref_t *base, int count, 4035 int *cache_indexp, hammer2_key_t *key_nextp, 4036 hammer2_key_t key_beg, hammer2_key_t key_end) 4037 { 4038 hammer2_chain_core_t *core = chain->core; 4039 hammer2_blockref_t *scan; 4040 hammer2_key_t scan_end; 4041 int i; 4042 int limit; 4043 4044 /* 4045 * Require the live chain's already have their core's counted 4046 * so we can optimize operations. 4047 */ 4048 KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) || 4049 core->flags & HAMMER2_CORE_COUNTEDBREFS); 4050 4051 /* 4052 * Degenerate case 4053 */ 4054 if (count == 0 || base == NULL) 4055 return(count); 4056 4057 /* 4058 * Sequential optimization using *cache_indexp. This is the most 4059 * likely scenario. 4060 * 4061 * We can avoid trailing empty entries on live chains, otherwise 4062 * we might have to check the whole block array. 4063 */ 4064 i = *cache_indexp; 4065 cpu_ccfence(); 4066 if (chain->flags & HAMMER2_CHAIN_DUPLICATED) 4067 limit = count; 4068 else 4069 limit = core->live_zero; 4070 if (i >= limit) 4071 i = limit - 1; 4072 if (i < 0) 4073 i = 0; 4074 KKASSERT(i < count); 4075 4076 /* 4077 * Search backwards 4078 */ 4079 scan = &base[i]; 4080 while (i > 0 && (scan->type == 0 || scan->key > key_beg)) { 4081 --scan; 4082 --i; 4083 } 4084 *cache_indexp = i; 4085 4086 /* 4087 * Search forwards, stop when we find a scan element which 4088 * encloses the key or until we know that there are no further 4089 * elements. 4090 */ 4091 while (i < count) { 4092 if (scan->type != 0) { 4093 if (scan->key > key_beg) 4094 break; 4095 scan_end = scan->key + 4096 ((hammer2_key_t)1 << scan->keybits) - 1; 4097 if (scan_end >= key_beg) 4098 break; 4099 } 4100 if (i >= limit) 4101 return (count); 4102 ++scan; 4103 ++i; 4104 } 4105 if (i != count) { 4106 *cache_indexp = i; 4107 if (i >= limit) { 4108 i = count; 4109 } else { 4110 scan_end = scan->key + 4111 ((hammer2_key_t)1 << scan->keybits); 4112 if (scan_end && (*key_nextp > scan_end || 4113 *key_nextp == 0)) { 4114 *key_nextp = scan_end; 4115 } 4116 } 4117 } 4118 return (i); 4119 } 4120 4121 /* 4122 * Do a combined search and return the next match either from the blockref 4123 * array or from the in-memory chain. Sets *bresp to the returned bref in 4124 * both cases, or sets it to NULL if the search exhausted. Only returns 4125 * a non-NULL chain if the search matched from the in-memory chain. 4126 * 4127 * Must be called with above's spinlock held. Spinlock remains held 4128 * through the operation. 4129 * 4130 * The returned chain is not locked or referenced. Use the returned bref 4131 * to determine if the search exhausted or not. 4132 */ 4133 static hammer2_chain_t * 4134 hammer2_combined_find(hammer2_chain_t *parent, 4135 hammer2_blockref_t *base, int count, 4136 int *cache_indexp, hammer2_key_t *key_nextp, 4137 hammer2_key_t key_beg, hammer2_key_t key_end, 4138 hammer2_blockref_t **bresp) 4139 { 4140 hammer2_blockref_t *bref; 4141 hammer2_chain_t *chain; 4142 int i; 4143 4144 *key_nextp = key_end + 1; 4145 i = hammer2_base_find(parent, base, count, cache_indexp, 4146 key_nextp, key_beg, key_end); 4147 chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end); 4148 4149 /* 4150 * Neither matched 4151 */ 4152 if (i == count && chain == NULL) { 4153 *bresp = NULL; 4154 return(chain); /* NULL */ 4155 } 4156 4157 /* 4158 * Only chain matched 4159 */ 4160 if (i == count) { 4161 bref = &chain->bref; 4162 goto found; 4163 } 4164 4165 /* 4166 * Only blockref matched. 4167 */ 4168 if (chain == NULL) { 4169 bref = &base[i]; 4170 goto found; 4171 } 4172 4173 /* 4174 * Both in-memory and blockref match. Select the nearer element. 4175 * If both are flush with the left-hand side they are considered 4176 * to be the same distance. 4177 * 4178 * When both are the same distance away select the chain if it is 4179 * live or if it's delete_tid is greater than the parent's 4180 * synchronized bref.mirror_tid (a single test suffices for both 4181 * conditions), otherwise select the element. 4182 * 4183 * (It is possible for an old deletion to linger after a rename-over 4184 * and flush, which would make the media copy the correct choice). 4185 */ 4186 4187 /* 4188 * Either both are flush with the left-hand side or they are the 4189 * same distance away. Select the chain if it is not deleted 4190 * or it has a higher delete_tid, else select the media. 4191 */ 4192 if ((chain->bref.key <= key_beg && base[i].key <= key_beg) || 4193 chain->bref.key == base[i].key) { 4194 if (chain->delete_tid > base[i].mirror_tid) { 4195 bref = &chain->bref; 4196 } else { 4197 KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED); 4198 bref = &base[i]; 4199 chain = NULL; 4200 } 4201 goto found; 4202 } 4203 4204 /* 4205 * Select the nearer key. 4206 */ 4207 if (chain->bref.key < base[i].key) { 4208 bref = &chain->bref; 4209 } else { 4210 bref = &base[i]; 4211 chain = NULL; 4212 } 4213 4214 /* 4215 * If the bref is out of bounds we've exhausted our search. 4216 */ 4217 found: 4218 if (bref->key > key_end) { 4219 *bresp = NULL; 4220 chain = NULL; 4221 } else { 4222 *bresp = bref; 4223 } 4224 return(chain); 4225 } 4226 4227 /* 4228 * Locate the specified block array element and delete it. The element 4229 * must exist. 4230 * 4231 * The spin lock on the related chain must be held. 4232 * 4233 * NOTE: live_count was adjusted when the chain was deleted, so it does not 4234 * need to be adjusted when we commit the media change. 4235 */ 4236 void 4237 hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent, 4238 hammer2_blockref_t *base, int count, 4239 int *cache_indexp, hammer2_chain_t *child) 4240 { 4241 hammer2_blockref_t *elm = &child->bref; 4242 hammer2_chain_core_t *core = parent->core; 4243 hammer2_key_t key_next; 4244 int i; 4245 4246 /* 4247 * Delete element. Expect the element to exist. 4248 * 4249 * XXX see caller, flush code not yet sophisticated enough to prevent 4250 * re-flushed in some cases. 4251 */ 4252 key_next = 0; /* max range */ 4253 i = hammer2_base_find(parent, base, count, cache_indexp, 4254 &key_next, elm->key, elm->key); 4255 if (i == count || base[i].type == 0 || 4256 base[i].key != elm->key || base[i].keybits != elm->keybits) { 4257 panic("delete base %p element not found at %d/%d elm %p\n", 4258 base, i, count, elm); 4259 return; 4260 } 4261 bzero(&base[i], sizeof(*base)); 4262 base[i].mirror_tid = (intptr_t)parent; /* debug */ 4263 base[i].modify_tid = (intptr_t)child; /* debug */ 4264 base[i].check.debug.sync_tid = trans->sync_tid; /* debug */ 4265 4266 /* 4267 * We can only optimize core->live_zero for live chains. 4268 */ 4269 if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) { 4270 if (core->live_zero == i + 1) { 4271 while (--i >= 0 && base[i].type == 0) 4272 ; 4273 core->live_zero = i + 1; 4274 } 4275 } 4276 } 4277 4278 /* 4279 * Insert the specified element. The block array must not already have the 4280 * element and must have space available for the insertion. 4281 * 4282 * The spin lock on the related chain must be held. 4283 * 4284 * NOTE: live_count was adjusted when the chain was deleted, so it does not 4285 * need to be adjusted when we commit the media change. 4286 */ 4287 void 4288 hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent, 4289 hammer2_blockref_t *base, int count, 4290 int *cache_indexp, hammer2_chain_t *child) 4291 { 4292 hammer2_blockref_t *elm = &child->bref; 4293 hammer2_chain_core_t *core = parent->core; 4294 hammer2_key_t key_next; 4295 hammer2_key_t xkey; 4296 int i; 4297 int j; 4298 int k; 4299 int l; 4300 int u = 1; 4301 4302 /* 4303 * Insert new element. Expect the element to not already exist 4304 * unless we are replacing it. 4305 * 4306 * XXX see caller, flush code not yet sophisticated enough to prevent 4307 * re-flushed in some cases. 4308 */ 4309 key_next = 0; /* max range */ 4310 i = hammer2_base_find(parent, base, count, cache_indexp, 4311 &key_next, elm->key, elm->key); 4312 4313 /* 4314 * Shortcut fill optimization, typical ordered insertion(s) may not 4315 * require a search. 4316 */ 4317 KKASSERT(i >= 0 && i <= count); 4318 4319 /* 4320 * We can only optimize core->live_zero for live chains. 4321 */ 4322 if (i == count && core->live_zero < count) { 4323 if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) { 4324 i = core->live_zero++; 4325 base[i] = *elm; 4326 return; 4327 } 4328 } 4329 4330 xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1; 4331 if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) { 4332 panic("insert base %p overlapping elements at %d elm %p\n", 4333 base, i, elm); 4334 } 4335 4336 /* 4337 * Try to find an empty slot before or after. 4338 */ 4339 j = i; 4340 k = i; 4341 while (j > 0 || k < count) { 4342 --j; 4343 if (j >= 0 && base[j].type == 0) { 4344 if (j == i - 1) { 4345 base[j] = *elm; 4346 } else { 4347 bcopy(&base[j+1], &base[j], 4348 (i - j - 1) * sizeof(*base)); 4349 base[i - 1] = *elm; 4350 } 4351 goto validate; 4352 } 4353 ++k; 4354 if (k < count && base[k].type == 0) { 4355 bcopy(&base[i], &base[i+1], 4356 (k - i) * sizeof(hammer2_blockref_t)); 4357 base[i] = *elm; 4358 4359 /* 4360 * We can only update core->live_zero for live 4361 * chains. 4362 */ 4363 if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) { 4364 if (core->live_zero <= k) 4365 core->live_zero = k + 1; 4366 } 4367 u = 2; 4368 goto validate; 4369 } 4370 } 4371 panic("hammer2_base_insert: no room!"); 4372 4373 /* 4374 * Debugging 4375 */ 4376 validate: 4377 key_next = 0; 4378 for (l = 0; l < count; ++l) { 4379 if (base[l].type) { 4380 key_next = base[l].key + 4381 ((hammer2_key_t)1 << base[l].keybits) - 1; 4382 break; 4383 } 4384 } 4385 while (++l < count) { 4386 if (base[l].type) { 4387 if (base[l].key <= key_next) 4388 panic("base_insert %d %d,%d,%d fail %p:%d", u, i, j, k, base, l); 4389 key_next = base[l].key + 4390 ((hammer2_key_t)1 << base[l].keybits) - 1; 4391 4392 } 4393 } 4394 4395 } 4396 4397 #if 0 4398 4399 /* 4400 * Sort the blockref array for the chain. Used by the flush code to 4401 * sort the blockref[] array. 4402 * 4403 * The chain must be exclusively locked AND spin-locked. 4404 */ 4405 typedef hammer2_blockref_t *hammer2_blockref_p; 4406 4407 static 4408 int 4409 hammer2_base_sort_callback(const void *v1, const void *v2) 4410 { 4411 hammer2_blockref_p bref1 = *(const hammer2_blockref_p *)v1; 4412 hammer2_blockref_p bref2 = *(const hammer2_blockref_p *)v2; 4413 4414 /* 4415 * Make sure empty elements are placed at the end of the array 4416 */ 4417 if (bref1->type == 0) { 4418 if (bref2->type == 0) 4419 return(0); 4420 return(1); 4421 } else if (bref2->type == 0) { 4422 return(-1); 4423 } 4424 4425 /* 4426 * Sort by key 4427 */ 4428 if (bref1->key < bref2->key) 4429 return(-1); 4430 if (bref1->key > bref2->key) 4431 return(1); 4432 return(0); 4433 } 4434 4435 void 4436 hammer2_base_sort(hammer2_chain_t *chain) 4437 { 4438 hammer2_blockref_t *base; 4439 int count; 4440 4441 switch(chain->bref.type) { 4442 case HAMMER2_BREF_TYPE_INODE: 4443 /* 4444 * Special shortcut for embedded data returns the inode 4445 * itself. Callers must detect this condition and access 4446 * the embedded data (the strategy code does this for us). 4447 * 4448 * This is only applicable to regular files and softlinks. 4449 */ 4450 if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) 4451 return; 4452 base = &chain->data->ipdata.u.blockset.blockref[0]; 4453 count = HAMMER2_SET_COUNT; 4454 break; 4455 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4456 case HAMMER2_BREF_TYPE_INDIRECT: 4457 /* 4458 * Optimize indirect blocks in the INITIAL state to avoid 4459 * I/O. 4460 */ 4461 KKASSERT((chain->flags & HAMMER2_CHAIN_INITIAL) == 0); 4462 base = &chain->data->npdata[0]; 4463 count = chain->bytes / sizeof(hammer2_blockref_t); 4464 break; 4465 case HAMMER2_BREF_TYPE_VOLUME: 4466 base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 4467 count = HAMMER2_SET_COUNT; 4468 break; 4469 case HAMMER2_BREF_TYPE_FREEMAP: 4470 base = &chain->hmp->voldata.freemap_blockset.blockref[0]; 4471 count = HAMMER2_SET_COUNT; 4472 break; 4473 default: 4474 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 4475 chain->bref.type); 4476 base = NULL; /* safety */ 4477 count = 0; /* safety */ 4478 } 4479 kqsort(base, count, sizeof(*base), hammer2_base_sort_callback); 4480 } 4481 4482 #endif 4483 4484 /* 4485 * Chain memory management 4486 */ 4487 void 4488 hammer2_chain_wait(hammer2_chain_t *chain) 4489 { 4490 tsleep(chain, 0, "chnflw", 1); 4491 } 4492 4493 /* 4494 * Manage excessive memory resource use for chain and related 4495 * structures. 4496 */ 4497 void 4498 hammer2_chain_memory_wait(hammer2_pfsmount_t *pmp) 4499 { 4500 long waiting; 4501 long count; 4502 long limit; 4503 #if 0 4504 static int zzticks; 4505 #endif 4506 4507 /* 4508 * Atomic check condition and wait. Also do an early speedup of 4509 * the syncer to try to avoid hitting the wait. 4510 */ 4511 for (;;) { 4512 waiting = pmp->inmem_dirty_chains; 4513 cpu_ccfence(); 4514 count = waiting & HAMMER2_DIRTYCHAIN_MASK; 4515 4516 limit = pmp->mp->mnt_nvnodelistsize / 10; 4517 if (limit < hammer2_limit_dirty_chains) 4518 limit = hammer2_limit_dirty_chains; 4519 if (limit < 1000) 4520 limit = 1000; 4521 4522 #if 0 4523 if ((int)(ticks - zzticks) > hz) { 4524 zzticks = ticks; 4525 kprintf("count %ld %ld\n", count, limit); 4526 } 4527 #endif 4528 4529 /* 4530 * Block if there are too many dirty chains present, wait 4531 * for the flush to clean some out. 4532 */ 4533 if (count > limit) { 4534 tsleep_interlock(&pmp->inmem_dirty_chains, 0); 4535 if (atomic_cmpset_long(&pmp->inmem_dirty_chains, 4536 waiting, 4537 waiting | HAMMER2_DIRTYCHAIN_WAITING)) { 4538 speedup_syncer(pmp->mp); 4539 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED, 4540 "chnmem", hz); 4541 } 4542 continue; /* loop on success or fail */ 4543 } 4544 4545 /* 4546 * Try to start an early flush before we are forced to block. 4547 */ 4548 if (count > limit * 7 / 10) 4549 speedup_syncer(pmp->mp); 4550 break; 4551 } 4552 } 4553 4554 void 4555 hammer2_chain_memory_inc(hammer2_pfsmount_t *pmp) 4556 { 4557 if (pmp) 4558 atomic_add_long(&pmp->inmem_dirty_chains, 1); 4559 } 4560 4561 void 4562 hammer2_chain_memory_wakeup(hammer2_pfsmount_t *pmp) 4563 { 4564 long waiting; 4565 4566 if (pmp == NULL) 4567 return; 4568 4569 for (;;) { 4570 waiting = pmp->inmem_dirty_chains; 4571 cpu_ccfence(); 4572 if (atomic_cmpset_long(&pmp->inmem_dirty_chains, 4573 waiting, 4574 (waiting - 1) & 4575 ~HAMMER2_DIRTYCHAIN_WAITING)) { 4576 break; 4577 } 4578 } 4579 if (waiting & HAMMER2_DIRTYCHAIN_WAITING) 4580 wakeup(&pmp->inmem_dirty_chains); 4581 } 4582 4583 static 4584 void 4585 adjreadcounter(hammer2_blockref_t *bref, size_t bytes) 4586 { 4587 long *counterp; 4588 4589 switch(bref->type) { 4590 case HAMMER2_BREF_TYPE_DATA: 4591 counterp = &hammer2_iod_file_read; 4592 break; 4593 case HAMMER2_BREF_TYPE_INODE: 4594 counterp = &hammer2_iod_meta_read; 4595 break; 4596 case HAMMER2_BREF_TYPE_INDIRECT: 4597 counterp = &hammer2_iod_indr_read; 4598 break; 4599 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 4600 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 4601 counterp = &hammer2_iod_fmap_read; 4602 break; 4603 default: 4604 counterp = &hammer2_iod_volu_read; 4605 break; 4606 } 4607 *counterp += bytes; 4608 } 4609