1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 /* 37 * TRANSACTION AND FLUSH HANDLING 38 * 39 * Deceptively simple but actually fairly difficult to implement properly is 40 * how I would describe it. 41 * 42 * The biggest problem is that each PFS may belong to a cluster so its 43 * media modify_tid and mirror_tid fields are in a completely different 44 * domain than the topology related to the super-root. Most of the code 45 * operates using modify_xid and delete_xid which are local identifiers. 46 * 47 * The second biggest problem is that we really want to allow flushes to run 48 * concurrently with new front-end operations, which means that the in-memory 49 * topology of hammer2_chain structures can represent both current state and 50 * snapshot-for-flush state. 51 */ 52 53 #include <sys/cdefs.h> 54 #include <sys/param.h> 55 #include <sys/systm.h> 56 #include <sys/types.h> 57 #include <sys/lock.h> 58 #include <sys/uuid.h> 59 60 #include "hammer2.h" 61 62 #define FLUSH_DEBUG 0 63 64 /* 65 * Recursively flush the specified chain. The chain is locked and 66 * referenced by the caller and will remain so on return. The chain 67 * will remain referenced throughout but can temporarily lose its 68 * lock during the recursion to avoid unnecessarily stalling user 69 * processes. 70 */ 71 struct hammer2_flush_info { 72 hammer2_chain_t *parent; 73 hammer2_trans_t *trans; 74 int depth; 75 int diddeferral; 76 int cache_index; 77 int domodify; 78 struct h2_flush_deferral_list flush_list; 79 hammer2_xid_t sync_xid; /* memory synchronization point */ 80 }; 81 82 typedef struct hammer2_flush_info hammer2_flush_info_t; 83 84 static void hammer2_flush_core(hammer2_flush_info_t *info, 85 hammer2_chain_t **chainp, int deleting); 86 static int hammer2_flush_pass1(hammer2_chain_t *child, void *data); 87 static int hammer2_flush_pass2(hammer2_chain_t *child, void *data); 88 static int hammer2_flush_pass3(hammer2_chain_t *child, void *data); 89 static int hammer2_flush_pass4(hammer2_chain_t *child, void *data); 90 static int hammer2_flush_pass5(hammer2_chain_t *child, void *data); 91 static void hammer2_rollup_stats(hammer2_chain_t *parent, 92 hammer2_chain_t *child, int how); 93 94 95 /* 96 * Can we ignore a chain for the purposes of flushing modifications 97 * to the media? 98 * 99 * This code is now degenerate. We used to have to distinguish between 100 * deleted chains and deleted chains associated with inodes that were 101 * still open. This mechanic has been fixed so the function is now 102 * a simple test. 103 */ 104 static __inline 105 int 106 h2ignore_deleted(hammer2_flush_info_t *info, hammer2_chain_t *chain) 107 { 108 return (chain->delete_xid <= info->sync_xid); 109 } 110 111 #if 0 112 static __inline 113 void 114 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref, 115 int how) 116 { 117 hammer2_key_t bytes; 118 119 if (bref->type != 0) { 120 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX); 121 if (bref->type == HAMMER2_BREF_TYPE_INODE) 122 info->inode_count += how; 123 if (how < 0) 124 info->data_count -= bytes; 125 else 126 info->data_count += bytes; 127 } 128 } 129 #endif 130 131 /* 132 * For now use a global transaction manager. What we ultimately want to do 133 * is give each non-overlapping hmp/pmp group its own transaction manager. 134 * 135 * Transactions govern XID tracking on the physical media (the hmp), but they 136 * also govern TID tracking which is per-PFS and thus might cross multiple 137 * hmp's. So we can't just stuff tmanage into hammer2_mount or 138 * hammer2_pfsmount. 139 */ 140 static hammer2_trans_manage_t tmanage; 141 142 void 143 hammer2_trans_manage_init(void) 144 { 145 lockinit(&tmanage.translk, "h2trans", 0, 0); 146 TAILQ_INIT(&tmanage.transq); 147 tmanage.flush_xid = 1; 148 tmanage.alloc_xid = tmanage.flush_xid + 1; 149 } 150 151 hammer2_xid_t 152 hammer2_trans_newxid(hammer2_pfsmount_t *pmp __unused) 153 { 154 hammer2_xid_t xid; 155 156 for (;;) { 157 xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1); 158 if (xid) 159 break; 160 } 161 return xid; 162 } 163 164 /* 165 * Transaction support functions for writing to the filesystem. 166 * 167 * Initializing a new transaction allocates a transaction ID. Typically 168 * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 169 * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 170 * media target. The latter mode is used by the recovery code. 171 * 172 * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 173 * other is a set of any number of concurrent filesystem operations. We 174 * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 175 * or we can have <running_flush> + <concurrent_fs_ops>. 176 * 177 * During a flush, new fs_ops are only blocked until the fs_ops prior to 178 * the flush complete. The new fs_ops can then run concurrent with the flush. 179 * 180 * Buffer-cache transactions operate as fs_ops but never block. A 181 * buffer-cache flush will run either before or after the current pending 182 * flush depending on its state. 183 */ 184 void 185 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags) 186 { 187 hammer2_trans_manage_t *tman; 188 hammer2_trans_t *head; 189 190 tman = &tmanage; 191 192 bzero(trans, sizeof(*trans)); 193 trans->pmp = pmp; 194 trans->flags = flags; 195 trans->td = curthread; 196 197 lockmgr(&tman->translk, LK_EXCLUSIVE); 198 199 if (flags & HAMMER2_TRANS_ISFLUSH) { 200 /* 201 * If multiple flushes are trying to run we have to 202 * wait until it is our turn. All flushes are serialized. 203 * 204 * We queue ourselves and then wait to become the head 205 * of the queue, allowing all prior flushes to complete. 206 * 207 * Multiple normal transactions can share the current 208 * transaction id but a flush transaction needs its own 209 * unique TID for proper block table update accounting. 210 */ 211 ++tman->flushcnt; 212 ++pmp->alloc_tid; 213 pmp->flush_tid = pmp->alloc_tid; 214 tman->flush_xid = hammer2_trans_newxid(pmp); 215 trans->sync_xid = tman->flush_xid; 216 ++pmp->alloc_tid; 217 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 218 if (TAILQ_FIRST(&tman->transq) != trans) { 219 trans->blocked = 1; 220 while (trans->blocked) { 221 lksleep(&trans->sync_xid, &tman->translk, 222 0, "h2multf", hz); 223 } 224 } 225 } else if (tman->flushcnt == 0) { 226 /* 227 * No flushes are pending, we can go. Use prior flush_xid + 1. 228 * 229 * WARNING! Also see hammer2_chain_setsubmod() 230 */ 231 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 232 trans->sync_xid = tman->flush_xid + 1; 233 234 /* XXX improve/optimize inode allocation */ 235 } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) { 236 /* 237 * A buffer cache transaction is requested while a flush 238 * is in progress. The flush's PREFLUSH flag must be set 239 * in this situation. 240 * 241 * The buffer cache flush takes on the main flush's 242 * transaction id. 243 */ 244 TAILQ_FOREACH(head, &tman->transq, entry) { 245 if (head->flags & HAMMER2_TRANS_ISFLUSH) 246 break; 247 } 248 KKASSERT(head); 249 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH); 250 trans->flags |= HAMMER2_TRANS_PREFLUSH; 251 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 252 trans->sync_xid = head->sync_xid; 253 trans->flags |= HAMMER2_TRANS_CONCURRENT; 254 /* not allowed to block */ 255 } else { 256 /* 257 * A normal transaction is requested while a flush is in 258 * progress. We insert after the current flush and may 259 * block. 260 * 261 * WARNING! Also see hammer2_chain_setsubmod() 262 */ 263 TAILQ_FOREACH(head, &tman->transq, entry) { 264 if (head->flags & HAMMER2_TRANS_ISFLUSH) 265 break; 266 } 267 KKASSERT(head); 268 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 269 trans->sync_xid = head->sync_xid + 1; 270 trans->flags |= HAMMER2_TRANS_CONCURRENT; 271 272 /* 273 * XXX for now we must block new transactions, synchronous 274 * flush mode is on by default. 275 * 276 * If synchronous flush mode is enabled concurrent 277 * frontend transactions during the flush are not 278 * allowed (except we don't have a choice for buffer 279 * cache ops). 280 */ 281 if (hammer2_synchronous_flush > 0 || 282 TAILQ_FIRST(&tman->transq) != head) { 283 trans->blocked = 1; 284 while (trans->blocked) { 285 lksleep(&trans->sync_xid, 286 &tman->translk, 0, 287 "h2multf", hz); 288 } 289 } 290 } 291 if (flags & HAMMER2_TRANS_NEWINODE) { 292 if (pmp->spmp_hmp) { 293 /* 294 * Super-root transaction, all new inodes have an 295 * inode number of 1. Normal pfs inode cache 296 * semantics are not used. 297 */ 298 trans->inode_tid = 1; 299 } else { 300 /* 301 * Normal transaction 302 */ 303 if (pmp->inode_tid < HAMMER2_INODE_START) 304 pmp->inode_tid = HAMMER2_INODE_START; 305 trans->inode_tid = pmp->inode_tid++; 306 } 307 } 308 309 lockmgr(&tman->translk, LK_RELEASE); 310 } 311 312 /* 313 * This may only be called while in a flush transaction. It's a bit of a 314 * hack but after flushing a PFS we need to flush each volume root as part 315 * of the same transaction. 316 */ 317 void 318 hammer2_trans_spmp(hammer2_trans_t *trans, hammer2_pfsmount_t *spmp) 319 { 320 ++spmp->alloc_tid; 321 spmp->flush_tid = spmp->alloc_tid; 322 ++spmp->alloc_tid; 323 trans->pmp = spmp; 324 } 325 326 327 void 328 hammer2_trans_done(hammer2_trans_t *trans) 329 { 330 hammer2_trans_manage_t *tman; 331 hammer2_trans_t *head; 332 hammer2_trans_t *scan; 333 334 tman = &tmanage; 335 336 /* 337 * Remove. 338 */ 339 lockmgr(&tman->translk, LK_EXCLUSIVE); 340 TAILQ_REMOVE(&tman->transq, trans, entry); 341 head = TAILQ_FIRST(&tman->transq); 342 343 /* 344 * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT 345 * up through the next flush. (If the head is a flush then we 346 * stop there, unlike the unblock code following this section). 347 */ 348 if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 349 --tman->flushcnt; 350 scan = head; 351 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 352 atomic_clear_int(&scan->flags, 353 HAMMER2_TRANS_CONCURRENT); 354 scan = TAILQ_NEXT(scan, entry); 355 } 356 } 357 358 /* 359 * Unblock the head of the queue and any additional transactions 360 * up to the next flush. The head can be a flush and it will be 361 * unblocked along with the non-flush transactions following it 362 * (which are allowed to run concurrently with it). 363 * 364 * In synchronous flush mode we stop if the head transaction is 365 * a flush. 366 */ 367 if (head && head->blocked) { 368 head->blocked = 0; 369 wakeup(&head->sync_xid); 370 371 if (hammer2_synchronous_flush > 0) 372 scan = head; 373 else 374 scan = TAILQ_NEXT(head, entry); 375 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 376 if (scan->blocked) { 377 scan->blocked = 0; 378 wakeup(&scan->sync_xid); 379 } 380 scan = TAILQ_NEXT(scan, entry); 381 } 382 } 383 lockmgr(&tman->translk, LK_RELEASE); 384 } 385 386 /* 387 * Flush the chain and all modified sub-chains through the specified 388 * synchronization point, propagating parent chain modifications and 389 * mirror_tid updates back up as needed. Since we are recursing downward 390 * we do not have to deal with the complexities of multi-homed chains (chains 391 * with multiple parents). 392 * 393 * Caller must have interlocked against any non-flush-related modifying 394 * operations in progress whos modify_xid values are less than or equal 395 * to the passed sync_xid. 396 * 397 * Caller must have already vetted synchronization points to ensure they 398 * are properly flushed. Only snapshots and cluster flushes can create 399 * these sorts of synchronization points. 400 * 401 * This routine can be called from several places but the most important 402 * is from VFS_SYNC. 403 * 404 * chain is locked on call and will remain locked on return. If a flush 405 * occured, the chain's FLUSH_CREATE and/or FLUSH_DELETE bit will be set 406 * indicating that its parent (which is not part of the flush) should be 407 * updated. The chain may be replaced by the call if it was modified. 408 */ 409 void 410 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp) 411 { 412 hammer2_chain_t *chain = *chainp; 413 hammer2_chain_t *scan; 414 hammer2_chain_core_t *core; 415 hammer2_flush_info_t info; 416 int loops; 417 418 /* 419 * Execute the recursive flush and handle deferrals. 420 * 421 * Chains can be ridiculously long (thousands deep), so to 422 * avoid blowing out the kernel stack the recursive flush has a 423 * depth limit. Elements at the limit are placed on a list 424 * for re-execution after the stack has been popped. 425 */ 426 bzero(&info, sizeof(info)); 427 TAILQ_INIT(&info.flush_list); 428 info.trans = trans; 429 info.sync_xid = trans->sync_xid; 430 info.cache_index = -1; 431 432 core = chain->core; 433 434 /* 435 * Extra ref needed because flush_core expects it when replacing 436 * chain. 437 */ 438 hammer2_chain_ref(chain); 439 loops = 0; 440 441 for (;;) { 442 /* 443 * Unwind deep recursions which had been deferred. This 444 * can leave the FLUSH_* bits set for these chains, which 445 * will be handled when we [re]flush chain after the unwind. 446 */ 447 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) { 448 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 449 TAILQ_REMOVE(&info.flush_list, scan, flush_node); 450 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 451 452 /* 453 * Now that we've popped back up we can do a secondary 454 * recursion on the deferred elements. 455 * 456 * NOTE: hammer2_flush() may replace scan. 457 */ 458 if (hammer2_debug & 0x0040) 459 kprintf("deferred flush %p\n", scan); 460 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 461 hammer2_chain_drop(scan); /* ref from deferral */ 462 hammer2_flush(trans, &scan); 463 hammer2_chain_unlock(scan); 464 } 465 466 /* 467 * [re]flush chain. 468 */ 469 info.diddeferral = 0; 470 hammer2_flush_core(&info, &chain, 0); 471 472 /* 473 * Only loop if deep recursions have been deferred. 474 */ 475 if (TAILQ_EMPTY(&info.flush_list)) 476 break; 477 478 if (++loops % 1000 == 0) { 479 kprintf("hammer2_flush: excessive loops on %p\n", 480 chain); 481 if (hammer2_debug & 0x100000) 482 Debugger("hell4"); 483 } 484 } 485 hammer2_chain_drop(chain); 486 *chainp = chain; 487 } 488 489 /* 490 * This is the core of the chain flushing code. The chain is locked by the 491 * caller and must also have an extra ref on it by the caller, and remains 492 * locked and will have an extra ref on return. Upon return, the caller can 493 * test the FLUSH_CREATE and FLUSH_DELETE bits to determine what action must 494 * be taken on the parent. 495 * 496 * (1) Determine if this node is a candidate for the flush, return if it is 497 * not. fchain and vchain are always candidates for the flush. 498 * 499 * (2) If we recurse too deep the chain is entered onto the deferral list and 500 * the current flush stack is aborted until after the deferral list is 501 * run. 502 * 503 * (3) Recursively flush live children (rbtree). This can create deferrals. 504 * A successful flush clears the MODIFIED bit in the children. 505 * 506 * (4) Recursively flush deleted children (dbtree). Deletions may be 507 * considered 'live' if the delete_tid is beyond the flush_tid. If 508 * considered 'dead' the recursion is still needed in order to clean 509 * up the chain. This can create deferrals. 510 * 511 * A successful flush clears the MODIFIED bit in the children. 512 * 513 * (5) Calculate block table updates on chain based on the children scans 514 * in (3) and (4) by testing the FLUSH_CREATE and FLUSH_DELETE bits, 515 * modifying chain if necessary to perform the block table updates. 516 * Deletions must be removed from dbtree when removed from the 517 * chain's block table. 518 * 519 * If 'chain' itself is marked DELETED but treated as live, the block 520 * table update(s) must be propagated to all contemporary chains. In 521 * fact, all contemporary chains must be locked and updated uninterrupted 522 * to avoid lookup races. Once MODIFIED and FLUSH_CREATE is cleared, 523 * a chain can be unloaded from memory with the expectation that it can 524 * be reloaded later via the block table at any time. 525 * 526 * WARNING ON BREF MODIFY_TID/MIRROR_TID 527 * 528 * blockref.modify_tid and blockref.mirror_tid are consistent only within a 529 * PFS. This is why we cannot cache sync_tid in the transaction structure. 530 * Instead we access it from the pmp. 531 */ 532 static void 533 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp, 534 int deleting) 535 { 536 hammer2_chain_t *chain = *chainp; 537 hammer2_chain_t *saved_parent; 538 hammer2_mount_t *hmp; 539 hammer2_pfsmount_t *pmp; 540 hammer2_chain_core_t *core; 541 int diddeferral; 542 int saved_domodify; 543 544 hmp = chain->hmp; 545 pmp = chain->pmp; 546 core = chain->core; 547 diddeferral = info->diddeferral; 548 549 /* 550 * (1) Check if we even have any work to do. 551 * 552 * This bit of code is capable of short-cutting entire sub-trees 553 * if they have not been touched or if they have already been 554 * flushed. 555 */ 556 if (/*(chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&*/ 557 (chain->update_xlo >= info->sync_xid || /* already synced */ 558 chain->update_xlo >= chain->update_xhi)) { /* old/unchanged */ 559 /* update_xlo/_xhi already filters chain out, do not update */ 560 /* don't update bref.mirror_tid, pass2 is not called */ 561 return; 562 } 563 564 /* 565 * mirror_tid should not be forward-indexed 566 */ 567 KKASSERT(chain->bref.mirror_tid <= pmp->flush_tid); 568 569 /* 570 * Ignore chains modified beyond the current flush point. These 571 * will be treated as if they did not exist. Subchains with lower 572 * modify_xid's will still be accessible via other parents. 573 * 574 * Do not update bref.mirror_tid here, it will interfere with 575 * synchronization. e.g. inode flush tid 1, concurrent D-D tid 2, 576 * then later on inode flush tid 2. If we were to set mirror_tid 577 * to 1 during inode flush tid 1 the blockrefs would only be partially 578 * updated (and likely panic). 579 * 580 * We must update chain->update_xlo here to prevent re-entry in this 581 * flush transaction. 582 * 583 * (vchain and fchain are exceptions since they cannot be duplicated) 584 */ 585 if (chain->modify_xid > info->sync_xid && 586 chain != &hmp->fchain && chain != &hmp->vchain) { 587 /* do not update bref.mirror_tid, pass2 ignores chain */ 588 /* chain->update_xlo = info->sync_xid; */ 589 return; 590 } 591 592 /* 593 * (2) Recurse downward and check recursion depth. 594 * (3) Flush live children 595 * (4) Flush deleted children 596 * 597 * We adjust update_xlo if not deferring chain to prevent re-entry 598 * in this flush cycle, but it must be set AFTER the flush in case 599 * a deeper flush hits the chain. Otherwise the deeper flush cannot 600 * complete. We re-check the condition after finishing the flushes. 601 * 602 * update_xhi was already checked and prevents initial recursions on 603 * subtrees which have not been modified. 604 */ 605 saved_parent = info->parent; 606 saved_domodify = info->domodify; 607 info->parent = chain; 608 info->domodify = 0; 609 610 if (chain->flags & HAMMER2_CHAIN_DEFERRED) { 611 ++info->diddeferral; 612 } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 613 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) { 614 hammer2_chain_ref(chain); 615 TAILQ_INSERT_TAIL(&info->flush_list, 616 chain, flush_node); 617 atomic_set_int(&chain->flags, 618 HAMMER2_CHAIN_DEFERRED); 619 } 620 ++info->diddeferral; 621 } else { 622 hammer2_chain_t *scan; 623 624 /* 625 * The flush is queue-agnostic when running pass1, but order 626 * is important to catch any races where an existing 627 * flush-visible child is moved from rbtree->dbtree/dbq. 628 * 629 * New children added by concurrent operations are not visible 630 * to the flush anyway so we don't care about those races. 631 * However, the flush itself can move a child from dbq to 632 * dbtree (rare in pass1 but it is possible). 633 * 634 * pass1 can handle re-execution of a child. 635 */ 636 spin_lock(&core->cst.spin); 637 KKASSERT(core->good == 0x1234 && core->sharecnt > 0); 638 RB_SCAN(hammer2_chain_tree, &core->rbtree, 639 NULL, hammer2_flush_pass1, info); 640 RB_SCAN(hammer2_chain_tree, &core->dbtree, 641 NULL, hammer2_flush_pass1, info); 642 scan = TAILQ_FIRST(&core->dbq); 643 while (scan) { 644 KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ); 645 hammer2_flush_pass1(scan, info); 646 if (scan->flags & HAMMER2_CHAIN_ONDBQ) 647 scan = TAILQ_NEXT(scan, db_entry); 648 else 649 scan = TAILQ_FIRST(&core->dbq); 650 } 651 spin_unlock(&core->cst.spin); 652 } 653 654 /* 655 * Stop if deferred, do not update update_xlo. 656 */ 657 if (info->diddeferral) { 658 goto done; 659 } 660 661 /* 662 * If a block table update is needed place the parent in a modified 663 * state, which might delete-duplicate it. 664 * 665 * - To prevent loops and other confusion, we synchronize update_xlo 666 * for the original chain. 667 * 668 * - The original parent will not be used by the flush so we can 669 * clear its MODIFIED bit. 670 */ 671 if (info->domodify) { 672 hammer2_chain_modify(info->trans, &info->parent, 0); 673 if (info->parent != chain) { 674 /* 675 * chain - old 676 * info->parent - new 677 * 678 * NOTE: bref.mirror_tid cannot be updated 679 * unless MODIFIED is cleared or already 680 * clear. 681 */ 682 chain->inode_reason += 0x10000000; 683 info->parent->inode_reason += 0x100; 684 KKASSERT(info->parent->core == chain->core); 685 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 686 atomic_clear_int(&chain->flags, 687 HAMMER2_CHAIN_MODIFIED); 688 hammer2_pfs_memory_wakeup(pmp); 689 hammer2_chain_drop(chain); 690 } 691 #if 0 692 if (chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) { 693 atomic_clear_int(&chain->flags, 694 HAMMER2_CHAIN_FLUSH_CREATE); 695 hammer2_chain_drop(chain); 696 } 697 if (info->parent->flags & HAMMER2_CHAIN_FLUSH_DELETE) { 698 atomic_clear_int(&info->parent->flags, 699 HAMMER2_CHAIN_FLUSH_DELETE); 700 hammer2_chain_drop(info->parent); 701 } 702 #endif 703 if (chain->update_xlo < info->sync_xid) 704 chain->update_xlo = info->sync_xid; 705 KKASSERT(info->parent->update_xlo < info->sync_xid); 706 hammer2_chain_drop(chain); 707 hammer2_chain_ref(info->parent); 708 } 709 chain = info->parent; 710 } 711 712 /* 713 * If a blocktable update is needed determine if this is the last 714 * parent requiring modification (check all parents using the core). 715 * 716 * Set bit 1 (0x02) of domodify if this is the last parent, 717 * which will cause scan2 to clear FLUSH_CREATE and FLUSH_DELETE. 718 */ 719 if (1) { 720 hammer2_chain_t *scan; 721 722 spin_lock(&core->cst.spin); 723 TAILQ_FOREACH(scan, &core->ownerq, core_entry) { 724 /* 725 * Ignore the current parent being processed (we do 726 * not adjust update_xlo until after the fixup). 727 */ 728 if (scan == chain) 729 continue; 730 731 /* 732 * Ignore chains which have already been updated 733 * Ignore unmodified chains (lo >= hi). 734 */ 735 if ((scan->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 736 (scan->update_xlo >= info->sync_xid || 737 scan->update_xlo >= scan->update_xhi)) { 738 continue; 739 } 740 741 /* 742 * Cannot exhaust all parents if one is not visible 743 * to the flush. The root chains are special-cased 744 * because they cannot really be delete-duplicated. 745 */ 746 if (scan != &scan->hmp->fchain && 747 scan != &scan->hmp->vchain && 748 scan->modify_xid > info->sync_xid) { 749 break; 750 } 751 752 /* 753 * Fail if update_xlo has not been synchronized to 754 * at least our sync_xid on any modified parent chain. 755 */ 756 if (scan->update_xlo < info->sync_xid) 757 break; 758 } 759 spin_unlock(&core->cst.spin); 760 if (scan == NULL) 761 info->domodify |= 2; 762 } 763 764 /* 765 * (5) Calculate block table updates or child cleanups. 766 * (this whole operation has to be atomic) 767 * 768 * domodify 0x01 - block table updates 769 * 0x02 - child cleanups 770 * 771 * pass2 - Process deletions from dbtree and dbq. 772 * pass3 - Process insertions from rbtree, dbtree, and dbq. 773 * pass4 - Cleanup child flags on the last parent and 774 * Adjust queues on the live parent (deletions). 775 * pass5 - Cleanup child flags on the last parent and 776 * Adjust queues on the live parent (insertions). 777 * 778 * Queue adjustments had to be separated into deletions and 779 * insertions because both can occur on dbtree. 780 */ 781 if (info->domodify) { 782 hammer2_chain_t *scan; 783 784 spin_lock(&core->cst.spin); 785 786 while ((info->domodify & 1) && info->parent) { 787 /* PASS2 - Deletions */ 788 RB_SCAN(hammer2_chain_tree, &core->rbtree, 789 NULL, hammer2_flush_pass2, info); 790 RB_SCAN(hammer2_chain_tree, &core->dbtree, 791 NULL, hammer2_flush_pass2, info); 792 scan = TAILQ_FIRST(&core->dbq); 793 TAILQ_FOREACH(scan, &core->dbq, db_entry) { 794 KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ); 795 hammer2_flush_pass2(scan, info); 796 } 797 798 /* PASS3 - Insertions */ 799 RB_SCAN(hammer2_chain_tree, &core->rbtree, 800 NULL, hammer2_flush_pass3, info); 801 RB_SCAN(hammer2_chain_tree, &core->dbtree, 802 NULL, hammer2_flush_pass3, info); 803 TAILQ_FOREACH(scan, &core->dbq, db_entry) { 804 KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ); 805 hammer2_flush_pass3(scan, info); 806 } 807 info->parent = TAILQ_NEXT(info->parent, core_entry); 808 if (info->parent) 809 kprintf("FLUSH SPECIAL UPDATE (%p) %p.%d %08x\n", 810 chain, info->parent, 811 info->parent->bref.type, 812 info->parent->flags); 813 } 814 info->parent = chain; 815 816 /* PASS4 - Cleanup */ 817 RB_SCAN(hammer2_chain_tree, &core->rbtree, 818 NULL, hammer2_flush_pass4, info); 819 scan = TAILQ_FIRST(&core->dbq); 820 while (scan) { 821 KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ); 822 hammer2_flush_pass4(scan, info); 823 if (scan->flags & HAMMER2_CHAIN_ONDBQ) 824 scan = TAILQ_NEXT(scan, db_entry); 825 else 826 scan = TAILQ_FIRST(&core->dbq); 827 } 828 RB_SCAN(hammer2_chain_tree, &core->dbtree, 829 NULL, hammer2_flush_pass4, info); 830 831 /* PASS5 - Cleanup */ 832 RB_SCAN(hammer2_chain_tree, &core->rbtree, 833 NULL, hammer2_flush_pass5, info); 834 scan = TAILQ_FIRST(&core->dbq); 835 while (scan) { 836 KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ); 837 hammer2_flush_pass5(scan, info); 838 if (scan->flags & HAMMER2_CHAIN_ONDBQ) 839 scan = TAILQ_NEXT(scan, db_entry); 840 else 841 scan = TAILQ_FIRST(&core->dbq); 842 } 843 RB_SCAN(hammer2_chain_tree, &core->dbtree, 844 NULL, hammer2_flush_pass5, info); 845 846 spin_unlock(&core->cst.spin); 847 } 848 849 /* 850 * Synchronize update_xlo to prevent reentrant block updates of this 851 * parent. 852 */ 853 chain->update_xlo = info->sync_xid; 854 855 /* 856 * Skip the flush if the chain was not placed in a modified state 857 * or was not already in a modified state. 858 */ 859 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) 860 goto done; 861 862 /* 863 * FLUSH THE CHAIN (on the way back up the recursion) 864 * 865 * Chain is now deterministically being flushed and not being deferred. 866 * We've finished running the recursion and the blockref update. 867 * 868 * update bref.mirror_tid. update_xlo has already been updated. 869 */ 870 chain->bref.mirror_tid = pmp->flush_tid; 871 872 /* 873 * Dispose of the modified bit. FLUSH_CREATE should already be 874 * set. 875 */ 876 KKASSERT((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) || 877 chain == &hmp->vchain); 878 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 879 hammer2_pfs_memory_wakeup(pmp); 880 881 if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) || 882 chain == &hmp->vchain || 883 chain == &hmp->fchain) { 884 /* 885 * Drop the ref from the MODIFIED bit we cleared, 886 * net -1 ref. 887 */ 888 hammer2_chain_drop(chain); 889 } else { 890 /* 891 * Drop the ref from the MODIFIED bit we cleared and 892 * set a ref for the FLUSH_CREATE bit we are setting. 893 * Net 0 refs. 894 */ 895 atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_CREATE); 896 } 897 898 /* 899 * Skip the actual flush operation if the chain has been deleted 900 * in our flus hview. There will be no block table entry that 901 * references it. 902 */ 903 if (h2ignore_deleted(info, chain)) 904 goto done; 905 906 /* 907 * Issue flush. 908 * 909 * A DELETED node that reaches this point must be flushed for 910 * synchronization point consistency. 911 * 912 * Update bref.mirror_tid, clear MODIFIED, and set MOVED. 913 * 914 * The caller will update the parent's reference to this chain 915 * by testing MOVED as long as the modification was in-bounds. 916 * 917 * MOVED is never set on the volume root as there is no parent 918 * to adjust. 919 */ 920 if (hammer2_debug & 0x1000) { 921 kprintf("Flush %p.%d %016jx/%d sync_xid=%08x data=%016jx\n", 922 chain, chain->bref.type, 923 chain->bref.key, chain->bref.keybits, 924 info->sync_xid, chain->bref.data_off); 925 } 926 if (hammer2_debug & 0x2000) { 927 Debugger("Flush hell"); 928 } 929 930 /* 931 * If this is part of a recursive flush we can go ahead and write 932 * out the buffer cache buffer and pass a new bref back up the chain 933 * via the MOVED bit. 934 * 935 * Volume headers are NOT flushed here as they require special 936 * processing. 937 */ 938 switch(chain->bref.type) { 939 case HAMMER2_BREF_TYPE_FREEMAP: 940 KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED); 941 hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; 942 break; 943 case HAMMER2_BREF_TYPE_VOLUME: 944 /* 945 * The free block table is flushed by hammer2_vfs_sync() 946 * before it flushes vchain. We must still hold fchain 947 * locked while copying voldata to volsync, however. 948 */ 949 hammer2_voldata_lock(hmp); 950 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 951 #if 0 952 if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) || 953 hmp->voldata.freemap_tid < info->trans->sync_tid) { 954 /* this will modify vchain as a side effect */ 955 hammer2_chain_t *tmp = &hmp->fchain; 956 hammer2_chain_flush(info->trans, &tmp); 957 KKASSERT(tmp == &hmp->fchain); 958 } 959 #endif 960 961 /* 962 * There is no parent to our root vchain and fchain to 963 * synchronize the bref to, their updated mirror_tid's 964 * must be synchronized to the volume header. 965 */ 966 hmp->voldata.mirror_tid = chain->bref.mirror_tid; 967 hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; 968 kprintf("mirror_tid %016llx\n", chain->bref.mirror_tid); 969 970 /* 971 * The volume header is flushed manually by the syncer, not 972 * here. All we do here is adjust the crc's. 973 */ 974 KKASSERT(chain->data != NULL); 975 KKASSERT(chain->dio == NULL); 976 977 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 978 hammer2_icrc32( 979 (char *)&hmp->voldata + 980 HAMMER2_VOLUME_ICRC1_OFF, 981 HAMMER2_VOLUME_ICRC1_SIZE); 982 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 983 hammer2_icrc32( 984 (char *)&hmp->voldata + 985 HAMMER2_VOLUME_ICRC0_OFF, 986 HAMMER2_VOLUME_ICRC0_SIZE); 987 hmp->voldata.icrc_volheader = 988 hammer2_icrc32( 989 (char *)&hmp->voldata + 990 HAMMER2_VOLUME_ICRCVH_OFF, 991 HAMMER2_VOLUME_ICRCVH_SIZE); 992 hmp->volsync = hmp->voldata; 993 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 994 hammer2_chain_unlock(&hmp->fchain); 995 hammer2_voldata_unlock(hmp); 996 break; 997 case HAMMER2_BREF_TYPE_DATA: 998 /* 999 * Data elements have already been flushed via the logical 1000 * file buffer cache. Their hash was set in the bref by 1001 * the vop_write code. 1002 * 1003 * Make sure any device buffer(s) have been flushed out here. 1004 * (there aren't usually any to flush). 1005 */ 1006 break; 1007 #if 0 1008 case HAMMER2_BREF_TYPE_INDIRECT: 1009 /* 1010 * Indirect blocks may be in an INITIAL state. Use the 1011 * chain_lock() call to ensure that the buffer has been 1012 * instantiated (even though it is already locked the buffer 1013 * might not have been instantiated). 1014 * 1015 * Only write the buffer out if it is dirty, it is possible 1016 * the operating system had already written out the buffer. 1017 */ 1018 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1019 KKASSERT(chain->dio != NULL); 1020 1021 chain->data = NULL; 1022 hammer2_io_bqrelse(&chain->dio); 1023 hammer2_chain_unlock(chain); 1024 break; 1025 #endif 1026 case HAMMER2_BREF_TYPE_INDIRECT: 1027 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1028 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1029 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 1030 break; 1031 case HAMMER2_BREF_TYPE_INODE: 1032 if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_PFSROOT) { 1033 /* 1034 * non-NULL pmp if mounted as a PFS. We must sync 1035 * fields cached in the pmp. 1036 */ 1037 hammer2_inode_data_t *ipdata; 1038 1039 ipdata = &chain->data->ipdata; 1040 ipdata->pfs_inum = pmp->inode_tid; 1041 } else { 1042 /* can't be mounted as a PFS */ 1043 KKASSERT((chain->flags & HAMMER2_CHAIN_PFSROOT) == 0); 1044 } 1045 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 1046 break; 1047 default: 1048 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 1049 panic("hammer2_flush_core: unsupported embedded bref %d", 1050 chain->bref.type); 1051 /* NOT REACHED */ 1052 } 1053 1054 /* 1055 * Final cleanup after flush 1056 */ 1057 done: 1058 KKASSERT(chain->refs > 1); 1059 info->domodify = saved_domodify; 1060 info->parent = saved_parent; 1061 *chainp = chain; 1062 1063 KKASSERT(chain->bref.mirror_tid <= chain->pmp->flush_tid); 1064 } 1065 1066 /* 1067 * Flush helper pass1 (recursive) 1068 * 1069 * Flushes the children of the caller's chain (info->parent), restricted 1070 * by sync_tid. Set info->domodify if the child's blockref must propagate 1071 * back up to the parent. 1072 * 1073 * Ripouts can move child from rbtree to dbtree or dbq but the caller's 1074 * flush scan order prevents any chains from being lost. A child can be 1075 * executes more than once (update_xlo is used to prevent infinite recursions). 1076 * 1077 * WARNING! If we do not call hammer2_flush_core() we must update 1078 * bref.mirror_tid ourselves to indicate that the flush has 1079 * processed the child. 1080 * 1081 * WARNING! parent->core spinlock is held on entry and return. 1082 * 1083 * WARNING! Flushes do not cross PFS boundaries. Specifically, a flush must 1084 * not cross a pfs-root boundary. 1085 */ 1086 static int 1087 hammer2_flush_pass1(hammer2_chain_t *child, void *data) 1088 { 1089 hammer2_flush_info_t *info = data; 1090 hammer2_trans_t *trans = info->trans; 1091 hammer2_chain_t *parent = info->parent; 1092 1093 /* 1094 * Child modified in a later transactions, nothing to flush in this 1095 * transaction. 1096 * 1097 * Remember that modifications generally delete-duplicate so if the 1098 * sub-tree is dirty another child will get us there. But not this 1099 * one. 1100 * 1101 * (child can never be fchain or vchain so a special check isn't 1102 * needed). 1103 */ 1104 if (child->modify_xid > trans->sync_xid) { 1105 KKASSERT(child->delete_xid >= child->modify_xid); 1106 /*child->update_xlo = info->sync_xid;*/ 1107 /* do not update mirror_tid, pass2 will ignore chain */ 1108 return (0); 1109 } 1110 1111 /* 1112 * We must ref the child before unlocking the spinlock. 1113 * 1114 * The caller has added a ref to the parent so we can temporarily 1115 * unlock it in order to lock the child. 1116 */ 1117 hammer2_chain_ref(child); 1118 spin_unlock(&parent->core->cst.spin); 1119 1120 hammer2_chain_unlock(parent); 1121 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 1122 1123 /* 1124 * Never recurse across a mounted PFS boundary. 1125 * 1126 * Recurse and collect deferral data. We only recursively sync 1127 * (basically) if update_xlo has not been updated, indicating that 1128 * the child has not already been processed. 1129 */ 1130 if ((child->flags & HAMMER2_CHAIN_PFSBOUNDARY) == 0 || 1131 child->pmp == NULL) { 1132 if ((child->flags & HAMMER2_CHAIN_MODIFIED) || 1133 (child->update_xlo < info->sync_xid && 1134 child->update_xlo < child->update_xhi)) { 1135 ++info->depth; 1136 hammer2_flush_core(info, &child, 0); /* XXX deleting */ 1137 --info->depth; 1138 } 1139 } 1140 1141 /* 1142 * Determine if domodify should be set. Do not otherwise adjust 1143 * the child or pass2 will get confused. 1144 * 1145 * Insertion: 1146 * - child is flagged as possibly needing block table insertion. 1147 * - child not deleted or deletion is beyond transaction id 1148 * - child created beyond parent synchronization point 1149 * - parent not deleted as-of this transaction 1150 */ 1151 if ((child->flags & HAMMER2_CHAIN_FLUSH_CREATE) && 1152 child->delete_xid > trans->sync_xid && 1153 child->modify_xid > parent->update_xlo && 1154 parent->delete_xid > trans->sync_xid) { 1155 info->domodify = 1; 1156 } 1157 1158 /* 1159 * Removal: 1160 * - child is flagged as possibly needing block table removal. 1161 * - child deleted before or during this transaction 1162 * - child created prior or during parent synchronization point 1163 * - parent not yet synchronized to child deletion 1164 * - parent not deleted as-of this transaction 1165 */ 1166 if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) && 1167 child->delete_xid <= trans->sync_xid && 1168 child->modify_xid <= parent->update_xlo && 1169 child->delete_xid > parent->update_xlo && 1170 parent->delete_xid > trans->sync_xid) { 1171 info->domodify = 1; 1172 } 1173 1174 /* 1175 * Relock to continue the loop 1176 */ 1177 hammer2_chain_unlock(child); 1178 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1179 hammer2_chain_drop(child); 1180 KKASSERT(info->parent == parent); 1181 1182 spin_lock(&parent->core->cst.spin); 1183 return (0); 1184 } 1185 1186 /* 1187 * PASS2 - BLOCKTABLE DELETIONS 1188 */ 1189 static int 1190 hammer2_flush_pass2(hammer2_chain_t *child, void *data) 1191 { 1192 hammer2_flush_info_t *info = data; 1193 hammer2_chain_t *parent = info->parent; 1194 hammer2_mount_t *hmp = child->hmp; 1195 hammer2_trans_t *trans = info->trans; 1196 hammer2_blockref_t *base; 1197 int count; 1198 1199 /* 1200 * Prefilter - Ignore children not flagged as needing a parent 1201 * blocktable update. 1202 */ 1203 if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0) 1204 return (0); 1205 1206 /* 1207 * Prefilter - Ignore children created after our flush_tid (not 1208 * visible to our flush). 1209 */ 1210 if (child->modify_xid > trans->sync_xid) { 1211 KKASSERT(child->delete_xid >= child->modify_xid); 1212 return 0; 1213 } 1214 1215 /* 1216 * Prefilter - Don't bother updating the blockrefs for a deleted 1217 * parent (from the flush's perspective). Otherwise, 1218 * we need to be COUNTEDBREFS synchronized for the 1219 * hammer2_base_*() functions. 1220 * 1221 * NOTE: This test must match the similar one in flush_core. 1222 */ 1223 if (h2ignore_deleted(info, parent)) 1224 return 0; 1225 1226 /* 1227 * Calculate blockmap pointer 1228 */ 1229 switch(parent->bref.type) { 1230 case HAMMER2_BREF_TYPE_INODE: 1231 /* 1232 * Access the inode's block array. However, there is no 1233 * block array if the inode is flagged DIRECTDATA. The 1234 * DIRECTDATA case typicaly only occurs when a hardlink has 1235 * been shifted up the tree and the original inode gets 1236 * replaced with an OBJTYPE_HARDLINK placeholding inode. 1237 */ 1238 if (parent->data && 1239 (parent->data->ipdata.op_flags & 1240 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1241 base = &parent->data->ipdata.u.blockset.blockref[0]; 1242 } else { 1243 base = NULL; 1244 } 1245 count = HAMMER2_SET_COUNT; 1246 break; 1247 case HAMMER2_BREF_TYPE_INDIRECT: 1248 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1249 if (parent->data) 1250 base = &parent->data->npdata[0]; 1251 else 1252 base = NULL; 1253 count = parent->bytes / sizeof(hammer2_blockref_t); 1254 break; 1255 case HAMMER2_BREF_TYPE_VOLUME: 1256 base = &hmp->voldata.sroot_blockset.blockref[0]; 1257 count = HAMMER2_SET_COUNT; 1258 break; 1259 case HAMMER2_BREF_TYPE_FREEMAP: 1260 base = &parent->data->npdata[0]; 1261 count = HAMMER2_SET_COUNT; 1262 break; 1263 default: 1264 base = NULL; 1265 count = 0; 1266 panic("hammer2_flush_pass2: unrecognized blockref type: %d", 1267 parent->bref.type); 1268 } 1269 1270 /* 1271 * Removal 1272 * - child is flagged for removal 1273 * - child deleted before or during this transaction 1274 * - child created prior or during parent synchronization point 1275 * - parent not yet synchronized to child's deletion 1276 */ 1277 if (child->delete_xid <= trans->sync_xid && 1278 child->modify_xid <= parent->update_xlo && 1279 child->delete_xid > parent->update_xlo) { 1280 /* can't assert BMAPPED because state adjustment may occur 1281 * before we are done, and BMAPPED only applies to the live 1282 * parent. 1283 *KKASSERT(child->flags & HAMMER2_CHAIN_BMAPPED);*/ 1284 if (base) { 1285 hammer2_rollup_stats(parent, child, -1); 1286 hammer2_base_delete(trans, parent, base, count, 1287 &info->cache_index, child); 1288 } 1289 } 1290 1291 return 0; 1292 } 1293 1294 /* 1295 * PASS3 - BLOCKTABLE INSERTIONS 1296 */ 1297 static int 1298 hammer2_flush_pass3(hammer2_chain_t *child, void *data) 1299 { 1300 hammer2_flush_info_t *info = data; 1301 hammer2_chain_t *parent = info->parent; 1302 hammer2_mount_t *hmp = child->hmp; 1303 hammer2_trans_t *trans = info->trans; 1304 hammer2_blockref_t *base; 1305 int count; 1306 1307 /* 1308 * Prefilter - Ignore children not flagged as needing a parent 1309 * blocktable update. 1310 */ 1311 if ((child->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) 1312 return (0); 1313 1314 /* 1315 * Prefilter - Ignore children created after our flush_tid (not 1316 * visible to our flush). 1317 */ 1318 if (child->modify_xid > trans->sync_xid) { 1319 KKASSERT(child->delete_xid >= child->modify_xid); 1320 return 0; 1321 } 1322 1323 /* 1324 * Prefilter - Don't bother updating the blockrefs for a deleted 1325 * parent (from the flush's perspective). Otherwise, 1326 * we need to be COUNTEDBREFS synchronized for the 1327 * hammer2_base_*() functions. 1328 * 1329 * NOTE: This test must match the similar one in flush_core. 1330 */ 1331 if (h2ignore_deleted(info, parent)) 1332 return 0; 1333 1334 /* 1335 * Calculate blockmap pointer 1336 */ 1337 switch(parent->bref.type) { 1338 case HAMMER2_BREF_TYPE_INODE: 1339 /* 1340 * Access the inode's block array. However, there is no 1341 * block array if the inode is flagged DIRECTDATA. The 1342 * DIRECTDATA case typicaly only occurs when a hardlink has 1343 * been shifted up the tree and the original inode gets 1344 * replaced with an OBJTYPE_HARDLINK placeholding inode. 1345 */ 1346 if (parent->data && 1347 (parent->data->ipdata.op_flags & 1348 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1349 base = &parent->data->ipdata.u.blockset.blockref[0]; 1350 } else { 1351 base = NULL; 1352 } 1353 count = HAMMER2_SET_COUNT; 1354 break; 1355 case HAMMER2_BREF_TYPE_INDIRECT: 1356 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1357 if (parent->data) 1358 base = &parent->data->npdata[0]; 1359 else 1360 base = NULL; 1361 count = parent->bytes / sizeof(hammer2_blockref_t); 1362 break; 1363 case HAMMER2_BREF_TYPE_VOLUME: 1364 base = &hmp->voldata.sroot_blockset.blockref[0]; 1365 count = HAMMER2_SET_COUNT; 1366 break; 1367 case HAMMER2_BREF_TYPE_FREEMAP: 1368 base = &parent->data->npdata[0]; 1369 count = HAMMER2_SET_COUNT; 1370 break; 1371 default: 1372 base = NULL; 1373 count = 0; 1374 panic("hammer2_flush_pass3: " 1375 "unrecognized blockref type: %d", 1376 parent->bref.type); 1377 } 1378 1379 /* 1380 * Insertion 1381 * - child is flagged as possibly needing block table insertion. 1382 * - child not deleted or deletion is beyond transaction id 1383 * - child created beyond parent synchronization point 1384 */ 1385 if (child->delete_xid > trans->sync_xid && 1386 child->modify_xid > parent->update_xlo) { 1387 if (base) { 1388 hammer2_rollup_stats(parent, child, 1); 1389 hammer2_base_insert(trans, parent, base, count, 1390 &info->cache_index, child); 1391 } 1392 } 1393 1394 return 0; 1395 } 1396 1397 /* 1398 * PASS4 - CLEANUP CHILDREN (non-recursive, but CAN be re-entrant) 1399 * 1400 * Adjust queues and set or clear BMAPPED appropriately if processing 1401 * the live parent. pass4 handles deletions, pass5 handles insertions. 1402 * Separate passes are required because both deletions and insertions can 1403 * occur on dbtree. 1404 * 1405 * Cleanup FLUSH_CREATE/FLUSH_DELETE on the last parent. 1406 */ 1407 static int 1408 hammer2_flush_pass4(hammer2_chain_t *child, void *data) 1409 { 1410 hammer2_flush_info_t *info = data; 1411 hammer2_chain_t *parent = info->parent; 1412 hammer2_chain_core_t *above = child->above; 1413 hammer2_trans_t *trans = info->trans; 1414 1415 /* 1416 * Prefilter - Ignore children created after our flush_tid (not 1417 * visible to our flush). 1418 */ 1419 if (child->modify_xid > trans->sync_xid) { 1420 KKASSERT(child->delete_xid >= child->modify_xid); 1421 return 0; 1422 } 1423 1424 /* 1425 * Ref and lock child for operation, spinlock must be temporarily 1426 * Make sure child is referenced before we unlock. 1427 */ 1428 hammer2_chain_ref(child); 1429 spin_unlock(&above->cst.spin); 1430 hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER); 1431 KKASSERT(child->above == above); 1432 KKASSERT(parent->core == above); 1433 1434 /* 1435 * Adjust BMAPPED state and rbtree/queue only when we hit the 1436 * actual live parent. 1437 */ 1438 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 1439 spin_lock(&above->cst.spin); 1440 1441 /* 1442 * Deleting from blockmap, move child out of dbtree 1443 * and clear BMAPPED. Child should not be on RBTREE. 1444 */ 1445 if (child->delete_xid <= trans->sync_xid && 1446 child->modify_xid <= parent->update_xlo && 1447 child->delete_xid > parent->update_xlo && 1448 (child->flags & HAMMER2_CHAIN_BMAPPED)) { 1449 KKASSERT(child->flags & HAMMER2_CHAIN_ONDBTREE); 1450 RB_REMOVE(hammer2_chain_tree, &above->dbtree, child); 1451 atomic_clear_int(&child->flags, HAMMER2_CHAIN_ONDBTREE); 1452 atomic_clear_int(&child->flags, HAMMER2_CHAIN_BMAPPED); 1453 } 1454 1455 /* 1456 * Not on any list, place child on DBQ 1457 */ 1458 if ((child->flags & (HAMMER2_CHAIN_ONRBTREE | 1459 HAMMER2_CHAIN_ONDBTREE | 1460 HAMMER2_CHAIN_ONDBQ)) == 0) { 1461 KKASSERT((child->flags & HAMMER2_CHAIN_BMAPPED) == 0); 1462 TAILQ_INSERT_TAIL(&above->dbq, child, db_entry); 1463 atomic_set_int(&child->flags, HAMMER2_CHAIN_ONDBQ); 1464 } 1465 spin_unlock(&above->cst.spin); 1466 } 1467 1468 /* 1469 * Unlock the child. This can wind up dropping the child's 1470 * last ref, removing it from the parent's RB tree, and deallocating 1471 * the structure. The RB_SCAN() our caller is doing handles the 1472 * situation. 1473 */ 1474 hammer2_chain_unlock(child); 1475 hammer2_chain_drop(child); 1476 spin_lock(&above->cst.spin); 1477 1478 /* 1479 * The parent may have been delete-duplicated. 1480 */ 1481 return (0); 1482 } 1483 1484 static int 1485 hammer2_flush_pass5(hammer2_chain_t *child, void *data) 1486 { 1487 hammer2_flush_info_t *info = data; 1488 hammer2_chain_t *parent = info->parent; 1489 hammer2_chain_t *xchain; 1490 hammer2_chain_core_t *above = child->above; 1491 hammer2_trans_t *trans = info->trans; 1492 1493 /* 1494 * Prefilter - Ignore children created after our flush_tid (not 1495 * visible to our flush). 1496 */ 1497 if (child->modify_xid > trans->sync_xid) { 1498 KKASSERT(child->delete_xid >= child->modify_xid); 1499 return 0; 1500 } 1501 1502 /* 1503 * Ref and lock child for operation, spinlock must be temporarily 1504 * Make sure child is referenced before we unlock. 1505 */ 1506 hammer2_chain_ref(child); 1507 spin_unlock(&above->cst.spin); 1508 hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER); 1509 KKASSERT(child->above == above); 1510 KKASSERT(parent->core == above); 1511 1512 /* 1513 * Adjust BMAPPED state and rbtree/queue only when we hit the 1514 * actual live parent. 1515 */ 1516 if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) { 1517 spin_lock(&above->cst.spin); 1518 1519 /* 1520 * Inserting into blockmap, place child in rbtree or dbtree. 1521 */ 1522 if (child->delete_xid > trans->sync_xid && 1523 child->modify_xid > parent->update_xlo && 1524 (child->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 1525 if (child->flags & HAMMER2_CHAIN_ONDBQ) { 1526 TAILQ_REMOVE(&above->dbq, child, db_entry); 1527 atomic_clear_int(&child->flags, 1528 HAMMER2_CHAIN_ONDBQ); 1529 } 1530 if ((child->flags & HAMMER2_CHAIN_DELETED) == 0 && 1531 (child->flags & HAMMER2_CHAIN_ONRBTREE) == 0) { 1532 KKASSERT((child->flags & 1533 (HAMMER2_CHAIN_ONDBTREE | 1534 HAMMER2_CHAIN_ONDBQ)) == 0); 1535 xchain = RB_INSERT(hammer2_chain_tree, 1536 &above->rbtree, child); 1537 KKASSERT(xchain == NULL); 1538 atomic_set_int(&child->flags, 1539 HAMMER2_CHAIN_ONRBTREE); 1540 } else 1541 if ((child->flags & HAMMER2_CHAIN_DELETED) && 1542 (child->flags & HAMMER2_CHAIN_ONDBTREE) == 0) { 1543 KKASSERT((child->flags & 1544 (HAMMER2_CHAIN_ONRBTREE | 1545 HAMMER2_CHAIN_ONDBQ)) == 0); 1546 xchain = RB_INSERT(hammer2_chain_tree, 1547 &above->dbtree, child); 1548 KKASSERT(xchain == NULL); 1549 atomic_set_int(&child->flags, 1550 HAMMER2_CHAIN_ONDBTREE); 1551 } 1552 atomic_set_int(&child->flags, HAMMER2_CHAIN_BMAPPED); 1553 KKASSERT(child->flags & 1554 (HAMMER2_CHAIN_ONRBTREE | 1555 HAMMER2_CHAIN_ONDBTREE | 1556 HAMMER2_CHAIN_ONDBQ)); 1557 } 1558 1559 /* 1560 * Not on any list, place child on DBQ 1561 */ 1562 if ((child->flags & (HAMMER2_CHAIN_ONRBTREE | 1563 HAMMER2_CHAIN_ONDBTREE | 1564 HAMMER2_CHAIN_ONDBQ)) == 0) { 1565 KKASSERT((child->flags & HAMMER2_CHAIN_BMAPPED) == 0); 1566 TAILQ_INSERT_TAIL(&above->dbq, child, db_entry); 1567 atomic_set_int(&child->flags, HAMMER2_CHAIN_ONDBQ); 1568 } 1569 spin_unlock(&above->cst.spin); 1570 } 1571 1572 /* 1573 * Cleanup flags on last parent iterated for flush. 1574 */ 1575 if (info->domodify & 2) { 1576 if (child->flags & HAMMER2_CHAIN_FLUSH_CREATE) { 1577 atomic_clear_int(&child->flags, 1578 HAMMER2_CHAIN_FLUSH_CREATE); 1579 hammer2_chain_drop(child); 1580 } 1581 if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) && 1582 child->delete_xid <= trans->sync_xid) { 1583 KKASSERT((parent->flags & HAMMER2_CHAIN_DELETED) || 1584 (child->flags & HAMMER2_CHAIN_ONDBTREE) == 0); 1585 /* XXX delete-duplicate chain insertion mech wrong */ 1586 KKASSERT((parent->flags & HAMMER2_CHAIN_DELETED) || 1587 (child->flags & HAMMER2_CHAIN_BMAPPED) == 0); 1588 atomic_clear_int(&child->flags, 1589 HAMMER2_CHAIN_FLUSH_DELETE); 1590 hammer2_chain_drop(child); 1591 } 1592 } 1593 1594 /* 1595 * Unlock the child. This can wind up dropping the child's 1596 * last ref, removing it from the parent's RB tree, and deallocating 1597 * the structure. The RB_SCAN() our caller is doing handles the 1598 * situation. 1599 */ 1600 hammer2_chain_unlock(child); 1601 hammer2_chain_drop(child); 1602 spin_lock(&above->cst.spin); 1603 1604 /* 1605 * The parent may have been delete-duplicated. 1606 */ 1607 return (0); 1608 } 1609 1610 void 1611 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how) 1612 { 1613 #if 0 1614 hammer2_chain_t *grandp; 1615 #endif 1616 1617 parent->data_count += child->data_count; 1618 parent->inode_count += child->inode_count; 1619 child->data_count = 0; 1620 child->inode_count = 0; 1621 if (how < 0) { 1622 parent->data_count -= child->bytes; 1623 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1624 parent->inode_count -= 1; 1625 #if 0 1626 /* XXX child->data may be NULL atm */ 1627 parent->data_count -= child->data->ipdata.data_count; 1628 parent->inode_count -= child->data->ipdata.inode_count; 1629 #endif 1630 } 1631 } else if (how > 0) { 1632 parent->data_count += child->bytes; 1633 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1634 parent->inode_count += 1; 1635 #if 0 1636 /* XXX child->data may be NULL atm */ 1637 parent->data_count += child->data->ipdata.data_count; 1638 parent->inode_count += child->data->ipdata.inode_count; 1639 #endif 1640 } 1641 } 1642 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 1643 parent->data->ipdata.data_count += parent->data_count; 1644 parent->data->ipdata.inode_count += parent->inode_count; 1645 #if 0 1646 for (grandp = parent->above->first_parent; 1647 grandp; 1648 grandp = grandp->next_parent) { 1649 grandp->data_count += parent->data_count; 1650 grandp->inode_count += parent->inode_count; 1651 } 1652 #endif 1653 parent->data_count = 0; 1654 parent->inode_count = 0; 1655 } 1656 } 1657