1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/types.h> 40 #include <sys/lock.h> 41 #include <sys/uuid.h> 42 43 #include "hammer2.h" 44 45 #define FLUSH_DEBUG 0 46 47 /* 48 * Recursively flush the specified chain. The chain is locked and 49 * referenced by the caller and will remain so on return. The chain 50 * will remain referenced throughout but can temporarily lose its 51 * lock during the recursion to avoid unnecessarily stalling user 52 * processes. 53 */ 54 struct hammer2_flush_info { 55 hammer2_chain_t *parent; 56 hammer2_trans_t *trans; 57 int depth; 58 int diddeferral; 59 int pass; 60 int cache_index; 61 int domodify; 62 struct h2_flush_deferral_list flush_list; 63 hammer2_tid_t sync_tid; /* flush synchronization point */ 64 }; 65 66 typedef struct hammer2_flush_info hammer2_flush_info_t; 67 68 static void hammer2_chain_flush_core(hammer2_flush_info_t *info, 69 hammer2_chain_t **chainp); 70 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data); 71 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data); 72 static void hammer2_flush_core_update(hammer2_chain_core_t *core, 73 hammer2_flush_info_t *info); 74 static void hammer2_rollup_stats(hammer2_chain_t *parent, 75 hammer2_chain_t *child, int how); 76 77 /* 78 * Can we ignore a chain for the purposes of flushing modifications 79 * to the media? 80 * 81 * Normally we don't bother to flush deleted chains. However, a chain 82 * associated with a deleted inode may still be active due to open 83 * descriptors. A flush is needed in this situation to prevent unbounded 84 * memory use by the filesystem. 85 * 86 * This test is fragile as active inodes can still be delete-duplicated and 87 * the older deleted chains are, in fact, actual deletions. These will be 88 * flagged DUPLICATED. The live inode at the end of the del-dup chain 89 * will be marked destroyed on last close. 90 * 91 * XXX inode deletions need to be deleted normally and then duplicated to 92 * a special 'live-but-unlinked' spot somewhere until last-close. 93 * XXX bulk free code has a conniption fit too. 94 */ 95 static __inline 96 int 97 h2ignore_deleted(hammer2_flush_info_t *info, hammer2_chain_t *chain) 98 { 99 return (chain->delete_tid <= info->sync_tid && 100 (chain->bref.type != HAMMER2_BREF_TYPE_INODE || 101 (chain->flags & HAMMER2_CHAIN_DUPLICATED) || 102 (chain->flags & HAMMER2_CHAIN_DESTROYED))); 103 } 104 105 #if 0 106 static __inline 107 void 108 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref, 109 int how) 110 { 111 hammer2_key_t bytes; 112 113 if (bref->type != 0) { 114 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX); 115 if (bref->type == HAMMER2_BREF_TYPE_INODE) 116 info->inode_count += how; 117 if (how < 0) 118 info->data_count -= bytes; 119 else 120 info->data_count += bytes; 121 } 122 } 123 #endif 124 125 /* 126 * Transaction support functions for writing to the filesystem. 127 * 128 * Initializing a new transaction allocates a transaction ID. Typically 129 * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 130 * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 131 * media target. The latter mode is used by the recovery code. 132 * 133 * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 134 * other is a set of any number of concurrent filesystem operations. We 135 * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 136 * or we can have <running_flush> + <concurrent_fs_ops>. 137 * 138 * During a flush, new fs_ops are only blocked until the fs_ops prior to 139 * the flush complete. The new fs_ops can then run concurrent with the flush. 140 * 141 * Buffer-cache transactions operate as fs_ops but never block. A 142 * buffer-cache flush will run either before or after the current pending 143 * flush depending on its state. 144 * 145 * sync_tid vs real_tid. For flush transactions ONLY, the flush operation 146 * actually uses two transaction ids, one for the flush operation itself, 147 * and <N+1> for any freemap allocations made as a side-effect. real_tid 148 * is fixed at <N>, sync_tid is adjusted dynamically as-needed. 149 * 150 * NOTE: The sync_tid for a flush's freemap allocation will match the 151 * sync_tid of the following <concurrent_fs_ops> transaction(s). 152 * The freemap topology will be out-of-step by one transaction id 153 * in order to give the flusher a stable freemap topology to flush 154 * out. This is fixed up at mount-time using a quick incremental 155 * scan. 156 */ 157 void 158 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, 159 hammer2_mount_t *hmp, int flags) 160 { 161 hammer2_trans_t *head; 162 163 bzero(trans, sizeof(*trans)); 164 if (pmp) { 165 trans->pmp = pmp; 166 KKASSERT(hmp == NULL); 167 hmp = pmp->cluster.chains[0]->hmp; /* XXX */ 168 } else { 169 trans->hmp_single = hmp; 170 KKASSERT(hmp); 171 } 172 173 hammer2_voldata_lock(hmp); 174 trans->flags = flags; 175 trans->td = curthread; 176 /*trans->delete_gen = 0;*/ /* multiple deletions within trans */ 177 178 if (flags & HAMMER2_TRANS_ISFLUSH) { 179 /* 180 * If multiple flushes are trying to run we have to 181 * wait until it is our turn. All flushes are serialized. 182 * 183 * We queue ourselves and then wait to become the head 184 * of the queue, allowing all prior flushes to complete. 185 * 186 * A unique transaction id is required to avoid confusion 187 * when updating the block tables. 188 */ 189 ++hmp->flushcnt; 190 ++hmp->voldata.alloc_tid; 191 trans->sync_tid = hmp->voldata.alloc_tid; 192 trans->real_tid = trans->sync_tid; 193 ++hmp->voldata.alloc_tid; 194 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 195 if (TAILQ_FIRST(&hmp->transq) != trans) { 196 trans->blocked = 1; 197 while (trans->blocked) { 198 lksleep(&trans->sync_tid, &hmp->voldatalk, 199 0, "h2multf", hz); 200 } 201 } 202 } else if (hmp->flushcnt == 0) { 203 /* 204 * No flushes are pending, we can go. 205 */ 206 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 207 trans->sync_tid = hmp->voldata.alloc_tid; 208 trans->real_tid = trans->sync_tid; 209 210 /* XXX improve/optimize inode allocation */ 211 } else { 212 /* 213 * One or more flushes are pending. We insert after 214 * the current flush and may block. We have priority 215 * over any flushes that are not the current flush. 216 * 217 * TRANS_BUFCACHE transactions cannot block. 218 */ 219 TAILQ_FOREACH(head, &hmp->transq, entry) { 220 if (head->flags & HAMMER2_TRANS_ISFLUSH) 221 break; 222 } 223 KKASSERT(head); 224 TAILQ_INSERT_AFTER(&hmp->transq, head, trans, entry); 225 trans->sync_tid = head->real_tid + 1; 226 trans->real_tid = trans->sync_tid; 227 228 if ((trans->flags & HAMMER2_TRANS_BUFCACHE) == 0) { 229 if (TAILQ_FIRST(&hmp->transq) != head) { 230 trans->blocked = 1; 231 while (trans->blocked) { 232 lksleep(&trans->sync_tid, 233 &hmp->voldatalk, 0, 234 "h2multf", hz); 235 } 236 } 237 } 238 } 239 if (flags & HAMMER2_TRANS_NEWINODE) 240 trans->inode_tid = hmp->voldata.inode_tid++; 241 hammer2_voldata_unlock(hmp, 0); 242 } 243 244 void 245 hammer2_trans_done(hammer2_trans_t *trans) 246 { 247 hammer2_mount_t *hmp; 248 hammer2_trans_t *head; 249 hammer2_trans_t *scan; 250 251 if (trans->pmp) 252 hmp = trans->pmp->cluster.chains[0]->hmp; 253 else 254 hmp = trans->hmp_single; 255 256 /* 257 * Remove and adjust flushcnt 258 */ 259 hammer2_voldata_lock(hmp); 260 TAILQ_REMOVE(&hmp->transq, trans, entry); 261 if (trans->flags & HAMMER2_TRANS_ISFLUSH) 262 --hmp->flushcnt; 263 264 /* 265 * Unblock the head of the queue and any additional transactions 266 * up to the next flush. 267 */ 268 head = TAILQ_FIRST(&hmp->transq); 269 if (head && head->blocked) { 270 head->blocked = 0; 271 wakeup(&head->sync_tid); 272 273 scan = TAILQ_NEXT(head, entry); 274 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 275 if (scan->blocked) { 276 scan->blocked = 0; 277 wakeup(&scan->sync_tid); 278 } 279 scan = TAILQ_NEXT(scan, entry); 280 } 281 } 282 hammer2_voldata_unlock(hmp, 0); 283 } 284 285 /* 286 * Flush the chain and all modified sub-chains through the specified 287 * synchronization point (sync_tid), propagating parent chain modifications 288 * and mirror_tid updates back up as needed. Since we are recursing downward 289 * we do not have to deal with the complexities of multi-homed chains (chains 290 * with multiple parents). 291 * 292 * Caller must have interlocked against any non-flush-related modifying 293 * operations in progress whos modify_tid values are less than or equal 294 * to the passed sync_tid. 295 * 296 * Caller must have already vetted synchronization points to ensure they 297 * are properly flushed. Only snapshots and cluster flushes can create 298 * these sorts of synchronization points. 299 * 300 * This routine can be called from several places but the most important 301 * is from the hammer2_vop_reclaim() function. We want to try to completely 302 * clean out the inode structure to prevent disconnected inodes from 303 * building up and blowing out the kmalloc pool. However, it is not actually 304 * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery 305 * capability. 306 * 307 * chain is locked on call and will remain locked on return. If a flush 308 * occured, the chain's MOVED bit will be set indicating that its parent 309 * (which is not part of the flush) should be updated. The chain may be 310 * replaced by the call. 311 */ 312 void 313 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp) 314 { 315 hammer2_chain_t *chain = *chainp; 316 hammer2_chain_t *scan; 317 hammer2_chain_core_t *core; 318 hammer2_flush_info_t info; 319 int loops; 320 321 /* 322 * Execute the recursive flush and handle deferrals. 323 * 324 * Chains can be ridiculously long (thousands deep), so to 325 * avoid blowing out the kernel stack the recursive flush has a 326 * depth limit. Elements at the limit are placed on a list 327 * for re-execution after the stack has been popped. 328 */ 329 bzero(&info, sizeof(info)); 330 TAILQ_INIT(&info.flush_list); 331 info.trans = trans; 332 info.sync_tid = trans->sync_tid; 333 info.cache_index = -1; 334 335 core = chain->core; 336 #if FLUSH_DEBUG 337 kprintf("CHAIN FLUSH trans %p.%016jx chain %p.%d mod %016jx upd %016jx\n", trans, trans->sync_tid, chain, chain->bref.type, chain->modify_tid, core->update_lo); 338 #endif 339 340 /* 341 * Extra ref needed because flush_core expects it when replacing 342 * chain. 343 */ 344 hammer2_chain_ref(chain); 345 loops = 0; 346 347 for (;;) { 348 /* 349 * Unwind deep recursions which had been deferred. This 350 * can leave MOVED set for these chains, which will be 351 * handled when we [re]flush chain after the unwind. 352 */ 353 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) { 354 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 355 TAILQ_REMOVE(&info.flush_list, scan, flush_node); 356 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 357 358 /* 359 * Now that we've popped back up we can do a secondary 360 * recursion on the deferred elements. 361 * 362 * NOTE: hammer2_chain_flush() may replace scan. 363 */ 364 if (hammer2_debug & 0x0040) 365 kprintf("deferred flush %p\n", scan); 366 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 367 hammer2_chain_drop(scan); /* ref from deferral */ 368 hammer2_chain_flush(trans, &scan); 369 hammer2_chain_unlock(scan); 370 } 371 372 /* 373 * [re]flush chain. 374 */ 375 info.diddeferral = 0; 376 hammer2_chain_flush_core(&info, &chain); 377 #if FLUSH_DEBUG 378 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n", 379 chain, chain->bref.type, chain->flags); 380 #endif 381 382 /* 383 * Only loop if deep recursions have been deferred. 384 */ 385 if (TAILQ_EMPTY(&info.flush_list)) 386 break; 387 388 if (++loops % 1000 == 0) { 389 kprintf("hammer2_chain_flush: excessive loops on %p\n", 390 chain); 391 if (hammer2_debug & 0x100000) 392 Debugger("hell4"); 393 } 394 } 395 hammer2_chain_drop(chain); 396 *chainp = chain; 397 } 398 399 /* 400 * This is the core of the chain flushing code. The chain is locked by the 401 * caller and must also have an extra ref on it by the caller, and remains 402 * locked and will have an extra ref on return. 403 * 404 * If the flush accomplished any work chain will be flagged MOVED 405 * indicating a copy-on-write propagation back up is required. 406 * Deep sub-nodes may also have been entered onto the deferral list. 407 * MOVED is never set on the volume root. 408 * 409 * NOTE: modify_tid is different from MODIFIED. modify_tid is updated 410 * only when a chain is specifically modified, and not updated 411 * for copy-on-write propagations. MODIFIED is set on any modification 412 * including copy-on-write propagations. 413 * 414 * NOTE: We are responsible for updating chain->bref.mirror_tid and 415 * core->update_lo The caller is responsible for processing us into 416 * our parent (if any). 417 * 418 * We are also responsible for updating chain->core->update_lo to 419 * prevent repeated recursions due to deferrals. 420 * 421 * WARNING! bref.mirror_tid may only be updated if either the MODIFIED bit 422 * is already zero or if we clear it. 423 * 424 */ 425 static void 426 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp) 427 { 428 hammer2_chain_t *chain = *chainp; 429 hammer2_chain_t *saved_parent; 430 hammer2_mount_t *hmp; 431 hammer2_blockref_t *bref; 432 hammer2_chain_core_t *core; 433 char *bdata; 434 hammer2_io_t *dio; 435 int error; 436 int diddeferral; 437 438 hmp = chain->hmp; 439 core = chain->core; 440 diddeferral = info->diddeferral; 441 442 /* 443 * Check if we even have any work to do. 444 * 445 * We do not update core->update_lo because there might be other 446 * paths to the core and we haven't actually checked it. 447 * 448 * This bit of code is capable of short-cutting entire sub-trees 449 * if they have not been touched. 450 */ 451 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 452 (core->update_lo >= info->sync_tid || 453 chain->bref.mirror_tid >= info->sync_tid || 454 chain->bref.mirror_tid >= core->update_hi)) { 455 KKASSERT(chain->modify_tid <= info->sync_tid); 456 /* don't update update_lo, there may be other paths to core */ 457 /* don't update bref.mirror_tid, scan2 is not called */ 458 return; 459 } 460 461 /* 462 * Ignore chains which have already been flushed through the current 463 * synchronization point. 464 */ 465 KKASSERT (chain->bref.mirror_tid <= info->sync_tid); 466 if (chain->bref.mirror_tid == info->sync_tid) { 467 /* do not update core->update_lo, there may be another path */ 468 return; 469 } 470 471 /* 472 * Ignore chains modified beyond the current flush point. These 473 * will be treated as if they did not exist. Subchains with lower 474 * modify_tid's will still be accessible via other parents. 475 * 476 * Do not update bref.mirror_tid here, it will interfere with 477 * synchronization. e.g. inode flush tid 1, concurrent D-D tid 2, 478 * then later on inode flush tid 2. If we were to set mirror_tid 479 * to 1 during inode flush tid 1 the blockrefs would only be partially 480 * updated (and likely panic). 481 * 482 * Do not update core->update_lo here, there might be other paths 483 * to the core and we haven't actually flushed it. 484 * 485 * (vchain and fchain are exceptions since they cannot be duplicated) 486 */ 487 if (chain->modify_tid > info->sync_tid && 488 chain != &hmp->fchain && chain != &hmp->vchain) { 489 /* do not update bref.mirror_tid, scan2 ignores chain */ 490 /* do not update core->update_lo, there may be another path */ 491 return; 492 } 493 494 saved_parent = info->parent; 495 info->parent = chain; 496 retry: 497 498 /* 499 * Chains deleted as-of the flush synchronization point require some 500 * special early handling to avoid double flushing because multiple 501 * deletions are sometimes forced within the same transaction. 502 * Allowing the flush to proceed through more than one path can wind 503 * up updating the chain's block table twice and cause an assertion. 504 * 505 * We don't check the 'same transaction' part but simply punt in this 506 * situation. We must still check for multiple deletions, since any 507 * terminal (non-stale) deletion still requires processing to at 508 * least clean up the children, and also (for inodes) there might 509 * still be an open descriptor. 510 * 511 * Clear MODIFIED but set MOVED to ensure that the parent still 512 * deals with it. 513 */ 514 if (chain->delete_tid <= info->sync_tid && 515 (chain->flags & HAMMER2_CHAIN_DUPLICATED)) { 516 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 517 #if 0 518 /* 519 * XXX should be able to invalidate the buffer here. 520 * XXX problem if reused, snapshotted, or reactivated. 521 */ 522 if (chain->dio) { 523 hammer2_io_setinval(chain->dio, chain->bytes); 524 } 525 #endif 526 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 527 hammer2_chain_ref(chain); 528 atomic_set_int(&chain->flags, 529 HAMMER2_CHAIN_MOVED); 530 } 531 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 532 hammer2_chain_memory_wakeup(chain->pmp); 533 hammer2_chain_drop(chain); 534 } 535 536 /* 537 * Update mirror_tid, indicating that chain is synchronized 538 * on its modification and block table. This probably isn't 539 * needed since scan2 should ignore deleted chains anyway. 540 * 541 * NOTE: bref.mirror_tid cannot be updated 542 * unless MODIFIED is cleared or already 543 * clear. 544 */ 545 if (chain->bref.mirror_tid < info->sync_tid) 546 chain->bref.mirror_tid = info->sync_tid; 547 /* do not update core->update_lo, there may be another path */ 548 info->parent = saved_parent; 549 return; 550 } 551 552 /* 553 * Recurse if we are not up-to-date. Once we are done we will 554 * update update_lo if there were no deferrals. update_lo can become 555 * higher than update_hi and is used to prevent re-recursions during 556 * the same flush cycle. 557 * 558 * update_hi was already checked and prevents initial recursions on 559 * subtrees which have not been modified. 560 * 561 * NOTE: We must recurse whether chain is flagged DELETED or not. 562 * However, if it is flagged DELETED we limit sync_tid to 563 * delete_tid to ensure that the chain's bref.mirror_tid is 564 * not fully updated and causes it to miss the non-DELETED 565 * path. 566 * 567 * NOTE: If a deferral occurs hammer2_chain_flush() will flush the 568 * deferred chain independently which will update it's 569 * bref.mirror_tid and prevent it from deferred again. 570 */ 571 if (chain->bref.mirror_tid < info->sync_tid && 572 chain->bref.mirror_tid < core->update_hi) { 573 hammer2_chain_layer_t *layer; 574 int saved_domodify; 575 int save_gen; 576 577 /* 578 * Races will bump update_hi above trans->sync_tid and should 579 * not affect this test. 580 * 581 * We don't want to set our chain to MODIFIED gratuitously. 582 * 583 * We need an extra ref on chain because we are going to 584 * release its lock temporarily in our child loop. 585 */ 586 587 /* 588 * Run two passes. The first pass handles MODIFIED and 589 * update_lo recursions while the second pass handles 590 * MOVED chains on the way back up. 591 * 592 * If the stack gets too deep we defer the chain. Since 593 * hammer2_chain_core's can be shared at multiple levels 594 * in the tree, we may encounter a chain that we had already 595 * deferred. We could undefer it but it will probably just 596 * defer again so it is best to leave it deferred. 597 * 598 * Scan1 is recursive. 599 * 600 * NOTE: The act of handling a modified/submodified chain can 601 * cause the MOVED Flag to be set. It can also be set 602 * via hammer2_chain_delete() and in other situations. 603 * 604 * NOTE: RB_SCAN() must be used instead of RB_FOREACH() 605 * because children can be physically removed during 606 * the scan. 607 * 608 * NOTE: We would normally not care about insertions except 609 * that some insertions might occur from the flush 610 * itself, so loop on generation number changes. 611 */ 612 saved_domodify = info->domodify; 613 info->domodify = 0; 614 615 if (chain->flags & HAMMER2_CHAIN_DEFERRED) { 616 ++info->diddeferral; 617 } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 618 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) { 619 hammer2_chain_ref(chain); 620 TAILQ_INSERT_TAIL(&info->flush_list, 621 chain, flush_node); 622 atomic_set_int(&chain->flags, 623 HAMMER2_CHAIN_DEFERRED); 624 } 625 ++info->diddeferral; 626 } else { 627 spin_lock(&core->cst.spin); 628 KKASSERT(core->good == 0x1234 && core->sharecnt > 0); 629 do { 630 save_gen = core->generation; 631 TAILQ_FOREACH_REVERSE(layer, &core->layerq, 632 h2_layer_list, entry) { 633 ++layer->refs; 634 KKASSERT(layer->good == 0xABCD); 635 RB_SCAN(hammer2_chain_tree, 636 &layer->rbtree, 637 NULL, hammer2_chain_flush_scan1, 638 info); 639 --layer->refs; 640 } 641 } while (core->generation != save_gen); 642 spin_unlock(&core->cst.spin); 643 } 644 645 if (info->parent != chain) { 646 kprintf("ZZZ\n"); 647 hammer2_chain_drop(chain); 648 hammer2_chain_ref(info->parent); 649 } 650 chain = info->parent; 651 652 /* 653 * chain was unlocked during the scan1 recursion and may 654 * have been deleted, destroyed, or even synchronously 655 * flushed due to aliasing. 656 * 657 * The flush continues normally in the first two places as 658 * the deletion or destruction does NOT affect the flush 659 * as-of the flush synchronization point. 660 * 661 * We must detect the last case and avoid flushing chain twice. 662 */ 663 #if 0 664 if (chain->delete_tid <= info->sync_tid && 665 (chain->flags & HAMMER2_CHAIN_DUPLICATED)) { 666 kprintf("xxx\n"); 667 goto retry; 668 } 669 #endif 670 if (chain->bref.mirror_tid >= info->sync_tid || 671 chain->bref.mirror_tid >= core->update_hi) { 672 kprintf("yyy\n"); 673 goto retry; 674 } 675 676 /* 677 * If any deferral occurred we must set domodify to 0 to avoid 678 * potentially modifying the parent twice (now and when we run 679 * the deferral list), as doing so could cause the blockref 680 * update to run on a block array which has already been 681 * updated. 682 */ 683 if (info->domodify && diddeferral != info->diddeferral) 684 info->domodify = 0; 685 686 /* 687 * THIS IS THE ONLY POINT IN THE FLUSH WHERE A PARENT IN THE 688 * NOMINAL TOPOLOGY, OTHER THAN FREEMAP ALLOCATIONS, IS 689 * MODIFIED. FREEMAP ALLOCATIONS WILL MODIFY THE FREEMAP 690 * TOPOLOGY WITH SYNC_TID+1 AND DO NOT AFFECT THE CURRENT 691 * FLUSH. 692 * 693 * Modifying the parent can create issues if the current 694 * parent is already in a modified state with an earlier 695 * transaction id. We want to avoid an endless flush loop 696 * on the original parent so we must clear its modified bit 697 * after creating the new parent, if they wind up being 698 * different. Care must also be taken to avoid flushing the 699 * same parent twice. 700 * 701 * We are responsible for setting the parent into a modified 702 * state before we scan the children to update the parent's 703 * block table. This must essentially be done as an atomic 704 * operation (the parent must remain locked throughout the 705 * operation), otherwise other transactions can squeeze a 706 * delete-duplicate in and create block table havoc. 707 * 708 * NOTE: Blockrefs are only updated on live chains. 709 * 710 * NOTE: Modifying the parent generally causes a 711 * delete-duplicate to occur from within the flush 712 * itself, with an allocation from the freemap occuring 713 * as an additional side-effect. 714 * 715 * NOTE: If the parent was deleted our modified chain will 716 * also be marked deleted, but since it inherits the 717 * parent's delete_tid it will still appear to be 718 * 'live' for the purposes of the flush. 719 */ 720 if (info->domodify && !h2ignore_deleted(info, chain)) { 721 KKASSERT(chain->modify_tid < info->sync_tid); 722 723 /* 724 * The scan1 loop and/or flush_core is reentrant, 725 * particularly when core->generation changes. To 726 * avoid havoc we have to prevent repetitive 727 * delete-duplicates of the same chain. 728 * 729 * After executing the modify set the original chain's 730 * bref.mirror_tid to prevent any reentrancy during 731 * the current flush cycle. 732 */ 733 hammer2_chain_modify(info->trans, &info->parent, 734 HAMMER2_MODIFY_NO_MODIFY_TID); 735 if (info->parent != chain) { 736 /* 737 * NOTE: bref.mirror_tid cannot be updated 738 * unless MODIFIED is cleared or already 739 * clear. 740 */ 741 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 742 atomic_clear_int(&chain->flags, 743 HAMMER2_CHAIN_MODIFIED); 744 hammer2_chain_memory_wakeup(chain->pmp); 745 hammer2_chain_drop(chain); 746 } 747 if (chain->bref.mirror_tid < info->sync_tid) 748 chain->bref.mirror_tid = info->sync_tid; 749 hammer2_chain_drop(chain); 750 hammer2_chain_ref(info->parent); 751 } 752 chain = info->parent; 753 } 754 755 KKASSERT(chain == info->parent); 756 757 /* 758 * Handle successfully flushed children who are in the MOVED 759 * state on the way back up the recursion. This can have 760 * the side-effect of clearing MOVED. 761 * 762 * Scan2 may replace info->parent. If it does it will also 763 * replace the extra ref we made. 764 * 765 * Scan2 is non-recursive. 766 */ 767 if (diddeferral != info->diddeferral) { 768 spin_lock(&core->cst.spin); 769 } else { 770 KKASSERT(chain == info->parent); 771 KKASSERT(info->domodify == 0 || 772 (chain->flags & HAMMER2_CHAIN_FLUSHED) == 0); 773 atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSHED); 774 spin_lock(&core->cst.spin); 775 KKASSERT(core->good == 0x1234 && core->sharecnt > 0); 776 KKASSERT(info->parent->core == core); 777 TAILQ_FOREACH_REVERSE(layer, &core->layerq, 778 h2_layer_list, entry) { 779 info->pass = 1; 780 ++layer->refs; 781 KKASSERT(layer->good == 0xABCD); 782 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 783 NULL, hammer2_chain_flush_scan2, info); 784 info->pass = 2; 785 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 786 NULL, hammer2_chain_flush_scan2, info); 787 --layer->refs; 788 } 789 } 790 791 /* 792 * info->parent must not have been replaced again 793 */ 794 KKASSERT(info->parent == chain); 795 796 *chainp = chain; 797 798 hammer2_chain_layer_check_locked(chain->hmp, core); 799 spin_unlock(&core->cst.spin); 800 801 /* 802 * Update the core only if no deferrals occurred. Otherwise 803 * we could end up clearing the MOVED bit in the children 804 * prematurely. 805 */ 806 if (diddeferral == info->diddeferral) 807 hammer2_flush_core_update(core, info); 808 809 info->domodify = saved_domodify; 810 KKASSERT(chain->refs > 1); 811 } else { 812 /* 813 * Update the core, no deferrals occurred in this path. 814 */ 815 hammer2_flush_core_update(core, info); 816 } 817 info->parent = saved_parent; 818 819 #if FLUSH_DEBUG 820 kprintf("POP %p.%d defer=%d\n", chain, chain->bref.type, diddeferral); 821 #endif 822 823 /* 824 * Do not flush the chain if there were any deferrals. It will be 825 * retried later after the deferrals are independently handled. 826 * Do not update update_lo or bref.mirror_tid. 827 */ 828 if (diddeferral != info->diddeferral) { 829 if (hammer2_debug & 0x0008) { 830 kprintf("%*.*s} %p/%d %04x (deferred)", 831 info->depth, info->depth, "", 832 chain, chain->refs, chain->flags); 833 } 834 /* do not update core->update_lo */ 835 /* do not update bref.mirror_tid */ 836 return; 837 } 838 839 KKASSERT(chain->bref.mirror_tid < info->sync_tid); 840 841 /* 842 * Non-deferral path, chain is now deterministically being flushed. 843 * We've finished running the recursion and the blockref update. 844 * 845 * update bref.mirror_tid. update_lo has already been updated. 846 * 847 * After this point we MUST dipose of the MODIFIED bit on chain. 848 */ 849 if (chain->bref.mirror_tid < info->sync_tid) 850 chain->bref.mirror_tid = info->sync_tid; 851 852 /* 853 * Deal with deleted and destroyed chains on the way back up. 854 * 855 * Otherwise a deleted chain can be optimized by clearing MODIFIED 856 * without bothering to write it out. 857 * 858 * NOTE: We optimize this by noting that only 'inode' chains require 859 * this treatment. When a file with an open descriptor is 860 * deleted only its inode is marked deleted. Other deletions, 861 * such as indirect block deletions, will no longer be visible 862 * to the live filesystem and do not need to be updated. 863 */ 864 if (h2ignore_deleted(info, chain)) { 865 /* 866 * At the moment we unconditionally set the MOVED bit because 867 * there are situations where it might not have been set due 868 * to similar delete-destroyed optimizations, and the parent 869 * of the parent still may need to be notified of the deletion. 870 */ 871 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 872 hammer2_chain_ref(chain); 873 atomic_set_int(&chain->flags, 874 HAMMER2_CHAIN_MOVED); 875 } 876 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 877 #if 0 878 /* 879 * XXX should be able to invalidate the buffer here. 880 * XXX problem if reused, snapshotted, or reactivated. 881 */ 882 if (chain->dio) { 883 hammer2_io_setinval(chain->dio, chain->bytes); 884 } 885 #endif 886 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 887 hammer2_chain_memory_wakeup(chain->pmp); 888 hammer2_chain_drop(chain); 889 } 890 return; 891 } 892 893 /* 894 * A degenerate flush might not have flushed anything and thus not 895 * processed modified blocks on the way back up. Detect the case. 896 * 897 * This case can occur when a create, modify, and rename (to a 898 * different part of the topology) occurs in the same flush, 899 * resulting in a parent which effectively needs no modification. 900 */ 901 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 902 kprintf("chain %p.%d %08x recursed but wasn't " 903 "modified mirr=%016jx " 904 "update_lo=%016jx synctid=%016jx\n", 905 chain, chain->bref.type, chain->flags, 906 chain->bref.mirror_tid, 907 core->update_lo, info->sync_tid); 908 #if 0 909 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 910 hammer2_chain_ref(chain); 911 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 912 } 913 #endif 914 return; 915 } 916 917 /* 918 * Issue flush. 919 * 920 * A DESTROYED node that reaches this point must be flushed for 921 * synchronization point consistency. 922 * 923 * Update bref.mirror_tid, clear MODIFIED, and set MOVED. 924 * 925 * The caller will update the parent's reference to this chain 926 * by testing MOVED as long as the modification was in-bounds. 927 * 928 * MOVED is never set on the volume root as there is no parent 929 * to adjust. 930 */ 931 if (hammer2_debug & 0x1000) { 932 kprintf("Flush %p.%d %016jx/%d sync_tid=%016jx data=%016jx\n", 933 chain, chain->bref.type, 934 chain->bref.key, chain->bref.keybits, 935 info->sync_tid, chain->bref.data_off); 936 } 937 if (hammer2_debug & 0x2000) { 938 Debugger("Flush hell"); 939 } 940 941 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 942 hammer2_chain_memory_wakeup(chain->pmp); 943 944 if ((chain->flags & HAMMER2_CHAIN_MOVED) || 945 chain == &hmp->vchain || 946 chain == &hmp->fchain) { 947 /* 948 * Drop the ref from the MODIFIED bit we cleared, 949 * net -1 ref. 950 */ 951 hammer2_chain_drop(chain); 952 } else { 953 /* 954 * Drop the ref from the MODIFIED bit we cleared and 955 * set a ref for the MOVED bit we are setting. Net 0 refs. 956 */ 957 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 958 } 959 960 /* 961 * If this is part of a recursive flush we can go ahead and write 962 * out the buffer cache buffer and pass a new bref back up the chain 963 * via the MOVED bit. 964 * 965 * Volume headers are NOT flushed here as they require special 966 * processing. 967 */ 968 switch(chain->bref.type) { 969 case HAMMER2_BREF_TYPE_FREEMAP: 970 hammer2_modify_volume(hmp); 971 hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; 972 break; 973 case HAMMER2_BREF_TYPE_VOLUME: 974 /* 975 * The free block table is flushed by hammer2_vfs_sync() 976 * before it flushes vchain. We must still hold fchain 977 * locked while copying voldata to volsync, however. 978 */ 979 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 980 #if 0 981 if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) || 982 hmp->voldata.freemap_tid < info->trans->sync_tid) { 983 /* this will modify vchain as a side effect */ 984 hammer2_chain_t *tmp = &hmp->fchain; 985 hammer2_chain_flush(info->trans, &tmp); 986 KKASSERT(tmp == &hmp->fchain); 987 } 988 #endif 989 990 /* 991 * There is no parent to our root vchain and fchain to 992 * synchronize the bref to, their updated mirror_tid's 993 * must be synchronized to the volume header. 994 */ 995 hmp->voldata.mirror_tid = chain->bref.mirror_tid; 996 /*hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;*/ 997 998 /* 999 * The volume header is flushed manually by the syncer, not 1000 * here. All we do here is adjust the crc's. 1001 */ 1002 KKASSERT(chain->data != NULL); 1003 KKASSERT(chain->dio == NULL); 1004 1005 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 1006 hammer2_icrc32( 1007 (char *)&hmp->voldata + 1008 HAMMER2_VOLUME_ICRC1_OFF, 1009 HAMMER2_VOLUME_ICRC1_SIZE); 1010 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 1011 hammer2_icrc32( 1012 (char *)&hmp->voldata + 1013 HAMMER2_VOLUME_ICRC0_OFF, 1014 HAMMER2_VOLUME_ICRC0_SIZE); 1015 hmp->voldata.icrc_volheader = 1016 hammer2_icrc32( 1017 (char *)&hmp->voldata + 1018 HAMMER2_VOLUME_ICRCVH_OFF, 1019 HAMMER2_VOLUME_ICRCVH_SIZE); 1020 hmp->volsync = hmp->voldata; 1021 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 1022 hammer2_chain_unlock(&hmp->fchain); 1023 break; 1024 case HAMMER2_BREF_TYPE_DATA: 1025 /* 1026 * Data elements have already been flushed via the logical 1027 * file buffer cache. Their hash was set in the bref by 1028 * the vop_write code. 1029 * 1030 * Make sure any device buffer(s) have been flushed out here. 1031 * (there aren't usually any to flush). 1032 */ 1033 #if 0 1034 /* XXX */ 1035 /* chain and chain->bref, NOWAIT operation */ 1036 #endif 1037 break; 1038 #if 0 1039 case HAMMER2_BREF_TYPE_INDIRECT: 1040 /* 1041 * Indirect blocks may be in an INITIAL state. Use the 1042 * chain_lock() call to ensure that the buffer has been 1043 * instantiated (even though it is already locked the buffer 1044 * might not have been instantiated). 1045 * 1046 * Only write the buffer out if it is dirty, it is possible 1047 * the operating system had already written out the buffer. 1048 */ 1049 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1050 KKASSERT(chain->dio != NULL); 1051 1052 chain->data = NULL; 1053 hammer2_io_bqrelse(&chain->dio); 1054 hammer2_chain_unlock(chain); 1055 break; 1056 #endif 1057 case HAMMER2_BREF_TYPE_INDIRECT: 1058 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1059 /* 1060 * Device-backed. Buffer will be flushed by the sync 1061 * code XXX. 1062 */ 1063 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 1064 break; 1065 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1066 default: 1067 /* 1068 * Embedded elements have to be flushed out. 1069 * (Basically just BREF_TYPE_INODE). 1070 */ 1071 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 1072 KKASSERT(chain->data != NULL); 1073 KKASSERT(chain->dio == NULL); 1074 bref = &chain->bref; 1075 1076 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0); 1077 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) == 1078 HAMMER2_CHECK_ISCSI32 || 1079 HAMMER2_DEC_CHECK(chain->bref.methods) == 1080 HAMMER2_CHECK_FREEMAP); 1081 1082 /* 1083 * The data is embedded, we have to acquire the 1084 * buffer cache buffer and copy the data into it. 1085 */ 1086 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes, 1087 &dio); 1088 KKASSERT(error == 0); 1089 bdata = hammer2_io_data(dio, bref->data_off); 1090 1091 /* 1092 * Copy the data to the buffer, mark the buffer 1093 * dirty, and convert the chain to unmodified. 1094 */ 1095 bcopy(chain->data, bdata, chain->bytes); 1096 hammer2_io_bdwrite(&dio); 1097 1098 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 1099 case HAMMER2_CHECK_FREEMAP: 1100 chain->bref.check.freemap.icrc32 = 1101 hammer2_icrc32(chain->data, chain->bytes); 1102 break; 1103 case HAMMER2_CHECK_ISCSI32: 1104 chain->bref.check.iscsi32.value = 1105 hammer2_icrc32(chain->data, chain->bytes); 1106 break; 1107 default: 1108 panic("hammer2_flush_core: bad crc type"); 1109 break; /* NOT REACHED */ 1110 } 1111 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 1112 ++hammer2_iod_meta_write; 1113 else 1114 ++hammer2_iod_indr_write; 1115 } 1116 } 1117 1118 /* 1119 * Flush helper scan1 (recursive) 1120 * 1121 * Flushes the children of the caller's chain (parent) and updates 1122 * the blockref, restricted by sync_tid. 1123 * 1124 * Ripouts during the loop should not cause any problems. Because we are 1125 * flushing to a synchronization point, modification races will occur after 1126 * sync_tid and do not have to be flushed anyway. 1127 * 1128 * It is also ok if the parent is chain_duplicate()'d while unlocked because 1129 * the delete/duplication will install a delete_tid that is still larger than 1130 * our current sync_tid. 1131 * 1132 * WARNING! If we do not call chain_flush_core we must update bref.mirror_tid 1133 * ourselves. 1134 */ 1135 static int 1136 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data) 1137 { 1138 hammer2_flush_info_t *info = data; 1139 hammer2_trans_t *trans = info->trans; 1140 hammer2_chain_t *parent = info->parent; 1141 int diddeferral; 1142 1143 if (hammer2_debug & 0x80000) 1144 Debugger("hell3"); 1145 diddeferral = info->diddeferral; 1146 1147 /* 1148 * Child is beyond the flush synchronization zone, don't persue. 1149 * Remember that modifications generally delete-duplicate so if the 1150 * sub-tree is dirty another child will get us there. But not this 1151 * one. 1152 * 1153 * Or MODIFIED is not set and child is already fully synchronized 1154 * with its sub-tree. Don't persue. 1155 * 1156 * (child can never be fchain or vchain so a special check isn't 1157 * needed). 1158 */ 1159 if (child->modify_tid > trans->sync_tid) { 1160 KKASSERT(child->delete_tid >= child->modify_tid); 1161 /* do not update child->core->update_lo, core not flushed */ 1162 /* do not update core->update_lo, there may be another path */ 1163 /* do not update mirror_tid, scan2 will ignore chain */ 1164 return (0); 1165 } 1166 1167 /* 1168 * We must ref the child before unlocking the spinlock. 1169 * 1170 * The caller has added a ref to the parent so we can temporarily 1171 * unlock it in order to lock the child. 1172 */ 1173 hammer2_chain_ref(child); 1174 spin_unlock(&parent->core->cst.spin); 1175 1176 hammer2_chain_unlock(parent); 1177 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 1178 1179 #if 0 1180 /* 1181 * This isn't working atm, it seems to be causing necessary 1182 * updates to be thrown away, probably due to aliasing, resulting 1183 * base_insert/base_delete panics. 1184 */ 1185 /* 1186 * The DESTROYED flag can only be initially set on an unreferenced 1187 * deleted inode and will propagate downward via the mechanic below. 1188 * Such inode chains have been deleted for good and should no longer 1189 * be subject to delete/duplication. 1190 * 1191 * This optimization allows the inode reclaim (destroy unlinked file 1192 * on vnode reclamation after last close) to be flagged by just 1193 * setting HAMMER2_CHAIN_DESTROYED at the top level and then will 1194 * cause the chains to be terminated and related buffers to be 1195 * invalidated and not flushed out. 1196 * 1197 * We have to be careful not to propagate the DESTROYED flag if 1198 * the destruction occurred after our flush sync_tid. 1199 */ 1200 if (parent->delete_tid <= trans->sync_tid && 1201 (parent->flags & HAMMER2_CHAIN_DESTROYED) && 1202 (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) { 1203 /* 1204 * Force downward recursion by bringing update_hi up to 1205 * at least sync_tid, and setting the DESTROYED flag. 1206 * Parent's mirror_tid has not yet been updated. 1207 * 1208 * We do not mark the child DELETED because this would 1209 * cause unnecessary modifications/updates. Instead, the 1210 * DESTROYED flag propagates downward and causes the flush 1211 * to ignore any pre-existing modified chains. 1212 * 1213 * Vnode reclamation may have forced update_hi to MAX_TID 1214 * (we do this because there was no transaction at the time). 1215 * In this situation bring it down to something reasonable 1216 * so the elements being destroyed can be retired. 1217 */ 1218 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROYED); 1219 spin_lock(&child->core->cst.spin); 1220 if (child->core->update_hi < trans->sync_tid) 1221 child->core->update_hi = trans->sync_tid; 1222 spin_unlock(&child->core->cst.spin); 1223 } 1224 #endif 1225 1226 /* 1227 * No recursion needed if neither the child or anything under it 1228 * was messed with. 1229 */ 1230 if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 1231 child->core->update_lo >= info->sync_tid) { 1232 KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0); 1233 if (child->bref.mirror_tid < info->sync_tid) 1234 child->bref.mirror_tid = info->sync_tid; 1235 goto skip; 1236 } 1237 1238 /* 1239 * Re-check original pre-lock conditions after locking. 1240 */ 1241 if (child->modify_tid > trans->sync_tid) { 1242 hammer2_chain_unlock(child); 1243 hammer2_chain_drop(child); 1244 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1245 spin_lock(&parent->core->cst.spin); 1246 return (0); 1247 } 1248 1249 if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 1250 child->core->update_lo >= info->sync_tid) { 1251 KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0); 1252 if (child->bref.mirror_tid < info->sync_tid) 1253 child->bref.mirror_tid = info->sync_tid; 1254 goto skip; 1255 } 1256 1257 /* 1258 * Recurse and collect deferral data. 1259 */ 1260 ++info->depth; 1261 hammer2_chain_flush_core(info, &child); 1262 --info->depth; 1263 1264 skip: 1265 /* 1266 * Check the conditions that could cause SCAN2 to modify the parent. 1267 * Modify the parent here instead of in SCAN2, which would cause 1268 * rollup chicken-and-egg races. 1269 * 1270 * Scan2 is expected to update bref.mirror_tid in the domodify case, 1271 * but will skip the child otherwise giving us the responsibility to 1272 * update bref.mirror_tid. 1273 * 1274 * WARNING! Do NOT update the child's bref.mirror_tid right here, 1275 * even if there was no deferral. Doing so would cause 1276 * confusion with the child's block array state in a 1277 * future flush. 1278 */ 1279 if (h2ignore_deleted(info, parent)) { 1280 /* 1281 * Special optimization matching similar tests done in 1282 * flush_core, scan1, and scan2. Avoid updating the block 1283 * table in the parent if the parent is no longer visible. 1284 * A deleted parent is no longer visible unless it's an 1285 * inode (in which case it might have an open fd).. the 1286 * DESTROYED flag must also be checked for inodes. 1287 */ 1288 ; 1289 } else if (child->delete_tid <= trans->sync_tid && 1290 child->delete_tid > parent->bref.mirror_tid && 1291 child->modify_tid <= parent->bref.mirror_tid) { 1292 info->domodify = 1; 1293 } else if (child->delete_tid > trans->sync_tid && 1294 child->modify_tid > parent->bref.mirror_tid) { 1295 info->domodify = 1; /* base insertion */ 1296 } 1297 1298 /* 1299 * Relock to continue the loop 1300 */ 1301 hammer2_chain_unlock(child); 1302 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1303 hammer2_chain_drop(child); 1304 KKASSERT(info->parent == parent); 1305 1306 spin_lock(&parent->core->cst.spin); 1307 return (0); 1308 } 1309 1310 /* 1311 * Flush helper scan2 (non-recursive) 1312 * 1313 * This pass on a chain's children propagates any MOVED or DELETED 1314 * elements back up the chain towards the root after those elements have 1315 * been fully flushed. Unlike scan1, this function is NOT recursive and 1316 * the parent remains locked across the entire scan. 1317 * 1318 * SCAN2 is called twice, once with pass set to 1 and once with it set to 2. 1319 * We have to do this so base[] elements can be deleted in pass 1 to make 1320 * room for adding new elements in pass 2. 1321 * 1322 * This function also rolls up storage statistics. 1323 * 1324 * NOTE! A deletion is a visbility issue, there can still be references to 1325 * deleted elements (for example, to an unlinked file which is still 1326 * open), and there can also be multiple chains pointing to the same 1327 * bref where some are deleted and some are not (for example due to 1328 * a rename). So a chain marked for deletion is basically considered 1329 * to be live until it is explicitly destroyed or until its ref-count 1330 * reaches zero (also implying that MOVED and MODIFIED are clear). 1331 * 1332 * NOTE! Info->parent will be locked but will only be instantiated/modified 1333 * if it is either MODIFIED or if scan1 determined that block table 1334 * updates will occur. 1335 * 1336 * NOTE! SCAN2 is responsible for updating child->bref.mirror_tid only in 1337 * the case where it modifies the parent (does a base insertion 1338 * or deletion). SCAN1 handled all other cases. 1339 */ 1340 static int 1341 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data) 1342 { 1343 hammer2_flush_info_t *info = data; 1344 hammer2_chain_t *parent = info->parent; 1345 hammer2_chain_core_t *above = child->above; 1346 hammer2_mount_t *hmp = child->hmp; 1347 hammer2_trans_t *trans = info->trans; 1348 hammer2_blockref_t *base; 1349 int count; 1350 int ok; 1351 1352 #if FLUSH_DEBUG 1353 kprintf("SCAN2 %p.%d %08x mod=%016jx del=%016jx trans=%016jx\n", child, child->bref.type, child->flags, child->modify_tid, child->delete_tid, info->trans->sync_tid); 1354 #endif 1355 /* 1356 * Ignore children created after our flush point, treating them as 1357 * if they did not exist). These children will not cause the parent 1358 * to be updated. 1359 * 1360 * Children deleted after our flush point are treated as having been 1361 * created for the purposes of the flush. The parent's update_hi 1362 * will already be higher than our trans->sync_tid so the path for 1363 * the next flush is left intact. 1364 * 1365 * When we encounter such children and the parent chain has not been 1366 * deleted, delete/duplicated, or delete/duplicated-for-move, then 1367 * the parent may be used to funnel through several flush points. 1368 * These chains will still be visible to later flushes due to having 1369 * a higher update_hi than we can set in the current flush. 1370 */ 1371 if (child->modify_tid > trans->sync_tid) { 1372 KKASSERT(child->delete_tid >= child->modify_tid); 1373 goto finalize; 1374 } 1375 1376 #if 0 1377 /* 1378 * Ignore children which have not changed. The parent's block table 1379 * is already correct. 1380 * 1381 * XXX The MOVED bit is only cleared when all multi-homed parents 1382 * have flushed, creating a situation where a re-flush can occur 1383 * via a parent which has already flushed. The hammer2_base_*() 1384 * functions currently have a hack to deal with this case but 1385 * we need something better. 1386 */ 1387 if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) { 1388 KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0); 1389 goto finalize; 1390 } 1391 #endif 1392 1393 /* 1394 * Make sure child is referenced before we unlock. 1395 */ 1396 hammer2_chain_ref(child); 1397 spin_unlock(&above->cst.spin); 1398 1399 /* 1400 * Parent reflushed after the child has passed them by should skip 1401 * due to the modify_tid test. XXX 1402 */ 1403 hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER); 1404 KKASSERT(child->above == above); 1405 KKASSERT(parent->core == above); 1406 1407 /* 1408 * The parent's blockref to the child must be deleted or updated. 1409 * 1410 * This point is not reached on successful DESTROYED optimizations 1411 * but can be reached on recursive deletions and restricted flushes. 1412 * 1413 * The chain_modify here may delete-duplicate the block. This can 1414 * cause a multitude of issues if the block was already modified 1415 * by a later (post-flush) transaction. Primarily blockrefs in 1416 * the later block can be out-of-date, so if the situation occurs 1417 * we can't throw away the MOVED bit on the current blocks until 1418 * the later blocks are flushed (so as to be able to regenerate all 1419 * the changes that were made). 1420 * 1421 * Because flushes are ordered we do not have to make a 1422 * modify/duplicate of indirect blocks. That is, the flush 1423 * code does not have to kmalloc or duplicate anything. We 1424 * can adjust the indirect block table in-place and reuse the 1425 * chain. It IS possible that the chain has already been duplicated 1426 * or may wind up being duplicated on-the-fly by modifying code 1427 * on the frontend. We simply use the original and ignore such 1428 * chains. However, it does mean we can't clear the MOVED bit. 1429 * 1430 * XXX recursive deletions not optimized. 1431 */ 1432 1433 switch(parent->bref.type) { 1434 case HAMMER2_BREF_TYPE_INODE: 1435 /* 1436 * Access the inode's block array. However, there is no 1437 * block array if the inode is flagged DIRECTDATA. The 1438 * DIRECTDATA case typicaly only occurs when a hardlink has 1439 * been shifted up the tree and the original inode gets 1440 * replaced with an OBJTYPE_HARDLINK placeholding inode. 1441 */ 1442 if (parent->data && 1443 (parent->data->ipdata.op_flags & 1444 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1445 base = &parent->data->ipdata.u.blockset.blockref[0]; 1446 } else { 1447 base = NULL; 1448 } 1449 count = HAMMER2_SET_COUNT; 1450 break; 1451 case HAMMER2_BREF_TYPE_INDIRECT: 1452 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1453 if (parent->data) 1454 base = &parent->data->npdata[0]; 1455 else 1456 base = NULL; 1457 count = parent->bytes / sizeof(hammer2_blockref_t); 1458 break; 1459 case HAMMER2_BREF_TYPE_VOLUME: 1460 base = &hmp->voldata.sroot_blockset.blockref[0]; 1461 count = HAMMER2_SET_COUNT; 1462 break; 1463 case HAMMER2_BREF_TYPE_FREEMAP: 1464 base = &parent->data->npdata[0]; 1465 count = HAMMER2_SET_COUNT; 1466 break; 1467 default: 1468 base = NULL; 1469 count = 0; 1470 panic("hammer2_chain_flush_scan2: " 1471 "unrecognized blockref type: %d", 1472 parent->bref.type); 1473 } 1474 1475 /* 1476 * Don't bother updating a deleted + destroyed parent's blockrefs. 1477 * We MUST update deleted + non-destroyed parent's blockrefs since 1478 * they could represent an open file. 1479 * 1480 * Otherwise, we need to be COUNTEDBREFS synchronized for the 1481 * hammer2_base_*() functions. 1482 * 1483 * NOTE: We optimize this by noting that only 'inode' chains require 1484 * this treatment. When a file with an open descriptor is 1485 * deleted only its inode is marked deleted. Other deletions, 1486 * such as indirect block deletions, will no longer be visible 1487 * to the live filesystem and do not need to be updated. 1488 * 1489 * rm -rf's generally wind up setting DESTROYED on the way down 1490 * and the result is typically that no disk I/O is needed at all 1491 * when rm -rf'ing an entire directory topology. 1492 * 1493 * This test must match the similar one in flush_core. 1494 */ 1495 #if FLUSH_DEBUG 1496 kprintf("SCAN2 base=%p pass=%d PARENT %p.%d DTID=%016jx SYNC=%016jx\n", 1497 base, 1498 info->pass, parent, parent->bref.type, parent->delete_tid, trans->sync_tid); 1499 #endif 1500 if (h2ignore_deleted(info, parent)) 1501 base = NULL; 1502 1503 /* 1504 * Update the parent's blockref table and propagate mirror_tid. 1505 * 1506 * NOTE! Children with modify_tid's beyond our flush point are 1507 * considered to not exist for the purposes of updating the 1508 * parent's blockref array. 1509 * 1510 * NOTE! SCAN1 has already put the parent in a modified state 1511 * so if it isn't we panic. 1512 * 1513 * NOTE! chain->modify_tid vs chain->bref.modify_tid. The chain's 1514 * internal modify_tid is always updated based on creation 1515 * or delete-duplicate. However, the bref.modify_tid is NOT 1516 * updated due to simple blockref updates. 1517 */ 1518 #if FLUSH_DEBUG 1519 kprintf("chain %p->%p pass %d trans %016jx sync %p.%d %016jx/%d C=%016jx D=%016jx PMIRROR %016jx\n", 1520 parent, child, 1521 info->pass, trans->sync_tid, 1522 child, child->bref.type, 1523 child->bref.key, child->bref.keybits, 1524 child->modify_tid, child->delete_tid, parent->bref.mirror_tid); 1525 #endif 1526 1527 if (info->pass == 1 && child->delete_tid <= trans->sync_tid) { 1528 /* 1529 * Deleting. The block array is expected to contain the 1530 * child's entry if: 1531 * 1532 * (1) The deletion occurred after the parent's block table 1533 * was last synchronized (delete_tid), and 1534 * 1535 * (2) The creation occurred before or during the parent's 1536 * last block table synchronization. 1537 */ 1538 #if FLUSH_DEBUG 1539 kprintf("S2A %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n", 1540 child, child->bref.type, 1541 base, child->delete_tid, parent->bref.mirror_tid, 1542 child->modify_tid, parent->bref.mirror_tid); 1543 #endif 1544 if (base && 1545 child->delete_tid > parent->bref.mirror_tid && 1546 child->modify_tid <= parent->bref.mirror_tid) { 1547 KKASSERT(child->flags & HAMMER2_CHAIN_MOVED); 1548 KKASSERT(parent->modify_tid == trans->sync_tid || 1549 (parent == &hmp->vchain || 1550 parent == &hmp->fchain)); 1551 hammer2_rollup_stats(parent, child, -1); 1552 spin_lock(&above->cst.spin); 1553 #if FLUSH_DEBUG 1554 kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx " 1555 "flg=%08x %016jx/%d delete\n", 1556 trans->sync_tid, 1557 parent, parent->bref.type, 1558 child, child->bref.type, 1559 child->modify_tid, child->delete_tid, 1560 child->flags, 1561 child->bref.key, child->bref.keybits); 1562 #endif 1563 hammer2_base_delete(trans, parent, base, count, 1564 &info->cache_index, child); 1565 spin_unlock(&above->cst.spin); 1566 } 1567 } else if (info->pass == 2 && child->delete_tid > trans->sync_tid) { 1568 /* 1569 * Inserting. The block array is expected to NOT contain 1570 * the child's entry if: 1571 * 1572 * (1) The creation occurred after the parent's block table 1573 * was last synchronized (modify_tid), and 1574 * 1575 * (2) The child is not being deleted in the same 1576 * transaction. 1577 */ 1578 #if FLUSH_DEBUG 1579 kprintf("S2B %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n", 1580 child, child->bref.type, 1581 base, child->delete_tid, parent->bref.mirror_tid, 1582 child->modify_tid, parent->bref.mirror_tid); 1583 #endif 1584 if (base && 1585 child->modify_tid > parent->bref.mirror_tid) { 1586 KKASSERT(child->flags & HAMMER2_CHAIN_MOVED); 1587 KKASSERT(parent->modify_tid == trans->sync_tid || 1588 (parent == &hmp->vchain || 1589 parent == &hmp->fchain)); 1590 hammer2_rollup_stats(parent, child, 1); 1591 spin_lock(&above->cst.spin); 1592 #if FLUSH_DEBUG 1593 kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx " 1594 "flg=%08x %016jx/%d insert\n", 1595 trans->sync_tid, 1596 parent, parent->bref.type, 1597 child, child->bref.type, 1598 child->modify_tid, child->delete_tid, 1599 child->flags, 1600 child->bref.key, child->bref.keybits); 1601 #endif 1602 hammer2_base_insert(trans, parent, base, count, 1603 &info->cache_index, child); 1604 spin_unlock(&above->cst.spin); 1605 } 1606 } else if (info->pass == 3 && 1607 (child->delete_tid == HAMMER2_MAX_TID || 1608 child->delete_tid <= trans->sync_tid) && 1609 (child->flags & HAMMER2_CHAIN_MOVED)) { 1610 /* 1611 * We can't clear the MOVED bit on children whos modify_tid 1612 * is beyond our current trans (was tested at top of scan2), 1613 * or on deleted children which have not yet been flushed 1614 * (handled above). 1615 * 1616 * Scan all parents of this child and determine if any of 1617 * them still need the child's MOVED bit. 1618 */ 1619 hammer2_chain_t *scan; 1620 1621 if (hammer2_debug & 0x4000) 1622 kprintf("CHECKMOVED %p (parent=%p)", child, parent); 1623 1624 ok = 1; 1625 spin_lock(&above->cst.spin); 1626 TAILQ_FOREACH(scan, &above->ownerq, core_entry) { 1627 /* 1628 * Can't clear child's MOVED until all parent's have 1629 * synchronized with it. 1630 * 1631 * Ignore deleted parents as-of this flush TID. 1632 * Ignore the current parent being flushed. 1633 */ 1634 if (h2ignore_deleted(info, scan)) 1635 continue; 1636 if (scan == parent) 1637 continue; 1638 1639 /* 1640 * For parents not already synchronized check to see 1641 * if the flush has gotten past them yet or not. 1642 * 1643 * This must roughly mimic the tests that 1644 * hammer2_chain_flush_core() runs or we could leave 1645 * children hanging around with MOVED set and cause 1646 * a memory leak. 1647 */ 1648 if (scan->bref.mirror_tid >= trans->sync_tid) 1649 continue; 1650 if (scan->bref.mirror_tid >= above->update_hi) 1651 continue; 1652 1653 if (hammer2_debug & 0x4000) { 1654 kprintf("(fail scan %p %016jx/%016jx)", 1655 scan, scan->bref.mirror_tid, 1656 child->modify_tid); 1657 } 1658 ok = 0; 1659 break; 1660 } 1661 if (hammer2_debug & 0x4000) 1662 kprintf("\n"); 1663 spin_unlock(&above->cst.spin); 1664 1665 /* 1666 * Can we finally clear MOVED? 1667 */ 1668 if (ok) { 1669 if (hammer2_debug & 0x4000) 1670 kprintf("clear moved %p.%d %016jx/%d\n", 1671 child, child->bref.type, 1672 child->bref.key, child->bref.keybits); 1673 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 1674 if (child->flags & HAMMER2_CHAIN_MODIFIED) { 1675 kprintf("modified child %p all parents updated\n", 1676 child); 1677 atomic_clear_int(&child->flags, 1678 HAMMER2_CHAIN_MODIFIED); 1679 hammer2_chain_memory_wakeup(child->pmp); 1680 hammer2_chain_drop(child);/* cleared MODIFIED */ 1681 } 1682 hammer2_chain_drop(child); /* cleared MOVED */ 1683 } else { 1684 if (hammer2_debug & 0x4000) 1685 kprintf("keep moved %p.%d %016jx/%d\n", 1686 child, child->bref.type, 1687 child->bref.key, child->bref.keybits); 1688 } 1689 } 1690 1691 /* 1692 * Unlock the child. This can wind up dropping the child's 1693 * last ref, removing it from the parent's RB tree, and deallocating 1694 * the structure. The RB_SCAN() our caller is doing handles the 1695 * situation. 1696 */ 1697 hammer2_chain_unlock(child); 1698 hammer2_chain_drop(child); 1699 spin_lock(&above->cst.spin); 1700 1701 /* 1702 * The parent may have been delete-duplicated. 1703 */ 1704 info->parent = parent; 1705 finalize: 1706 return (0); 1707 } 1708 1709 /* 1710 * Update core->update_lo and attempt to clear the MOVED bit 1711 * for its children. 1712 * 1713 * This routine is only called after a sub-tree has been fully flushed 1714 * up to the current flush synchronization point. Calling it under any 1715 * other condition will blow up flush tracking. 1716 */ 1717 static 1718 void 1719 hammer2_flush_core_update(hammer2_chain_core_t *core, 1720 hammer2_flush_info_t *info) 1721 { 1722 hammer2_chain_layer_t *layer; 1723 1724 spin_lock(&core->cst.spin); 1725 if (core->update_lo < info->sync_tid) 1726 core->update_lo = info->sync_tid; 1727 TAILQ_FOREACH_REVERSE(layer, &core->layerq, 1728 h2_layer_list, entry) { 1729 info->pass = 3; 1730 ++layer->refs; 1731 KKASSERT(layer->good == 0xABCD); 1732 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 1733 NULL, hammer2_chain_flush_scan2, info); 1734 --layer->refs; 1735 KKASSERT(info->parent->core == core); 1736 } 1737 spin_unlock(&core->cst.spin); 1738 } 1739 1740 static 1741 void 1742 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how) 1743 { 1744 #if 0 1745 hammer2_chain_t *grandp; 1746 #endif 1747 1748 parent->data_count += child->data_count; 1749 parent->inode_count += child->inode_count; 1750 child->data_count = 0; 1751 child->inode_count = 0; 1752 if (how < 0) { 1753 parent->data_count -= child->bytes; 1754 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1755 parent->inode_count -= 1; 1756 #if 0 1757 /* XXX child->data may be NULL atm */ 1758 parent->data_count -= child->data->ipdata.data_count; 1759 parent->inode_count -= child->data->ipdata.inode_count; 1760 #endif 1761 } 1762 } else if (how > 0) { 1763 parent->data_count += child->bytes; 1764 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1765 parent->inode_count += 1; 1766 #if 0 1767 /* XXX child->data may be NULL atm */ 1768 parent->data_count += child->data->ipdata.data_count; 1769 parent->inode_count += child->data->ipdata.inode_count; 1770 #endif 1771 } 1772 } 1773 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 1774 parent->data->ipdata.data_count += parent->data_count; 1775 parent->data->ipdata.inode_count += parent->inode_count; 1776 #if 0 1777 for (grandp = parent->above->first_parent; 1778 grandp; 1779 grandp = grandp->next_parent) { 1780 grandp->data_count += parent->data_count; 1781 grandp->inode_count += parent->inode_count; 1782 } 1783 #endif 1784 parent->data_count = 0; 1785 parent->inode_count = 0; 1786 } 1787 } 1788