1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/types.h> 40 #include <sys/lock.h> 41 #include <sys/uuid.h> 42 43 #include "hammer2.h" 44 45 #define FLUSH_DEBUG 0 46 47 /* 48 * Recursively flush the specified chain. The chain is locked and 49 * referenced by the caller and will remain so on return. The chain 50 * will remain referenced throughout but can temporarily lose its 51 * lock during the recursion to avoid unnecessarily stalling user 52 * processes. 53 */ 54 struct hammer2_flush_info { 55 hammer2_chain_t *parent; 56 hammer2_trans_t *trans; 57 int depth; 58 int diddeferral; 59 int pass; 60 int cache_index; 61 int domodify; 62 struct h2_flush_deferral_list flush_list; 63 hammer2_tid_t sync_tid; /* flush synchronization point */ 64 }; 65 66 typedef struct hammer2_flush_info hammer2_flush_info_t; 67 68 static void hammer2_chain_flush_core(hammer2_flush_info_t *info, 69 hammer2_chain_t **chainp); 70 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data); 71 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data); 72 static void hammer2_flush_core_update(hammer2_chain_core_t *core, 73 hammer2_flush_info_t *info); 74 static void hammer2_rollup_stats(hammer2_chain_t *parent, 75 hammer2_chain_t *child, int how); 76 77 /* 78 * Can we ignore a chain for the purposes of flushing modifications 79 * to the media? 80 * 81 * This code is now degenerate. We used to have to distinguish between 82 * deleted chains and deleted chains associated with inodes that were 83 * still open. This mechanic has been fixed so the function is now 84 * a simple test. 85 */ 86 static __inline 87 int 88 h2ignore_deleted(hammer2_flush_info_t *info, hammer2_chain_t *chain) 89 { 90 return (chain->delete_tid <= info->sync_tid); 91 } 92 93 #if 0 94 static __inline 95 void 96 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref, 97 int how) 98 { 99 hammer2_key_t bytes; 100 101 if (bref->type != 0) { 102 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX); 103 if (bref->type == HAMMER2_BREF_TYPE_INODE) 104 info->inode_count += how; 105 if (how < 0) 106 info->data_count -= bytes; 107 else 108 info->data_count += bytes; 109 } 110 } 111 #endif 112 113 /* 114 * Transaction support functions for writing to the filesystem. 115 * 116 * Initializing a new transaction allocates a transaction ID. Typically 117 * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 118 * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 119 * media target. The latter mode is used by the recovery code. 120 * 121 * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 122 * other is a set of any number of concurrent filesystem operations. We 123 * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 124 * or we can have <running_flush> + <concurrent_fs_ops>. 125 * 126 * During a flush, new fs_ops are only blocked until the fs_ops prior to 127 * the flush complete. The new fs_ops can then run concurrent with the flush. 128 * 129 * Buffer-cache transactions operate as fs_ops but never block. A 130 * buffer-cache flush will run either before or after the current pending 131 * flush depending on its state. 132 * 133 * sync_tid vs real_tid. For flush transactions ONLY, the flush operation 134 * actually uses two transaction ids, one for the flush operation itself, 135 * and <N+1> for any freemap allocations made as a side-effect. real_tid 136 * is fixed at <N>, sync_tid is adjusted dynamically as-needed. 137 * 138 * NOTE: The sync_tid for a flush's freemap allocation will match the 139 * sync_tid of the following <concurrent_fs_ops> transaction(s). 140 * The freemap topology will be out-of-step by one transaction id 141 * in order to give the flusher a stable freemap topology to flush 142 * out. This is fixed up at mount-time using a quick incremental 143 * scan. 144 */ 145 void 146 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, 147 hammer2_mount_t *hmp, int flags) 148 { 149 hammer2_trans_t *head; 150 151 bzero(trans, sizeof(*trans)); 152 if (pmp) { 153 trans->pmp = pmp; 154 KKASSERT(hmp == NULL); 155 hmp = pmp->cluster.chains[0]->hmp; /* XXX */ 156 } else { 157 trans->hmp_single = hmp; 158 KKASSERT(hmp); 159 } 160 161 hammer2_voldata_lock(hmp); 162 trans->flags = flags; 163 trans->td = curthread; 164 /*trans->delete_gen = 0;*/ /* multiple deletions within trans */ 165 166 if (flags & HAMMER2_TRANS_ISFLUSH) { 167 /* 168 * If multiple flushes are trying to run we have to 169 * wait until it is our turn. All flushes are serialized. 170 * 171 * We queue ourselves and then wait to become the head 172 * of the queue, allowing all prior flushes to complete. 173 * 174 * A unique transaction id is required to avoid confusion 175 * when updating the block tables. 176 */ 177 ++hmp->flushcnt; 178 ++hmp->voldata.alloc_tid; 179 trans->sync_tid = hmp->voldata.alloc_tid; 180 trans->real_tid = trans->sync_tid; 181 ++hmp->voldata.alloc_tid; 182 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 183 if (TAILQ_FIRST(&hmp->transq) != trans) { 184 trans->blocked = 1; 185 while (trans->blocked) { 186 lksleep(&trans->sync_tid, &hmp->voldatalk, 187 0, "h2multf", hz); 188 } 189 } 190 } else if (hmp->flushcnt == 0) { 191 /* 192 * No flushes are pending, we can go. 193 */ 194 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 195 trans->sync_tid = hmp->voldata.alloc_tid; 196 trans->real_tid = trans->sync_tid; 197 198 /* XXX improve/optimize inode allocation */ 199 } else { 200 /* 201 * One or more flushes are pending. We insert after 202 * the current flush and may block. We have priority 203 * over any flushes that are not the current flush. 204 * 205 * TRANS_BUFCACHE transactions cannot block. 206 */ 207 TAILQ_FOREACH(head, &hmp->transq, entry) { 208 if (head->flags & HAMMER2_TRANS_ISFLUSH) 209 break; 210 } 211 KKASSERT(head); 212 TAILQ_INSERT_AFTER(&hmp->transq, head, trans, entry); 213 trans->sync_tid = head->real_tid + 1; 214 trans->real_tid = trans->sync_tid; 215 216 if ((trans->flags & HAMMER2_TRANS_BUFCACHE) == 0) { 217 if (TAILQ_FIRST(&hmp->transq) != head) { 218 trans->blocked = 1; 219 while (trans->blocked) { 220 lksleep(&trans->sync_tid, 221 &hmp->voldatalk, 0, 222 "h2multf", hz); 223 } 224 } 225 } 226 } 227 if (flags & HAMMER2_TRANS_NEWINODE) { 228 if (hmp->voldata.inode_tid < HAMMER2_INODE_START) 229 hmp->voldata.inode_tid = HAMMER2_INODE_START; 230 trans->inode_tid = hmp->voldata.inode_tid++; 231 } 232 hammer2_voldata_unlock(hmp, 0); 233 } 234 235 void 236 hammer2_trans_done(hammer2_trans_t *trans) 237 { 238 hammer2_mount_t *hmp; 239 hammer2_trans_t *head; 240 hammer2_trans_t *scan; 241 242 if (trans->pmp) 243 hmp = trans->pmp->cluster.chains[0]->hmp; 244 else 245 hmp = trans->hmp_single; 246 247 /* 248 * Remove and adjust flushcnt 249 */ 250 hammer2_voldata_lock(hmp); 251 TAILQ_REMOVE(&hmp->transq, trans, entry); 252 if (trans->flags & HAMMER2_TRANS_ISFLUSH) 253 --hmp->flushcnt; 254 255 /* 256 * Unblock the head of the queue and any additional transactions 257 * up to the next flush. 258 */ 259 head = TAILQ_FIRST(&hmp->transq); 260 if (head && head->blocked) { 261 head->blocked = 0; 262 wakeup(&head->sync_tid); 263 264 scan = TAILQ_NEXT(head, entry); 265 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 266 if (scan->blocked) { 267 scan->blocked = 0; 268 wakeup(&scan->sync_tid); 269 } 270 scan = TAILQ_NEXT(scan, entry); 271 } 272 } 273 hammer2_voldata_unlock(hmp, 0); 274 } 275 276 /* 277 * Flush the chain and all modified sub-chains through the specified 278 * synchronization point (sync_tid), propagating parent chain modifications 279 * and mirror_tid updates back up as needed. Since we are recursing downward 280 * we do not have to deal with the complexities of multi-homed chains (chains 281 * with multiple parents). 282 * 283 * Caller must have interlocked against any non-flush-related modifying 284 * operations in progress whos modify_tid values are less than or equal 285 * to the passed sync_tid. 286 * 287 * Caller must have already vetted synchronization points to ensure they 288 * are properly flushed. Only snapshots and cluster flushes can create 289 * these sorts of synchronization points. 290 * 291 * This routine can be called from several places but the most important 292 * is from the hammer2_vop_reclaim() function. We want to try to completely 293 * clean out the inode structure to prevent disconnected inodes from 294 * building up and blowing out the kmalloc pool. However, it is not actually 295 * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery 296 * capability. 297 * 298 * chain is locked on call and will remain locked on return. If a flush 299 * occured, the chain's MOVED bit will be set indicating that its parent 300 * (which is not part of the flush) should be updated. The chain may be 301 * replaced by the call. 302 */ 303 void 304 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp) 305 { 306 hammer2_chain_t *chain = *chainp; 307 hammer2_chain_t *scan; 308 hammer2_chain_core_t *core; 309 hammer2_flush_info_t info; 310 int loops; 311 312 /* 313 * Execute the recursive flush and handle deferrals. 314 * 315 * Chains can be ridiculously long (thousands deep), so to 316 * avoid blowing out the kernel stack the recursive flush has a 317 * depth limit. Elements at the limit are placed on a list 318 * for re-execution after the stack has been popped. 319 */ 320 bzero(&info, sizeof(info)); 321 TAILQ_INIT(&info.flush_list); 322 info.trans = trans; 323 info.sync_tid = trans->sync_tid; 324 info.cache_index = -1; 325 326 core = chain->core; 327 #if FLUSH_DEBUG 328 kprintf("CHAIN FLUSH trans %p.%016jx chain %p.%d mod %016jx upd %016jx\n", trans, trans->sync_tid, chain, chain->bref.type, chain->modify_tid, core->update_lo); 329 #endif 330 331 /* 332 * Extra ref needed because flush_core expects it when replacing 333 * chain. 334 */ 335 hammer2_chain_ref(chain); 336 loops = 0; 337 338 for (;;) { 339 /* 340 * Unwind deep recursions which had been deferred. This 341 * can leave MOVED set for these chains, which will be 342 * handled when we [re]flush chain after the unwind. 343 */ 344 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) { 345 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 346 TAILQ_REMOVE(&info.flush_list, scan, flush_node); 347 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 348 349 /* 350 * Now that we've popped back up we can do a secondary 351 * recursion on the deferred elements. 352 * 353 * NOTE: hammer2_chain_flush() may replace scan. 354 */ 355 if (hammer2_debug & 0x0040) 356 kprintf("deferred flush %p\n", scan); 357 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 358 hammer2_chain_drop(scan); /* ref from deferral */ 359 hammer2_chain_flush(trans, &scan); 360 hammer2_chain_unlock(scan); 361 } 362 363 /* 364 * [re]flush chain. 365 */ 366 info.diddeferral = 0; 367 hammer2_chain_flush_core(&info, &chain); 368 #if FLUSH_DEBUG 369 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n", 370 chain, chain->bref.type, chain->flags); 371 #endif 372 373 /* 374 * Only loop if deep recursions have been deferred. 375 */ 376 if (TAILQ_EMPTY(&info.flush_list)) 377 break; 378 379 if (++loops % 1000 == 0) { 380 kprintf("hammer2_chain_flush: excessive loops on %p\n", 381 chain); 382 if (hammer2_debug & 0x100000) 383 Debugger("hell4"); 384 } 385 } 386 hammer2_chain_drop(chain); 387 *chainp = chain; 388 } 389 390 /* 391 * This is the core of the chain flushing code. The chain is locked by the 392 * caller and must also have an extra ref on it by the caller, and remains 393 * locked and will have an extra ref on return. 394 * 395 * If the flush accomplished any work chain will be flagged MOVED 396 * indicating a copy-on-write propagation back up is required. 397 * Deep sub-nodes may also have been entered onto the deferral list. 398 * MOVED is never set on the volume root. 399 * 400 * NOTE: modify_tid is different from MODIFIED. modify_tid is updated 401 * only when a chain is specifically modified, and not updated 402 * for copy-on-write propagations. MODIFIED is set on any modification 403 * including copy-on-write propagations. 404 * 405 * NOTE: We are responsible for updating chain->bref.mirror_tid and 406 * core->update_lo The caller is responsible for processing us into 407 * our parent (if any). 408 * 409 * We are also responsible for updating chain->core->update_lo to 410 * prevent repeated recursions due to deferrals. 411 * 412 * WARNING! bref.mirror_tid may only be updated if either the MODIFIED bit 413 * is already zero or if we clear it. 414 * 415 */ 416 static void 417 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp) 418 { 419 hammer2_chain_t *chain = *chainp; 420 hammer2_chain_t *saved_parent; 421 hammer2_mount_t *hmp; 422 hammer2_chain_core_t *core; 423 #if 0 424 hammer2_blockref_t *bref; 425 char *bdata; 426 hammer2_io_t *dio; 427 int error; 428 #endif 429 int diddeferral; 430 431 hmp = chain->hmp; 432 core = chain->core; 433 diddeferral = info->diddeferral; 434 435 /* 436 * Check if we even have any work to do. 437 * 438 * We do not update core->update_lo because there might be other 439 * paths to the core and we haven't actually checked it. 440 * 441 * This bit of code is capable of short-cutting entire sub-trees 442 * if they have not been touched. 443 */ 444 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 445 (core->update_lo >= info->sync_tid || 446 chain->bref.mirror_tid >= info->sync_tid || 447 chain->bref.mirror_tid >= core->update_hi)) { 448 KKASSERT(chain->modify_tid <= info->sync_tid); 449 /* don't update update_lo, there may be other paths to core */ 450 /* don't update bref.mirror_tid, scan2 is not called */ 451 return; 452 } 453 454 /* 455 * Ignore chains which have already been flushed through the current 456 * synchronization point. 457 */ 458 KKASSERT (chain->bref.mirror_tid <= info->sync_tid); 459 if (chain->bref.mirror_tid == info->sync_tid) { 460 /* do not update core->update_lo, there may be another path */ 461 return; 462 } 463 464 /* 465 * Ignore chains modified beyond the current flush point. These 466 * will be treated as if they did not exist. Subchains with lower 467 * modify_tid's will still be accessible via other parents. 468 * 469 * Do not update bref.mirror_tid here, it will interfere with 470 * synchronization. e.g. inode flush tid 1, concurrent D-D tid 2, 471 * then later on inode flush tid 2. If we were to set mirror_tid 472 * to 1 during inode flush tid 1 the blockrefs would only be partially 473 * updated (and likely panic). 474 * 475 * Do not update core->update_lo here, there might be other paths 476 * to the core and we haven't actually flushed it. 477 * 478 * (vchain and fchain are exceptions since they cannot be duplicated) 479 */ 480 if (chain->modify_tid > info->sync_tid && 481 chain != &hmp->fchain && chain != &hmp->vchain) { 482 /* do not update bref.mirror_tid, scan2 ignores chain */ 483 /* do not update core->update_lo, there may be another path */ 484 return; 485 } 486 487 saved_parent = info->parent; 488 info->parent = chain; 489 retry: 490 491 /* 492 * Chains deleted as-of the flush synchronization point require some 493 * special early handling to avoid double flushing because multiple 494 * deletions are sometimes forced within the same transaction. 495 * Allowing the flush to proceed through more than one path can wind 496 * up updating the chain's block table twice and cause an assertion. 497 * 498 * We don't check the 'same transaction' part but simply punt in this 499 * situation. We must still check for multiple deletions, since any 500 * terminal (non-stale) deletion still requires processing to at 501 * least clean up the children, and also (for inodes) there might 502 * still be an open descriptor. 503 * 504 * Clear MODIFIED but set MOVED to ensure that the parent still 505 * deals with it. 506 */ 507 if (chain->delete_tid <= info->sync_tid && 508 (chain->flags & HAMMER2_CHAIN_DUPLICATED)) { 509 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 510 #if 0 511 /* 512 * XXX should be able to invalidate the buffer here. 513 * XXX problem if reused, snapshotted, or reactivated. 514 */ 515 if (chain->dio) { 516 hammer2_io_setinval(chain->dio, chain->bytes); 517 } 518 #endif 519 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 520 hammer2_chain_ref(chain); 521 atomic_set_int(&chain->flags, 522 HAMMER2_CHAIN_MOVED); 523 } 524 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 525 hammer2_chain_memory_wakeup(chain->pmp); 526 hammer2_chain_drop(chain); 527 } 528 529 /* 530 * Update mirror_tid, indicating that chain is synchronized 531 * on its modification and block table. This probably isn't 532 * needed since scan2 should ignore deleted chains anyway. 533 * 534 * NOTE: bref.mirror_tid cannot be updated 535 * unless MODIFIED is cleared or already 536 * clear. 537 */ 538 if (chain->bref.mirror_tid < info->sync_tid) 539 chain->bref.mirror_tid = info->sync_tid; 540 /* do not update core->update_lo, there may be another path */ 541 info->parent = saved_parent; 542 return; 543 } 544 545 /* 546 * Recurse if we are not up-to-date. Once we are done we will 547 * update update_lo if there were no deferrals. update_lo can become 548 * higher than update_hi and is used to prevent re-recursions during 549 * the same flush cycle. 550 * 551 * update_hi was already checked and prevents initial recursions on 552 * subtrees which have not been modified. 553 * 554 * NOTE: We must recurse whether chain is flagged DELETED or not. 555 * However, if it is flagged DELETED we limit sync_tid to 556 * delete_tid to ensure that the chain's bref.mirror_tid is 557 * not fully updated and causes it to miss the non-DELETED 558 * path. 559 * 560 * NOTE: If a deferral occurs hammer2_chain_flush() will flush the 561 * deferred chain independently which will update it's 562 * bref.mirror_tid and prevent it from deferred again. 563 */ 564 if (chain->bref.mirror_tid < info->sync_tid && 565 chain->bref.mirror_tid < core->update_hi) { 566 hammer2_chain_layer_t *layer; 567 int saved_domodify; 568 int save_gen; 569 570 /* 571 * Races will bump update_hi above trans->sync_tid and should 572 * not affect this test. 573 * 574 * We don't want to set our chain to MODIFIED gratuitously. 575 * 576 * We need an extra ref on chain because we are going to 577 * release its lock temporarily in our child loop. 578 */ 579 580 /* 581 * Run two passes. The first pass handles MODIFIED and 582 * update_lo recursions while the second pass handles 583 * MOVED chains on the way back up. 584 * 585 * If the stack gets too deep we defer the chain. Since 586 * hammer2_chain_core's can be shared at multiple levels 587 * in the tree, we may encounter a chain that we had already 588 * deferred. We could undefer it but it will probably just 589 * defer again so it is best to leave it deferred. 590 * 591 * Scan1 is recursive. 592 * 593 * NOTE: The act of handling a modified/submodified chain can 594 * cause the MOVED Flag to be set. It can also be set 595 * via hammer2_chain_delete() and in other situations. 596 * 597 * NOTE: RB_SCAN() must be used instead of RB_FOREACH() 598 * because children can be physically removed during 599 * the scan. 600 * 601 * NOTE: We would normally not care about insertions except 602 * that some insertions might occur from the flush 603 * itself, so loop on generation number changes. 604 */ 605 saved_domodify = info->domodify; 606 info->domodify = 0; 607 608 if (chain->flags & HAMMER2_CHAIN_DEFERRED) { 609 ++info->diddeferral; 610 } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 611 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) { 612 hammer2_chain_ref(chain); 613 TAILQ_INSERT_TAIL(&info->flush_list, 614 chain, flush_node); 615 atomic_set_int(&chain->flags, 616 HAMMER2_CHAIN_DEFERRED); 617 } 618 ++info->diddeferral; 619 } else { 620 spin_lock(&core->cst.spin); 621 KKASSERT(core->good == 0x1234 && core->sharecnt > 0); 622 do { 623 save_gen = core->generation; 624 TAILQ_FOREACH_REVERSE(layer, &core->layerq, 625 h2_layer_list, entry) { 626 ++layer->refs; 627 KKASSERT(layer->good == 0xABCD); 628 RB_SCAN(hammer2_chain_tree, 629 &layer->rbtree, 630 NULL, hammer2_chain_flush_scan1, 631 info); 632 --layer->refs; 633 } 634 } while (core->generation != save_gen); 635 spin_unlock(&core->cst.spin); 636 } 637 638 if (info->parent != chain) { 639 kprintf("ZZZ\n"); 640 hammer2_chain_drop(chain); 641 hammer2_chain_ref(info->parent); 642 } 643 chain = info->parent; 644 645 /* 646 * chain was unlocked during the scan1 recursion and may 647 * have been deleted, destroyed, or even synchronously 648 * flushed due to aliasing. 649 * 650 * The flush continues normally in the first two places as 651 * the deletion or destruction does NOT affect the flush 652 * as-of the flush synchronization point. 653 * 654 * We must detect the last case and avoid flushing chain twice. 655 */ 656 #if 0 657 if (chain->delete_tid <= info->sync_tid && 658 (chain->flags & HAMMER2_CHAIN_DUPLICATED)) { 659 kprintf("xxx\n"); 660 goto retry; 661 } 662 #endif 663 if (chain->bref.mirror_tid >= info->sync_tid || 664 chain->bref.mirror_tid >= core->update_hi) { 665 kprintf("yyy\n"); 666 goto retry; 667 } 668 669 /* 670 * If any deferral occurred we must set domodify to 0 to avoid 671 * potentially modifying the parent twice (now and when we run 672 * the deferral list), as doing so could cause the blockref 673 * update to run on a block array which has already been 674 * updated. 675 */ 676 if (info->domodify && diddeferral != info->diddeferral) 677 info->domodify = 0; 678 679 /* 680 * THIS IS THE ONLY POINT IN THE FLUSH WHERE A PARENT IN THE 681 * NOMINAL TOPOLOGY, OTHER THAN FREEMAP ALLOCATIONS, IS 682 * MODIFIED. FREEMAP ALLOCATIONS WILL MODIFY THE FREEMAP 683 * TOPOLOGY WITH SYNC_TID+1 AND DO NOT AFFECT THE CURRENT 684 * FLUSH. 685 * 686 * Modifying the parent can create issues if the current 687 * parent is already in a modified state with an earlier 688 * transaction id. We want to avoid an endless flush loop 689 * on the original parent so we must clear its modified bit 690 * after creating the new parent, if they wind up being 691 * different. Care must also be taken to avoid flushing the 692 * same parent twice. 693 * 694 * We are responsible for setting the parent into a modified 695 * state before we scan the children to update the parent's 696 * block table. This must essentially be done as an atomic 697 * operation (the parent must remain locked throughout the 698 * operation), otherwise other transactions can squeeze a 699 * delete-duplicate in and create block table havoc. 700 * 701 * NOTE: Blockrefs are only updated on live chains. 702 * 703 * NOTE: Modifying the parent generally causes a 704 * delete-duplicate to occur from within the flush 705 * itself, with an allocation from the freemap occuring 706 * as an additional side-effect. 707 * 708 * NOTE: If the parent was deleted our modified chain will 709 * also be marked deleted, but since it inherits the 710 * parent's delete_tid it will still appear to be 711 * 'live' for the purposes of the flush. 712 */ 713 if (info->domodify && !h2ignore_deleted(info, chain)) { 714 KKASSERT(chain->modify_tid < info->sync_tid); 715 716 /* 717 * The scan1 loop and/or flush_core is reentrant, 718 * particularly when core->generation changes. To 719 * avoid havoc we have to prevent repetitive 720 * delete-duplicates of the same chain. 721 * 722 * After executing the modify set the original chain's 723 * bref.mirror_tid to prevent any reentrancy during 724 * the current flush cycle. 725 */ 726 hammer2_chain_modify(info->trans, &info->parent, 727 HAMMER2_MODIFY_NO_MODIFY_TID); 728 if (info->parent != chain) { 729 /* 730 * NOTE: bref.mirror_tid cannot be updated 731 * unless MODIFIED is cleared or already 732 * clear. 733 */ 734 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 735 atomic_clear_int(&chain->flags, 736 HAMMER2_CHAIN_MODIFIED); 737 hammer2_chain_memory_wakeup(chain->pmp); 738 hammer2_chain_drop(chain); 739 } 740 if (chain->bref.mirror_tid < info->sync_tid) 741 chain->bref.mirror_tid = info->sync_tid; 742 hammer2_chain_drop(chain); 743 hammer2_chain_ref(info->parent); 744 } 745 chain = info->parent; 746 } 747 748 KKASSERT(chain == info->parent); 749 750 /* 751 * Handle successfully flushed children who are in the MOVED 752 * state on the way back up the recursion. This can have 753 * the side-effect of clearing MOVED. 754 * 755 * Scan2 may replace info->parent. If it does it will also 756 * replace the extra ref we made. 757 * 758 * Scan2 is non-recursive. 759 */ 760 if (diddeferral != info->diddeferral) { 761 spin_lock(&core->cst.spin); 762 } else { 763 KKASSERT(chain == info->parent); 764 KKASSERT(info->domodify == 0 || 765 (chain->flags & HAMMER2_CHAIN_FLUSHED) == 0); 766 atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSHED); 767 spin_lock(&core->cst.spin); 768 KKASSERT(core->good == 0x1234 && core->sharecnt > 0); 769 KKASSERT(info->parent->core == core); 770 TAILQ_FOREACH_REVERSE(layer, &core->layerq, 771 h2_layer_list, entry) { 772 info->pass = 1; 773 ++layer->refs; 774 KKASSERT(layer->good == 0xABCD); 775 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 776 NULL, hammer2_chain_flush_scan2, info); 777 info->pass = 2; 778 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 779 NULL, hammer2_chain_flush_scan2, info); 780 --layer->refs; 781 } 782 } 783 784 /* 785 * info->parent must not have been replaced again 786 */ 787 KKASSERT(info->parent == chain); 788 789 *chainp = chain; 790 791 hammer2_chain_layer_check_locked(chain->hmp, core); 792 spin_unlock(&core->cst.spin); 793 794 /* 795 * Update the core only if no deferrals occurred. Otherwise 796 * we could end up clearing the MOVED bit in the children 797 * prematurely. 798 */ 799 if (diddeferral == info->diddeferral) 800 hammer2_flush_core_update(core, info); 801 802 info->domodify = saved_domodify; 803 KKASSERT(chain->refs > 1); 804 } else { 805 /* 806 * Update the core, no deferrals occurred in this path. 807 */ 808 hammer2_flush_core_update(core, info); 809 } 810 info->parent = saved_parent; 811 812 #if FLUSH_DEBUG 813 kprintf("POP %p.%d defer=%d\n", chain, chain->bref.type, diddeferral); 814 #endif 815 816 /* 817 * Do not flush the chain if there were any deferrals. It will be 818 * retried later after the deferrals are independently handled. 819 * Do not update update_lo or bref.mirror_tid. 820 */ 821 if (diddeferral != info->diddeferral) { 822 if (hammer2_debug & 0x0008) { 823 kprintf("%*.*s} %p/%d %04x (deferred)", 824 info->depth, info->depth, "", 825 chain, chain->refs, chain->flags); 826 } 827 /* do not update core->update_lo */ 828 /* do not update bref.mirror_tid */ 829 return; 830 } 831 832 KKASSERT(chain->bref.mirror_tid < info->sync_tid); 833 834 /* 835 * Non-deferral path, chain is now deterministically being flushed. 836 * We've finished running the recursion and the blockref update. 837 * 838 * update bref.mirror_tid. update_lo has already been updated. 839 * 840 * After this point we MUST dipose of the MODIFIED bit on chain. 841 */ 842 if (chain->bref.mirror_tid < info->sync_tid) 843 chain->bref.mirror_tid = info->sync_tid; 844 845 /* 846 * Deal with deleted and destroyed chains on the way back up. 847 * 848 * Otherwise a deleted chain can be optimized by clearing MODIFIED 849 * without bothering to write it out. 850 * 851 * NOTE: We optimize this by noting that only 'inode' chains require 852 * this treatment. When a file with an open descriptor is 853 * deleted only its inode is marked deleted. Other deletions, 854 * such as indirect block deletions, will no longer be visible 855 * to the live filesystem and do not need to be updated. 856 */ 857 if (h2ignore_deleted(info, chain)) { 858 /* 859 * At the moment we unconditionally set the MOVED bit because 860 * there are situations where it might not have been set due 861 * to similar delete-destroyed optimizations, and the parent 862 * of the parent still may need to be notified of the deletion. 863 */ 864 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 865 hammer2_chain_ref(chain); 866 atomic_set_int(&chain->flags, 867 HAMMER2_CHAIN_MOVED); 868 } 869 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 870 #if 0 871 /* 872 * XXX should be able to invalidate the buffer here. 873 * XXX problem if reused, snapshotted, or reactivated. 874 */ 875 if (chain->dio) { 876 hammer2_io_setinval(chain->dio, chain->bytes); 877 } 878 #endif 879 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 880 hammer2_chain_memory_wakeup(chain->pmp); 881 hammer2_chain_drop(chain); 882 } 883 return; 884 } 885 886 /* 887 * A degenerate flush might not have flushed anything and thus not 888 * processed modified blocks on the way back up. Detect the case. 889 * 890 * This case can occur when a create, modify, and rename (to a 891 * different part of the topology) occurs in the same flush, 892 * resulting in a parent which effectively needs no modification. 893 */ 894 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 895 #if 0 896 kprintf("chain %p.%d %08x recursed but wasn't " 897 "modified mirr=%016jx " 898 "update_lo=%016jx synctid=%016jx\n", 899 chain, chain->bref.type, chain->flags, 900 chain->bref.mirror_tid, 901 core->update_lo, info->sync_tid); 902 #endif 903 #if 0 904 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 905 hammer2_chain_ref(chain); 906 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 907 } 908 #endif 909 return; 910 } 911 912 /* 913 * Issue flush. 914 * 915 * A DELETED node that reaches this point must be flushed for 916 * synchronization point consistency. 917 * 918 * Update bref.mirror_tid, clear MODIFIED, and set MOVED. 919 * 920 * The caller will update the parent's reference to this chain 921 * by testing MOVED as long as the modification was in-bounds. 922 * 923 * MOVED is never set on the volume root as there is no parent 924 * to adjust. 925 */ 926 if (hammer2_debug & 0x1000) { 927 kprintf("Flush %p.%d %016jx/%d sync_tid=%016jx data=%016jx\n", 928 chain, chain->bref.type, 929 chain->bref.key, chain->bref.keybits, 930 info->sync_tid, chain->bref.data_off); 931 } 932 if (hammer2_debug & 0x2000) { 933 Debugger("Flush hell"); 934 } 935 936 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 937 hammer2_chain_memory_wakeup(chain->pmp); 938 939 if ((chain->flags & HAMMER2_CHAIN_MOVED) || 940 chain == &hmp->vchain || 941 chain == &hmp->fchain) { 942 /* 943 * Drop the ref from the MODIFIED bit we cleared, 944 * net -1 ref. 945 */ 946 hammer2_chain_drop(chain); 947 } else { 948 /* 949 * Drop the ref from the MODIFIED bit we cleared and 950 * set a ref for the MOVED bit we are setting. Net 0 refs. 951 */ 952 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 953 } 954 955 /* 956 * If this is part of a recursive flush we can go ahead and write 957 * out the buffer cache buffer and pass a new bref back up the chain 958 * via the MOVED bit. 959 * 960 * Volume headers are NOT flushed here as they require special 961 * processing. 962 */ 963 switch(chain->bref.type) { 964 case HAMMER2_BREF_TYPE_FREEMAP: 965 hammer2_modify_volume(hmp); 966 hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; 967 break; 968 case HAMMER2_BREF_TYPE_VOLUME: 969 /* 970 * The free block table is flushed by hammer2_vfs_sync() 971 * before it flushes vchain. We must still hold fchain 972 * locked while copying voldata to volsync, however. 973 */ 974 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 975 #if 0 976 if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) || 977 hmp->voldata.freemap_tid < info->trans->sync_tid) { 978 /* this will modify vchain as a side effect */ 979 hammer2_chain_t *tmp = &hmp->fchain; 980 hammer2_chain_flush(info->trans, &tmp); 981 KKASSERT(tmp == &hmp->fchain); 982 } 983 #endif 984 985 /* 986 * There is no parent to our root vchain and fchain to 987 * synchronize the bref to, their updated mirror_tid's 988 * must be synchronized to the volume header. 989 */ 990 hmp->voldata.mirror_tid = chain->bref.mirror_tid; 991 /*hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;*/ 992 993 /* 994 * The volume header is flushed manually by the syncer, not 995 * here. All we do here is adjust the crc's. 996 */ 997 KKASSERT(chain->data != NULL); 998 KKASSERT(chain->dio == NULL); 999 1000 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 1001 hammer2_icrc32( 1002 (char *)&hmp->voldata + 1003 HAMMER2_VOLUME_ICRC1_OFF, 1004 HAMMER2_VOLUME_ICRC1_SIZE); 1005 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 1006 hammer2_icrc32( 1007 (char *)&hmp->voldata + 1008 HAMMER2_VOLUME_ICRC0_OFF, 1009 HAMMER2_VOLUME_ICRC0_SIZE); 1010 hmp->voldata.icrc_volheader = 1011 hammer2_icrc32( 1012 (char *)&hmp->voldata + 1013 HAMMER2_VOLUME_ICRCVH_OFF, 1014 HAMMER2_VOLUME_ICRCVH_SIZE); 1015 hmp->volsync = hmp->voldata; 1016 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 1017 hammer2_chain_unlock(&hmp->fchain); 1018 break; 1019 case HAMMER2_BREF_TYPE_DATA: 1020 /* 1021 * Data elements have already been flushed via the logical 1022 * file buffer cache. Their hash was set in the bref by 1023 * the vop_write code. 1024 * 1025 * Make sure any device buffer(s) have been flushed out here. 1026 * (there aren't usually any to flush). 1027 */ 1028 #if 0 1029 /* XXX */ 1030 /* chain and chain->bref, NOWAIT operation */ 1031 #endif 1032 break; 1033 #if 0 1034 case HAMMER2_BREF_TYPE_INDIRECT: 1035 /* 1036 * Indirect blocks may be in an INITIAL state. Use the 1037 * chain_lock() call to ensure that the buffer has been 1038 * instantiated (even though it is already locked the buffer 1039 * might not have been instantiated). 1040 * 1041 * Only write the buffer out if it is dirty, it is possible 1042 * the operating system had already written out the buffer. 1043 */ 1044 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 1045 KKASSERT(chain->dio != NULL); 1046 1047 chain->data = NULL; 1048 hammer2_io_bqrelse(&chain->dio); 1049 hammer2_chain_unlock(chain); 1050 break; 1051 #endif 1052 case HAMMER2_BREF_TYPE_INDIRECT: 1053 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1054 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 1055 case HAMMER2_BREF_TYPE_INODE: 1056 /* 1057 * Device-backed. Buffer will be flushed by the sync 1058 * code XXX. 1059 */ 1060 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 1061 break; 1062 default: 1063 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 1064 panic("hammer2_chain_flush_core: unsupported embedded bref %d", 1065 chain->bref.type); 1066 /* NOT REACHED */ 1067 #if 0 1068 /* 1069 * Embedded elements have to be flushed out. 1070 * (Basically just BREF_TYPE_INODE). 1071 */ 1072 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 1073 KKASSERT(chain->data != NULL); 1074 KKASSERT(chain->dio == NULL); 1075 bref = &chain->bref; 1076 1077 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0); 1078 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) == 1079 HAMMER2_CHECK_ISCSI32 || 1080 HAMMER2_DEC_CHECK(chain->bref.methods) == 1081 HAMMER2_CHECK_FREEMAP); 1082 1083 /* 1084 * The data is embedded, we have to acquire the 1085 * buffer cache buffer and copy the data into it. 1086 */ 1087 error = hammer2_io_bread(hmp, bref->data_off, chain->bytes, 1088 &dio); 1089 KKASSERT(error == 0); 1090 bdata = hammer2_io_data(dio, bref->data_off); 1091 1092 /* 1093 * Copy the data to the buffer, mark the buffer 1094 * dirty, and convert the chain to unmodified. 1095 */ 1096 bcopy(chain->data, bdata, chain->bytes); 1097 hammer2_io_bdwrite(&dio); 1098 1099 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { 1100 case HAMMER2_CHECK_FREEMAP: 1101 chain->bref.check.freemap.icrc32 = 1102 hammer2_icrc32(chain->data, chain->bytes); 1103 break; 1104 case HAMMER2_CHECK_ISCSI32: 1105 chain->bref.check.iscsi32.value = 1106 hammer2_icrc32(chain->data, chain->bytes); 1107 break; 1108 default: 1109 panic("hammer2_flush_core: bad crc type"); 1110 break; /* NOT REACHED */ 1111 } 1112 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 1113 ++hammer2_iod_meta_write; 1114 else 1115 ++hammer2_iod_indr_write; 1116 #endif 1117 } 1118 } 1119 1120 /* 1121 * Flush helper scan1 (recursive) 1122 * 1123 * Flushes the children of the caller's chain (parent) and updates 1124 * the blockref, restricted by sync_tid. 1125 * 1126 * Ripouts during the loop should not cause any problems. Because we are 1127 * flushing to a synchronization point, modification races will occur after 1128 * sync_tid and do not have to be flushed anyway. 1129 * 1130 * It is also ok if the parent is chain_duplicate()'d while unlocked because 1131 * the delete/duplication will install a delete_tid that is still larger than 1132 * our current sync_tid. 1133 * 1134 * WARNING! If we do not call chain_flush_core we must update bref.mirror_tid 1135 * ourselves. 1136 */ 1137 static int 1138 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data) 1139 { 1140 hammer2_flush_info_t *info = data; 1141 hammer2_trans_t *trans = info->trans; 1142 hammer2_chain_t *parent = info->parent; 1143 int diddeferral; 1144 1145 if (hammer2_debug & 0x80000) 1146 Debugger("hell3"); 1147 diddeferral = info->diddeferral; 1148 1149 /* 1150 * Child is beyond the flush synchronization zone, don't persue. 1151 * Remember that modifications generally delete-duplicate so if the 1152 * sub-tree is dirty another child will get us there. But not this 1153 * one. 1154 * 1155 * Or MODIFIED is not set and child is already fully synchronized 1156 * with its sub-tree. Don't persue. 1157 * 1158 * (child can never be fchain or vchain so a special check isn't 1159 * needed). 1160 */ 1161 if (child->modify_tid > trans->sync_tid) { 1162 KKASSERT(child->delete_tid >= child->modify_tid); 1163 /* do not update child->core->update_lo, core not flushed */ 1164 /* do not update core->update_lo, there may be another path */ 1165 /* do not update mirror_tid, scan2 will ignore chain */ 1166 return (0); 1167 } 1168 1169 /* 1170 * We must ref the child before unlocking the spinlock. 1171 * 1172 * The caller has added a ref to the parent so we can temporarily 1173 * unlock it in order to lock the child. 1174 */ 1175 hammer2_chain_ref(child); 1176 spin_unlock(&parent->core->cst.spin); 1177 1178 hammer2_chain_unlock(parent); 1179 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 1180 1181 /* 1182 * No recursion needed if neither the child or anything under it 1183 * was messed with. 1184 */ 1185 if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 1186 child->core->update_lo >= info->sync_tid) { 1187 KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0); 1188 if (child->bref.mirror_tid < info->sync_tid) 1189 child->bref.mirror_tid = info->sync_tid; 1190 goto skip; 1191 } 1192 1193 /* 1194 * XXX delete child if parent is deleted. Propagate deletion 1195 * downward. TODO 1196 */ 1197 1198 1199 /* 1200 * Re-check original pre-lock conditions after locking. 1201 */ 1202 if (child->modify_tid > trans->sync_tid) { 1203 hammer2_chain_unlock(child); 1204 hammer2_chain_drop(child); 1205 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1206 spin_lock(&parent->core->cst.spin); 1207 return (0); 1208 } 1209 1210 if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 && 1211 child->core->update_lo >= info->sync_tid) { 1212 KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0); 1213 if (child->bref.mirror_tid < info->sync_tid) 1214 child->bref.mirror_tid = info->sync_tid; 1215 goto skip; 1216 } 1217 1218 /* 1219 * Recurse and collect deferral data. 1220 */ 1221 ++info->depth; 1222 hammer2_chain_flush_core(info, &child); 1223 --info->depth; 1224 1225 skip: 1226 /* 1227 * Check the conditions that could cause SCAN2 to modify the parent. 1228 * Modify the parent here instead of in SCAN2, which would cause 1229 * rollup chicken-and-egg races. 1230 * 1231 * Scan2 is expected to update bref.mirror_tid in the domodify case, 1232 * but will skip the child otherwise giving us the responsibility to 1233 * update bref.mirror_tid. 1234 * 1235 * WARNING! Do NOT update the child's bref.mirror_tid right here, 1236 * even if there was no deferral. Doing so would cause 1237 * confusion with the child's block array state in a 1238 * future flush. 1239 */ 1240 if (h2ignore_deleted(info, parent)) { 1241 /* 1242 * Special optimization matching similar tests done in 1243 * flush_core, scan1, and scan2. Avoid updating the block 1244 * table in the parent if the parent is no longer visible. 1245 */ 1246 ; 1247 } else if (child->delete_tid <= trans->sync_tid && 1248 child->delete_tid > parent->bref.mirror_tid && 1249 child->modify_tid <= parent->bref.mirror_tid) { 1250 info->domodify = 1; 1251 } else if (child->delete_tid > trans->sync_tid && 1252 child->modify_tid > parent->bref.mirror_tid) { 1253 info->domodify = 1; /* base insertion */ 1254 } 1255 1256 /* 1257 * Relock to continue the loop 1258 */ 1259 hammer2_chain_unlock(child); 1260 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1261 hammer2_chain_drop(child); 1262 KKASSERT(info->parent == parent); 1263 1264 spin_lock(&parent->core->cst.spin); 1265 return (0); 1266 } 1267 1268 /* 1269 * Flush helper scan2 (non-recursive) 1270 * 1271 * This pass on a chain's children propagates any MOVED or DELETED 1272 * elements back up the chain towards the root after those elements have 1273 * been fully flushed. Unlike scan1, this function is NOT recursive and 1274 * the parent remains locked across the entire scan. 1275 * 1276 * SCAN2 is called twice, once with pass set to 1 and once with it set to 2. 1277 * We have to do this so base[] elements can be deleted in pass 1 to make 1278 * room for adding new elements in pass 2. 1279 * 1280 * This function also rolls up storage statistics. 1281 * 1282 * NOTE! A deletion is a visbility issue, there can still be references to 1283 * deleted elements (for example, to an unlinked file which is still 1284 * open), and there can also be multiple chains pointing to the same 1285 * bref where some are deleted and some are not (for example due to 1286 * a rename). So a chain marked for deletion is basically considered 1287 * to be live until it is explicitly destroyed or until its ref-count 1288 * reaches zero (also implying that MOVED and MODIFIED are clear). 1289 * 1290 * NOTE! Info->parent will be locked but will only be instantiated/modified 1291 * if it is either MODIFIED or if scan1 determined that block table 1292 * updates will occur. 1293 * 1294 * NOTE! SCAN2 is responsible for updating child->bref.mirror_tid only in 1295 * the case where it modifies the parent (does a base insertion 1296 * or deletion). SCAN1 handled all other cases. 1297 */ 1298 static int 1299 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data) 1300 { 1301 hammer2_flush_info_t *info = data; 1302 hammer2_chain_t *parent = info->parent; 1303 hammer2_chain_core_t *above = child->above; 1304 hammer2_mount_t *hmp = child->hmp; 1305 hammer2_trans_t *trans = info->trans; 1306 hammer2_blockref_t *base; 1307 int count; 1308 int ok; 1309 1310 #if FLUSH_DEBUG 1311 kprintf("SCAN2 %p.%d %08x mod=%016jx del=%016jx trans=%016jx\n", child, child->bref.type, child->flags, child->modify_tid, child->delete_tid, info->trans->sync_tid); 1312 #endif 1313 /* 1314 * Ignore children created after our flush point, treating them as 1315 * if they did not exist). These children will not cause the parent 1316 * to be updated. 1317 * 1318 * Children deleted after our flush point are treated as having been 1319 * created for the purposes of the flush. The parent's update_hi 1320 * will already be higher than our trans->sync_tid so the path for 1321 * the next flush is left intact. 1322 * 1323 * When we encounter such children and the parent chain has not been 1324 * deleted, delete/duplicated, or delete/duplicated-for-move, then 1325 * the parent may be used to funnel through several flush points. 1326 * These chains will still be visible to later flushes due to having 1327 * a higher update_hi than we can set in the current flush. 1328 */ 1329 if (child->modify_tid > trans->sync_tid) { 1330 KKASSERT(child->delete_tid >= child->modify_tid); 1331 goto finalize; 1332 } 1333 1334 #if 0 1335 /* 1336 * Ignore children which have not changed. The parent's block table 1337 * is already correct. 1338 * 1339 * XXX The MOVED bit is only cleared when all multi-homed parents 1340 * have flushed, creating a situation where a re-flush can occur 1341 * via a parent which has already flushed. The hammer2_base_*() 1342 * functions currently have a hack to deal with this case but 1343 * we need something better. 1344 */ 1345 if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) { 1346 KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0); 1347 goto finalize; 1348 } 1349 #endif 1350 1351 /* 1352 * Make sure child is referenced before we unlock. 1353 */ 1354 hammer2_chain_ref(child); 1355 spin_unlock(&above->cst.spin); 1356 1357 /* 1358 * Parent reflushed after the child has passed them by should skip 1359 * due to the modify_tid test. XXX 1360 */ 1361 hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER); 1362 KKASSERT(child->above == above); 1363 KKASSERT(parent->core == above); 1364 1365 /* 1366 * The parent's blockref to the child must be deleted or updated. 1367 * 1368 * This point is not reached on successful DELETED optimizations 1369 * but can be reached on recursive deletions and restricted flushes. 1370 * 1371 * The chain_modify here may delete-duplicate the block. This can 1372 * cause a multitude of issues if the block was already modified 1373 * by a later (post-flush) transaction. Primarily blockrefs in 1374 * the later block can be out-of-date, so if the situation occurs 1375 * we can't throw away the MOVED bit on the current blocks until 1376 * the later blocks are flushed (so as to be able to regenerate all 1377 * the changes that were made). 1378 * 1379 * Because flushes are ordered we do not have to make a 1380 * modify/duplicate of indirect blocks. That is, the flush 1381 * code does not have to kmalloc or duplicate anything. We 1382 * can adjust the indirect block table in-place and reuse the 1383 * chain. It IS possible that the chain has already been duplicated 1384 * or may wind up being duplicated on-the-fly by modifying code 1385 * on the frontend. We simply use the original and ignore such 1386 * chains. However, it does mean we can't clear the MOVED bit. 1387 * 1388 * XXX recursive deletions not optimized. 1389 */ 1390 1391 switch(parent->bref.type) { 1392 case HAMMER2_BREF_TYPE_INODE: 1393 /* 1394 * Access the inode's block array. However, there is no 1395 * block array if the inode is flagged DIRECTDATA. The 1396 * DIRECTDATA case typicaly only occurs when a hardlink has 1397 * been shifted up the tree and the original inode gets 1398 * replaced with an OBJTYPE_HARDLINK placeholding inode. 1399 */ 1400 if (parent->data && 1401 (parent->data->ipdata.op_flags & 1402 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1403 base = &parent->data->ipdata.u.blockset.blockref[0]; 1404 } else { 1405 base = NULL; 1406 } 1407 count = HAMMER2_SET_COUNT; 1408 break; 1409 case HAMMER2_BREF_TYPE_INDIRECT: 1410 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1411 if (parent->data) 1412 base = &parent->data->npdata[0]; 1413 else 1414 base = NULL; 1415 count = parent->bytes / sizeof(hammer2_blockref_t); 1416 break; 1417 case HAMMER2_BREF_TYPE_VOLUME: 1418 base = &hmp->voldata.sroot_blockset.blockref[0]; 1419 count = HAMMER2_SET_COUNT; 1420 break; 1421 case HAMMER2_BREF_TYPE_FREEMAP: 1422 base = &parent->data->npdata[0]; 1423 count = HAMMER2_SET_COUNT; 1424 break; 1425 default: 1426 base = NULL; 1427 count = 0; 1428 panic("hammer2_chain_flush_scan2: " 1429 "unrecognized blockref type: %d", 1430 parent->bref.type); 1431 } 1432 1433 /* 1434 * Don't bother updating a deleted parent's blockrefs. 1435 * 1436 * Otherwise, we need to be COUNTEDBREFS synchronized for the 1437 * hammer2_base_*() functions. 1438 * 1439 * This test must match the similar one in flush_core. 1440 */ 1441 #if FLUSH_DEBUG 1442 kprintf("SCAN2 base=%p pass=%d PARENT %p.%d DTID=%016jx SYNC=%016jx\n", 1443 base, 1444 info->pass, parent, parent->bref.type, parent->delete_tid, trans->sync_tid); 1445 #endif 1446 if (h2ignore_deleted(info, parent)) 1447 base = NULL; 1448 1449 /* 1450 * Update the parent's blockref table and propagate mirror_tid. 1451 * 1452 * NOTE! Children with modify_tid's beyond our flush point are 1453 * considered to not exist for the purposes of updating the 1454 * parent's blockref array. 1455 * 1456 * NOTE! SCAN1 has already put the parent in a modified state 1457 * so if it isn't we panic. 1458 * 1459 * NOTE! chain->modify_tid vs chain->bref.modify_tid. The chain's 1460 * internal modify_tid is always updated based on creation 1461 * or delete-duplicate. However, the bref.modify_tid is NOT 1462 * updated due to simple blockref updates. 1463 */ 1464 #if FLUSH_DEBUG 1465 kprintf("chain %p->%p pass %d trans %016jx sync %p.%d %016jx/%d C=%016jx D=%016jx PMIRROR %016jx\n", 1466 parent, child, 1467 info->pass, trans->sync_tid, 1468 child, child->bref.type, 1469 child->bref.key, child->bref.keybits, 1470 child->modify_tid, child->delete_tid, parent->bref.mirror_tid); 1471 #endif 1472 1473 if (info->pass == 1 && child->delete_tid <= trans->sync_tid) { 1474 /* 1475 * Deleting. The block array is expected to contain the 1476 * child's entry if: 1477 * 1478 * (1) The deletion occurred after the parent's block table 1479 * was last synchronized (delete_tid), and 1480 * 1481 * (2) The creation occurred before or during the parent's 1482 * last block table synchronization. 1483 */ 1484 #if FLUSH_DEBUG 1485 kprintf("S2A %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n", 1486 child, child->bref.type, 1487 base, child->delete_tid, parent->bref.mirror_tid, 1488 child->modify_tid, parent->bref.mirror_tid); 1489 #endif 1490 if (base && 1491 child->delete_tid > parent->bref.mirror_tid && 1492 child->modify_tid <= parent->bref.mirror_tid) { 1493 KKASSERT(child->flags & HAMMER2_CHAIN_MOVED); 1494 KKASSERT(parent->modify_tid == trans->sync_tid || 1495 (parent == &hmp->vchain || 1496 parent == &hmp->fchain)); 1497 hammer2_rollup_stats(parent, child, -1); 1498 spin_lock(&above->cst.spin); 1499 #if FLUSH_DEBUG 1500 kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx " 1501 "flg=%08x %016jx/%d delete\n", 1502 trans->sync_tid, 1503 parent, parent->bref.type, 1504 child, child->bref.type, 1505 child->modify_tid, child->delete_tid, 1506 child->flags, 1507 child->bref.key, child->bref.keybits); 1508 #endif 1509 hammer2_base_delete(trans, parent, base, count, 1510 &info->cache_index, child); 1511 spin_unlock(&above->cst.spin); 1512 } 1513 } else if (info->pass == 2 && child->delete_tid > trans->sync_tid) { 1514 /* 1515 * Inserting. The block array is expected to NOT contain 1516 * the child's entry if: 1517 * 1518 * (1) The creation occurred after the parent's block table 1519 * was last synchronized (modify_tid), and 1520 * 1521 * (2) The child is not being deleted in the same 1522 * transaction. 1523 */ 1524 #if FLUSH_DEBUG 1525 kprintf("S2B %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n", 1526 child, child->bref.type, 1527 base, child->delete_tid, parent->bref.mirror_tid, 1528 child->modify_tid, parent->bref.mirror_tid); 1529 #endif 1530 if (base && 1531 child->modify_tid > parent->bref.mirror_tid) { 1532 KKASSERT(child->flags & HAMMER2_CHAIN_MOVED); 1533 KKASSERT(parent->modify_tid == trans->sync_tid || 1534 (parent == &hmp->vchain || 1535 parent == &hmp->fchain)); 1536 hammer2_rollup_stats(parent, child, 1); 1537 spin_lock(&above->cst.spin); 1538 #if FLUSH_DEBUG 1539 kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx " 1540 "flg=%08x %016jx/%d insert\n", 1541 trans->sync_tid, 1542 parent, parent->bref.type, 1543 child, child->bref.type, 1544 child->modify_tid, child->delete_tid, 1545 child->flags, 1546 child->bref.key, child->bref.keybits); 1547 #endif 1548 hammer2_base_insert(trans, parent, base, count, 1549 &info->cache_index, child); 1550 spin_unlock(&above->cst.spin); 1551 } 1552 } else if (info->pass == 3 && 1553 (child->delete_tid == HAMMER2_MAX_TID || 1554 child->delete_tid <= trans->sync_tid) && 1555 (child->flags & HAMMER2_CHAIN_MOVED)) { 1556 /* 1557 * We can't clear the MOVED bit on children whos modify_tid 1558 * is beyond our current trans (was tested at top of scan2), 1559 * or on deleted children which have not yet been flushed 1560 * (handled above). 1561 * 1562 * Scan all parents of this child and determine if any of 1563 * them still need the child's MOVED bit. 1564 */ 1565 hammer2_chain_t *scan; 1566 1567 if (hammer2_debug & 0x4000) 1568 kprintf("CHECKMOVED %p (parent=%p)", child, parent); 1569 1570 ok = 1; 1571 spin_lock(&above->cst.spin); 1572 TAILQ_FOREACH(scan, &above->ownerq, core_entry) { 1573 /* 1574 * Can't clear child's MOVED until all parent's have 1575 * synchronized with it. 1576 * 1577 * Ignore deleted parents as-of this flush TID. 1578 * Ignore the current parent being flushed. 1579 */ 1580 if (h2ignore_deleted(info, scan)) 1581 continue; 1582 if (scan == parent) 1583 continue; 1584 1585 /* 1586 * For parents not already synchronized check to see 1587 * if the flush has gotten past them yet or not. 1588 * 1589 * This must roughly mimic the tests that 1590 * hammer2_chain_flush_core() runs or we could leave 1591 * children hanging around with MOVED set and cause 1592 * a memory leak. 1593 */ 1594 if (scan->bref.mirror_tid >= trans->sync_tid) 1595 continue; 1596 if (scan->bref.mirror_tid >= above->update_hi) 1597 continue; 1598 1599 if (hammer2_debug & 0x4000) { 1600 kprintf("(fail scan %p %016jx/%016jx)", 1601 scan, scan->bref.mirror_tid, 1602 child->modify_tid); 1603 } 1604 ok = 0; 1605 break; 1606 } 1607 if (hammer2_debug & 0x4000) 1608 kprintf("\n"); 1609 spin_unlock(&above->cst.spin); 1610 1611 /* 1612 * Can we finally clear MOVED? 1613 */ 1614 if (ok) { 1615 if (hammer2_debug & 0x4000) 1616 kprintf("clear moved %p.%d %016jx/%d\n", 1617 child, child->bref.type, 1618 child->bref.key, child->bref.keybits); 1619 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 1620 if (child->flags & HAMMER2_CHAIN_MODIFIED) { 1621 kprintf("modified child %p all parents updated\n", 1622 child); 1623 atomic_clear_int(&child->flags, 1624 HAMMER2_CHAIN_MODIFIED); 1625 hammer2_chain_memory_wakeup(child->pmp); 1626 hammer2_chain_drop(child);/* cleared MODIFIED */ 1627 } 1628 hammer2_chain_drop(child); /* cleared MOVED */ 1629 } else { 1630 if (hammer2_debug & 0x4000) 1631 kprintf("keep moved %p.%d %016jx/%d\n", 1632 child, child->bref.type, 1633 child->bref.key, child->bref.keybits); 1634 } 1635 } 1636 1637 /* 1638 * Unlock the child. This can wind up dropping the child's 1639 * last ref, removing it from the parent's RB tree, and deallocating 1640 * the structure. The RB_SCAN() our caller is doing handles the 1641 * situation. 1642 */ 1643 hammer2_chain_unlock(child); 1644 hammer2_chain_drop(child); 1645 spin_lock(&above->cst.spin); 1646 1647 /* 1648 * The parent may have been delete-duplicated. 1649 */ 1650 info->parent = parent; 1651 finalize: 1652 return (0); 1653 } 1654 1655 /* 1656 * Update core->update_lo and attempt to clear the MOVED bit 1657 * for its children. 1658 * 1659 * This routine is only called after a sub-tree has been fully flushed 1660 * up to the current flush synchronization point. Calling it under any 1661 * other condition will blow up flush tracking. 1662 */ 1663 static 1664 void 1665 hammer2_flush_core_update(hammer2_chain_core_t *core, 1666 hammer2_flush_info_t *info) 1667 { 1668 hammer2_chain_layer_t *layer; 1669 1670 spin_lock(&core->cst.spin); 1671 if (core->update_lo < info->sync_tid) 1672 core->update_lo = info->sync_tid; 1673 TAILQ_FOREACH_REVERSE(layer, &core->layerq, 1674 h2_layer_list, entry) { 1675 info->pass = 3; 1676 ++layer->refs; 1677 KKASSERT(layer->good == 0xABCD); 1678 RB_SCAN(hammer2_chain_tree, &layer->rbtree, 1679 NULL, hammer2_chain_flush_scan2, info); 1680 --layer->refs; 1681 KKASSERT(info->parent->core == core); 1682 } 1683 spin_unlock(&core->cst.spin); 1684 } 1685 1686 static 1687 void 1688 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how) 1689 { 1690 #if 0 1691 hammer2_chain_t *grandp; 1692 #endif 1693 1694 parent->data_count += child->data_count; 1695 parent->inode_count += child->inode_count; 1696 child->data_count = 0; 1697 child->inode_count = 0; 1698 if (how < 0) { 1699 parent->data_count -= child->bytes; 1700 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1701 parent->inode_count -= 1; 1702 #if 0 1703 /* XXX child->data may be NULL atm */ 1704 parent->data_count -= child->data->ipdata.data_count; 1705 parent->inode_count -= child->data->ipdata.inode_count; 1706 #endif 1707 } 1708 } else if (how > 0) { 1709 parent->data_count += child->bytes; 1710 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) { 1711 parent->inode_count += 1; 1712 #if 0 1713 /* XXX child->data may be NULL atm */ 1714 parent->data_count += child->data->ipdata.data_count; 1715 parent->inode_count += child->data->ipdata.inode_count; 1716 #endif 1717 } 1718 } 1719 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 1720 parent->data->ipdata.data_count += parent->data_count; 1721 parent->data->ipdata.inode_count += parent->inode_count; 1722 #if 0 1723 for (grandp = parent->above->first_parent; 1724 grandp; 1725 grandp = grandp->next_parent) { 1726 grandp->data_count += parent->data_count; 1727 grandp->inode_count += parent->inode_count; 1728 } 1729 #endif 1730 parent->data_count = 0; 1731 parent->inode_count = 0; 1732 } 1733 } 1734