1 /* 2 * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * TRANSACTION AND FLUSH HANDLING 37 * 38 * Deceptively simple but actually fairly difficult to implement properly is 39 * how I would describe it. 40 * 41 * Flushing generally occurs bottom-up but requires a top-down scan to 42 * locate chains with MODIFIED and/or UPDATE bits set. The ONFLUSH flag 43 * tells how to recurse downward to find these chains. 44 */ 45 46 #include <sys/cdefs.h> 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/types.h> 50 #include <sys/lock.h> 51 #include <sys/uuid.h> 52 53 #include "hammer2.h" 54 55 #define FLUSH_DEBUG 0 56 57 #define HAMMER2_FLUSH_DEPTH_LIMIT 60 /* stack recursion limit */ 58 59 60 /* 61 * Recursively flush the specified chain. The chain is locked and 62 * referenced by the caller and will remain so on return. The chain 63 * will remain referenced throughout but can temporarily lose its 64 * lock during the recursion to avoid unnecessarily stalling user 65 * processes. 66 */ 67 struct hammer2_flush_info { 68 hammer2_chain_t *parent; 69 int depth; 70 long diddeferral; 71 int error; /* cumulative error */ 72 int flags; 73 #ifdef HAMMER2_SCAN_DEBUG 74 long scan_count; 75 long scan_mod_count; 76 long scan_upd_count; 77 long scan_onf_count; 78 long scan_del_count; 79 long scan_btype[7]; 80 long flushq_count; 81 #endif 82 struct h2_flush_list flushq; 83 hammer2_chain_t *debug; 84 }; 85 86 typedef struct hammer2_flush_info hammer2_flush_info_t; 87 88 static void hammer2_flush_core(hammer2_flush_info_t *info, 89 hammer2_chain_t *chain, int flags); 90 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data); 91 92 /* 93 * Any per-pfs transaction initialization goes here. 94 */ 95 void 96 hammer2_trans_manage_init(hammer2_pfs_t *pmp) 97 { 98 } 99 100 /* 101 * Transaction support for any modifying operation. Transactions are used 102 * in the pmp layer by the frontend and in the spmp layer by the backend. 103 * 104 * 0 - Normal transaction, interlocked against flush 105 * transaction. 106 * 107 * TRANS_ISFLUSH - Flush transaction, interlocked against normal 108 * transaction. 109 * 110 * TRANS_BUFCACHE - Buffer cache transaction, no interlock. 111 * 112 * Initializing a new transaction allocates a transaction ID. Typically 113 * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 114 * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 115 * media target. The latter mode is used by the recovery code. 116 * 117 * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 118 * other is a set of any number of concurrent filesystem operations. We 119 * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 120 * or we can have <running_flush> + <concurrent_fs_ops>. 121 * 122 * During a flush, new fs_ops are only blocked until the fs_ops prior to 123 * the flush complete. The new fs_ops can then run concurrent with the flush. 124 * 125 * Buffer-cache transactions operate as fs_ops but never block. A 126 * buffer-cache flush will run either before or after the current pending 127 * flush depending on its state. 128 */ 129 void 130 hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags) 131 { 132 uint32_t oflags; 133 uint32_t nflags; 134 int dowait; 135 136 for (;;) { 137 oflags = pmp->trans.flags; 138 cpu_ccfence(); 139 dowait = 0; 140 141 if (flags & HAMMER2_TRANS_ISFLUSH) { 142 /* 143 * Requesting flush transaction. Wait for all 144 * currently running transactions to finish. 145 * Afterwords, normal transactions will be 146 * interlocked. 147 */ 148 if (oflags & HAMMER2_TRANS_MASK) { 149 nflags = oflags | HAMMER2_TRANS_FPENDING | 150 HAMMER2_TRANS_WAITING; 151 dowait = 1; 152 } else { 153 nflags = (oflags | flags) + 1; 154 } 155 } else if (flags & HAMMER2_TRANS_BUFCACHE) { 156 /* 157 * Requesting strategy transaction from buffer-cache, 158 * or a VM getpages/putpages through the buffer cache. 159 * We must allow such transactions in all situations 160 * to avoid deadlocks. 161 */ 162 nflags = (oflags | flags) + 1; 163 #if 0 164 /* 165 * (old) previous code interlocked against the main 166 * flush pass. 167 */ 168 if ((oflags & (HAMMER2_TRANS_ISFLUSH | 169 HAMMER2_TRANS_PREFLUSH)) == 170 HAMMER2_TRANS_ISFLUSH) { 171 nflags = oflags | HAMMER2_TRANS_WAITING; 172 dowait = 1; 173 } else { 174 nflags = (oflags | flags) + 1; 175 } 176 #endif 177 } else { 178 /* 179 * Requesting normal modifying transaction (read-only 180 * operations do not use transactions). Waits for 181 * any flush to finish before allowing. Multiple 182 * modifying transactions can run concurrently. 183 */ 184 if (oflags & HAMMER2_TRANS_ISFLUSH) { 185 nflags = oflags | HAMMER2_TRANS_WAITING; 186 dowait = 1; 187 } else { 188 nflags = (oflags | flags) + 1; 189 } 190 } 191 if (dowait) 192 tsleep_interlock(&pmp->trans.sync_wait, 0); 193 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) { 194 if (dowait == 0) 195 break; 196 tsleep(&pmp->trans.sync_wait, PINTERLOCKED, 197 "h2trans", hz); 198 } else { 199 cpu_pause(); 200 } 201 /* retry */ 202 } 203 } 204 205 /* 206 * Start a sub-transaction, there is no 'subdone' function. This will 207 * issue a new modify_tid (mtid) for the current transaction, which is a 208 * CLC (cluster level change) id and not a per-node id. 209 * 210 * This function must be called for each XOP when multiple XOPs are run in 211 * sequence within a transaction. 212 * 213 * Callers typically update the inode with the transaction mtid manually 214 * to enforce sequencing. 215 */ 216 hammer2_tid_t 217 hammer2_trans_sub(hammer2_pfs_t *pmp) 218 { 219 hammer2_tid_t mtid; 220 221 mtid = atomic_fetchadd_64(&pmp->modify_tid, 1); 222 223 return (mtid); 224 } 225 226 void 227 hammer2_trans_done(hammer2_pfs_t *pmp) 228 { 229 uint32_t oflags; 230 uint32_t nflags; 231 232 for (;;) { 233 oflags = pmp->trans.flags; 234 cpu_ccfence(); 235 KKASSERT(oflags & HAMMER2_TRANS_MASK); 236 if ((oflags & HAMMER2_TRANS_MASK) == 1) { 237 /* 238 * This was the last transaction 239 */ 240 nflags = (oflags - 1) & ~(HAMMER2_TRANS_ISFLUSH | 241 HAMMER2_TRANS_BUFCACHE | 242 HAMMER2_TRANS_FPENDING | 243 HAMMER2_TRANS_WAITING); 244 } else { 245 /* 246 * Still transactions pending 247 */ 248 nflags = oflags - 1; 249 } 250 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) { 251 if ((nflags & HAMMER2_TRANS_MASK) == 0 && 252 (oflags & HAMMER2_TRANS_WAITING)) { 253 wakeup(&pmp->trans.sync_wait); 254 } 255 break; 256 } else { 257 cpu_pause(); 258 } 259 /* retry */ 260 } 261 } 262 263 /* 264 * Obtain new, unique inode number (not serialized by caller). 265 */ 266 hammer2_tid_t 267 hammer2_trans_newinum(hammer2_pfs_t *pmp) 268 { 269 hammer2_tid_t tid; 270 271 tid = atomic_fetchadd_64(&pmp->inode_tid, 1); 272 273 return tid; 274 } 275 276 /* 277 * Assert that a strategy call is ok here. Currently we allow strategy 278 * calls in all situations, including during flushes. Previously: 279 * (old) (1) In a normal transaction. 280 * (old) (2) In a flush transaction only if PREFLUSH is also set. 281 */ 282 void 283 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp) 284 { 285 #if 0 286 KKASSERT((pmp->trans.flags & HAMMER2_TRANS_ISFLUSH) == 0 || 287 (pmp->trans.flags & HAMMER2_TRANS_PREFLUSH)); 288 #endif 289 } 290 291 292 /* 293 * Chains undergoing destruction are removed from the in-memory topology. 294 * To avoid getting lost these chains are placed on the delayed flush 295 * queue which will properly dispose of them. 296 * 297 * We do this instead of issuing an immediate flush in order to give 298 * recursive deletions (rm -rf, etc) a chance to remove more of the 299 * hierarchy, potentially allowing an enormous amount of write I/O to 300 * be avoided. 301 */ 302 void 303 hammer2_delayed_flush(hammer2_chain_t *chain) 304 { 305 if ((chain->flags & HAMMER2_CHAIN_DELAYED) == 0) { 306 hammer2_spin_ex(&chain->hmp->list_spin); 307 if ((chain->flags & (HAMMER2_CHAIN_DELAYED | 308 HAMMER2_CHAIN_DEFERRED)) == 0) { 309 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELAYED | 310 HAMMER2_CHAIN_DEFERRED); 311 TAILQ_INSERT_TAIL(&chain->hmp->flushq, 312 chain, flush_node); 313 hammer2_chain_ref(chain); 314 } 315 hammer2_spin_unex(&chain->hmp->list_spin); 316 hammer2_voldata_modify(chain->hmp); 317 } 318 } 319 320 /* 321 * Flush the chain and all modified sub-chains through the specified 322 * synchronization point, propagating blockref updates back up. As 323 * part of this propagation, mirror_tid and inode/data usage statistics 324 * propagates back upward. 325 * 326 * Returns a HAMMER2 error code, 0 if no error. Note that I/O errors from 327 * buffers dirtied during the flush operation can occur later. 328 * 329 * modify_tid (clc - cluster level change) is not propagated. 330 * 331 * update_tid (clc) is used for validation and is not propagated by this 332 * function. 333 * 334 * This routine can be called from several places but the most important 335 * is from VFS_SYNC (frontend) via hammer2_inode_xop_flush (backend). 336 * 337 * chain is locked on call and will remain locked on return. The chain's 338 * UPDATE flag indicates that its parent's block table (which is not yet 339 * part of the flush) should be updated. 340 */ 341 int 342 hammer2_flush(hammer2_chain_t *chain, int flags) 343 { 344 hammer2_chain_t *scan; 345 hammer2_flush_info_t info; 346 hammer2_dev_t *hmp; 347 int loops; 348 349 /* 350 * Execute the recursive flush and handle deferrals. 351 * 352 * Chains can be ridiculously long (thousands deep), so to 353 * avoid blowing out the kernel stack the recursive flush has a 354 * depth limit. Elements at the limit are placed on a list 355 * for re-execution after the stack has been popped. 356 */ 357 bzero(&info, sizeof(info)); 358 TAILQ_INIT(&info.flushq); 359 info.flags = flags & ~HAMMER2_FLUSH_TOP; 360 361 /* 362 * Calculate parent (can be NULL), if not NULL the flush core 363 * expects the parent to be referenced so it can easily lock/unlock 364 * it without it getting ripped up. 365 */ 366 if ((info.parent = chain->parent) != NULL) 367 hammer2_chain_ref(info.parent); 368 369 /* 370 * Extra ref needed because flush_core expects it when replacing 371 * chain. 372 */ 373 hammer2_chain_ref(chain); 374 hmp = chain->hmp; 375 loops = 0; 376 377 for (;;) { 378 /* 379 * Move hmp->flushq to info.flushq if non-empty so it can 380 * be processed. 381 */ 382 if (TAILQ_FIRST(&hmp->flushq) != NULL) { 383 hammer2_spin_ex(&chain->hmp->list_spin); 384 TAILQ_CONCAT(&info.flushq, &hmp->flushq, flush_node); 385 hammer2_spin_unex(&chain->hmp->list_spin); 386 } 387 388 /* 389 * Unwind deep recursions which had been deferred. This 390 * can leave the FLUSH_* bits set for these chains, which 391 * will be handled when we [re]flush chain after the unwind. 392 */ 393 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) { 394 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 395 TAILQ_REMOVE(&info.flushq, scan, flush_node); 396 #ifdef HAMMER2_SCAN_DEBUG 397 ++info.flushq_count; 398 #endif 399 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED | 400 HAMMER2_CHAIN_DELAYED); 401 402 /* 403 * Now that we've popped back up we can do a secondary 404 * recursion on the deferred elements. 405 * 406 * NOTE: hammer2_flush() may replace scan. 407 */ 408 if (hammer2_debug & 0x0040) 409 kprintf("deferred flush %p\n", scan); 410 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 411 if (scan->error == 0) { 412 hammer2_flush(scan, flags & ~HAMMER2_FLUSH_TOP); 413 hammer2_chain_unlock(scan); 414 hammer2_chain_drop(scan);/* ref from defer */ 415 } else { 416 info.error |= scan->error; 417 } 418 } 419 420 /* 421 * [re]flush chain. 422 */ 423 info.diddeferral = 0; 424 hammer2_flush_core(&info, chain, flags); 425 426 /* 427 * Only loop if deep recursions have been deferred. 428 */ 429 if (TAILQ_EMPTY(&info.flushq)) 430 break; 431 432 if (++loops % 1000 == 0) { 433 kprintf("hammer2_flush: excessive loops on %p\n", 434 chain); 435 if (hammer2_debug & 0x100000) 436 Debugger("hell4"); 437 } 438 } 439 #ifdef HAMMER2_SCAN_DEBUG 440 if (info.scan_count >= 10) 441 kprintf("hammer2_flush: scan_count %ld (%ld,%ld,%ld,%ld) " 442 "bt(%ld,%ld,%ld,%ld,%ld,%ld) flushq %ld\n", 443 info.scan_count, 444 info.scan_mod_count, 445 info.scan_upd_count, 446 info.scan_onf_count, 447 info.scan_del_count, 448 info.scan_btype[1], 449 info.scan_btype[2], 450 info.scan_btype[3], 451 info.scan_btype[4], 452 info.scan_btype[5], 453 info.scan_btype[6], 454 info.flushq_count); 455 #endif 456 hammer2_chain_drop(chain); 457 if (info.parent) 458 hammer2_chain_drop(info.parent); 459 return (info.error); 460 } 461 462 /* 463 * This is the core of the chain flushing code. The chain is locked by the 464 * caller and must also have an extra ref on it by the caller, and remains 465 * locked and will have an extra ref on return. info.parent is referenced 466 * but not locked. 467 * 468 * Upon return, the caller can test the UPDATE bit on the chain to determine 469 * if the parent needs updating. 470 * 471 * (1) Determine if this node is a candidate for the flush, return if it is 472 * not. fchain and vchain are always candidates for the flush. 473 * 474 * (2) If we recurse too deep the chain is entered onto the deferral list and 475 * the current flush stack is aborted until after the deferral list is 476 * run. 477 * 478 * (3) Recursively flush live children (rbtree). This can create deferrals. 479 * A successful flush clears the MODIFIED and UPDATE bits on the children 480 * and typically causes the parent to be marked MODIFIED as the children 481 * update the parent's block table. A parent might already be marked 482 * MODIFIED due to a deletion (whos blocktable update in the parent is 483 * handled by the frontend), or if the parent itself is modified by the 484 * frontend for other reasons. 485 * 486 * (4) Permanently disconnected sub-trees are cleaned up by the front-end. 487 * Deleted-but-open inodes can still be individually flushed via the 488 * filesystem syncer. 489 * 490 * (5) Delete parents on the way back up if they are normal indirect blocks 491 * and have no children. 492 * 493 * (6) Note that an unmodified child may still need the block table in its 494 * parent updated (e.g. rename/move). The child will have UPDATE set 495 * in this case. 496 * 497 * WARNING ON BREF MODIFY_TID/MIRROR_TID 498 * 499 * blockref.modify_tid is consistent only within a PFS, and will not be 500 * consistent during synchronization. mirror_tid is consistent across the 501 * block device regardless of the PFS. 502 */ 503 static void 504 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, 505 int flags) 506 { 507 hammer2_chain_t *parent; 508 hammer2_dev_t *hmp; 509 int save_error; 510 511 /* 512 * (1) Optimize downward recursion to locate nodes needing action. 513 * Nothing to do if none of these flags are set. 514 */ 515 if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) { 516 if (hammer2_debug & 0x200) { 517 if (info->debug == NULL) 518 info->debug = chain; 519 } else { 520 return; 521 } 522 } 523 524 hmp = chain->hmp; 525 parent = info->parent; /* can be NULL */ 526 KKASSERT(chain->parent == parent); 527 528 /* 529 * Downward search recursion 530 */ 531 if (chain->flags & (HAMMER2_CHAIN_DEFERRED | HAMMER2_CHAIN_DELAYED)) { 532 /* 533 * Already deferred. 534 */ 535 ++info->diddeferral; 536 } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) && 537 (flags & HAMMER2_FLUSH_ALL) == 0 && 538 (flags & HAMMER2_FLUSH_TOP) == 0 && 539 chain->pmp && chain->pmp->mp) { 540 /* 541 * If FLUSH_ALL is not specified the caller does not want 542 * to recurse through PFS roots that have been mounted. 543 * 544 * (If the PFS has not been mounted there may not be 545 * anything monitoring its chains and its up to us 546 * to flush it). 547 * 548 * The typical sequence is to flush dirty PFS's starting at 549 * their root downward, then flush the device root (vchain). 550 * It is this second flush that typically leaves out the 551 * ALL flag. 552 * 553 * However we must still process the PFSROOT chains for block 554 * table updates in their parent (which IS part of our flush). 555 * 556 * NOTE: The volume root, vchain, does not set PFSBOUNDARY. 557 * 558 * NOTE: This test must be done before the depth-limit test, 559 * else it might become the top on a flushq iteration. 560 * 561 * NOTE: We must re-set ONFLUSH in the parent to retain if 562 * this chain (that we are skipping) requires work. 563 */ 564 if (chain->flags & (HAMMER2_CHAIN_ONFLUSH | 565 HAMMER2_CHAIN_DESTROY | 566 HAMMER2_CHAIN_MODIFIED)) { 567 hammer2_chain_setflush(parent); 568 } 569 } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 570 /* 571 * Recursion depth reached. 572 */ 573 KKASSERT((chain->flags & HAMMER2_CHAIN_DELAYED) == 0); 574 hammer2_chain_ref(chain); 575 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node); 576 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED); 577 ++info->diddeferral; 578 } else if (chain->flags & (HAMMER2_CHAIN_ONFLUSH | 579 HAMMER2_CHAIN_DESTROY)) { 580 /* 581 * Downward recursion search (actual flush occurs bottom-up). 582 * pre-clear ONFLUSH. It can get set again due to races or 583 * flush errors, which we want so the scan finds us again in 584 * the next flush. 585 * 586 * We must also recurse if DESTROY is set so we can finally 587 * get rid of the related children, otherwise the node will 588 * just get re-flushed on lastdrop. 589 * 590 * WARNING! The recursion will unlock/relock info->parent 591 * (which is 'chain'), potentially allowing it 592 * to be ripped up. 593 */ 594 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 595 save_error = info->error; 596 info->error = 0; 597 info->parent = chain; 598 599 /* 600 * We may have to do this twice to catch any indirect 601 * block maintenance that occurs. Other conditions which 602 * can keep setting ONFLUSH (such as deferrals) ought to 603 * be handled by the flushq code. XXX needs more help 604 */ 605 hammer2_spin_ex(&chain->core.spin); 606 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree, 607 NULL, hammer2_flush_recurse, info); 608 if (chain->flags & HAMMER2_CHAIN_ONFLUSH) { 609 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 610 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree, 611 NULL, hammer2_flush_recurse, info); 612 } 613 hammer2_spin_unex(&chain->core.spin); 614 info->parent = parent; 615 616 /* 617 * Re-set the flush bits if the flush was incomplete or 618 * an error occurred. If an error occurs it is typically 619 * an allocation error. Errors do not cause deferrals. 620 */ 621 if (info->error) 622 hammer2_chain_setflush(chain); 623 info->error |= save_error; 624 if (info->diddeferral) 625 hammer2_chain_setflush(chain); 626 627 /* 628 * If we lost the parent->chain association we have to 629 * stop processing this chain because it is no longer 630 * in this recursion. If it moved, it will be handled 631 * by the ONFLUSH flag elsewhere. 632 */ 633 if (chain->parent != parent) { 634 kprintf("LOST CHILD2 %p->%p (actual parent %p)\n", 635 parent, chain, chain->parent); 636 goto done; 637 } 638 } 639 640 /* 641 * Now we are in the bottom-up part of the recursion. 642 * 643 * Do not update chain if lower layers were deferred. We continue 644 * to try to update the chain on lower-level errors, but the flush 645 * code may decide not to flush the volume root. 646 * 647 * XXX should we continue to try to update the chain if an error 648 * occurred? 649 */ 650 if (info->diddeferral) 651 goto done; 652 653 /* 654 * Both parent and chain must be locked in order to flush chain, 655 * in order to properly update the parent under certain conditions. 656 * 657 * In addition, we can't safely unlock/relock the chain once we 658 * start flushing the chain itself, which we would have to do later 659 * on in order to lock the parent if we didn't do that now. 660 */ 661 hammer2_chain_ref_hold(chain); 662 hammer2_chain_unlock(chain); 663 if (parent) 664 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 665 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE); 666 hammer2_chain_drop_unhold(chain); 667 668 /* 669 * Can't process if we can't access their content. 670 */ 671 if ((parent && parent->error) || chain->error) { 672 kprintf("hammer2: chain error during flush\n"); 673 info->error |= chain->error; 674 if (parent) { 675 info->error |= parent->error; 676 hammer2_chain_unlock(parent); 677 } 678 goto done; 679 } 680 681 if (chain->parent != parent) { 682 kprintf("LOST CHILD3 %p->%p (actual parent %p)\n", 683 parent, chain, chain->parent); 684 KKASSERT(parent != NULL); 685 hammer2_chain_unlock(parent); 686 if ((chain->flags & HAMMER2_CHAIN_DELAYED) == 0) { 687 hammer2_chain_ref(chain); 688 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node); 689 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED); 690 ++info->diddeferral; 691 } 692 goto done; 693 } 694 695 /* 696 * Propagate the DESTROY flag downwards. This dummies up the flush 697 * code and tries to invalidate related buffer cache buffers to 698 * avoid the disk write. 699 */ 700 if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY)) 701 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 702 703 /* 704 * Dispose of the modified bit. 705 * 706 * If parent is present, the UPDATE bit should already be set. 707 * UPDATE should already be set. 708 * bref.mirror_tid should already be set. 709 */ 710 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 711 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) || 712 chain->parent == NULL); 713 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 714 atomic_add_long(&hammer2_count_modified_chains, -1); 715 716 /* 717 * Manage threads waiting for excessive dirty memory to 718 * be retired. 719 */ 720 if (chain->pmp) 721 hammer2_pfs_memory_wakeup(chain->pmp); 722 723 #if 0 724 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0 && 725 chain != &hmp->vchain && 726 chain != &hmp->fchain) { 727 /* 728 * Set UPDATE bit indicating that the parent block 729 * table requires updating. 730 */ 731 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 732 } 733 #endif 734 735 /* 736 * Issue the flush. This is indirect via the DIO. 737 * 738 * NOTE: A DELETED node that reaches this point must be 739 * flushed for synchronization point consistency. 740 * 741 * NOTE: Even though MODIFIED was already set, the related DIO 742 * might not be dirty due to a system buffer cache 743 * flush and must be set dirty if we are going to make 744 * further modifications to the buffer. Chains with 745 * embedded data don't need this. 746 */ 747 if (hammer2_debug & 0x1000) { 748 kprintf("Flush %p.%d %016jx/%d data=%016jx\n", 749 chain, chain->bref.type, 750 (uintmax_t)chain->bref.key, 751 chain->bref.keybits, 752 (uintmax_t)chain->bref.data_off); 753 } 754 if (hammer2_debug & 0x2000) { 755 Debugger("Flush hell"); 756 } 757 758 /* 759 * Update chain CRCs for flush. 760 * 761 * NOTE: Volume headers are NOT flushed here as they require 762 * special processing. 763 */ 764 switch(chain->bref.type) { 765 case HAMMER2_BREF_TYPE_FREEMAP: 766 /* 767 * Update the volume header's freemap_tid to the 768 * freemap's flushing mirror_tid. 769 * 770 * (note: embedded data, do not call setdirty) 771 */ 772 KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED); 773 KKASSERT(chain == &hmp->fchain); 774 hmp->voldata.freemap_tid = chain->bref.mirror_tid; 775 if (hammer2_debug & 0x8000) { 776 /* debug only, avoid syslogd loop */ 777 kprintf("sync freemap mirror_tid %08jx\n", 778 (intmax_t)chain->bref.mirror_tid); 779 } 780 781 /* 782 * The freemap can be flushed independently of the 783 * main topology, but for the case where it is 784 * flushed in the same transaction, and flushed 785 * before vchain (a case we want to allow for 786 * performance reasons), make sure modifications 787 * made during the flush under vchain use a new 788 * transaction id. 789 * 790 * Otherwise the mount recovery code will get confused. 791 */ 792 ++hmp->voldata.mirror_tid; 793 break; 794 case HAMMER2_BREF_TYPE_VOLUME: 795 /* 796 * The free block table is flushed by 797 * hammer2_vfs_sync() before it flushes vchain. 798 * We must still hold fchain locked while copying 799 * voldata to volsync, however. 800 * 801 * These do not error per-say since their data does 802 * not need to be re-read from media on lock. 803 * 804 * (note: embedded data, do not call setdirty) 805 */ 806 hammer2_chain_lock(&hmp->fchain, 807 HAMMER2_RESOLVE_ALWAYS); 808 hammer2_voldata_lock(hmp); 809 if (hammer2_debug & 0x8000) { 810 /* debug only, avoid syslogd loop */ 811 kprintf("sync volume mirror_tid %08jx\n", 812 (intmax_t)chain->bref.mirror_tid); 813 } 814 815 /* 816 * Update the volume header's mirror_tid to the 817 * main topology's flushing mirror_tid. It is 818 * possible that voldata.mirror_tid is already 819 * beyond bref.mirror_tid due to the bump we made 820 * above in BREF_TYPE_FREEMAP. 821 */ 822 if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) { 823 hmp->voldata.mirror_tid = 824 chain->bref.mirror_tid; 825 } 826 827 /* 828 * The volume header is flushed manually by the 829 * syncer, not here. All we do here is adjust the 830 * crc's. 831 */ 832 KKASSERT(chain->data != NULL); 833 KKASSERT(chain->dio == NULL); 834 835 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 836 hammer2_icrc32( 837 (char *)&hmp->voldata + 838 HAMMER2_VOLUME_ICRC1_OFF, 839 HAMMER2_VOLUME_ICRC1_SIZE); 840 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 841 hammer2_icrc32( 842 (char *)&hmp->voldata + 843 HAMMER2_VOLUME_ICRC0_OFF, 844 HAMMER2_VOLUME_ICRC0_SIZE); 845 hmp->voldata.icrc_volheader = 846 hammer2_icrc32( 847 (char *)&hmp->voldata + 848 HAMMER2_VOLUME_ICRCVH_OFF, 849 HAMMER2_VOLUME_ICRCVH_SIZE); 850 851 if (hammer2_debug & 0x8000) { 852 /* debug only, avoid syslogd loop */ 853 kprintf("syncvolhdr %016jx %016jx\n", 854 hmp->voldata.mirror_tid, 855 hmp->vchain.bref.mirror_tid); 856 } 857 hmp->volsync = hmp->voldata; 858 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 859 hammer2_voldata_unlock(hmp); 860 hammer2_chain_unlock(&hmp->fchain); 861 break; 862 case HAMMER2_BREF_TYPE_DATA: 863 /* 864 * Data elements have already been flushed via the 865 * logical file buffer cache. Their hash was set in 866 * the bref by the vop_write code. Do not re-dirty. 867 * 868 * Make sure any device buffer(s) have been flushed 869 * out here (there aren't usually any to flush) XXX. 870 */ 871 break; 872 case HAMMER2_BREF_TYPE_INDIRECT: 873 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 874 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 875 /* 876 * Buffer I/O will be cleaned up when the volume is 877 * flushed (but the kernel is free to flush it before 878 * then, as well). 879 */ 880 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 881 hammer2_chain_setcheck(chain, chain->data); 882 break; 883 case HAMMER2_BREF_TYPE_DIRENT: 884 /* 885 * A directory entry can use the check area to store 886 * the filename for filenames <= 64 bytes, don't blow 887 * it up! 888 */ 889 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 890 if (chain->bytes) 891 hammer2_chain_setcheck(chain, chain->data); 892 break; 893 case HAMMER2_BREF_TYPE_INODE: 894 /* 895 * NOTE: We must call io_setdirty() to make any late 896 * changes to the inode data, the system might 897 * have already flushed the buffer. 898 */ 899 if (chain->data->ipdata.meta.op_flags & 900 HAMMER2_OPFLAG_PFSROOT) { 901 /* 902 * non-NULL pmp if mounted as a PFS. We must 903 * sync fields cached in the pmp? XXX 904 */ 905 hammer2_inode_data_t *ipdata; 906 907 hammer2_io_setdirty(chain->dio); 908 ipdata = &chain->data->ipdata; 909 if (chain->pmp) { 910 ipdata->meta.pfs_inum = 911 chain->pmp->inode_tid; 912 } 913 } else { 914 /* can't be mounted as a PFS */ 915 } 916 917 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 918 hammer2_chain_setcheck(chain, chain->data); 919 920 hammer2_inode_data_t *ipdata; 921 ipdata = &chain->data->ipdata; 922 break; 923 default: 924 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 925 panic("hammer2_flush_core: unsupported " 926 "embedded bref %d", 927 chain->bref.type); 928 /* NOT REACHED */ 929 } 930 931 /* 932 * If the chain was destroyed try to avoid unnecessary I/O 933 * that might not have yet occurred. Remove the data range 934 * from dedup candidacy and attempt to invalidation that 935 * potentially dirty portion of the I/O buffer. 936 */ 937 if (chain->flags & HAMMER2_CHAIN_DESTROY) { 938 hammer2_io_dedup_delete(hmp, 939 chain->bref.type, 940 chain->bref.data_off, 941 chain->bytes); 942 #if 0 943 hammer2_io_t *dio; 944 if (chain->dio) { 945 hammer2_io_inval(chain->dio, 946 chain->bref.data_off, 947 chain->bytes); 948 } else if ((dio = hammer2_io_getquick(hmp, 949 chain->bref.data_off, 950 chain->bytes, 951 1)) != NULL) { 952 hammer2_io_inval(dio, 953 chain->bref.data_off, 954 chain->bytes); 955 hammer2_io_putblk(&dio); 956 } 957 #endif 958 } 959 } 960 961 /* 962 * If UPDATE is set the parent block table may need to be updated. 963 * This can fail if the hammer2_chain_modify() fails. 964 * 965 * NOTE: UPDATE may be set on vchain or fchain in which case 966 * parent could be NULL. It's easiest to allow the case 967 * and test for NULL. parent can also wind up being NULL 968 * due to a deletion so we need to handle the case anyway. 969 * 970 * If no parent exists we can just clear the UPDATE bit. If the 971 * chain gets reattached later on the bit will simply get set 972 * again. 973 */ 974 if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) 975 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 976 977 /* 978 * The chain may need its blockrefs updated in the parent. 979 */ 980 if (chain->flags & HAMMER2_CHAIN_UPDATE) { 981 hammer2_blockref_t *base; 982 int count; 983 984 /* 985 * Clear UPDATE flag, mark parent modified, update its 986 * modify_tid if necessary, and adjust the parent blockmap. 987 */ 988 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 989 990 /* 991 * (optional code) 992 * 993 * Avoid actually modifying and updating the parent if it 994 * was flagged for destruction. This can greatly reduce 995 * disk I/O in large tree removals because the 996 * hammer2_io_setinval() call in the upward recursion 997 * (see MODIFIED code above) can only handle a few cases. 998 */ 999 if (parent->flags & HAMMER2_CHAIN_DESTROY) { 1000 if (parent->bref.modify_tid < chain->bref.modify_tid) { 1001 parent->bref.modify_tid = 1002 chain->bref.modify_tid; 1003 } 1004 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED | 1005 HAMMER2_CHAIN_BMAPUPD); 1006 goto skipupdate; 1007 } 1008 1009 /* 1010 * The flusher is responsible for deleting empty indirect 1011 * blocks at this point. If we don't do this, no major harm 1012 * will be done but the empty indirect blocks will stay in 1013 * the topology and make it a messy and inefficient. 1014 * 1015 * The flusher is also responsible for collapsing the 1016 * content of an indirect block into its parent whenever 1017 * possible (with some hysteresis). Not doing this will also 1018 * not harm the topology, but would make it messy and 1019 * inefficient. 1020 */ 1021 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) { 1022 if (hammer2_chain_indirect_maintenance(parent, chain)) 1023 goto skipupdate; 1024 } 1025 1026 /* 1027 * We are updating the parent's blockmap, the parent must 1028 * be set modified. If this fails we re-set the UPDATE flag 1029 * in the child. 1030 * 1031 * NOTE! A modification error can be ENOSPC. We still want 1032 * to flush modified chains recursively, not break out, 1033 * so we just skip the update in this situation and 1034 * continue. That is, we still need to try to clean 1035 * out dirty chains and buffers. 1036 * 1037 * This may not help bulkfree though. XXX 1038 */ 1039 save_error = hammer2_chain_modify(parent, 0, 0, 0); 1040 if (save_error) { 1041 info->error |= save_error; 1042 kprintf("hammer2_flush: %016jx.%02x error=%08x\n", 1043 parent->bref.data_off, parent->bref.type, 1044 save_error); 1045 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 1046 goto skipupdate; 1047 } 1048 if (parent->bref.modify_tid < chain->bref.modify_tid) 1049 parent->bref.modify_tid = chain->bref.modify_tid; 1050 1051 /* 1052 * Calculate blockmap pointer 1053 */ 1054 switch(parent->bref.type) { 1055 case HAMMER2_BREF_TYPE_INODE: 1056 /* 1057 * Access the inode's block array. However, there is 1058 * no block array if the inode is flagged DIRECTDATA. 1059 */ 1060 if (parent->data && 1061 (parent->data->ipdata.meta.op_flags & 1062 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 1063 base = &parent->data-> 1064 ipdata.u.blockset.blockref[0]; 1065 } else { 1066 base = NULL; 1067 } 1068 count = HAMMER2_SET_COUNT; 1069 break; 1070 case HAMMER2_BREF_TYPE_INDIRECT: 1071 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 1072 if (parent->data) 1073 base = &parent->data->npdata[0]; 1074 else 1075 base = NULL; 1076 count = parent->bytes / sizeof(hammer2_blockref_t); 1077 break; 1078 case HAMMER2_BREF_TYPE_VOLUME: 1079 base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 1080 count = HAMMER2_SET_COUNT; 1081 break; 1082 case HAMMER2_BREF_TYPE_FREEMAP: 1083 base = &parent->data->npdata[0]; 1084 count = HAMMER2_SET_COUNT; 1085 break; 1086 default: 1087 base = NULL; 1088 count = 0; 1089 panic("hammer2_flush_core: " 1090 "unrecognized blockref type: %d", 1091 parent->bref.type); 1092 } 1093 1094 /* 1095 * Blocktable updates 1096 * 1097 * We synchronize pending statistics at this time. Delta 1098 * adjustments designated for the current and upper level 1099 * are synchronized. 1100 */ 1101 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) { 1102 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 1103 hammer2_spin_ex(&parent->core.spin); 1104 hammer2_base_delete(parent, base, count, chain); 1105 hammer2_spin_unex(&parent->core.spin); 1106 /* base_delete clears both bits */ 1107 } else { 1108 atomic_clear_int(&chain->flags, 1109 HAMMER2_CHAIN_BMAPUPD); 1110 } 1111 } 1112 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 1113 hammer2_spin_ex(&parent->core.spin); 1114 hammer2_base_insert(parent, base, count, 1115 chain, &chain->bref); 1116 hammer2_spin_unex(&parent->core.spin); 1117 /* base_insert sets BMAPPED */ 1118 } 1119 } 1120 skipupdate: 1121 if (parent) 1122 hammer2_chain_unlock(parent); 1123 1124 /* 1125 * Final cleanup after flush 1126 */ 1127 done: 1128 KKASSERT(chain->refs > 0); 1129 if (hammer2_debug & 0x200) { 1130 if (info->debug == chain) 1131 info->debug = NULL; 1132 } 1133 } 1134 1135 /* 1136 * Flush recursion helper, called from flush_core, calls flush_core. 1137 * 1138 * Flushes the children of the caller's chain (info->parent), restricted 1139 * by sync_tid. Set info->domodify if the child's blockref must propagate 1140 * back up to the parent. 1141 * 1142 * This function may set info->error as a side effect. 1143 * 1144 * Ripouts can move child from rbtree to dbtree or dbq but the caller's 1145 * flush scan order prevents any chains from being lost. A child can be 1146 * executes more than once. 1147 * 1148 * WARNING! If we do not call hammer2_flush_core() we must update 1149 * bref.mirror_tid ourselves to indicate that the flush has 1150 * processed the child. 1151 * 1152 * WARNING! parent->core spinlock is held on entry and return. 1153 */ 1154 static int 1155 hammer2_flush_recurse(hammer2_chain_t *child, void *data) 1156 { 1157 hammer2_flush_info_t *info = data; 1158 hammer2_chain_t *parent = info->parent; 1159 1160 #ifdef HAMMER2_SCAN_DEBUG 1161 ++info->scan_count; 1162 if (child->flags & HAMMER2_CHAIN_MODIFIED) 1163 ++info->scan_mod_count; 1164 if (child->flags & HAMMER2_CHAIN_UPDATE) 1165 ++info->scan_upd_count; 1166 if (child->flags & HAMMER2_CHAIN_ONFLUSH) 1167 ++info->scan_onf_count; 1168 #endif 1169 1170 /* 1171 * (child can never be fchain or vchain so a special check isn't 1172 * needed). 1173 * 1174 * We must ref the child before unlocking the spinlock. 1175 * 1176 * The caller has added a ref to the parent so we can temporarily 1177 * unlock it in order to lock the child. However, if it no longer 1178 * winds up being the child of the parent we must skip this child. 1179 * 1180 * NOTE! chain locking errors are fatal. They are never out-of-space 1181 * errors. 1182 */ 1183 hammer2_chain_ref(child); 1184 hammer2_spin_unex(&parent->core.spin); 1185 1186 hammer2_chain_ref_hold(parent); 1187 hammer2_chain_unlock(parent); 1188 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 1189 if (child->parent != parent) { 1190 kprintf("LOST CHILD1 %p->%p (actual parent %p)\n", 1191 parent, child, child->parent); 1192 goto done; 1193 } 1194 if (child->error) { 1195 kprintf("CHILD ERROR DURING FLUSH LOCK %p->%p\n", 1196 parent, child); 1197 info->error |= child->error; 1198 goto done; 1199 } 1200 1201 /* 1202 * Must propagate the DESTROY flag downwards, otherwise the 1203 * parent could end up never being removed because it will 1204 * be requeued to the flusher if it survives this run due to 1205 * the flag. 1206 */ 1207 if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY)) 1208 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROY); 1209 #ifdef HAMMER2_SCAN_DEBUG 1210 if (child->flags & HAMMER2_CHAIN_DESTROY) 1211 ++info->scan_del_count; 1212 #endif 1213 1214 /* 1215 * Recurse and collect deferral data. We're in the media flush, 1216 * this can cross PFS boundaries. 1217 */ 1218 if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) { 1219 #ifdef HAMMER2_SCAN_DEBUG 1220 if (child->bref.type < 7) 1221 ++info->scan_btype[child->bref.type]; 1222 #endif 1223 ++info->depth; 1224 hammer2_flush_core(info, child, info->flags); 1225 --info->depth; 1226 } else if (hammer2_debug & 0x200) { 1227 if (info->debug == NULL) 1228 info->debug = child; 1229 ++info->depth; 1230 hammer2_flush_core(info, child, info->flags); 1231 --info->depth; 1232 if (info->debug == child) 1233 info->debug = NULL; 1234 } 1235 1236 done: 1237 /* 1238 * Relock to continue the loop. 1239 */ 1240 hammer2_chain_unlock(child); 1241 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1242 hammer2_chain_drop_unhold(parent); 1243 if (parent->error) { 1244 kprintf("PARENT ERROR DURING FLUSH LOCK %p->%p\n", 1245 parent, child); 1246 info->error |= parent->error; 1247 } 1248 hammer2_chain_drop(child); 1249 KKASSERT(info->parent == parent); 1250 hammer2_spin_ex(&parent->core.spin); 1251 1252 return (0); 1253 } 1254 1255 /* 1256 * flush helper (backend threaded) 1257 * 1258 * Flushes core chains, issues disk sync, flushes volume roots. 1259 * 1260 * Primarily called from vfs_sync(). 1261 */ 1262 void 1263 hammer2_inode_xop_flush(hammer2_thread_t *thr, hammer2_xop_t *arg) 1264 { 1265 hammer2_xop_flush_t *xop = &arg->xop_flush; 1266 hammer2_chain_t *chain; 1267 hammer2_chain_t *parent; 1268 hammer2_dev_t *hmp; 1269 int flush_error = 0; 1270 int fsync_error = 0; 1271 int total_error = 0; 1272 int j; 1273 1274 /* 1275 * Flush core chains 1276 */ 1277 chain = hammer2_inode_chain(xop->head.ip1, thr->clindex, 1278 HAMMER2_RESOLVE_ALWAYS); 1279 if (chain) { 1280 hmp = chain->hmp; 1281 if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) || 1282 TAILQ_FIRST(&hmp->flushq) != NULL) { 1283 hammer2_flush(chain, HAMMER2_FLUSH_TOP); 1284 parent = chain->parent; 1285 KKASSERT(chain->pmp != parent->pmp); 1286 hammer2_chain_setflush(parent); 1287 } 1288 hammer2_chain_unlock(chain); 1289 hammer2_chain_drop(chain); 1290 chain = NULL; 1291 } else { 1292 hmp = NULL; 1293 } 1294 1295 /* 1296 * Flush volume roots. Avoid replication, we only want to 1297 * flush each hammer2_dev (hmp) once. 1298 */ 1299 for (j = thr->clindex - 1; j >= 0; --j) { 1300 if ((chain = xop->head.ip1->cluster.array[j].chain) != NULL) { 1301 if (chain->hmp == hmp) { 1302 chain = NULL; /* safety */ 1303 goto skip; 1304 } 1305 } 1306 } 1307 chain = NULL; /* safety */ 1308 1309 /* 1310 * spmp transaction. The super-root is never directly mounted so 1311 * there shouldn't be any vnodes, let alone any dirty vnodes 1312 * associated with it, so we shouldn't have to mess around with any 1313 * vnode flushes here. 1314 */ 1315 hammer2_trans_init(hmp->spmp, HAMMER2_TRANS_ISFLUSH); 1316 1317 /* 1318 * Media mounts have two 'roots', vchain for the topology 1319 * and fchain for the free block table. Flush both. 1320 * 1321 * Note that the topology and free block table are handled 1322 * independently, so the free block table can wind up being 1323 * ahead of the topology. We depend on the bulk free scan 1324 * code to deal with any loose ends. 1325 * 1326 * vchain and fchain do not error on-lock since their data does 1327 * not have to be re-read from media. 1328 */ 1329 hammer2_chain_ref(&hmp->vchain); 1330 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); 1331 hammer2_chain_ref(&hmp->fchain); 1332 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); 1333 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1334 /* 1335 * This will also modify vchain as a side effect, 1336 * mark vchain as modified now. 1337 */ 1338 hammer2_voldata_modify(hmp); 1339 chain = &hmp->fchain; 1340 flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP); 1341 KKASSERT(chain == &hmp->fchain); 1342 } 1343 hammer2_chain_unlock(&hmp->fchain); 1344 hammer2_chain_unlock(&hmp->vchain); 1345 hammer2_chain_drop(&hmp->fchain); 1346 /* vchain dropped down below */ 1347 1348 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); 1349 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) { 1350 chain = &hmp->vchain; 1351 flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP); 1352 KKASSERT(chain == &hmp->vchain); 1353 } 1354 hammer2_chain_unlock(&hmp->vchain); 1355 hammer2_chain_drop(&hmp->vchain); 1356 1357 /* 1358 * We can't safely flush the volume header until we have 1359 * flushed any device buffers which have built up. 1360 * 1361 * XXX this isn't being incremental 1362 */ 1363 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY); 1364 fsync_error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0); 1365 vn_unlock(hmp->devvp); 1366 if (fsync_error || flush_error) { 1367 kprintf("hammer2: sync error fsync=%d h2flush=0x%04x dev=%s\n", 1368 fsync_error, flush_error, hmp->devrepname); 1369 } 1370 1371 /* 1372 * The flush code sets CHAIN_VOLUMESYNC to indicate that the 1373 * volume header needs synchronization via hmp->volsync. 1374 * 1375 * XXX synchronize the flag & data with only this flush XXX 1376 */ 1377 if (fsync_error == 0 && flush_error == 0 && 1378 (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) { 1379 struct buf *bp; 1380 int vol_error = 0; 1381 1382 /* 1383 * Synchronize the disk before flushing the volume 1384 * header. 1385 */ 1386 bp = getpbuf(NULL); 1387 bp->b_bio1.bio_offset = 0; 1388 bp->b_bufsize = 0; 1389 bp->b_bcount = 0; 1390 bp->b_cmd = BUF_CMD_FLUSH; 1391 bp->b_bio1.bio_done = biodone_sync; 1392 bp->b_bio1.bio_flags |= BIO_SYNC; 1393 vn_strategy(hmp->devvp, &bp->b_bio1); 1394 fsync_error = biowait(&bp->b_bio1, "h2vol"); 1395 relpbuf(bp, NULL); 1396 1397 /* 1398 * Then we can safely flush the version of the 1399 * volume header synchronized by the flush code. 1400 */ 1401 j = hmp->volhdrno + 1; 1402 if (j < 0) 1403 j = 0; 1404 if (j >= HAMMER2_NUM_VOLHDRS) 1405 j = 0; 1406 if (j * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE > 1407 hmp->volsync.volu_size) { 1408 j = 0; 1409 } 1410 if (hammer2_debug & 0x8000) { 1411 /* debug only, avoid syslogd loop */ 1412 kprintf("sync volhdr %d %jd\n", 1413 j, (intmax_t)hmp->volsync.volu_size); 1414 } 1415 bp = getblk(hmp->devvp, j * HAMMER2_ZONE_BYTES64, 1416 HAMMER2_PBUFSIZE, GETBLK_KVABIO, 0); 1417 atomic_clear_int(&hmp->vchain.flags, 1418 HAMMER2_CHAIN_VOLUMESYNC); 1419 bkvasync(bp); 1420 bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE); 1421 vol_error = bwrite(bp); 1422 hmp->volhdrno = j; 1423 if (vol_error) 1424 fsync_error = vol_error; 1425 } 1426 if (flush_error) 1427 total_error = flush_error; 1428 if (fsync_error) 1429 total_error = hammer2_errno_to_error(fsync_error); 1430 1431 hammer2_trans_done(hmp->spmp); /* spmp trans */ 1432 skip: 1433 hammer2_xop_feed(&xop->head, NULL, thr->clindex, total_error); 1434 } 1435