1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * TRANSACTION AND FLUSH HANDLING 37 * 38 * Deceptively simple but actually fairly difficult to implement properly is 39 * how I would describe it. 40 * 41 * The biggest issue is that each PFS may belong to a cluster so its media 42 * modify_tid and mirror_tid fields are in a completely different domain 43 * than the topology related to the super-root. 44 * 45 * Flushing generally occurs bottom-up but requires a top-down scan to 46 * locate chains with MODIFIED and/or UPDATE bits set. The ONFLUSH flag 47 * tells how to recurse downward to find these chains. 48 */ 49 50 #include <sys/cdefs.h> 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/types.h> 54 #include <sys/lock.h> 55 #include <sys/uuid.h> 56 57 #include "hammer2.h" 58 59 #define FLUSH_DEBUG 0 60 61 #define HAMMER2_FLUSH_DEPTH_LIMIT 10 /* stack recursion limit */ 62 63 64 /* 65 * Recursively flush the specified chain. The chain is locked and 66 * referenced by the caller and will remain so on return. The chain 67 * will remain referenced throughout but can temporarily lose its 68 * lock during the recursion to avoid unnecessarily stalling user 69 * processes. 70 */ 71 struct hammer2_flush_info { 72 hammer2_chain_t *parent; 73 hammer2_trans_t *trans; 74 int depth; 75 int diddeferral; 76 int cache_index; 77 struct h2_flush_list flushq; 78 hammer2_xid_t sync_xid; /* memory synchronization point */ 79 hammer2_chain_t *debug; 80 }; 81 82 typedef struct hammer2_flush_info hammer2_flush_info_t; 83 84 static void hammer2_flush_core(hammer2_flush_info_t *info, 85 hammer2_chain_t *chain, int deleting); 86 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data); 87 88 /* 89 * For now use a global transaction manager. What we ultimately want to do 90 * is give each non-overlapping hmp/pmp group its own transaction manager. 91 * 92 * Transactions govern XID tracking on the physical media (the hmp), but they 93 * also govern TID tracking which is per-PFS and thus might cross multiple 94 * hmp's. So we can't just stuff tmanage into hammer2_mount or 95 * hammer2_pfsmount. 96 */ 97 static hammer2_trans_manage_t tmanage; 98 99 void 100 hammer2_trans_manage_init(void) 101 { 102 lockinit(&tmanage.translk, "h2trans", 0, 0); 103 TAILQ_INIT(&tmanage.transq); 104 tmanage.flush_xid = 1; 105 tmanage.alloc_xid = tmanage.flush_xid + 1; 106 } 107 108 hammer2_xid_t 109 hammer2_trans_newxid(hammer2_pfsmount_t *pmp __unused) 110 { 111 hammer2_xid_t xid; 112 113 for (;;) { 114 xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1); 115 if (xid) 116 break; 117 } 118 return xid; 119 } 120 121 /* 122 * Transaction support functions for writing to the filesystem. 123 * 124 * Initializing a new transaction allocates a transaction ID. Typically 125 * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 126 * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 127 * media target. The latter mode is used by the recovery code. 128 * 129 * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 130 * other is a set of any number of concurrent filesystem operations. We 131 * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 132 * or we can have <running_flush> + <concurrent_fs_ops>. 133 * 134 * During a flush, new fs_ops are only blocked until the fs_ops prior to 135 * the flush complete. The new fs_ops can then run concurrent with the flush. 136 * 137 * Buffer-cache transactions operate as fs_ops but never block. A 138 * buffer-cache flush will run either before or after the current pending 139 * flush depending on its state. 140 */ 141 void 142 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags) 143 { 144 hammer2_trans_manage_t *tman; 145 hammer2_trans_t *head; 146 147 tman = &tmanage; 148 149 bzero(trans, sizeof(*trans)); 150 trans->pmp = pmp; 151 trans->flags = flags; 152 trans->td = curthread; 153 154 lockmgr(&tman->translk, LK_EXCLUSIVE); 155 156 if (flags & HAMMER2_TRANS_ISFLUSH) { 157 /* 158 * If multiple flushes are trying to run we have to 159 * wait until it is our turn. All flushes are serialized. 160 * 161 * We queue ourselves and then wait to become the head 162 * of the queue, allowing all prior flushes to complete. 163 * 164 * Multiple normal transactions can share the current 165 * transaction id but a flush transaction needs its own 166 * unique TID for proper block table update accounting. 167 */ 168 ++tman->flushcnt; 169 ++pmp->alloc_tid; 170 pmp->flush_tid = pmp->alloc_tid; 171 tman->flush_xid = hammer2_trans_newxid(pmp); 172 trans->sync_xid = tman->flush_xid; 173 ++pmp->alloc_tid; 174 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 175 if (TAILQ_FIRST(&tman->transq) != trans) { 176 trans->blocked = 1; 177 while (trans->blocked) { 178 lksleep(&trans->sync_xid, &tman->translk, 179 0, "h2multf", hz); 180 } 181 } 182 } else if (tman->flushcnt == 0) { 183 /* 184 * No flushes are pending, we can go. Use prior flush_xid + 1. 185 * 186 * WARNING! Also see hammer2_chain_setflush() 187 */ 188 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 189 trans->sync_xid = tman->flush_xid + 1; 190 191 /* XXX improve/optimize inode allocation */ 192 } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) { 193 /* 194 * A buffer cache transaction is requested while a flush 195 * is in progress. The flush's PREFLUSH flag must be set 196 * in this situation. 197 * 198 * The buffer cache flush takes on the main flush's 199 * transaction id. 200 */ 201 TAILQ_FOREACH(head, &tman->transq, entry) { 202 if (head->flags & HAMMER2_TRANS_ISFLUSH) 203 break; 204 } 205 KKASSERT(head); 206 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH); 207 trans->flags |= HAMMER2_TRANS_PREFLUSH; 208 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 209 trans->sync_xid = head->sync_xid; 210 trans->flags |= HAMMER2_TRANS_CONCURRENT; 211 /* not allowed to block */ 212 } else { 213 /* 214 * A normal transaction is requested while a flush is in 215 * progress. We insert after the current flush and may 216 * block. 217 * 218 * WARNING! Also see hammer2_chain_setflush() 219 */ 220 TAILQ_FOREACH(head, &tman->transq, entry) { 221 if (head->flags & HAMMER2_TRANS_ISFLUSH) 222 break; 223 } 224 KKASSERT(head); 225 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 226 trans->sync_xid = head->sync_xid + 1; 227 trans->flags |= HAMMER2_TRANS_CONCURRENT; 228 229 /* 230 * XXX for now we must block new transactions, synchronous 231 * flush mode is on by default. 232 * 233 * If synchronous flush mode is enabled concurrent 234 * frontend transactions during the flush are not 235 * allowed (except we don't have a choice for buffer 236 * cache ops). 237 */ 238 if (hammer2_synchronous_flush > 0 || 239 TAILQ_FIRST(&tman->transq) != head) { 240 trans->blocked = 1; 241 while (trans->blocked) { 242 lksleep(&trans->sync_xid, 243 &tman->translk, 0, 244 "h2multf", hz); 245 } 246 } 247 } 248 if (flags & HAMMER2_TRANS_NEWINODE) { 249 if (pmp->spmp_hmp) { 250 /* 251 * Super-root transaction, all new inodes have an 252 * inode number of 1. Normal pfs inode cache 253 * semantics are not used. 254 */ 255 trans->inode_tid = 1; 256 } else { 257 /* 258 * Normal transaction 259 */ 260 if (pmp->inode_tid < HAMMER2_INODE_START) 261 pmp->inode_tid = HAMMER2_INODE_START; 262 trans->inode_tid = pmp->inode_tid++; 263 } 264 } 265 266 lockmgr(&tman->translk, LK_RELEASE); 267 } 268 269 /* 270 * This may only be called while in a flush transaction. It's a bit of a 271 * hack but after flushing a PFS we need to flush each volume root as part 272 * of the same transaction. 273 */ 274 void 275 hammer2_trans_spmp(hammer2_trans_t *trans, hammer2_pfsmount_t *spmp) 276 { 277 ++spmp->alloc_tid; 278 spmp->flush_tid = spmp->alloc_tid; 279 ++spmp->alloc_tid; 280 trans->pmp = spmp; 281 } 282 283 284 void 285 hammer2_trans_done(hammer2_trans_t *trans) 286 { 287 hammer2_trans_manage_t *tman; 288 hammer2_trans_t *head; 289 hammer2_trans_t *scan; 290 291 tman = &tmanage; 292 293 /* 294 * Remove. 295 */ 296 lockmgr(&tman->translk, LK_EXCLUSIVE); 297 TAILQ_REMOVE(&tman->transq, trans, entry); 298 head = TAILQ_FIRST(&tman->transq); 299 300 /* 301 * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT 302 * up through the next flush. (If the head is a flush then we 303 * stop there, unlike the unblock code following this section). 304 */ 305 if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 306 --tman->flushcnt; 307 scan = head; 308 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 309 atomic_clear_int(&scan->flags, 310 HAMMER2_TRANS_CONCURRENT); 311 scan = TAILQ_NEXT(scan, entry); 312 } 313 } 314 315 /* 316 * Unblock the head of the queue and any additional transactions 317 * up to the next flush. The head can be a flush and it will be 318 * unblocked along with the non-flush transactions following it 319 * (which are allowed to run concurrently with it). 320 * 321 * In synchronous flush mode we stop if the head transaction is 322 * a flush. 323 */ 324 if (head && head->blocked) { 325 head->blocked = 0; 326 wakeup(&head->sync_xid); 327 328 if (hammer2_synchronous_flush > 0) 329 scan = head; 330 else 331 scan = TAILQ_NEXT(head, entry); 332 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 333 if (scan->blocked) { 334 scan->blocked = 0; 335 wakeup(&scan->sync_xid); 336 } 337 scan = TAILQ_NEXT(scan, entry); 338 } 339 } 340 lockmgr(&tman->translk, LK_RELEASE); 341 } 342 343 /* 344 * Flush the chain and all modified sub-chains through the specified 345 * synchronization point, propagating parent chain modifications and 346 * mirror_tid updates back up as needed. 347 * 348 * Caller must have interlocked against any non-flush-related modifying 349 * operations in progress whos XXX values are less than or equal 350 * to the passed sync_xid. 351 * 352 * Caller must have already vetted synchronization points to ensure they 353 * are properly flushed. Only snapshots and cluster flushes can create 354 * these sorts of synchronization points. 355 * 356 * This routine can be called from several places but the most important 357 * is from VFS_SYNC. 358 * 359 * chain is locked on call and will remain locked on return. The chain's 360 * UPDATE flag indicates that its parent's block table (which is not yet 361 * part of the flush) should be updated. The chain may be replaced by 362 * the call if it was modified. 363 */ 364 void 365 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain) 366 { 367 hammer2_chain_t *scan; 368 hammer2_flush_info_t info; 369 int loops; 370 371 /* 372 * Execute the recursive flush and handle deferrals. 373 * 374 * Chains can be ridiculously long (thousands deep), so to 375 * avoid blowing out the kernel stack the recursive flush has a 376 * depth limit. Elements at the limit are placed on a list 377 * for re-execution after the stack has been popped. 378 */ 379 bzero(&info, sizeof(info)); 380 TAILQ_INIT(&info.flushq); 381 info.trans = trans; 382 info.sync_xid = trans->sync_xid; 383 info.cache_index = -1; 384 385 /* 386 * Calculate parent (can be NULL), if not NULL the flush core 387 * expects the parent to be referenced so it can easily lock/unlock 388 * it without it getting ripped up. 389 */ 390 if ((info.parent = chain->parent) != NULL) 391 hammer2_chain_ref(info.parent); 392 393 /* 394 * Extra ref needed because flush_core expects it when replacing 395 * chain. 396 */ 397 hammer2_chain_ref(chain); 398 loops = 0; 399 400 for (;;) { 401 /* 402 * Unwind deep recursions which had been deferred. This 403 * can leave the FLUSH_* bits set for these chains, which 404 * will be handled when we [re]flush chain after the unwind. 405 */ 406 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) { 407 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 408 TAILQ_REMOVE(&info.flushq, scan, flush_node); 409 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 410 411 /* 412 * Now that we've popped back up we can do a secondary 413 * recursion on the deferred elements. 414 * 415 * NOTE: hammer2_flush() may replace scan. 416 */ 417 if (hammer2_debug & 0x0040) 418 kprintf("deferred flush %p\n", scan); 419 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 420 hammer2_chain_drop(scan); /* ref from deferral */ 421 hammer2_flush(trans, scan); 422 hammer2_chain_unlock(scan); 423 } 424 425 /* 426 * [re]flush chain. 427 */ 428 info.diddeferral = 0; 429 hammer2_flush_core(&info, chain, 0); 430 431 /* 432 * Only loop if deep recursions have been deferred. 433 */ 434 if (TAILQ_EMPTY(&info.flushq)) 435 break; 436 437 if (++loops % 1000 == 0) { 438 kprintf("hammer2_flush: excessive loops on %p\n", 439 chain); 440 if (hammer2_debug & 0x100000) 441 Debugger("hell4"); 442 } 443 } 444 hammer2_chain_drop(chain); 445 if (info.parent) 446 hammer2_chain_drop(info.parent); 447 } 448 449 /* 450 * This is the core of the chain flushing code. The chain is locked by the 451 * caller and must also have an extra ref on it by the caller, and remains 452 * locked and will have an extra ref on return. Upon return, the caller can 453 * test the UPDATE bit on the child to determine if the parent needs updating. 454 * 455 * (1) Determine if this node is a candidate for the flush, return if it is 456 * not. fchain and vchain are always candidates for the flush. 457 * 458 * (2) If we recurse too deep the chain is entered onto the deferral list and 459 * the current flush stack is aborted until after the deferral list is 460 * run. 461 * 462 * (3) Recursively flush live children (rbtree). This can create deferrals. 463 * A successful flush clears the MODIFIED and UPDATE bits on the children 464 * and typically causes the parent to be marked MODIFIED as the children 465 * update the parent's block table. A parent might already be marked 466 * MODIFIED due to a deletion (whos blocktable update in the parent is 467 * handled by the frontend), or if the parent itself is modified by the 468 * frontend for other reasons. 469 * 470 * (4) Permanently disconnected sub-trees are cleaned up by the front-end. 471 * Deleted-but-open inodes can still be individually flushed via the 472 * filesystem syncer. 473 * 474 * (5) Note that an unmodified child may still need the block table in its 475 * parent updated (e.g. rename/move). The child will have UPDATE set 476 * in this case. 477 * 478 * WARNING ON BREF MODIFY_TID/MIRROR_TID 479 * 480 * blockref.modify_tid and blockref.mirror_tid are consistent only within a 481 * PFS. This is why we cannot cache sync_tid in the transaction structure. 482 * Instead we access it from the pmp. 483 */ 484 static void 485 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, 486 int deleting) 487 { 488 hammer2_chain_t *parent; 489 hammer2_mount_t *hmp; 490 hammer2_pfsmount_t *pmp; 491 int diddeferral; 492 493 /* 494 * (1) Optimize downward recursion to locate nodes needing action. 495 * Nothing to do if none of these flags are set. 496 */ 497 if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) { 498 if (hammer2_debug & 0x200) { 499 if (info->debug == NULL) 500 info->debug = chain; 501 } else { 502 return; 503 } 504 } 505 506 hmp = chain->hmp; 507 pmp = chain->pmp; /* can be NULL */ 508 diddeferral = info->diddeferral; 509 parent = info->parent; /* can be NULL */ 510 511 /* 512 * mirror_tid should not be forward-indexed 513 */ 514 KKASSERT(pmp == NULL || chain->bref.mirror_tid <= pmp->flush_tid); 515 516 /* 517 * Downward search recursion 518 */ 519 if (chain->flags & HAMMER2_CHAIN_DEFERRED) { 520 /* 521 * Already deferred. 522 */ 523 ++info->diddeferral; 524 } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 525 /* 526 * Recursion depth reached. 527 */ 528 hammer2_chain_ref(chain); 529 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node); 530 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED); 531 ++info->diddeferral; 532 } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) { 533 /* 534 * Downward recursion search (actual flush occurs bottom-up). 535 * pre-clear ONFLUSH. It can get set again due to races, 536 * which we want so the scan finds us again in the next flush. 537 */ 538 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 539 info->parent = chain; 540 spin_lock(&chain->core.cst.spin); 541 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree, 542 NULL, hammer2_flush_recurse, info); 543 spin_unlock(&chain->core.cst.spin); 544 info->parent = parent; 545 if (info->diddeferral) 546 hammer2_chain_setflush(info->trans, chain); 547 } 548 549 /* 550 * Now we are in the bottom-up part of the recursion. 551 * 552 * Do not update chain if lower layers were deferred. 553 */ 554 if (info->diddeferral) 555 goto done; 556 557 /* 558 * Propagate the DESTROY flag downwards. This dummies up the flush 559 * code and tries to invalidate related buffer cache buffers to 560 * avoid the disk write. 561 */ 562 if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY)) 563 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 564 565 /* 566 * Chain was already modified or has become modified, flush it out. 567 */ 568 again: 569 if ((hammer2_debug & 0x200) && 570 info->debug && 571 (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) { 572 hammer2_chain_t *scan = chain; 573 574 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain); 575 while (scan) { 576 kprintf(" chain %p [%08x] bref=%016jx:%02x\n", 577 scan, scan->flags, 578 scan->bref.key, scan->bref.type); 579 if (scan == info->debug) 580 break; 581 scan = scan->parent; 582 } 583 } 584 585 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 586 /* 587 * Dispose of the modified bit. UPDATE should already be 588 * set. 589 */ 590 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) || 591 chain == &hmp->vchain); 592 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 593 if (pmp) { 594 hammer2_pfs_memory_wakeup(pmp); 595 chain->bref.mirror_tid = pmp->flush_tid; 596 } 597 598 if ((chain->flags & HAMMER2_CHAIN_UPDATE) || 599 chain == &hmp->vchain || 600 chain == &hmp->fchain) { 601 /* 602 * Drop the ref from the MODIFIED bit we cleared, 603 * net -1 ref. 604 */ 605 hammer2_chain_drop(chain); 606 } else { 607 /* 608 * Drop the ref from the MODIFIED bit we cleared and 609 * set a ref for the UPDATE bit we are setting. Net 610 * 0 refs. 611 */ 612 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 613 } 614 615 /* 616 * Issue the flush. This is indirect via the DIO. 617 * 618 * NOTE: A DELETED node that reaches this point must be 619 * flushed for synchronization point consistency. 620 * 621 * NOTE: Even though MODIFIED was already set, the related DIO 622 * might not be dirty due to a system buffer cache 623 * flush and must be set dirty if we are going to make 624 * further modifications to the buffer. Chains with 625 * embedded data don't need this. 626 * 627 * Update bref.mirror_tid, clear MODIFIED, and set UPDATE. 628 */ 629 if (hammer2_debug & 0x1000) { 630 kprintf("Flush %p.%d %016jx/%d sync_xid=%08x " 631 "data=%016jx\n", 632 chain, chain->bref.type, 633 chain->bref.key, chain->bref.keybits, 634 info->sync_xid, 635 chain->bref.data_off); 636 } 637 if (hammer2_debug & 0x2000) { 638 Debugger("Flush hell"); 639 } 640 641 /* 642 * Update chain CRCs for flush. 643 * 644 * NOTE: Volume headers are NOT flushed here as they require 645 * special processing. 646 */ 647 switch(chain->bref.type) { 648 case HAMMER2_BREF_TYPE_FREEMAP: 649 /* 650 * (note: embedded data, do not call setdirty) 651 */ 652 KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED); 653 hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; 654 break; 655 case HAMMER2_BREF_TYPE_VOLUME: 656 /* 657 * The free block table is flushed by hammer2_vfs_sync() 658 * before it flushes vchain. We must still hold fchain 659 * locked while copying voldata to volsync, however. 660 * 661 * (note: embedded data, do not call setdirty) 662 */ 663 hammer2_voldata_lock(hmp); 664 hammer2_chain_lock(&hmp->fchain, 665 HAMMER2_RESOLVE_ALWAYS); 666 /* 667 * There is no parent to our root vchain and fchain to 668 * synchronize the bref to, their updated mirror_tid's 669 * must be synchronized to the volume header. 670 */ 671 hmp->voldata.mirror_tid = chain->bref.mirror_tid; 672 hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; 673 kprintf("mirror_tid %08jx\n", 674 (intmax_t)chain->bref.mirror_tid); 675 676 /* 677 * The volume header is flushed manually by the 678 * syncer, not here. All we do here is adjust the 679 * crc's. 680 */ 681 KKASSERT(chain->data != NULL); 682 KKASSERT(chain->dio == NULL); 683 684 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 685 hammer2_icrc32( 686 (char *)&hmp->voldata + 687 HAMMER2_VOLUME_ICRC1_OFF, 688 HAMMER2_VOLUME_ICRC1_SIZE); 689 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 690 hammer2_icrc32( 691 (char *)&hmp->voldata + 692 HAMMER2_VOLUME_ICRC0_OFF, 693 HAMMER2_VOLUME_ICRC0_SIZE); 694 hmp->voldata.icrc_volheader = 695 hammer2_icrc32( 696 (char *)&hmp->voldata + 697 HAMMER2_VOLUME_ICRCVH_OFF, 698 HAMMER2_VOLUME_ICRCVH_SIZE); 699 hmp->volsync = hmp->voldata; 700 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 701 hammer2_chain_unlock(&hmp->fchain); 702 hammer2_voldata_unlock(hmp); 703 break; 704 case HAMMER2_BREF_TYPE_DATA: 705 /* 706 * Data elements have already been flushed via the 707 * logical file buffer cache. Their hash was set in 708 * the bref by the vop_write code. Do not re-dirty. 709 * 710 * Make sure any device buffer(s) have been flushed 711 * out here (there aren't usually any to flush) XXX. 712 */ 713 break; 714 case HAMMER2_BREF_TYPE_INDIRECT: 715 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 716 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 717 /* 718 * Buffer I/O will be cleaned up when the volume is 719 * flushed (but the kernel is free to flush it before 720 * then, as well). 721 */ 722 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 723 hammer2_chain_setcheck(chain, chain->data); 724 break; 725 case HAMMER2_BREF_TYPE_INODE: 726 /* 727 * NOTE: We must call io_setdirty() to make any late 728 * changes to the inode data, the system might 729 * have already flushed the buffer. 730 */ 731 if (chain->data->ipdata.op_flags & 732 HAMMER2_OPFLAG_PFSROOT) { 733 /* 734 * non-NULL pmp if mounted as a PFS. We must 735 * sync fields cached in the pmp? XXX 736 */ 737 hammer2_inode_data_t *ipdata; 738 739 hammer2_io_setdirty(chain->dio); 740 ipdata = &chain->data->ipdata; 741 if (pmp) 742 ipdata->pfs_inum = pmp->inode_tid; 743 } else { 744 /* can't be mounted as a PFS */ 745 } 746 747 /* 748 * Update inode statistics. Pending stats in chain 749 * are cleared out on UPDATE so expect that bit to 750 * be set here too or the statistics will not be 751 * rolled-up properly. 752 */ 753 if (chain->data_count || chain->inode_count) { 754 hammer2_inode_data_t *ipdata; 755 756 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 757 hammer2_io_setdirty(chain->dio); 758 ipdata = &chain->data->ipdata; 759 ipdata->data_count += chain->data_count; 760 ipdata->inode_count += chain->inode_count; 761 } 762 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 763 hammer2_chain_setcheck(chain, chain->data); 764 break; 765 default: 766 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 767 panic("hammer2_flush_core: unsupported " 768 "embedded bref %d", 769 chain->bref.type); 770 /* NOT REACHED */ 771 } 772 773 /* 774 * If the chain was destroyed try to avoid unnecessary I/O. 775 * (this only really works if the DIO system buffer is the 776 * same size as chain->bytes). 777 */ 778 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) { 779 hammer2_io_setinval(chain->dio, chain->bytes); 780 } 781 } 782 783 /* 784 * If UPDATE is set the parent block table may need to be updated. 785 * 786 * NOTE: UPDATE may be set on vchain or fchain in which case 787 * parent could be NULL. It's easiest to allow the case 788 * and test for NULL. parent can also wind up being NULL 789 * due to a deletion so we need to handle the case anyway. 790 * 791 * If no parent exists we can just clear the UPDATE bit. If the 792 * chain gets reattached later on the bit will simply get set 793 * again. 794 */ 795 if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) { 796 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 797 hammer2_chain_drop(chain); 798 } 799 800 /* 801 * The chain may need its blockrefs updated in the parent. This 802 * requires some fancy footwork. 803 */ 804 if (chain->flags & HAMMER2_CHAIN_UPDATE) { 805 hammer2_blockref_t *base; 806 int count; 807 808 /* 809 * Both parent and chain must be locked. This requires 810 * temporarily unlocking the chain. We have to deal with 811 * the case where the chain might be reparented or modified 812 * while it was unlocked. 813 */ 814 hammer2_chain_unlock(chain); 815 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 816 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE); 817 if (chain->parent != parent) { 818 kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", chain, chain->parent, parent); 819 hammer2_chain_unlock(parent); 820 goto done; 821 } 822 823 /* 824 * Check race condition. If someone got in and modified 825 * it again while it was unlocked, we have to loop up. 826 */ 827 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 828 hammer2_chain_unlock(parent); 829 kprintf("hammer2_flush: chain %p flush-mod race\n", 830 chain); 831 goto again; 832 } 833 834 /* 835 * Clear UPDATE flag 836 */ 837 if (chain->flags & HAMMER2_CHAIN_UPDATE) { 838 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 839 hammer2_chain_drop(chain); 840 } 841 hammer2_chain_modify(info->trans, parent, 0); 842 843 /* 844 * Calculate blockmap pointer 845 */ 846 switch(parent->bref.type) { 847 case HAMMER2_BREF_TYPE_INODE: 848 /* 849 * Access the inode's block array. However, there is 850 * no block array if the inode is flagged DIRECTDATA. 851 */ 852 if (parent->data && 853 (parent->data->ipdata.op_flags & 854 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 855 base = &parent->data-> 856 ipdata.u.blockset.blockref[0]; 857 } else { 858 base = NULL; 859 } 860 count = HAMMER2_SET_COUNT; 861 break; 862 case HAMMER2_BREF_TYPE_INDIRECT: 863 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 864 if (parent->data) 865 base = &parent->data->npdata[0]; 866 else 867 base = NULL; 868 count = parent->bytes / sizeof(hammer2_blockref_t); 869 break; 870 case HAMMER2_BREF_TYPE_VOLUME: 871 base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 872 count = HAMMER2_SET_COUNT; 873 break; 874 case HAMMER2_BREF_TYPE_FREEMAP: 875 base = &parent->data->npdata[0]; 876 count = HAMMER2_SET_COUNT; 877 break; 878 default: 879 base = NULL; 880 count = 0; 881 panic("hammer2_flush_core: " 882 "unrecognized blockref type: %d", 883 parent->bref.type); 884 } 885 886 /* 887 * Blocktable updates 888 * 889 * We synchronize pending statistics at this time. Delta 890 * adjustments designated for the current and upper level 891 * are synchronized. 892 */ 893 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) { 894 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 895 hammer2_base_delete(info->trans, parent, 896 base, count, 897 &info->cache_index, chain); 898 /* base_delete clears both bits */ 899 } else { 900 atomic_clear_int(&chain->flags, 901 HAMMER2_CHAIN_BMAPUPD); 902 } 903 } 904 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 905 parent->data_count += chain->data_count + 906 chain->data_count_up; 907 parent->inode_count += chain->inode_count + 908 chain->inode_count_up; 909 chain->data_count = 0; 910 chain->inode_count = 0; 911 chain->data_count_up = 0; 912 chain->inode_count_up = 0; 913 hammer2_base_insert(info->trans, parent, 914 base, count, 915 &info->cache_index, chain); 916 /* base_insert sets BMAPPED */ 917 } 918 hammer2_chain_unlock(parent); 919 } 920 921 /* 922 * Final cleanup after flush 923 */ 924 done: 925 KKASSERT(chain->refs > 1); 926 KKASSERT(pmp == NULL || 927 chain->bref.mirror_tid <= chain->pmp->flush_tid); 928 if (hammer2_debug & 0x200) { 929 if (info->debug == chain) 930 info->debug = NULL; 931 } 932 } 933 934 /* 935 * Flush recursion helper, called from flush_core, calls flush_core. 936 * 937 * Flushes the children of the caller's chain (info->parent), restricted 938 * by sync_tid. Set info->domodify if the child's blockref must propagate 939 * back up to the parent. 940 * 941 * Ripouts can move child from rbtree to dbtree or dbq but the caller's 942 * flush scan order prevents any chains from being lost. A child can be 943 * executes more than once. 944 * 945 * WARNING! If we do not call hammer2_flush_core() we must update 946 * bref.mirror_tid ourselves to indicate that the flush has 947 * processed the child. 948 * 949 * WARNING! parent->core spinlock is held on entry and return. 950 * 951 * WARNING! Flushes do not cross PFS boundaries. Specifically, a flush must 952 * not cross a pfs-root boundary. 953 */ 954 static int 955 hammer2_flush_recurse(hammer2_chain_t *child, void *data) 956 { 957 hammer2_flush_info_t *info = data; 958 /*hammer2_trans_t *trans = info->trans;*/ 959 hammer2_chain_t *parent = info->parent; 960 961 /* 962 * (child can never be fchain or vchain so a special check isn't 963 * needed). 964 * 965 * We must ref the child before unlocking the spinlock. 966 * 967 * The caller has added a ref to the parent so we can temporarily 968 * unlock it in order to lock the child. 969 */ 970 hammer2_chain_ref(child); 971 spin_unlock(&parent->core.cst.spin); 972 973 hammer2_chain_unlock(parent); 974 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 975 976 /* 977 * Never recurse across a mounted PFS boundary. 978 * 979 * Recurse and collect deferral data. 980 */ 981 if ((child->flags & HAMMER2_CHAIN_PFSBOUNDARY) == 0 || 982 child->pmp == NULL) { 983 if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) { 984 ++info->depth; 985 hammer2_flush_core(info, child, 0); /* XXX deleting */ 986 --info->depth; 987 } else if (hammer2_debug & 0x200) { 988 if (info->debug == NULL) 989 info->debug = child; 990 ++info->depth; 991 hammer2_flush_core(info, child, 0); /* XXX deleting */ 992 --info->depth; 993 if (info->debug == child) 994 info->debug = NULL; 995 } 996 } 997 998 /* 999 * Relock to continue the loop 1000 */ 1001 hammer2_chain_unlock(child); 1002 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1003 hammer2_chain_drop(child); 1004 KKASSERT(info->parent == parent); 1005 spin_lock(&parent->core.cst.spin); 1006 1007 return (0); 1008 } 1009