1 /* 2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * TRANSACTION AND FLUSH HANDLING 37 * 38 * Deceptively simple but actually fairly difficult to implement properly is 39 * how I would describe it. 40 * 41 * Flushing generally occurs bottom-up but requires a top-down scan to 42 * locate chains with MODIFIED and/or UPDATE bits set. The ONFLUSH flag 43 * tells how to recurse downward to find these chains. 44 */ 45 46 #include <sys/cdefs.h> 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/types.h> 50 #include <sys/lock.h> 51 #include <sys/uuid.h> 52 53 #include "hammer2.h" 54 55 #define FLUSH_DEBUG 0 56 57 #define HAMMER2_FLUSH_DEPTH_LIMIT 10 /* stack recursion limit */ 58 59 60 /* 61 * Recursively flush the specified chain. The chain is locked and 62 * referenced by the caller and will remain so on return. The chain 63 * will remain referenced throughout but can temporarily lose its 64 * lock during the recursion to avoid unnecessarily stalling user 65 * processes. 66 */ 67 struct hammer2_flush_info { 68 hammer2_chain_t *parent; 69 hammer2_trans_t *trans; 70 int depth; 71 int diddeferral; 72 int cache_index; 73 struct h2_flush_list flushq; 74 hammer2_xid_t sync_xid; /* memory synchronization point */ 75 hammer2_tid_t mirror_tid; /* avoid digging through hmp */ 76 hammer2_tid_t modify_tid; 77 hammer2_chain_t *debug; 78 }; 79 80 typedef struct hammer2_flush_info hammer2_flush_info_t; 81 82 static void hammer2_flush_core(hammer2_flush_info_t *info, 83 hammer2_chain_t *chain, int deleting); 84 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data); 85 86 /* 87 * For now use a global transaction manager. What we ultimately want to do 88 * is give each non-overlapping hmp/pmp group its own transaction manager. 89 * 90 * Transactions govern XID tracking on the physical media (the hmp), but they 91 * also govern TID tracking which is per-PFS and thus might cross multiple 92 * hmp's. So we can't just stuff tmanage into hammer2_dev or 93 * hammer2_pfs. 94 */ 95 static hammer2_trans_manage_t tmanage; 96 97 void 98 hammer2_trans_manage_init(void) 99 { 100 lockinit(&tmanage.translk, "h2trans", 0, 0); 101 TAILQ_INIT(&tmanage.transq); 102 tmanage.flush_xid = 1; 103 tmanage.alloc_xid = tmanage.flush_xid + 1; 104 } 105 106 hammer2_xid_t 107 hammer2_trans_newxid(hammer2_pfs_t *pmp __unused) 108 { 109 hammer2_xid_t xid; 110 111 for (;;) { 112 xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1); 113 if (xid) 114 break; 115 } 116 return xid; 117 } 118 119 /* 120 * Transaction support functions for writing to the filesystem. 121 * 122 * Initializing a new transaction allocates a transaction ID. Typically 123 * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can 124 * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single 125 * media target. The latter mode is used by the recovery code. 126 * 127 * TWO TRANSACTION IDs can run concurrently, where one is a flush and the 128 * other is a set of any number of concurrent filesystem operations. We 129 * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops> 130 * or we can have <running_flush> + <concurrent_fs_ops>. 131 * 132 * During a flush, new fs_ops are only blocked until the fs_ops prior to 133 * the flush complete. The new fs_ops can then run concurrent with the flush. 134 * 135 * Buffer-cache transactions operate as fs_ops but never block. A 136 * buffer-cache flush will run either before or after the current pending 137 * flush depending on its state. 138 */ 139 void 140 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfs_t *pmp, int flags) 141 { 142 hammer2_trans_manage_t *tman; 143 hammer2_trans_t *head; 144 145 tman = &tmanage; 146 147 bzero(trans, sizeof(*trans)); 148 trans->pmp = pmp; 149 trans->flags = flags; 150 trans->td = curthread; 151 152 lockmgr(&tman->translk, LK_EXCLUSIVE); 153 154 if (flags & HAMMER2_TRANS_ISFLUSH) { 155 /* 156 * If multiple flushes are trying to run we have to 157 * wait until it is our turn. All flushes are serialized. 158 * 159 * We queue ourselves and then wait to become the head 160 * of the queue, allowing all prior flushes to complete. 161 * 162 * Multiple normal transactions can share the current 163 * transaction id but a flush transaction needs its own 164 * unique TID for proper block table update accounting. 165 */ 166 ++tman->flushcnt; 167 ++pmp->modify_tid; 168 tman->flush_xid = hammer2_trans_newxid(pmp); 169 trans->sync_xid = tman->flush_xid; 170 trans->modify_tid = pmp->modify_tid; 171 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 172 if (TAILQ_FIRST(&tman->transq) != trans) { 173 trans->blocked = 1; 174 while (trans->blocked) { 175 lksleep(&trans->sync_xid, &tman->translk, 176 0, "h2multf", hz); 177 } 178 } 179 } else if (tman->flushcnt == 0) { 180 /* 181 * No flushes are pending, we can go. Use prior flush_xid + 1. 182 * 183 * WARNING! Also see hammer2_chain_setflush() 184 */ 185 TAILQ_INSERT_TAIL(&tman->transq, trans, entry); 186 trans->sync_xid = tman->flush_xid + 1; 187 188 /* XXX improve/optimize inode allocation */ 189 } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) { 190 /* 191 * A buffer cache transaction is requested while a flush 192 * is in progress. The flush's PREFLUSH flag must be set 193 * in this situation. 194 * 195 * The buffer cache flush takes on the main flush's 196 * transaction id. 197 */ 198 TAILQ_FOREACH(head, &tman->transq, entry) { 199 if (head->flags & HAMMER2_TRANS_ISFLUSH) 200 break; 201 } 202 KKASSERT(head); 203 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH); 204 trans->flags |= HAMMER2_TRANS_PREFLUSH; 205 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 206 trans->sync_xid = head->sync_xid; 207 trans->modify_tid = head->modify_tid; 208 trans->flags |= HAMMER2_TRANS_CONCURRENT; 209 /* not allowed to block */ 210 } else { 211 /* 212 * A normal transaction is requested while a flush is in 213 * progress. We insert after the current flush and may 214 * block. 215 * 216 * WARNING! Also see hammer2_chain_setflush() 217 */ 218 TAILQ_FOREACH(head, &tman->transq, entry) { 219 if (head->flags & HAMMER2_TRANS_ISFLUSH) 220 break; 221 } 222 KKASSERT(head); 223 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry); 224 trans->sync_xid = head->sync_xid + 1; 225 trans->flags |= HAMMER2_TRANS_CONCURRENT; 226 227 /* 228 * XXX for now we must block new transactions, synchronous 229 * flush mode is on by default. 230 * 231 * If synchronous flush mode is enabled concurrent 232 * frontend transactions during the flush are not 233 * allowed (except we don't have a choice for buffer 234 * cache ops). 235 */ 236 if (hammer2_synchronous_flush > 0 || 237 TAILQ_FIRST(&tman->transq) != head) { 238 trans->blocked = 1; 239 while (trans->blocked) { 240 lksleep(&trans->sync_xid, &tman->translk, 241 0, "h2multf", hz); 242 } 243 } 244 } 245 if (flags & HAMMER2_TRANS_NEWINODE) { 246 if (pmp->spmp_hmp) { 247 /* 248 * Super-root transaction, all new inodes have an 249 * inode number of 1. Normal pfs inode cache 250 * semantics are not used. 251 */ 252 trans->inode_tid = 1; 253 } else { 254 /* 255 * Normal transaction 256 */ 257 if (pmp->inode_tid < HAMMER2_INODE_START) 258 pmp->inode_tid = HAMMER2_INODE_START; 259 trans->inode_tid = pmp->inode_tid++; 260 } 261 } 262 263 lockmgr(&tman->translk, LK_RELEASE); 264 } 265 266 void 267 hammer2_trans_done(hammer2_trans_t *trans) 268 { 269 hammer2_trans_manage_t *tman; 270 hammer2_trans_t *head; 271 hammer2_trans_t *scan; 272 273 tman = &tmanage; 274 275 /* 276 * Remove. 277 */ 278 lockmgr(&tman->translk, LK_EXCLUSIVE); 279 TAILQ_REMOVE(&tman->transq, trans, entry); 280 head = TAILQ_FIRST(&tman->transq); 281 282 /* 283 * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT 284 * up through the next flush. (If the head is a flush then we 285 * stop there, unlike the unblock code following this section). 286 */ 287 if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 288 --tman->flushcnt; 289 scan = head; 290 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 291 atomic_clear_int(&scan->flags, 292 HAMMER2_TRANS_CONCURRENT); 293 scan = TAILQ_NEXT(scan, entry); 294 } 295 } 296 297 /* 298 * Unblock the head of the queue and any additional transactions 299 * up to the next flush. The head can be a flush and it will be 300 * unblocked along with the non-flush transactions following it 301 * (which are allowed to run concurrently with it). 302 * 303 * In synchronous flush mode we stop if the head transaction is 304 * a flush. 305 */ 306 if (head && head->blocked) { 307 head->blocked = 0; 308 wakeup(&head->sync_xid); 309 310 if (hammer2_synchronous_flush > 0) 311 scan = head; 312 else 313 scan = TAILQ_NEXT(head, entry); 314 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 315 if (scan->blocked) { 316 scan->blocked = 0; 317 wakeup(&scan->sync_xid); 318 } 319 scan = TAILQ_NEXT(scan, entry); 320 } 321 } 322 lockmgr(&tman->translk, LK_RELEASE); 323 } 324 325 /* 326 * Flush the chain and all modified sub-chains through the specified 327 * synchronization point, propagating parent chain modifications, modify_tid, 328 * and mirror_tid updates back up as needed. 329 * 330 * Caller must have interlocked against any non-flush-related modifying 331 * operations in progress whos XXX values are less than or equal 332 * to the passed sync_xid. 333 * 334 * Caller must have already vetted synchronization points to ensure they 335 * are properly flushed. Only snapshots and cluster flushes can create 336 * these sorts of synchronization points. 337 * 338 * This routine can be called from several places but the most important 339 * is from VFS_SYNC. 340 * 341 * chain is locked on call and will remain locked on return. The chain's 342 * UPDATE flag indicates that its parent's block table (which is not yet 343 * part of the flush) should be updated. The chain may be replaced by 344 * the call if it was modified. 345 */ 346 void 347 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain) 348 { 349 hammer2_chain_t *scan; 350 hammer2_flush_info_t info; 351 int loops; 352 353 /* 354 * Execute the recursive flush and handle deferrals. 355 * 356 * Chains can be ridiculously long (thousands deep), so to 357 * avoid blowing out the kernel stack the recursive flush has a 358 * depth limit. Elements at the limit are placed on a list 359 * for re-execution after the stack has been popped. 360 */ 361 bzero(&info, sizeof(info)); 362 TAILQ_INIT(&info.flushq); 363 info.trans = trans; 364 info.sync_xid = trans->sync_xid; 365 info.cache_index = -1; 366 367 /* 368 * Calculate parent (can be NULL), if not NULL the flush core 369 * expects the parent to be referenced so it can easily lock/unlock 370 * it without it getting ripped up. 371 */ 372 if ((info.parent = chain->parent) != NULL) 373 hammer2_chain_ref(info.parent); 374 375 /* 376 * Extra ref needed because flush_core expects it when replacing 377 * chain. 378 */ 379 hammer2_chain_ref(chain); 380 loops = 0; 381 382 for (;;) { 383 /* 384 * Unwind deep recursions which had been deferred. This 385 * can leave the FLUSH_* bits set for these chains, which 386 * will be handled when we [re]flush chain after the unwind. 387 */ 388 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) { 389 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 390 TAILQ_REMOVE(&info.flushq, scan, flush_node); 391 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 392 393 /* 394 * Now that we've popped back up we can do a secondary 395 * recursion on the deferred elements. 396 * 397 * NOTE: hammer2_flush() may replace scan. 398 */ 399 if (hammer2_debug & 0x0040) 400 kprintf("deferred flush %p\n", scan); 401 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 402 hammer2_flush(trans, scan); 403 hammer2_chain_unlock(scan); 404 hammer2_chain_drop(scan); /* ref from deferral */ 405 } 406 407 /* 408 * [re]flush chain. 409 */ 410 info.diddeferral = 0; 411 hammer2_flush_core(&info, chain, 0); 412 413 /* 414 * Only loop if deep recursions have been deferred. 415 */ 416 if (TAILQ_EMPTY(&info.flushq)) 417 break; 418 419 if (++loops % 1000 == 0) { 420 kprintf("hammer2_flush: excessive loops on %p\n", 421 chain); 422 if (hammer2_debug & 0x100000) 423 Debugger("hell4"); 424 } 425 } 426 hammer2_chain_drop(chain); 427 if (info.parent) 428 hammer2_chain_drop(info.parent); 429 } 430 431 /* 432 * This is the core of the chain flushing code. The chain is locked by the 433 * caller and must also have an extra ref on it by the caller, and remains 434 * locked and will have an extra ref on return. Upon return, the caller can 435 * test the UPDATE bit on the child to determine if the parent needs updating. 436 * 437 * (1) Determine if this node is a candidate for the flush, return if it is 438 * not. fchain and vchain are always candidates for the flush. 439 * 440 * (2) If we recurse too deep the chain is entered onto the deferral list and 441 * the current flush stack is aborted until after the deferral list is 442 * run. 443 * 444 * (3) Recursively flush live children (rbtree). This can create deferrals. 445 * A successful flush clears the MODIFIED and UPDATE bits on the children 446 * and typically causes the parent to be marked MODIFIED as the children 447 * update the parent's block table. A parent might already be marked 448 * MODIFIED due to a deletion (whos blocktable update in the parent is 449 * handled by the frontend), or if the parent itself is modified by the 450 * frontend for other reasons. 451 * 452 * (4) Permanently disconnected sub-trees are cleaned up by the front-end. 453 * Deleted-but-open inodes can still be individually flushed via the 454 * filesystem syncer. 455 * 456 * (5) Note that an unmodified child may still need the block table in its 457 * parent updated (e.g. rename/move). The child will have UPDATE set 458 * in this case. 459 * 460 * WARNING ON BREF MODIFY_TID/MIRROR_TID 461 * 462 * blockref.modify_tid is consistent only within a PFS, and will not be 463 * consistent during synchronization. mirror_tid is consistent across the 464 * block device regardless of the PFS. 465 */ 466 static void 467 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, 468 int deleting) 469 { 470 hammer2_chain_t *parent; 471 hammer2_dev_t *hmp; 472 int diddeferral; 473 474 /* 475 * (1) Optimize downward recursion to locate nodes needing action. 476 * Nothing to do if none of these flags are set. 477 */ 478 if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) { 479 if (hammer2_debug & 0x200) { 480 if (info->debug == NULL) 481 info->debug = chain; 482 } else { 483 return; 484 } 485 } 486 487 hmp = chain->hmp; 488 diddeferral = info->diddeferral; 489 parent = info->parent; /* can be NULL */ 490 491 /* 492 * Downward search recursion 493 */ 494 if (chain->flags & HAMMER2_CHAIN_DEFERRED) { 495 /* 496 * Already deferred. 497 */ 498 ++info->diddeferral; 499 } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 500 /* 501 * Recursion depth reached. 502 */ 503 hammer2_chain_ref(chain); 504 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node); 505 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED); 506 ++info->diddeferral; 507 } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) { 508 /* 509 * Downward recursion search (actual flush occurs bottom-up). 510 * pre-clear ONFLUSH. It can get set again due to races, 511 * which we want so the scan finds us again in the next flush. 512 */ 513 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH); 514 info->parent = chain; 515 hammer2_spin_ex(&chain->core.spin); 516 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree, 517 NULL, hammer2_flush_recurse, info); 518 hammer2_spin_unex(&chain->core.spin); 519 info->parent = parent; 520 if (info->diddeferral) 521 hammer2_chain_setflush(info->trans, chain); 522 } 523 524 /* 525 * Now we are in the bottom-up part of the recursion. 526 * 527 * Do not update chain if lower layers were deferred. 528 */ 529 if (info->diddeferral) 530 goto done; 531 532 /* 533 * Propagate the DESTROY flag downwards. This dummies up the flush 534 * code and tries to invalidate related buffer cache buffers to 535 * avoid the disk write. 536 */ 537 if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY)) 538 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); 539 540 /* 541 * Chain was already modified or has become modified, flush it out. 542 */ 543 again: 544 if ((hammer2_debug & 0x200) && 545 info->debug && 546 (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) { 547 hammer2_chain_t *scan = chain; 548 549 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain); 550 while (scan) { 551 kprintf(" chain %p [%08x] bref=%016jx:%02x\n", 552 scan, scan->flags, 553 scan->bref.key, scan->bref.type); 554 if (scan == info->debug) 555 break; 556 scan = scan->parent; 557 } 558 } 559 560 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 561 /* 562 * Dispose of the modified bit. 563 * 564 * UPDATE should already be set. 565 * bref.mirror_tid should already be set. 566 */ 567 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) || 568 chain == &hmp->vchain); 569 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 570 571 /* 572 * Manage threads waiting for excessive dirty memory to 573 * be retired. 574 */ 575 if (chain->pmp) 576 hammer2_pfs_memory_wakeup(chain->pmp); 577 578 if ((chain->flags & HAMMER2_CHAIN_UPDATE) || 579 chain == &hmp->vchain || 580 chain == &hmp->fchain) { 581 /* 582 * Drop the ref from the MODIFIED bit we cleared, 583 * net -1 ref. 584 */ 585 hammer2_chain_drop(chain); 586 } else { 587 /* 588 * Drop the ref from the MODIFIED bit we cleared and 589 * set a ref for the UPDATE bit we are setting. Net 590 * 0 refs. 591 */ 592 atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 593 } 594 595 /* 596 * Issue the flush. This is indirect via the DIO. 597 * 598 * NOTE: A DELETED node that reaches this point must be 599 * flushed for synchronization point consistency. 600 * 601 * NOTE: Even though MODIFIED was already set, the related DIO 602 * might not be dirty due to a system buffer cache 603 * flush and must be set dirty if we are going to make 604 * further modifications to the buffer. Chains with 605 * embedded data don't need this. 606 */ 607 if (hammer2_debug & 0x1000) { 608 kprintf("Flush %p.%d %016jx/%d sync_xid=%08x " 609 "data=%016jx\n", 610 chain, chain->bref.type, 611 chain->bref.key, chain->bref.keybits, 612 info->sync_xid, 613 chain->bref.data_off); 614 } 615 if (hammer2_debug & 0x2000) { 616 Debugger("Flush hell"); 617 } 618 619 /* 620 * Update chain CRCs for flush. 621 * 622 * NOTE: Volume headers are NOT flushed here as they require 623 * special processing. 624 */ 625 switch(chain->bref.type) { 626 case HAMMER2_BREF_TYPE_FREEMAP: 627 /* 628 * Update the volume header's freemap_tid to the 629 * freemap's flushing mirror_tid. 630 * 631 * (note: embedded data, do not call setdirty) 632 */ 633 KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED); 634 KKASSERT(chain == &hmp->fchain); 635 hmp->voldata.freemap_tid = chain->bref.mirror_tid; 636 kprintf("sync freemap mirror_tid %08jx\n", 637 (intmax_t)chain->bref.mirror_tid); 638 639 /* 640 * The freemap can be flushed independently of the 641 * main topology, but for the case where it is 642 * flushed in the same transaction, and flushed 643 * before vchain (a case we want to allow for 644 * performance reasons), make sure modifications 645 * made during the flush under vchain use a new 646 * transaction id. 647 * 648 * Otherwise the mount recovery code will get confused. 649 */ 650 ++hmp->voldata.mirror_tid; 651 break; 652 case HAMMER2_BREF_TYPE_VOLUME: 653 /* 654 * The free block table is flushed by 655 * hammer2_vfs_sync() before it flushes vchain. 656 * We must still hold fchain locked while copying 657 * voldata to volsync, however. 658 * 659 * (note: embedded data, do not call setdirty) 660 */ 661 hammer2_voldata_lock(hmp); 662 hammer2_chain_lock(&hmp->fchain, 663 HAMMER2_RESOLVE_ALWAYS); 664 kprintf("sync volume mirror_tid %08jx\n", 665 (intmax_t)chain->bref.mirror_tid); 666 667 /* 668 * Update the volume header's mirror_tid to the 669 * main topology's flushing mirror_tid. It is 670 * possible that voldata.mirror_tid is already 671 * beyond bref.mirror_tid due to the bump we made 672 * above in BREF_TYPE_FREEMAP. 673 */ 674 if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) { 675 hmp->voldata.mirror_tid = 676 chain->bref.mirror_tid; 677 } 678 679 /* 680 * The volume header is flushed manually by the 681 * syncer, not here. All we do here is adjust the 682 * crc's. 683 */ 684 KKASSERT(chain->data != NULL); 685 KKASSERT(chain->dio == NULL); 686 687 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 688 hammer2_icrc32( 689 (char *)&hmp->voldata + 690 HAMMER2_VOLUME_ICRC1_OFF, 691 HAMMER2_VOLUME_ICRC1_SIZE); 692 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 693 hammer2_icrc32( 694 (char *)&hmp->voldata + 695 HAMMER2_VOLUME_ICRC0_OFF, 696 HAMMER2_VOLUME_ICRC0_SIZE); 697 hmp->voldata.icrc_volheader = 698 hammer2_icrc32( 699 (char *)&hmp->voldata + 700 HAMMER2_VOLUME_ICRCVH_OFF, 701 HAMMER2_VOLUME_ICRCVH_SIZE); 702 703 kprintf("syncvolhdr %016jx %016jx\n", 704 hmp->voldata.mirror_tid, 705 hmp->vchain.bref.mirror_tid); 706 hmp->volsync = hmp->voldata; 707 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 708 hammer2_chain_unlock(&hmp->fchain); 709 hammer2_voldata_unlock(hmp); 710 break; 711 case HAMMER2_BREF_TYPE_DATA: 712 /* 713 * Data elements have already been flushed via the 714 * logical file buffer cache. Their hash was set in 715 * the bref by the vop_write code. Do not re-dirty. 716 * 717 * Make sure any device buffer(s) have been flushed 718 * out here (there aren't usually any to flush) XXX. 719 */ 720 break; 721 case HAMMER2_BREF_TYPE_INDIRECT: 722 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 723 case HAMMER2_BREF_TYPE_FREEMAP_LEAF: 724 /* 725 * Buffer I/O will be cleaned up when the volume is 726 * flushed (but the kernel is free to flush it before 727 * then, as well). 728 */ 729 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 730 hammer2_chain_setcheck(chain, chain->data); 731 break; 732 case HAMMER2_BREF_TYPE_INODE: 733 /* 734 * NOTE: We must call io_setdirty() to make any late 735 * changes to the inode data, the system might 736 * have already flushed the buffer. 737 */ 738 if (chain->data->ipdata.op_flags & 739 HAMMER2_OPFLAG_PFSROOT) { 740 /* 741 * non-NULL pmp if mounted as a PFS. We must 742 * sync fields cached in the pmp? XXX 743 */ 744 hammer2_inode_data_t *ipdata; 745 746 hammer2_io_setdirty(chain->dio); 747 ipdata = &chain->data->ipdata; 748 if (chain->pmp) { 749 ipdata->pfs_inum = 750 chain->pmp->inode_tid; 751 } 752 } else { 753 /* can't be mounted as a PFS */ 754 } 755 756 /* 757 * Update inode statistics. Pending stats in chain 758 * are cleared out on UPDATE so expect that bit to 759 * be set here too or the statistics will not be 760 * rolled-up properly. 761 */ 762 if (chain->data_count || chain->inode_count) { 763 hammer2_inode_data_t *ipdata; 764 765 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE); 766 hammer2_io_setdirty(chain->dio); 767 ipdata = &chain->data->ipdata; 768 ipdata->data_count += chain->data_count; 769 ipdata->inode_count += chain->inode_count; 770 } 771 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0); 772 hammer2_chain_setcheck(chain, chain->data); 773 break; 774 default: 775 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED); 776 panic("hammer2_flush_core: unsupported " 777 "embedded bref %d", 778 chain->bref.type); 779 /* NOT REACHED */ 780 } 781 782 /* 783 * If the chain was destroyed try to avoid unnecessary I/O. 784 * (this only really works if the DIO system buffer is the 785 * same size as chain->bytes). 786 */ 787 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) { 788 hammer2_io_setinval(chain->dio, chain->bytes); 789 } 790 } 791 792 /* 793 * If UPDATE is set the parent block table may need to be updated. 794 * 795 * NOTE: UPDATE may be set on vchain or fchain in which case 796 * parent could be NULL. It's easiest to allow the case 797 * and test for NULL. parent can also wind up being NULL 798 * due to a deletion so we need to handle the case anyway. 799 * 800 * If no parent exists we can just clear the UPDATE bit. If the 801 * chain gets reattached later on the bit will simply get set 802 * again. 803 */ 804 if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) { 805 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 806 hammer2_chain_drop(chain); 807 } 808 809 /* 810 * The chain may need its blockrefs updated in the parent. This 811 * requires some fancy footwork. 812 */ 813 if (chain->flags & HAMMER2_CHAIN_UPDATE) { 814 hammer2_blockref_t *base; 815 int count; 816 817 /* 818 * Both parent and chain must be locked. This requires 819 * temporarily unlocking the chain. We have to deal with 820 * the case where the chain might be reparented or modified 821 * while it was unlocked. 822 */ 823 hammer2_chain_unlock(chain); 824 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 825 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE); 826 if (chain->parent != parent) { 827 kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", chain, chain->parent, parent); 828 hammer2_chain_unlock(parent); 829 goto done; 830 } 831 832 /* 833 * Check race condition. If someone got in and modified 834 * it again while it was unlocked, we have to loop up. 835 */ 836 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 837 hammer2_chain_unlock(parent); 838 kprintf("hammer2_flush: chain %p flush-mod race\n", 839 chain); 840 goto again; 841 } 842 843 /* 844 * Clear UPDATE flag 845 */ 846 if (chain->flags & HAMMER2_CHAIN_UPDATE) { 847 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); 848 hammer2_chain_drop(chain); 849 } 850 hammer2_chain_modify(info->trans, parent, 0); 851 852 /* 853 * Calculate blockmap pointer 854 */ 855 switch(parent->bref.type) { 856 case HAMMER2_BREF_TYPE_INODE: 857 /* 858 * Access the inode's block array. However, there is 859 * no block array if the inode is flagged DIRECTDATA. 860 */ 861 if (parent->data && 862 (parent->data->ipdata.op_flags & 863 HAMMER2_OPFLAG_DIRECTDATA) == 0) { 864 base = &parent->data-> 865 ipdata.u.blockset.blockref[0]; 866 } else { 867 base = NULL; 868 } 869 count = HAMMER2_SET_COUNT; 870 break; 871 case HAMMER2_BREF_TYPE_INDIRECT: 872 case HAMMER2_BREF_TYPE_FREEMAP_NODE: 873 if (parent->data) 874 base = &parent->data->npdata[0]; 875 else 876 base = NULL; 877 count = parent->bytes / sizeof(hammer2_blockref_t); 878 break; 879 case HAMMER2_BREF_TYPE_VOLUME: 880 base = &chain->hmp->voldata.sroot_blockset.blockref[0]; 881 count = HAMMER2_SET_COUNT; 882 break; 883 case HAMMER2_BREF_TYPE_FREEMAP: 884 base = &parent->data->npdata[0]; 885 count = HAMMER2_SET_COUNT; 886 break; 887 default: 888 base = NULL; 889 count = 0; 890 panic("hammer2_flush_core: " 891 "unrecognized blockref type: %d", 892 parent->bref.type); 893 } 894 895 /* 896 * Blocktable updates 897 * 898 * We synchronize pending statistics at this time. Delta 899 * adjustments designated for the current and upper level 900 * are synchronized. 901 */ 902 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) { 903 if (chain->flags & HAMMER2_CHAIN_BMAPPED) { 904 hammer2_base_delete(info->trans, parent, 905 base, count, 906 &info->cache_index, chain); 907 /* base_delete clears both bits */ 908 } else { 909 atomic_clear_int(&chain->flags, 910 HAMMER2_CHAIN_BMAPUPD); 911 } 912 } 913 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) { 914 parent->data_count += chain->data_count + 915 chain->data_count_up; 916 parent->inode_count += chain->inode_count + 917 chain->inode_count_up; 918 chain->data_count = 0; 919 chain->inode_count = 0; 920 chain->data_count_up = 0; 921 chain->inode_count_up = 0; 922 hammer2_base_insert(info->trans, parent, 923 base, count, 924 &info->cache_index, chain); 925 /* base_insert sets BMAPPED */ 926 } 927 hammer2_chain_unlock(parent); 928 } 929 930 /* 931 * Final cleanup after flush 932 */ 933 done: 934 KKASSERT(chain->refs > 0); 935 if (hammer2_debug & 0x200) { 936 if (info->debug == chain) 937 info->debug = NULL; 938 } 939 } 940 941 /* 942 * Flush recursion helper, called from flush_core, calls flush_core. 943 * 944 * Flushes the children of the caller's chain (info->parent), restricted 945 * by sync_tid. Set info->domodify if the child's blockref must propagate 946 * back up to the parent. 947 * 948 * Ripouts can move child from rbtree to dbtree or dbq but the caller's 949 * flush scan order prevents any chains from being lost. A child can be 950 * executes more than once. 951 * 952 * WARNING! If we do not call hammer2_flush_core() we must update 953 * bref.mirror_tid ourselves to indicate that the flush has 954 * processed the child. 955 * 956 * WARNING! parent->core spinlock is held on entry and return. 957 * 958 * WARNING! Flushes do not cross PFS boundaries. Specifically, a flush must 959 * not cross a pfs-root boundary. 960 */ 961 static int 962 hammer2_flush_recurse(hammer2_chain_t *child, void *data) 963 { 964 hammer2_flush_info_t *info = data; 965 /*hammer2_trans_t *trans = info->trans;*/ 966 hammer2_chain_t *parent = info->parent; 967 968 /* 969 * (child can never be fchain or vchain so a special check isn't 970 * needed). 971 * 972 * We must ref the child before unlocking the spinlock. 973 * 974 * The caller has added a ref to the parent so we can temporarily 975 * unlock it in order to lock the child. 976 */ 977 hammer2_chain_ref(child); 978 hammer2_spin_unex(&parent->core.spin); 979 980 hammer2_chain_unlock(parent); 981 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 982 983 /* 984 * Recurse and collect deferral data. We're in the media flush, 985 * this can cross PFS boundaries. 986 */ 987 if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) { 988 ++info->depth; 989 hammer2_flush_core(info, child, 0); /* XXX deleting */ 990 --info->depth; 991 } else if (hammer2_debug & 0x200) { 992 if (info->debug == NULL) 993 info->debug = child; 994 ++info->depth; 995 hammer2_flush_core(info, child, 0); /* XXX deleting */ 996 --info->depth; 997 if (info->debug == child) 998 info->debug = NULL; 999 } 1000 1001 /* 1002 * Relock to continue the loop 1003 */ 1004 hammer2_chain_unlock(child); 1005 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 1006 hammer2_chain_drop(child); 1007 KKASSERT(info->parent == parent); 1008 hammer2_spin_ex(&parent->core.spin); 1009 1010 return (0); 1011 } 1012