1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/cdefs.h> 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/types.h> 40 #include <sys/lock.h> 41 #include <sys/uuid.h> 42 43 #include "hammer2.h" 44 45 /* 46 * Recursively flush the specified chain. The chain is locked and 47 * referenced by the caller and will remain so on return. The chain 48 * will remain referenced throughout but can temporarily lose its 49 * lock during the recursion to avoid unnecessarily stalling user 50 * processes. 51 */ 52 struct hammer2_flush_info { 53 hammer2_mount_t *hmp; 54 hammer2_chain_t *parent; 55 hammer2_trans_t *trans; 56 int depth; 57 int diddeferral; 58 struct flush_deferral_list flush_list; 59 hammer2_tid_t sync_tid; /* flush synchronization point */ 60 hammer2_tid_t mirror_tid; /* collect mirror TID updates */ 61 }; 62 63 typedef struct hammer2_flush_info hammer2_flush_info_t; 64 65 static void hammer2_chain_flush_core(hammer2_flush_info_t *info, 66 hammer2_chain_t *chain); 67 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data); 68 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data); 69 70 /* 71 * Transaction support functions for writing to the filesystem. 72 * 73 * Initializing a new transaction allocates a transaction ID. We 74 * don't bother marking the volume header MODIFIED. Instead, the volume 75 * will be synchronized at a later time as part of a larger flush sequence. 76 * 77 * Non-flush transactions can typically run concurrently. However if 78 * there are non-flush transaction both before AND after a flush trans, 79 * the transactions after stall until the ones before finish. 80 * 81 * Non-flush transactions occuring after a flush pointer can run concurrently 82 * with that flush. They only have to wait for transactions prior to the 83 * flush trans to complete before they unstall. 84 * 85 * WARNING! Modifications to the root volume cannot dup the root volume 86 * header to handle synchronization points, so alloc_tid can 87 * wind up (harmlessly) more advanced on flush. 88 * 89 * WARNING! Operations which might call inode_duplicate()/chain_duplicate() 90 * depend heavily on having a unique sync_tid to avoid duplication 91 * collisions (which key off of delete_tid). 92 */ 93 void 94 hammer2_trans_init(hammer2_mount_t *hmp, hammer2_trans_t *trans, int flags) 95 { 96 hammer2_trans_t *scan; 97 98 bzero(trans, sizeof(*trans)); 99 trans->hmp = hmp; 100 101 hammer2_voldata_lock(hmp); 102 trans->sync_tid = hmp->voldata.alloc_tid++; 103 trans->flags = flags; 104 trans->td = curthread; 105 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry); 106 107 if (flags & HAMMER2_TRANS_ISFLUSH) { 108 /* 109 * If we are a flush we have to wait for all transactions 110 * prior to our flush synchronization point to complete 111 * before we can start our flush. 112 */ 113 ++hmp->flushcnt; 114 if (hmp->curflush == NULL) { 115 hmp->curflush = trans; 116 hmp->flush_tid = trans->sync_tid;; 117 } 118 while (TAILQ_FIRST(&hmp->transq) != trans) { 119 lksleep(&trans->sync_tid, &hmp->voldatalk, 120 0, "h2syncw", hz); 121 } 122 123 /* 124 * Once we become the running flush we can wakeup anyone 125 * who blocked on us. 126 */ 127 scan = trans; 128 while ((scan = TAILQ_NEXT(scan, entry)) != NULL) { 129 if (scan->flags & HAMMER2_TRANS_ISFLUSH) 130 break; 131 if (scan->blocked == 0) 132 break; 133 scan->blocked = 0; 134 wakeup(&scan->blocked); 135 } 136 } else { 137 /* 138 * If we are not a flush but our sync_tid is after a 139 * stalled flush, we have to wait until that flush unstalls 140 * (that is, all transactions prior to that flush complete), 141 * but then we can run concurrently with that flush. 142 * 143 * (flushcnt check only good as pre-condition, otherwise it 144 * may represent elements queued after us after we block). 145 */ 146 if (hmp->flushcnt > 1 || 147 (hmp->curflush && 148 TAILQ_FIRST(&hmp->transq) != hmp->curflush)) { 149 trans->blocked = 1; 150 while (trans->blocked) { 151 lksleep(&trans->blocked, &hmp->voldatalk, 152 0, "h2trans", hz); 153 } 154 } 155 } 156 hammer2_voldata_unlock(hmp, 0); 157 } 158 159 void 160 hammer2_trans_done(hammer2_trans_t *trans) 161 { 162 hammer2_mount_t *hmp = trans->hmp; 163 hammer2_trans_t *scan; 164 165 hammer2_voldata_lock(hmp); 166 TAILQ_REMOVE(&hmp->transq, trans, entry); 167 if (trans->flags & HAMMER2_TRANS_ISFLUSH) { 168 /* 169 * If we were a flush we have to adjust curflush to the 170 * next flush. 171 * 172 * flush_tid is used to partition copy-on-write operations 173 * (mostly duplicate-on-modify ops), which is what allows 174 * us to execute a flush concurrent with modifying operations 175 * with higher TIDs. 176 */ 177 --hmp->flushcnt; 178 if (hmp->flushcnt) { 179 TAILQ_FOREACH(scan, &hmp->transq, entry) { 180 if (scan->flags & HAMMER2_TRANS_ISFLUSH) 181 break; 182 } 183 KKASSERT(scan); 184 hmp->curflush = scan; 185 hmp->flush_tid = scan->sync_tid; 186 } else { 187 /* 188 * Theoretically we don't have to clear flush_tid 189 * here since the flush will have synchronized 190 * all operations <= flush_tid already. But for 191 * now zero-it. 192 */ 193 hmp->curflush = NULL; 194 hmp->flush_tid = 0; 195 } 196 } else { 197 /* 198 * If we are not a flush but a flush is now at the head 199 * of the queue and we were previously blocking it, 200 * we can now unblock it. 201 */ 202 if (hmp->flushcnt && 203 (scan = TAILQ_FIRST(&hmp->transq)) != NULL && 204 trans->sync_tid < scan->sync_tid && 205 (scan->flags & HAMMER2_TRANS_ISFLUSH)) { 206 wakeup(&scan->sync_tid); 207 } 208 } 209 hammer2_voldata_unlock(hmp, 0); 210 211 trans->hmp = NULL; 212 } 213 214 /* 215 * Flush the chain and all modified sub-chains through the specified 216 * synchronization point (sync_tid), propagating parent chain modifications 217 * and mirror_tid updates back up as needed. Since we are recursing downward 218 * we do not have to deal with the complexities of multi-homed chains (chains 219 * with multiple parents). 220 * 221 * Caller must have interlocked against any non-flush-related modifying 222 * operations in progress whos modify_tid values are less than or equal 223 * to the passed sync_tid. 224 * 225 * Caller must have already vetted synchronization points to ensure they 226 * are properly flushed. Only snapshots and cluster flushes can create 227 * these sorts of synchronization points. 228 * 229 * SUBMODIFIED is not cleared if modified elements with higher modify_tid 230 * values (thus not flushed) are still present after the flush. 231 * 232 * If a chain is unable to completely flush we have to be sure that 233 * SUBMODIFIED remains set up the parent chain, and that MOVED is not 234 * cleared or our desynchronized bref will not properly update in the 235 * parent. The parent's indirect block is copied-on-write and adjusted 236 * as needed so it no longer needs to be placemarked by the subchains, 237 * allowing the sub-chains to be cleaned out. 238 * 239 * This routine can be called from several places but the most important 240 * is from the hammer2_vop_reclaim() function. We want to try to completely 241 * clean out the inode structure to prevent disconnected inodes from 242 * building up and blowing out the kmalloc pool. However, it is not actually 243 * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery 244 * capability. 245 * 246 * chain is locked on call and will remain locked on return. If a flush 247 * occured, the chain's MOVED bit will be set indicating that its parent 248 * (which is not part of the flush) should be updated. 249 */ 250 void 251 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain) 252 { 253 hammer2_chain_t *scan; 254 hammer2_flush_info_t info; 255 256 /* 257 * Execute the recursive flush and handle deferrals. 258 * 259 * Chains can be ridiculously long (thousands deep), so to 260 * avoid blowing out the kernel stack the recursive flush has a 261 * depth limit. Elements at the limit are placed on a list 262 * for re-execution after the stack has been popped. 263 */ 264 bzero(&info, sizeof(info)); 265 TAILQ_INIT(&info.flush_list); 266 info.hmp = trans->hmp; 267 info.trans = trans; 268 info.sync_tid = trans->sync_tid; 269 info.mirror_tid = 0; 270 271 for (;;) { 272 /* 273 * Unwind deep recursions which had been deferred. This 274 * can leave MOVED set for these chains, which will be 275 * handled when we [re]flush chain after the unwind. 276 */ 277 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) { 278 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 279 TAILQ_REMOVE(&info.flush_list, scan, flush_node); 280 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 281 282 /* 283 * Now that we've popped back up we can do a secondary 284 * recursion on the deferred elements. 285 */ 286 if (hammer2_debug & 0x0040) 287 kprintf("defered flush %p\n", scan); 288 hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); 289 hammer2_chain_flush(trans, scan); 290 hammer2_chain_unlock(scan); 291 hammer2_chain_drop(scan); /* ref from deferral */ 292 } 293 294 /* 295 * Flush pass1 on root. SUBMODIFIED can remain set after 296 * this call for numerous reasons, including write failures, 297 * but most likely due to only a partial flush being 298 * requested or the chain element belongs to the wrong 299 * synchronization point. 300 */ 301 info.diddeferral = 0; 302 hammer2_chain_flush_core(&info, chain); 303 #if FLUSH_DEBUG 304 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n", 305 chain, chain->bref.type, chain->flags); 306 #endif 307 308 /* 309 * Only loop if deep recursions have been deferred. 310 */ 311 if (TAILQ_EMPTY(&info.flush_list)) 312 break; 313 } 314 315 /* 316 * SUBMODIFIED can be temporarily cleared and then re-set, which 317 * can prevent concurrent setsubmods from reaching all the way to 318 * the root. If after the flush we find the node is still in need 319 * of flushing (though possibly due to modifications made outside 320 * the requested synchronization zone), we must call setsubmod again 321 * to cover the race. 322 */ 323 if (chain->flags & (HAMMER2_CHAIN_MOVED | 324 HAMMER2_CHAIN_DELETED | 325 HAMMER2_CHAIN_MODIFIED | 326 HAMMER2_CHAIN_SUBMODIFIED)) { 327 hammer2_chain_parent_setsubmod(trans, chain); 328 } 329 } 330 331 /* 332 * This is the core of the chain flushing code. The chain is locked by the 333 * caller and remains locked on return. This function is keyed off of 334 * the SUBMODIFIED bit but must make fine-grained choices based on the 335 * synchronization point we are flushing to. 336 * 337 * If the flush accomplished any work chain will be flagged MOVED 338 * indicating a copy-on-write propagation back up is required. 339 * Deep sub-nodes may also have been entered onto the deferral list. 340 * MOVED is never set on the volume root. 341 * 342 * NOTE: modify_tid is different from MODIFIED. modify_tid is updated 343 * only when a chain is specifically modified, and not updated 344 * for copy-on-write propagations. MODIFIED is set on any modification 345 * including copy-on-write propagations. 346 */ 347 static void 348 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain) 349 { 350 hammer2_mount_t *hmp; 351 hammer2_blockref_t *bref; 352 hammer2_off_t pbase; 353 hammer2_tid_t saved_sync; 354 size_t bbytes; 355 size_t boff; 356 char *bdata; 357 struct buf *bp; 358 int error; 359 int wasmodified; 360 int diddeferral = 0; 361 362 hmp = info->hmp; 363 364 #if FLUSH_DEBUG 365 if (info->parent) 366 kprintf("flush_core %p->%p.%d %08x (%s)\n", 367 info->parent, chain, chain->bref.type, 368 chain->flags, 369 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 370 chain->data->ipdata.filename : "?")); 371 else 372 kprintf("flush_core NULL->%p.%d %08x (%s)\n", 373 chain, chain->bref.type, 374 chain->flags, 375 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ? 376 chain->data->ipdata.filename : "?")); 377 #endif 378 379 #if 0 380 /* 381 * A chain modified beyond our flush point is ignored by the current 382 * flush. We could safely flush such chains if we wanted to (they 383 * just wouldn't propagate back up and be left with MOVED set), but 384 * doing so could lead to an infinite flush loop under heavy 385 * filesystem write loads. By ignoring such elements the flush 386 * will only deal with changes as-of when the flush was started. 387 * 388 * NOTE: Unmodified chains set modify_tid to 0, allowing us to reach 389 * deeper chains. 390 */ 391 if (chain->modify_tid > info->sync_tid) 392 return; 393 #endif 394 395 /* 396 * Restrict the synchronization point when we encounter a 397 * delete/duplicate chain. We do not do this for deletions 398 * at the end of the linked list because they represent an 399 * operation occuring within the flush range, whereas flushes 400 * through deleted chains which have been duplicated represent 401 * only changes made through that deletion point. 402 */ 403 saved_sync = info->sync_tid; 404 #if 0 405 if (chain->duplink && (chain->flags & HAMMER2_CHAIN_DELETED) && 406 chain->delete_tid < saved_sync) { 407 info->sync_tid = chain->delete_tid; 408 } 409 #endif 410 411 /* 412 * If SUBMODIFIED is set we recurse the flush and adjust the 413 * blockrefs accordingly. 414 * 415 * NOTE: Looping on SUBMODIFIED can prevent a flush from ever 416 * finishing in the face of filesystem activity. 417 */ 418 if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) { 419 hammer2_chain_t *saved_parent; 420 hammer2_tid_t saved_mirror; 421 422 /* 423 * Clear SUBMODIFIED to catch races. Note that any child 424 * with MODIFIED, DELETED, or MOVED set during Scan2, after 425 * it processes the child, will cause SUBMODIFIED to be 426 * re-set. 427 * child has to be flushed SUBMODIFIED will wind up being 428 * set again (for next time), but this does not stop us from 429 * synchronizing block updates which occurred. 430 * 431 * We don't want to set our chain to MODIFIED gratuitously. 432 * 433 * We need an extra ref on chain because we are going to 434 * release its lock temporarily in our child loop. 435 */ 436 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED); 437 hammer2_chain_ref(chain); 438 439 /* 440 * Run two passes. The first pass handles MODIFIED and 441 * SUBMODIFIED chains and recurses while the second pass 442 * handles MOVED chains on the way back up. 443 * 444 * If the stack gets too deep we defer scan1, but must 445 * be sure to still run scan2 if on the next loop the 446 * deferred chain has been flushed and now needs MOVED 447 * handling on the way back up. 448 * 449 * Scan1 is recursive. 450 * 451 * NOTE: The act of handling a modified/submodified chain can 452 * cause the MOVED Flag to be set. It can also be set 453 * via hammer2_chain_delete() and in other situations. 454 * 455 * NOTE: RB_SCAN() must be used instead of RB_FOREACH() 456 * because children can be physically removed during 457 * the scan. 458 */ 459 saved_parent = info->parent; 460 saved_mirror = info->mirror_tid; 461 info->parent = chain; 462 info->mirror_tid = chain->bref.mirror_tid; 463 464 if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { 465 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) { 466 hammer2_chain_ref(chain); 467 TAILQ_INSERT_TAIL(&info->flush_list, 468 chain, flush_node); 469 atomic_set_int(&chain->flags, 470 HAMMER2_CHAIN_DEFERRED); 471 } 472 diddeferral = 1; 473 } else { 474 info->diddeferral = 0; 475 spin_lock(&chain->core->cst.spin); 476 RB_SCAN(hammer2_chain_tree, &chain->core->rbtree, 477 NULL, hammer2_chain_flush_scan1, info); 478 spin_unlock(&chain->core->cst.spin); 479 diddeferral += info->diddeferral; 480 } 481 482 /* 483 * Handle successfully flushed children who are in the MOVED 484 * state on the way back up the recursion. This can have 485 * the side-effect of clearing MOVED. 486 * 487 * We execute this even if there were deferrals to try to 488 * keep the chain topology cleaner. 489 * 490 * Scan2 is non-recursive. 491 */ 492 #if FLUSH_DEBUG 493 kprintf("scan2_start parent %p %08x\n", chain, chain->flags); 494 #endif 495 spin_lock(&chain->core->cst.spin); 496 RB_SCAN(hammer2_chain_tree, &chain->core->rbtree, 497 NULL, hammer2_chain_flush_scan2, info); 498 spin_unlock(&chain->core->cst.spin); 499 #if FLUSH_DEBUG 500 kprintf("scan2_stop parent %p %08x\n", chain, chain->flags); 501 #endif 502 chain->bref.mirror_tid = info->mirror_tid; 503 info->mirror_tid = saved_mirror; 504 info->parent = saved_parent; 505 hammer2_chain_drop(chain); 506 } 507 508 /* 509 * Restore sync_tid in case it was restricted by a delete/duplicate. 510 */ 511 info->sync_tid = saved_sync; 512 513 /* 514 * Rollup diddeferral for caller. Note direct assignment, not +=. 515 */ 516 info->diddeferral = diddeferral; 517 518 /* 519 * Do not flush chain if there were any deferrals. It will be 520 * retried later after the deferrals are independently handled. 521 */ 522 if (diddeferral) { 523 if (hammer2_debug & 0x0008) { 524 kprintf("%*.*s} %p/%d %04x (deferred)", 525 info->depth, info->depth, "", 526 chain, chain->refs, chain->flags); 527 } 528 return; 529 } 530 531 /* 532 * The DESTROYED flag is set when an inode is physically deleted 533 * and no longer referenced (no open descriptors). We can 534 * safely clear the MODIFIED bit. 535 * 536 * The MOVED bit has to be left intact as this flags the zeroing 537 * of the bref in the parent chain. 538 * 539 * XXX 540 * 541 * Chain objects flagged for complete destruction recurse down from 542 * their inode. The inode will have already been removed from 543 * its parent. We have no need to disconnect the children from 544 * their parents or the inode in this situation (it would just 545 * waste time and storage with copy-on-write operations), so 546 * we can clear both the MODIFIED bit and the MOVED bit. 547 * 548 * DESTROYED chains stop processing here. 549 */ 550 if ((chain->flags & HAMMER2_CHAIN_DESTROYED) && 551 (chain->flags & HAMMER2_CHAIN_DELETED)) { 552 #if 0 553 (chain->delete_tid <= info->sync_tid)) { 554 #endif 555 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 556 if (chain->bp) 557 chain->bp->b_flags |= B_INVAL|B_RELBUF; 558 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 559 hammer2_chain_drop(chain); 560 } 561 #if 0 562 if (chain->flags & HAMMER2_CHAIN_MOVED) { 563 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED); 564 hammer2_chain_drop(chain); 565 } 566 #endif 567 if (hammer2_debug & 0x0008) { 568 kprintf("%*.*s} %p/%d %04x (destroyed)", 569 info->depth, info->depth, "", 570 chain, chain->refs, chain->flags); 571 } 572 return; 573 } 574 575 /* 576 * A degenerate flush might not have flushed anything and thus not 577 * processed modified blocks on the way back up. Detect the case. 578 * 579 * Note that MOVED can be set without MODIFIED being set due to 580 * a deletion, in which case it is handled by Scan2 later on. 581 * 582 * Both bits can be set along with DELETED due to a deletion if 583 * modified data within the synchronization zone and the chain 584 * was then deleted beyond the zone, in which case we still have 585 * to flush for synchronization point consistency. Otherwise though 586 * DELETED and MODIFIED are treated as separate flags. 587 */ 588 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) 589 return; 590 591 /* 592 * Issue flush. 593 * 594 * A DESTROYED node that reaches this point must be flushed for 595 * synchronization point consistency. 596 */ 597 598 /* 599 * Update mirror_tid, clear MODIFIED, and set MOVED. 600 * 601 * The caller will update the parent's reference to this chain 602 * by testing MOVED as long as the modification was in-bounds. 603 * 604 * MOVED is never set on the volume root as there is no parent 605 * to adjust. 606 */ 607 if (chain->bref.mirror_tid < info->sync_tid) 608 chain->bref.mirror_tid = info->sync_tid; 609 wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0; 610 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 611 if (chain == &hmp->vchain) 612 kprintf("(FLUSHED VOLUME HEADER)\n"); 613 614 if ((chain->flags & HAMMER2_CHAIN_MOVED) || 615 chain == &hmp->vchain) { 616 /* 617 * Drop the ref from the MODIFIED bit we cleared. 618 */ 619 if (wasmodified) 620 hammer2_chain_drop(chain); 621 } else { 622 /* 623 * If we were MODIFIED we inherit the ref from clearing 624 * that bit, otherwise we need another ref. 625 */ 626 if (wasmodified == 0) 627 hammer2_chain_ref(chain); 628 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 629 } 630 631 /* 632 * If this is part of a recursive flush we can go ahead and write 633 * out the buffer cache buffer and pass a new bref back up the chain 634 * via the MOVED bit. 635 * 636 * Volume headers are NOT flushed here as they require special 637 * processing. 638 */ 639 switch(chain->bref.type) { 640 case HAMMER2_BREF_TYPE_VOLUME: 641 /* 642 * The volume header is flushed manually by the syncer, not 643 * here. All we do is adjust the crc's. 644 */ 645 KKASSERT(chain->data != NULL); 646 KKASSERT(chain->bp == NULL); 647 kprintf("volume header mirror_tid %jd\n", 648 hmp->voldata.mirror_tid); 649 650 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 651 hammer2_icrc32( 652 (char *)&hmp->voldata + 653 HAMMER2_VOLUME_ICRC1_OFF, 654 HAMMER2_VOLUME_ICRC1_SIZE); 655 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 656 hammer2_icrc32( 657 (char *)&hmp->voldata + 658 HAMMER2_VOLUME_ICRC0_OFF, 659 HAMMER2_VOLUME_ICRC0_SIZE); 660 hmp->voldata.icrc_volheader = 661 hammer2_icrc32( 662 (char *)&hmp->voldata + 663 HAMMER2_VOLUME_ICRCVH_OFF, 664 HAMMER2_VOLUME_ICRCVH_SIZE); 665 hmp->volsync = hmp->voldata; 666 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC); 667 break; 668 case HAMMER2_BREF_TYPE_DATA: 669 /* 670 * Data elements have already been flushed via the logical 671 * file buffer cache. Their hash was set in the bref by 672 * the vop_write code. 673 * 674 * Make sure any device buffer(s) have been flushed out here. 675 * (there aren't usually any to flush). 676 */ 677 bbytes = chain->bytes; 678 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); 679 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); 680 681 bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0); 682 if (bp) { 683 if ((bp->b_flags & (B_CACHE | B_DIRTY)) == 684 (B_CACHE | B_DIRTY)) { 685 cluster_awrite(bp); 686 } else { 687 bp->b_flags |= B_RELBUF; 688 brelse(bp); 689 } 690 } 691 break; 692 case HAMMER2_BREF_TYPE_INDIRECT: 693 /* 694 * Indirect blocks may be in an INITIAL state. Use the 695 * chain_lock() call to ensure that the buffer has been 696 * instantiated (even though it is already locked the buffer 697 * might not have been instantiated). 698 * 699 * Only write the buffer out if it is dirty, it is possible 700 * the operating system had already written out the buffer. 701 */ 702 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS); 703 KKASSERT(chain->bp != NULL); 704 705 bp = chain->bp; 706 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) || 707 (bp->b_flags & B_DIRTY)) { 708 bdwrite(chain->bp); 709 } else { 710 brelse(chain->bp); 711 } 712 chain->bp = NULL; 713 chain->data = NULL; 714 hammer2_chain_unlock(chain); 715 break; 716 default: 717 /* 718 * Embedded elements have to be flushed out. 719 */ 720 KKASSERT(chain->data != NULL); 721 KKASSERT(chain->bp == NULL); 722 bref = &chain->bref; 723 724 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0); 725 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) == 726 HAMMER2_CHECK_ISCSI32); 727 728 if (chain->bp == NULL) { 729 /* 730 * The data is embedded, we have to acquire the 731 * buffer cache buffer and copy the data into it. 732 */ 733 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 734 bbytes = HAMMER2_MINIOSIZE; 735 pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1); 736 boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1); 737 738 /* 739 * The getblk() optimization can only be used if the 740 * physical block size matches the request. 741 */ 742 if (chain->bytes == bbytes) { 743 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0); 744 error = 0; 745 } else { 746 error = bread(hmp->devvp, pbase, bbytes, &bp); 747 KKASSERT(error == 0); 748 } 749 bdata = (char *)bp->b_data + boff; 750 751 /* 752 * Copy the data to the buffer, mark the buffer 753 * dirty, and convert the chain to unmodified. 754 */ 755 bcopy(chain->data, bdata, chain->bytes); 756 bp->b_flags |= B_CLUSTEROK; 757 bdwrite(bp); 758 bp = NULL; 759 chain->bref.check.iscsi32.value = 760 hammer2_icrc32(chain->data, chain->bytes); 761 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 762 ++hammer2_iod_meta_write; 763 else 764 ++hammer2_iod_indr_write; 765 } else { 766 chain->bref.check.iscsi32.value = 767 hammer2_icrc32(chain->data, chain->bytes); 768 } 769 } 770 if (hammer2_debug & 0x0008) { 771 kprintf("%*.*s} %p/%d %04x (flushed)", 772 info->depth, info->depth, "", 773 chain, chain->refs, chain->flags); 774 } 775 } 776 777 /* 778 * Flush helper scan1 (recursive) 779 * 780 * Flushes the children of the caller's chain (parent) and updates 781 * the blockref, restricted by sync_tid. 782 * 783 * Ripouts during the loop should not cause any problems. Because we are 784 * flushing to a synchronization point, modification races will occur after 785 * sync_tid and do not have to be flushed anyway. 786 * 787 * It is also ok if the parent is chain_duplicate()'d while unlocked because 788 * the delete/duplication will install a delete_tid that is still larger than 789 * our current sync_tid. 790 */ 791 static int 792 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data) 793 { 794 hammer2_flush_info_t *info = data; 795 hammer2_chain_t *parent = info->parent; 796 /*hammer2_mount_t *hmp = info->hmp;*/ 797 int diddeferral; 798 799 #if 0 800 kprintf("flush %p,%d [%08x] -> %p [%08x] %jx\n", 801 parent, child->index, parent->flags, 802 child, child->flags, child->bref.key); 803 #endif 804 805 /* 806 * We should only need to recurse if SUBMODIFIED is set, but as 807 * a safety also recurse if MODIFIED is also set. Return early 808 * if neither bit is set. 809 */ 810 if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED | 811 HAMMER2_CHAIN_MODIFIED)) == 0) { 812 return (0); 813 } 814 hammer2_chain_ref(child); 815 spin_unlock(&parent->core->cst.spin); 816 817 /* 818 * The caller has added a ref to the parent so we can temporarily 819 * unlock it in order to lock the child. Re-check the flags before 820 * continuing. 821 */ 822 hammer2_chain_unlock(parent); 823 hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE); 824 825 if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED | 826 HAMMER2_CHAIN_MODIFIED)) == 0) { 827 hammer2_chain_unlock(child); 828 hammer2_chain_drop(child); 829 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 830 spin_lock(&parent->core->cst.spin); 831 return (0); 832 } 833 834 /* 835 * The DESTROYED flag can only be initially set on an unreferenced 836 * deleted inode and will propagate downward via the mechanic below. 837 * Such inode chains have been deleted for good and should no longer 838 * be subject to delete/duplication. 839 * 840 * This optimization allows the inode reclaim (destroy unlinked file 841 * on vnode reclamation after last close) to be flagged by just 842 * setting HAMMER2_CHAIN_DESTROYED at the top level and then will 843 * cause the chains to be terminated and related buffers to be 844 * invalidated and not flushed out. 845 * 846 * We have to be careful not to propagate the DESTROYED flag if 847 * the destruction occurred after our flush sync_tid. 848 */ 849 if ((parent->flags & HAMMER2_CHAIN_DESTROYED) && 850 (child->flags & HAMMER2_CHAIN_DELETED) && 851 #if 0 852 child->delete_tid <= info->sync_tid && 853 #endif 854 (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) { 855 KKASSERT(child->duplink == NULL); 856 atomic_set_int(&child->flags, 857 HAMMER2_CHAIN_DESTROYED | 858 HAMMER2_CHAIN_SUBMODIFIED); 859 } 860 861 /* 862 * Recurse and collect deferral data. 863 */ 864 diddeferral = info->diddeferral; 865 ++info->depth; 866 hammer2_chain_flush_core(info, child); 867 #if FLUSH_DEBUG 868 kprintf("flush_core_done parent=%p flags=%08x child=%p.%d %08x\n", 869 parent, parent->flags, child, child->bref.type, child->flags); 870 #endif 871 --info->depth; 872 info->diddeferral += diddeferral; 873 874 hammer2_chain_unlock(child); 875 hammer2_chain_drop(child); 876 877 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE); 878 879 spin_lock(&parent->core->cst.spin); 880 return (0); 881 } 882 883 /* 884 * Flush helper scan2 (non-recursive) 885 * 886 * This pass on a chain's children propagates any MOVED or DELETED 887 * elements back up the chain towards the root after those elements have 888 * been fully flushed. Unlike scan1, this function is NOT recursive and 889 * the parent remains locked across the entire scan. 890 * 891 * Moves - MOVED elements need to propagate their bref up to the parent. 892 * all parents from element->parent through the duplink chain 893 * must be updated. The flag can only be reset once SUBMODIFIED 894 * has been cleared for all parents in the chain. 895 * 896 * A secondary bcmp of the bref is made to catch out-of-order 897 * flushes and not re-sync parents which are already correct. 898 * 899 * Deletions - Deletions are handled via delete_tid coupled with the MOVED 900 * flag. When a deletion is detected the parent's bref to the 901 * child is properly cleared. MOVED is always set when a deletion 902 * is made. A deleted element is an element where delete_tid != 903 * HAMMER2_MAX_TID. 904 * 905 * NOTE! We must re-set SUBMODIFIED on the parent(s) as appropriate, and 906 * due to the above conditions it is possible to do this and still 907 * have some children flagged MOVED depending on the synchronization. 908 * 909 * NOTE! A deletion is a visbility issue, there can still be referenced to 910 * deleted elements (for example, to an unlinked file which is still 911 * open), and there can also be multiple chains pointing to the same 912 * bref where some are deleted and some are not (for example due to 913 * a rename). So a chain marked for deletion is basically considered 914 * to be live until it is explicitly destroyed. Such chains can also 915 * be freed once all MOVED and MODIFIED handling is done. 916 */ 917 static int 918 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data) 919 { 920 hammer2_flush_info_t *info = data; 921 hammer2_chain_t *parent = info->parent; 922 hammer2_mount_t *hmp = info->hmp; 923 hammer2_blockref_t *base; 924 int count; 925 int child_flags; 926 927 /* 928 * Check update conditions prior to locking child. 929 * We may not be able to safely test the 64-bit TIDs 930 * but we can certainly test the flags. 931 * 932 * NOTE: DELETED always also sets MOVED. 933 */ 934 #if FLUSH_DEBUG 935 kprintf(" scan2 parent %p %08x child %p %08x ", parent, parent->flags, child, child->flags); 936 #endif 937 if (parent->flags & HAMMER2_CHAIN_DELETED) { 938 #if FLUSH_DEBUG 939 kprintf("A"); 940 #endif 941 child_flags = 0; 942 goto finalize; 943 } 944 945 /* 946 * Inodes with stale children that have been converted to DIRECTDATA 947 * mode (file extension or hardlink conversion typically) need to 948 * skipped right now before we start messing with a non-existant 949 * block table. 950 */ 951 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE && 952 (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)) { 953 #if FLUSH_DEBUG 954 kprintf("B"); 955 #endif 956 child_flags = 0; 957 goto finalize; 958 } 959 960 /* 961 * Ignore children modified beyond our flush point. If the parent 962 * is deleted within our flush we don't have to re-set SUBMODIFIED, 963 * otherwise we must set it according to the child's flags so 964 * SUBMODIFIED remains flagged for later flushes. 965 * 966 * NOTE: modify_tid is only updated for modifications, NOT for 967 * deletions (delete_tid is updated for deletions). Also note 968 * that delete_tid will ALWAYS be >= modify_tid. 969 * 970 * XXX spin-lock on child->modify_tid ? 971 */ 972 #if 0 973 if (child->modify_tid > info->sync_tid) { 974 if (parent->delete_tid <= info->sync_tid) 975 child_flags = 0; 976 else 977 child_flags = child->flags; 978 #if FLUSH_DEBUG 979 kprintf("C"); 980 #endif 981 goto finalize; 982 } 983 #endif 984 985 if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) { 986 child_flags = child->flags; 987 #if FLUSH_DEBUG 988 kprintf("D"); 989 #endif 990 goto finalize; 991 } 992 993 hammer2_chain_ref(child); 994 spin_unlock(&parent->core->cst.spin); 995 996 /* 997 * The MOVED bit implies an additional reference which prevents 998 * the child from being destroyed out from under our operation 999 * so we can lock the child safely without worrying about it 1000 * getting ripped up (?). 1001 * 1002 * We can only update parents where child->parent matches. The 1003 * child->parent link will migrate along the chain but the flush 1004 * order must be enforced absolutely. Parent reflushed after the 1005 * child has passed them by should skip due to the modify_tid test. 1006 */ 1007 hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER); 1008 1009 #if 0 1010 if (child->parent != parent) { 1011 child_flags = child->flags; 1012 hammer2_chain_unlock(child); 1013 spin_lock(&parent->core->cst.spin); 1014 #if FLUSH_DEBUG 1015 kprintf("E"); 1016 #endif 1017 goto finalize; 1018 } 1019 #endif 1020 1021 /* 1022 * The parent's blockref to the child must be deleted or updated. 1023 * 1024 * This point is not reached on successful DESTROYED optimizations 1025 * but can be reached on recursive deletions. 1026 * 1027 * Because flushes are ordered we do not have to make a 1028 * modify/duplicate of indirect blocks. That is, the flush 1029 * code does not have to kmalloc or duplicate anything. We 1030 * can adjust the indirect block table in-place and reuse the 1031 * chain. It IS possible that the chain has already been duplicated 1032 * or may wind up being duplicated on-the-fly by modifying code 1033 * on the frontend. We simply use the original and ignore such 1034 * chains. However, it does mean we can't clear the MOVED bit. 1035 * 1036 * XXX recursive deletions not optimized. 1037 */ 1038 hammer2_chain_modify(info->trans, &parent, 1039 HAMMER2_MODIFY_NO_MODIFY_TID | 1040 HAMMER2_MODIFY_ASSERTNOCOPY); 1041 1042 switch(parent->bref.type) { 1043 case HAMMER2_BREF_TYPE_INODE: 1044 /* 1045 * XXX Should assert that OPFLAG_DIRECTDATA is 0 once we 1046 * properly duplicate the inode headers and do proper flush 1047 * range checks (all the children should be beyond the flush 1048 * point). For now just don't sync the non-applicable 1049 * children. 1050 * 1051 * XXX Can also occur due to hardlink consolidation. We 1052 * set OPFLAG_DIRECTDATA to prevent the indirect and data 1053 * blocks from syncing ot the hardlink pointer. 1054 */ 1055 #if 0 1056 KKASSERT((parent->data->ipdata.op_flags & 1057 HAMMER2_OPFLAG_DIRECTDATA) == 0); 1058 #endif 1059 if (parent->data->ipdata.op_flags & 1060 HAMMER2_OPFLAG_DIRECTDATA) { 1061 base = NULL; 1062 } else { 1063 base = &parent->data->ipdata.u.blockset.blockref[0]; 1064 count = HAMMER2_SET_COUNT; 1065 } 1066 break; 1067 case HAMMER2_BREF_TYPE_INDIRECT: 1068 if (parent->data) { 1069 base = &parent->data->npdata.blockref[0]; 1070 } else { 1071 base = NULL; 1072 KKASSERT(child->flags & HAMMER2_CHAIN_DELETED); 1073 } 1074 count = parent->bytes / sizeof(hammer2_blockref_t); 1075 break; 1076 case HAMMER2_BREF_TYPE_VOLUME: 1077 base = &hmp->voldata.sroot_blockset.blockref[0]; 1078 count = HAMMER2_SET_COUNT; 1079 break; 1080 default: 1081 base = NULL; 1082 count = 0; 1083 panic("hammer2_chain_get: " 1084 "unrecognized blockref type: %d", 1085 parent->bref.type); 1086 } 1087 1088 /* 1089 * Update the parent's blockref table and propagate mirror_tid. 1090 * blockref updates do not touch modify_tid. Instead, mirroring 1091 * operations always reconcile the entire array during their 1092 * mirror_tid based recursion. 1093 * 1094 * WARNING! Deleted chains may still be used by the filesystem 1095 * in a later duplication, for example in a rename() 1096 * operation. Also any topological movement of the 1097 * related blocks. We only mess with the parent 1098 * block array, we do not mess with the child! 1099 * 1100 * We adjust the parent's bref pointer to the child but 1101 * we do not modify the contents of the child. 1102 */ 1103 #if 0 1104 if (child->delete_tid <= info->sync_tid) { 1105 #endif 1106 if (child->flags & HAMMER2_CHAIN_DELETED) { 1107 if (base) { 1108 KKASSERT(child->index < count); 1109 bzero(&base[child->index], sizeof(child->bref)); 1110 if (info->mirror_tid < child->delete_tid) 1111 info->mirror_tid = child->delete_tid; 1112 } 1113 } else { 1114 if (base) { 1115 KKASSERT(child->index < count); 1116 base[child->index] = child->bref; 1117 if (info->mirror_tid < child->modify_tid) 1118 info->mirror_tid = child->modify_tid; 1119 } 1120 } 1121 KKASSERT(child->index >= 0); 1122 1123 /* 1124 * Propagate mirror_tid back up. 1125 */ 1126 if (info->mirror_tid < child->bref.mirror_tid) { 1127 info->mirror_tid = child->bref.mirror_tid; 1128 } 1129 if (parent->bref.type == HAMMER2_BREF_TYPE_VOLUME && 1130 hmp->voldata.mirror_tid < child->bref.mirror_tid) { 1131 hmp->voldata.mirror_tid = child->bref.mirror_tid; 1132 } 1133 1134 /*cleanup:*/ 1135 /* 1136 * Cleanup the children. Clear the MOVED flag 1137 * 1138 * XXXWe can only clear the MOVED flag when the child has progressed 1139 * to the last parent in the duplication chain. 1140 * 1141 * XXXMOVED might not be set if we are reflushing this chain due to 1142 * the previous chain overwriting the same index in the parent. 1143 */ 1144 #if 0 1145 if (child->parent == parent && 1146 parent->duplink && (parent->flags & HAMMER2_CHAIN_DELETED)) { 1147 hammer2_chain_ref(parent->duplink); 1148 child->parent = parent->duplink; 1149 child->modify_tid = child->parent->modify_tid; 1150 hammer2_chain_drop(parent); 1151 } 1152 #endif 1153 if (child->flags & HAMMER2_CHAIN_MOVED) { 1154 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 1155 hammer2_chain_drop(child); /* flag */ 1156 #if 0 1157 if (child->delete_tid == HAMMER2_MAX_TID) { 1158 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 1159 hammer2_chain_drop(child); /* flag */ 1160 } else if (child->delete_tid <= info->sync_tid) { 1161 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 1162 hammer2_chain_drop(child); /* flag */ 1163 } 1164 #endif 1165 } 1166 1167 /* 1168 * Unlock the child. This can wind up dropping the child's 1169 * last ref, removing it from the parent's RB tree, and deallocating 1170 * the structure. The RB_SCAN() our caller is doing handles the 1171 * situation. 1172 */ 1173 child_flags = child->flags; 1174 hammer2_chain_unlock(child); 1175 hammer2_chain_drop(child); 1176 spin_lock(&parent->core->cst.spin); 1177 #if FLUSH_DEBUG 1178 kprintf("F"); 1179 #endif 1180 1181 /* 1182 * The parent cleared SUBMODIFIED prior to the scan. If the child 1183 * still requires a flush (possibly due to being outside the current 1184 * synchronization zone), we must re-set SUBMODIFIED on the way back 1185 * up. 1186 */ 1187 finalize: 1188 #if FLUSH_DEBUG 1189 kprintf("G child %08x act=%08x\n", child_flags, child->flags); 1190 #endif 1191 if (child_flags & (HAMMER2_CHAIN_MOVED | 1192 HAMMER2_CHAIN_DELETED | 1193 HAMMER2_CHAIN_MODIFIED | 1194 HAMMER2_CHAIN_SUBMODIFIED)) { 1195 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED); 1196 } 1197 1198 return (0); 1199 } 1200