1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.45 2008/07/31 04:42:04 dillon Exp $ 35 */ 36 /* 37 * HAMMER dependancy flusher thread 38 * 39 * Meta data updates create buffer dependancies which are arranged as a 40 * hierarchy of lists. 41 */ 42 43 #include "hammer.h" 44 45 static void hammer_flusher_master_thread(void *arg); 46 static void hammer_flusher_slave_thread(void *arg); 47 static int hammer_flusher_flush(hammer_mount_t hmp, int *nomorep); 48 static int hammer_flusher_flush_inode(hammer_inode_t ip, void *data); 49 50 RB_GENERATE(hammer_fls_rb_tree, hammer_inode, rb_flsnode, 51 hammer_ino_rb_compare); 52 53 /* 54 * Support structures for the flusher threads. 55 */ 56 struct hammer_flusher_info { 57 TAILQ_ENTRY(hammer_flusher_info) entry; 58 struct hammer_mount *hmp; 59 thread_t td; 60 int runstate; 61 hammer_flush_group_t flg; 62 struct hammer_transaction trans; /* per-slave transaction */ 63 }; 64 65 typedef struct hammer_flusher_info *hammer_flusher_info_t; 66 67 /* 68 * Sync all inodes pending on the flusher. 69 * 70 * All flush groups will be flushed. This does not queue dirty inodes 71 * to the flush groups, it just flushes out what has already been queued! 72 */ 73 void 74 hammer_flusher_sync(hammer_mount_t hmp) 75 { 76 int seq; 77 78 seq = hammer_flusher_async(hmp, NULL); 79 hammer_flusher_wait(hmp, seq); 80 } 81 82 /* 83 * Sync all flush groups through to close_flg - return immediately. 84 * If close_flg is NULL all flush groups are synced. 85 * 86 * Returns the sequence number of the last closed flush group, 87 * which may be close_flg. When syncing to the end if there 88 * are no flush groups pending we still cycle the flusher, and 89 * must allocate a sequence number to placemark the spot even 90 * though no flush group will ever be associated with it. 91 */ 92 int 93 hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t close_flg) 94 { 95 hammer_flush_group_t flg; 96 int seq; 97 98 /* 99 * Already closed 100 */ 101 if (close_flg && close_flg->closed) 102 return(close_flg->seq); 103 104 /* 105 * Close flush groups until we hit the end of the list 106 * or close_flg. 107 */ 108 while ((flg = hmp->next_flush_group) != NULL) { 109 KKASSERT(flg->closed == 0 && flg->running == 0); 110 flg->closed = 1; 111 hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry); 112 if (flg == close_flg) 113 break; 114 } 115 116 if (hmp->flusher.td) { 117 if (hmp->flusher.signal++ == 0) 118 wakeup(&hmp->flusher.signal); 119 if (flg) { 120 seq = flg->seq; 121 } else { 122 seq = hmp->flusher.next; 123 ++hmp->flusher.next; 124 } 125 } else { 126 seq = hmp->flusher.done; 127 } 128 return(seq); 129 } 130 131 /* 132 * Flush the current/next flushable flg. This function is typically called 133 * in a loop along with hammer_flusher_wait(hmp, returned_seq) to iterate 134 * flush groups until specific conditions are met. 135 * 136 * If a flush is currently in progress its seq is returned. 137 * 138 * If no flush is currently in progress the next available flush group 139 * will be flushed and its seq returned. 140 * 141 * If no flush groups are present a dummy seq will be allocated and 142 * returned and the flusher will be activated (e.g. to flush the 143 * undo/redo and the volume header). 144 */ 145 int 146 hammer_flusher_async_one(hammer_mount_t hmp) 147 { 148 hammer_flush_group_t flg; 149 int seq; 150 151 if (hmp->flusher.td) { 152 flg = TAILQ_FIRST(&hmp->flush_group_list); 153 seq = hammer_flusher_async(hmp, flg); 154 } else { 155 seq = hmp->flusher.done; 156 } 157 return(seq); 158 } 159 160 /* 161 * Wait for the flusher to finish flushing the specified sequence 162 * number. The flush is already running and will signal us on 163 * each completion. 164 */ 165 void 166 hammer_flusher_wait(hammer_mount_t hmp, int seq) 167 { 168 while (seq - hmp->flusher.done > 0) 169 tsleep(&hmp->flusher.done, 0, "hmrfls", 0); 170 } 171 172 /* 173 * Returns non-zero if the flusher is currently running. Used for 174 * time-domain multiplexing of frontend operations in order to avoid 175 * starving the backend flusher. 176 */ 177 int 178 hammer_flusher_running(hammer_mount_t hmp) 179 { 180 int seq = hmp->flusher.next - 1; 181 if (seq - hmp->flusher.done > 0) 182 return(1); 183 return (0); 184 } 185 186 void 187 hammer_flusher_wait_next(hammer_mount_t hmp) 188 { 189 int seq; 190 191 seq = hammer_flusher_async_one(hmp); 192 hammer_flusher_wait(hmp, seq); 193 } 194 195 void 196 hammer_flusher_create(hammer_mount_t hmp) 197 { 198 hammer_flusher_info_t info; 199 int i; 200 201 hmp->flusher.signal = 0; 202 hmp->flusher.done = 0; 203 hmp->flusher.next = 1; 204 hammer_ref(&hmp->flusher.finalize_lock); 205 TAILQ_INIT(&hmp->flusher.run_list); 206 TAILQ_INIT(&hmp->flusher.ready_list); 207 208 lwkt_create(hammer_flusher_master_thread, hmp, 209 &hmp->flusher.td, NULL, 0, -1, "hammer-M"); 210 for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) { 211 info = kmalloc(sizeof(*info), hmp->m_misc, M_WAITOK|M_ZERO); 212 info->hmp = hmp; 213 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry); 214 lwkt_create(hammer_flusher_slave_thread, info, 215 &info->td, NULL, 0, -1, "hammer-S%d", i); 216 } 217 } 218 219 void 220 hammer_flusher_destroy(hammer_mount_t hmp) 221 { 222 hammer_flusher_info_t info; 223 224 /* 225 * Kill the master 226 */ 227 hmp->flusher.exiting = 1; 228 while (hmp->flusher.td) { 229 ++hmp->flusher.signal; 230 wakeup(&hmp->flusher.signal); 231 tsleep(&hmp->flusher.exiting, 0, "hmrwex", hz); 232 } 233 234 /* 235 * Kill the slaves 236 */ 237 while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) != NULL) { 238 KKASSERT(info->runstate == 0); 239 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry); 240 info->runstate = -1; 241 wakeup(&info->runstate); 242 while (info->td) 243 tsleep(&info->td, 0, "hmrwwc", 0); 244 kfree(info, hmp->m_misc); 245 } 246 } 247 248 /* 249 * The master flusher thread manages the flusher sequence id and 250 * synchronization with the slave work threads. 251 */ 252 static void 253 hammer_flusher_master_thread(void *arg) 254 { 255 hammer_mount_t hmp; 256 int seq; 257 int nomore; 258 259 hmp = arg; 260 261 lwkt_gettoken(&hmp->fs_token); 262 263 for (;;) { 264 /* 265 * Flush all sequence numbers up to but not including .next, 266 * or until an open flush group is encountered. 267 */ 268 for (;;) { 269 while (hmp->flusher.group_lock) 270 tsleep(&hmp->flusher.group_lock, 0, "hmrhld",0); 271 hammer_flusher_clean_loose_ios(hmp); 272 273 seq = hammer_flusher_flush(hmp, &nomore); 274 hmp->flusher.done = seq; 275 wakeup(&hmp->flusher.done); 276 277 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 278 break; 279 if (nomore) 280 break; 281 } 282 283 /* 284 * Wait for activity. 285 */ 286 if (hmp->flusher.exiting && TAILQ_EMPTY(&hmp->flush_group_list)) 287 break; 288 while (hmp->flusher.signal == 0) 289 tsleep(&hmp->flusher.signal, 0, "hmrwwa", 0); 290 hmp->flusher.signal = 0; 291 } 292 293 /* 294 * And we are done. 295 */ 296 hmp->flusher.td = NULL; 297 wakeup(&hmp->flusher.exiting); 298 lwkt_reltoken(&hmp->fs_token); 299 lwkt_exit(); 300 } 301 302 /* 303 * Flush the next sequence number until an open flush group is encountered 304 * or we reach (next). Not all sequence numbers will have flush groups 305 * associated with them. These require that the UNDO/REDO FIFO still be 306 * flushed since it can take at least one additional run to synchronize 307 * the FIFO, and more to also synchronize the reserve structures. 308 */ 309 static int 310 hammer_flusher_flush(hammer_mount_t hmp, int *nomorep) 311 { 312 hammer_flusher_info_t info; 313 hammer_flush_group_t flg; 314 hammer_reserve_t resv; 315 int count; 316 int seq; 317 318 /* 319 * Just in-case there's a flush race on mount. Seq number 320 * does not change. 321 */ 322 if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL) { 323 *nomorep = 1; 324 return (hmp->flusher.done); 325 } 326 *nomorep = 0; 327 328 /* 329 * Flush the next sequence number. Sequence numbers can exist 330 * without an assigned flush group, indicating that just a FIFO flush 331 * should occur. 332 */ 333 seq = hmp->flusher.done + 1; 334 flg = TAILQ_FIRST(&hmp->flush_group_list); 335 if (flg == NULL) { 336 if (seq == hmp->flusher.next) { 337 *nomorep = 1; 338 return (hmp->flusher.done); 339 } 340 } else if (seq == flg->seq) { 341 if (flg->closed) { 342 KKASSERT(flg->running == 0); 343 flg->running = 1; 344 if (hmp->fill_flush_group == flg) { 345 hmp->fill_flush_group = 346 TAILQ_NEXT(flg, flush_entry); 347 } 348 } else { 349 *nomorep = 1; 350 return (hmp->flusher.done); 351 } 352 } else { 353 /* 354 * Sequence number problems can only happen if a critical 355 * filesystem error occurred which forced the filesystem into 356 * read-only mode. 357 */ 358 KKASSERT(flg->seq - seq > 0 || hmp->ronly >= 2); 359 flg = NULL; 360 } 361 362 /* 363 * We only do one flg but we may have to loop/retry. 364 * 365 * Due to various races it is possible to come across a flush 366 * group which as not yet been closed. 367 */ 368 count = 0; 369 while (flg && flg->running) { 370 ++count; 371 if (hammer_debug_general & 0x0001) { 372 hdkprintf("%d ttl=%d recs=%d\n", 373 flg->seq, flg->total_count, flg->refs); 374 } 375 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 376 break; 377 hammer_start_transaction_fls(&hmp->flusher.trans, hmp); 378 379 /* 380 * If the previous flush cycle just about exhausted our 381 * UNDO space we may have to do a dummy cycle to move the 382 * first_offset up before actually digging into a new cycle, 383 * or the new cycle will not have sufficient undo space. 384 */ 385 if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3)) 386 hammer_flusher_finalize(&hmp->flusher.trans, 0); 387 388 KKASSERT(hmp->next_flush_group != flg); 389 390 /* 391 * Place the flg in the flusher structure and start the 392 * slaves running. The slaves will compete for inodes 393 * to flush. 394 * 395 * Make a per-thread copy of the transaction. 396 */ 397 while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) != NULL) { 398 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry); 399 info->flg = flg; 400 info->runstate = 1; 401 info->trans = hmp->flusher.trans; 402 TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry); 403 wakeup(&info->runstate); 404 } 405 406 /* 407 * Wait for all slaves to finish running 408 */ 409 while (TAILQ_FIRST(&hmp->flusher.run_list) != NULL) 410 tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0); 411 412 /* 413 * Do the final finalization, clean up 414 */ 415 hammer_flusher_finalize(&hmp->flusher.trans, 1); 416 hmp->flusher.tid = hmp->flusher.trans.tid; 417 418 hammer_done_transaction(&hmp->flusher.trans); 419 420 /* 421 * Loop up on the same flg. If the flg is done clean it up 422 * and break out. We only flush one flg. 423 */ 424 if (RB_EMPTY(&flg->flush_tree)) { 425 KKASSERT(flg->refs == 0); 426 TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry); 427 kfree(flg, hmp->m_misc); 428 break; 429 } 430 KKASSERT(TAILQ_FIRST(&hmp->flush_group_list) == flg); 431 } 432 433 /* 434 * We may have pure meta-data to flush, or we may have to finish 435 * cycling the UNDO FIFO, even if there were no flush groups. 436 */ 437 if (count == 0 && hammer_flusher_haswork(hmp)) { 438 hammer_start_transaction_fls(&hmp->flusher.trans, hmp); 439 hammer_flusher_finalize(&hmp->flusher.trans, 1); 440 hammer_done_transaction(&hmp->flusher.trans); 441 } 442 443 /* 444 * Clean up any freed big-blocks (typically zone-2). 445 * resv->flush_group is typically set several flush groups ahead 446 * of the free to ensure that the freed block is not reused until 447 * it can no longer be reused. 448 */ 449 while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) { 450 if (resv->flg_no - seq > 0) 451 break; 452 hammer_reserve_clrdelay(hmp, resv); 453 } 454 return (seq); 455 } 456 457 458 /* 459 * The slave flusher thread pulls work off the master flush list until no 460 * work is left. 461 */ 462 static void 463 hammer_flusher_slave_thread(void *arg) 464 { 465 hammer_flush_group_t flg; 466 hammer_flusher_info_t info; 467 hammer_mount_t hmp; 468 469 info = arg; 470 hmp = info->hmp; 471 lwkt_gettoken(&hmp->fs_token); 472 473 for (;;) { 474 while (info->runstate == 0) 475 tsleep(&info->runstate, 0, "hmrssw", 0); 476 if (info->runstate < 0) 477 break; 478 flg = info->flg; 479 480 RB_SCAN(hammer_fls_rb_tree, &flg->flush_tree, NULL, 481 hammer_flusher_flush_inode, info); 482 483 info->runstate = 0; 484 info->flg = NULL; 485 TAILQ_REMOVE(&hmp->flusher.run_list, info, entry); 486 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry); 487 wakeup(&hmp->flusher.ready_list); 488 } 489 info->td = NULL; 490 wakeup(&info->td); 491 lwkt_reltoken(&hmp->fs_token); 492 lwkt_exit(); 493 } 494 495 void 496 hammer_flusher_clean_loose_ios(hammer_mount_t hmp) 497 { 498 hammer_buffer_t buffer; 499 hammer_io_t io; 500 501 /* 502 * loose ends - buffers without bp's aren't tracked by the kernel 503 * and can build up, so clean them out. This can occur when an 504 * IO completes on a buffer with no references left. 505 * 506 * The io_token is needed to protect the list. 507 */ 508 if ((io = RB_ROOT(&hmp->lose_root)) != NULL) { 509 lwkt_gettoken(&hmp->io_token); 510 while ((io = RB_ROOT(&hmp->lose_root)) != NULL) { 511 KKASSERT(io->mod_root == &hmp->lose_root); 512 RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io); 513 io->mod_root = NULL; 514 hammer_ref(&io->lock); 515 buffer = (void *)io; 516 hammer_rel_buffer(buffer, 0); 517 } 518 lwkt_reltoken(&hmp->io_token); 519 } 520 } 521 522 /* 523 * Flush a single inode that is part of a flush group. 524 * 525 * Flusher errors are extremely serious, even ENOSPC shouldn't occur because 526 * the front-end should have reserved sufficient space on the media. Any 527 * error other then EWOULDBLOCK will force the mount to be read-only. 528 */ 529 static 530 int 531 hammer_flusher_flush_inode(hammer_inode_t ip, void *data) 532 { 533 hammer_flusher_info_t info = data; 534 hammer_mount_t hmp = info->hmp; 535 hammer_transaction_t trans = &info->trans; 536 int error; 537 538 /* 539 * Several slaves are operating on the same flush group concurrently. 540 * The SLAVEFLUSH flag prevents them from tripping over each other. 541 * 542 * NOTE: It is possible for a EWOULDBLOCK'd ip returned by one slave 543 * to be resynced by another, but normally such inodes are not 544 * revisited until the master loop gets to them. 545 */ 546 if (ip->flags & HAMMER_INODE_SLAVEFLUSH) 547 return(0); 548 ip->flags |= HAMMER_INODE_SLAVEFLUSH; 549 ++hammer_stats_inode_flushes; 550 551 hammer_flusher_clean_loose_ios(hmp); 552 vm_wait_nominal(); 553 error = hammer_sync_inode(trans, ip); 554 555 /* 556 * EWOULDBLOCK can happen under normal operation, all other errors 557 * are considered extremely serious. We must set WOULDBLOCK 558 * mechanics to deal with the mess left over from the abort of the 559 * previous flush. 560 */ 561 if (error) { 562 ip->flags |= HAMMER_INODE_WOULDBLOCK; 563 if (error == EWOULDBLOCK) 564 error = 0; 565 } 566 hammer_flush_inode_done(ip, error); 567 /* ip invalid */ 568 569 while (hmp->flusher.finalize_want) 570 tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0); 571 if (hammer_flusher_undo_exhausted(trans, 1)) { 572 hkprintf("Warning: UNDO area too small!\n"); 573 hammer_flusher_finalize(trans, 1); 574 } else if (hammer_flusher_meta_limit(trans->hmp)) { 575 hammer_flusher_finalize(trans, 0); 576 } 577 return (0); 578 } 579 580 /* 581 * Return non-zero if the UNDO area has less then (QUARTER / 4) of its 582 * space left. 583 * 584 * 1/4 - Emergency free undo space level. Below this point the flusher 585 * will finalize even if directory dependancies have not been resolved. 586 * 587 * 2/4 - Used by the pruning and reblocking code. These functions may be 588 * running in parallel with a flush and cannot be allowed to drop 589 * available undo space to emergency levels. 590 * 591 * 3/4 - Used at the beginning of a flush to force-sync the volume header 592 * to give the flush plenty of runway to work in. 593 */ 594 int 595 hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter) 596 { 597 if (hammer_undo_space(trans) < 598 hammer_undo_max(trans->hmp) * quarter / 4) { 599 return(1); 600 } else { 601 return(0); 602 } 603 } 604 605 /* 606 * Flush all pending UNDOs, wait for write completion, update the volume 607 * header with the new UNDO end position, and flush it. Then 608 * asynchronously flush the meta-data. 609 * 610 * If this is the last finalization in a flush group we also synchronize 611 * our cached blockmap and set hmp->flusher_undo_start and our cached undo 612 * fifo first_offset so the next flush resets the FIFO pointers. 613 * 614 * If this is not final it is being called because too many dirty meta-data 615 * buffers have built up and must be flushed with UNDO synchronization to 616 * avoid a buffer cache deadlock. 617 */ 618 void 619 hammer_flusher_finalize(hammer_transaction_t trans, int final) 620 { 621 hammer_volume_t root_volume; 622 hammer_blockmap_t cundomap, dundomap; 623 hammer_mount_t hmp; 624 hammer_io_t io; 625 hammer_off_t save_undo_next_offset; 626 int count; 627 int i; 628 629 hmp = trans->hmp; 630 root_volume = trans->rootvol; 631 632 /* 633 * Exclusively lock the flusher. This guarantees that all dirty 634 * buffers will be idled (have a mod-count of 0). 635 */ 636 ++hmp->flusher.finalize_want; 637 hammer_lock_ex(&hmp->flusher.finalize_lock); 638 639 /* 640 * If this isn't the final sync several threads may have hit the 641 * meta-limit at the same time and raced. Only sync if we really 642 * have to, after acquiring the lock. 643 */ 644 if (final == 0 && !hammer_flusher_meta_limit(hmp)) 645 goto done; 646 647 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 648 goto done; 649 650 /* 651 * Flush data buffers. This can occur asynchronously and at any 652 * time. We must interlock against the frontend direct-data write 653 * but do not have to acquire the sync-lock yet. 654 * 655 * These data buffers have already been collected prior to the 656 * related inode(s) getting queued to the flush group. 657 */ 658 count = 0; 659 while ((io = RB_FIRST(hammer_mod_rb_tree, &hmp->data_root)) != NULL) { 660 if (io->ioerror) 661 break; 662 hammer_ref(&io->lock); 663 hammer_io_write_interlock(io); 664 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME); 665 hammer_io_flush(io, 0); 666 hammer_io_done_interlock(io); 667 hammer_rel_buffer(HAMMER_ITOB(io), 0); 668 hammer_io_limit_backlog(hmp); 669 ++count; 670 } 671 672 /* 673 * The sync-lock is required for the remaining sequence. This lock 674 * prevents meta-data from being modified. 675 */ 676 hammer_sync_lock_ex(trans); 677 678 /* 679 * If we have been asked to finalize the volume header sync the 680 * cached blockmap to the on-disk blockmap. Generate an UNDO 681 * record for the update. 682 */ 683 if (final) { 684 cundomap = &hmp->blockmap[0]; 685 dundomap = &root_volume->ondisk->vol0_blockmap[0]; 686 if (root_volume->io.modified) { 687 hammer_modify_volume(trans, root_volume, 688 dundomap, sizeof(hmp->blockmap)); 689 for (i = 0; i < HAMMER_MAX_ZONES; ++i) 690 hammer_crc_set_blockmap(&cundomap[i]); 691 bcopy(cundomap, dundomap, sizeof(hmp->blockmap)); 692 hammer_modify_volume_done(root_volume); 693 } 694 } 695 696 /* 697 * Flush UNDOs. This can occur concurrently with the data flush 698 * because data writes never overwrite. 699 * 700 * This also waits for I/Os to complete and flushes the cache on 701 * the target disk. 702 * 703 * Record the UNDO append point as this can continue to change 704 * after we have flushed the UNDOs. 705 */ 706 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; 707 hammer_lock_ex(&hmp->undo_lock); 708 save_undo_next_offset = cundomap->next_offset; 709 hammer_unlock(&hmp->undo_lock); 710 hammer_flusher_flush_undos(hmp, HAMMER_FLUSH_UNDOS_FORCED); 711 712 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 713 goto failed; 714 715 /* 716 * HAMMER VERSION < 4: 717 * Update the on-disk volume header with new UNDO FIFO end 718 * position (do not generate new UNDO records for this change). 719 * We have to do this for the UNDO FIFO whether (final) is 720 * set or not in order for the UNDOs to be recognized on 721 * recovery. 722 * 723 * HAMMER VERSION >= 4: 724 * The UNDO FIFO data written above will be recognized on 725 * recovery without us having to sync the volume header. 726 * 727 * Also update the on-disk next_tid field. This does not require 728 * an UNDO. However, because our TID is generated before we get 729 * the sync lock another sync may have beat us to the punch. 730 * 731 * This also has the side effect of updating first_offset based on 732 * a prior finalization when the first finalization of the next flush 733 * cycle occurs, removing any undo info from the prior finalization 734 * from consideration. 735 * 736 * The volume header will be flushed out synchronously. 737 */ 738 dundomap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 739 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; 740 741 if (dundomap->first_offset != cundomap->first_offset || 742 dundomap->next_offset != save_undo_next_offset) { 743 hammer_modify_volume_noundo(NULL, root_volume); 744 dundomap->first_offset = cundomap->first_offset; 745 dundomap->next_offset = save_undo_next_offset; 746 hammer_crc_set_blockmap(dundomap); 747 hammer_modify_volume_done(root_volume); 748 } 749 750 /* 751 * vol0_next_tid is used for TID selection and is updated without 752 * an UNDO so we do not reuse a TID that may have been rolled-back. 753 * 754 * vol0_last_tid is the highest fully-synchronized TID. It is 755 * set-up when the UNDO fifo is fully synced, later on (not here). 756 * 757 * The root volume can be open for modification by other threads 758 * generating UNDO or REDO records. For example, reblocking, 759 * pruning, REDO mode fast-fsyncs, so the write interlock is 760 * mandatory. 761 */ 762 if (root_volume->io.modified) { 763 hammer_modify_volume_noundo(NULL, root_volume); 764 if (root_volume->ondisk->vol0_next_tid < trans->tid) 765 root_volume->ondisk->vol0_next_tid = trans->tid; 766 hammer_crc_set_volume(root_volume->ondisk); 767 hammer_modify_volume_done(root_volume); 768 hammer_io_write_interlock(&root_volume->io); 769 hammer_io_flush(&root_volume->io, 0); 770 hammer_io_done_interlock(&root_volume->io); 771 } 772 773 /* 774 * Wait for I/Os to complete. 775 * 776 * For HAMMER VERSION 4+ filesystems we do not have to wait for 777 * the I/O to complete as the new UNDO FIFO entries are recognized 778 * even without the volume header update. This allows the volume 779 * header to flushed along with meta-data, significantly reducing 780 * flush overheads. 781 */ 782 hammer_flusher_clean_loose_ios(hmp); 783 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 784 hammer_io_wait_all(hmp, "hmrfl3", 1); 785 786 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 787 goto failed; 788 789 /* 790 * Flush meta-data. The meta-data will be undone if we crash 791 * so we can safely flush it asynchronously. There is no need 792 * to wait for I/O to complete (or issue a synchronous disk flush). 793 * 794 * In fact, even if we did wait the meta-data will still be undone 795 * by a crash up until the next flush cycle due to the first_offset 796 * in the volume header for the UNDO FIFO not being adjusted until 797 * the following flush cycle. 798 * 799 * No io interlock is needed, bioops callbacks will not mess with 800 * meta data buffers. 801 */ 802 count = 0; 803 while ((io = RB_FIRST(hammer_mod_rb_tree, &hmp->meta_root)) != NULL) { 804 if (io->ioerror) 805 break; 806 KKASSERT(io->modify_refs == 0); 807 hammer_ref(&io->lock); 808 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME); 809 hammer_io_flush(io, 0); 810 hammer_rel_buffer(HAMMER_ITOB(io), 0); 811 hammer_io_limit_backlog(hmp); 812 ++count; 813 } 814 815 /* 816 * If this is the final finalization for the flush group set 817 * up for the next sequence by setting a new first_offset in 818 * our cached blockmap and clearing the undo history. 819 * 820 * Even though we have updated our cached first_offset, the on-disk 821 * first_offset still governs available-undo-space calculations. 822 * 823 * We synchronize to save_undo_next_offset rather than 824 * cundomap->next_offset because that is what we flushed out 825 * above. 826 * 827 * NOTE! UNDOs can only be added with the sync_lock held 828 * so we can clear the undo history without racing. 829 * REDOs can be added at any time which is why we 830 * have to be careful and use save_undo_next_offset 831 * when setting the new first_offset. 832 */ 833 if (final) { 834 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; 835 if (cundomap->first_offset != save_undo_next_offset) { 836 cundomap->first_offset = save_undo_next_offset; 837 hmp->hflags |= HMNT_UNDO_DIRTY; 838 } else if (cundomap->first_offset != cundomap->next_offset) { 839 hmp->hflags |= HMNT_UNDO_DIRTY; 840 } else { 841 hmp->hflags &= ~HMNT_UNDO_DIRTY; 842 } 843 hammer_clear_undo_history(hmp); 844 845 /* 846 * Flush tid sequencing. flush_tid1 is fully synchronized, 847 * meaning a crash will not roll it back. flush_tid2 has 848 * been written out asynchronously and a crash will roll 849 * it back. flush_tid1 is used for all mirroring masters. 850 */ 851 if (hmp->flush_tid1 != hmp->flush_tid2) { 852 hmp->flush_tid1 = hmp->flush_tid2; 853 wakeup(&hmp->flush_tid1); 854 } 855 hmp->flush_tid2 = trans->tid; 856 857 /* 858 * Clear the REDO SYNC flag. This flag is used to ensure 859 * that the recovery span in the UNDO/REDO FIFO contains 860 * at least one REDO SYNC record. 861 */ 862 hmp->flags &= ~HAMMER_MOUNT_REDO_SYNC; 863 } 864 865 /* 866 * Cleanup. Report any critical errors. 867 */ 868 failed: 869 hammer_sync_unlock(trans); 870 871 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) { 872 hvkprintf(root_volume, 873 "Critical write error during flush, " 874 "refusing to sync UNDO FIFO\n"); 875 } 876 877 done: 878 hammer_unlock(&hmp->flusher.finalize_lock); 879 880 if (--hmp->flusher.finalize_want == 0) 881 wakeup(&hmp->flusher.finalize_want); 882 hammer_stats_commits += final; 883 } 884 885 /* 886 * Flush UNDOs. 887 */ 888 void 889 hammer_flusher_flush_undos(hammer_mount_t hmp, int mode) 890 { 891 hammer_io_t io; 892 int count; 893 894 count = 0; 895 while ((io = RB_FIRST(hammer_mod_rb_tree, &hmp->undo_root)) != NULL) { 896 if (io->ioerror) 897 break; 898 hammer_ref(&io->lock); 899 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME); 900 hammer_io_write_interlock(io); 901 hammer_io_flush(io, hammer_undo_reclaim(io)); 902 hammer_io_done_interlock(io); 903 hammer_rel_buffer(HAMMER_ITOB(io), 0); 904 hammer_io_limit_backlog(hmp); 905 ++count; 906 } 907 hammer_flusher_clean_loose_ios(hmp); 908 if (mode == HAMMER_FLUSH_UNDOS_FORCED || 909 (mode == HAMMER_FLUSH_UNDOS_AUTO && count)) { 910 hammer_io_wait_all(hmp, "hmrfl1", 1); 911 } else { 912 hammer_io_wait_all(hmp, "hmrfl2", 0); 913 } 914 } 915 916 /* 917 * Return non-zero if too many dirty meta-data buffers have built up. 918 * 919 * Since we cannot allow such buffers to flush until we have dealt with 920 * the UNDOs, we risk deadlocking the kernel's buffer cache. 921 */ 922 int 923 hammer_flusher_meta_limit(hammer_mount_t hmp) 924 { 925 if (hmp->locked_dirty_space + hmp->io_running_space > 926 hammer_limit_dirtybufspace) { 927 return(1); 928 } 929 return(0); 930 } 931 932 /* 933 * Return non-zero if too many dirty meta-data buffers have built up. 934 * 935 * This version is used by background operations (mirror, prune, reblock) 936 * to leave room for foreground operations. 937 */ 938 int 939 hammer_flusher_meta_halflimit(hammer_mount_t hmp) 940 { 941 if (hmp->locked_dirty_space + hmp->io_running_space > 942 hammer_limit_dirtybufspace / 2) { 943 return(1); 944 } 945 return(0); 946 } 947 948 /* 949 * Return non-zero if the flusher still has something to flush. 950 */ 951 int 952 hammer_flusher_haswork(hammer_mount_t hmp) 953 { 954 if (hmp->ronly) 955 return(0); 956 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 957 return(0); 958 if (TAILQ_FIRST(&hmp->flush_group_list) || /* dirty inodes */ 959 RB_ROOT(&hmp->volu_root) || /* dirty buffers */ 960 RB_ROOT(&hmp->undo_root) || 961 RB_ROOT(&hmp->data_root) || 962 RB_ROOT(&hmp->meta_root) || 963 (hmp->hflags & HMNT_UNDO_DIRTY)) { /* UNDO FIFO sync */ 964 return(1); 965 } 966 return(0); 967 } 968 969 int 970 hammer_flush_dirty(hammer_mount_t hmp, int max_count) 971 { 972 int count = 0; 973 int dummy; 974 975 while (hammer_flusher_haswork(hmp)) { 976 hammer_flusher_sync(hmp); 977 ++count; 978 if (count >= 5) { 979 if (count == 5) 980 hkprintf("flushing."); 981 else 982 kprintf("."); 983 tsleep(&dummy, 0, "hmrufl", hz); 984 } 985 if (max_count != -1 && count == max_count) { 986 kprintf("giving up"); 987 break; 988 } 989 } 990 if (count >= 5) 991 kprintf("\n"); 992 993 if (count >= max_count) 994 return(-1); 995 return(0); 996 } 997