1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.45 2008/07/31 04:42:04 dillon Exp $ 35 */ 36 /* 37 * HAMMER dependancy flusher thread 38 * 39 * Meta data updates create buffer dependancies which are arranged as a 40 * hierarchy of lists. 41 */ 42 43 #include "hammer.h" 44 45 static void hammer_flusher_master_thread(void *arg); 46 static void hammer_flusher_slave_thread(void *arg); 47 static int hammer_flusher_flush(hammer_mount_t hmp, int *nomorep); 48 static int hammer_flusher_flush_inode(hammer_inode_t ip, void *data); 49 50 RB_GENERATE(hammer_fls_rb_tree, hammer_inode, rb_flsnode, 51 hammer_ino_rb_compare); 52 53 /* 54 * Support structures for the flusher threads. 55 */ 56 typedef struct hammer_flusher_info { 57 TAILQ_ENTRY(hammer_flusher_info) entry; 58 hammer_mount_t hmp; 59 thread_t td; 60 int runstate; 61 hammer_flush_group_t flg; 62 struct hammer_transaction trans; /* per-slave transaction */ 63 } *hammer_flusher_info_t; 64 65 /* 66 * Sync all inodes pending on the flusher. 67 * 68 * All flush groups will be flushed. This does not queue dirty inodes 69 * to the flush groups, it just flushes out what has already been queued! 70 */ 71 void 72 hammer_flusher_sync(hammer_mount_t hmp) 73 { 74 int seq; 75 76 seq = hammer_flusher_async(hmp, NULL); 77 hammer_flusher_wait(hmp, seq); 78 } 79 80 /* 81 * Sync all flush groups through to close_flg - return immediately. 82 * If close_flg is NULL all flush groups are synced. 83 * 84 * Returns the sequence number of the last closed flush group, 85 * which may be close_flg. When syncing to the end if there 86 * are no flush groups pending we still cycle the flusher, and 87 * must allocate a sequence number to placemark the spot even 88 * though no flush group will ever be associated with it. 89 */ 90 int 91 hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t close_flg) 92 { 93 hammer_flush_group_t flg; 94 int seq; 95 96 /* 97 * Already closed 98 */ 99 if (close_flg && close_flg->closed) 100 return(close_flg->seq); 101 102 /* 103 * Close flush groups until we hit the end of the list 104 * or close_flg. 105 */ 106 while ((flg = hmp->next_flush_group) != NULL) { 107 KKASSERT(flg->closed == 0 && flg->running == 0); 108 flg->closed = 1; 109 hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry); 110 if (flg == close_flg) 111 break; 112 } 113 114 if (hmp->flusher.td) { 115 if (hmp->flusher.signal++ == 0) 116 wakeup(&hmp->flusher.signal); 117 if (flg) { 118 seq = flg->seq; 119 } else { 120 seq = hmp->flusher.next; 121 ++hmp->flusher.next; 122 } 123 } else { 124 seq = hmp->flusher.done; 125 } 126 return(seq); 127 } 128 129 /* 130 * Flush the current/next flushable flg. This function is typically called 131 * in a loop along with hammer_flusher_wait(hmp, returned_seq) to iterate 132 * flush groups until specific conditions are met. 133 * 134 * If a flush is currently in progress its seq is returned. 135 * 136 * If no flush is currently in progress the next available flush group 137 * will be flushed and its seq returned. 138 * 139 * If no flush groups are present a dummy seq will be allocated and 140 * returned and the flusher will be activated (e.g. to flush the 141 * undo/redo and the volume header). 142 */ 143 int 144 hammer_flusher_async_one(hammer_mount_t hmp) 145 { 146 hammer_flush_group_t flg; 147 int seq; 148 149 if (hmp->flusher.td) { 150 flg = TAILQ_FIRST(&hmp->flush_group_list); 151 seq = hammer_flusher_async(hmp, flg); 152 } else { 153 seq = hmp->flusher.done; 154 } 155 return(seq); 156 } 157 158 /* 159 * Wait for the flusher to finish flushing the specified sequence 160 * number. The flush is already running and will signal us on 161 * each completion. 162 */ 163 void 164 hammer_flusher_wait(hammer_mount_t hmp, int seq) 165 { 166 while (seq - hmp->flusher.done > 0) 167 tsleep(&hmp->flusher.done, 0, "hmrfls", 0); 168 } 169 170 /* 171 * Returns non-zero if the flusher is currently running. Used for 172 * time-domain multiplexing of frontend operations in order to avoid 173 * starving the backend flusher. 174 */ 175 int 176 hammer_flusher_running(hammer_mount_t hmp) 177 { 178 int seq = hmp->flusher.next - 1; 179 if (seq - hmp->flusher.done > 0) 180 return(1); 181 return (0); 182 } 183 184 void 185 hammer_flusher_wait_next(hammer_mount_t hmp) 186 { 187 int seq; 188 189 seq = hammer_flusher_async_one(hmp); 190 hammer_flusher_wait(hmp, seq); 191 } 192 193 void 194 hammer_flusher_create(hammer_mount_t hmp) 195 { 196 hammer_flusher_info_t info; 197 int i; 198 199 hmp->flusher.signal = 0; 200 hmp->flusher.done = 0; 201 hmp->flusher.next = 1; 202 hammer_ref(&hmp->flusher.finalize_lock); 203 TAILQ_INIT(&hmp->flusher.run_list); 204 TAILQ_INIT(&hmp->flusher.ready_list); 205 206 lwkt_create(hammer_flusher_master_thread, hmp, 207 &hmp->flusher.td, NULL, 0, -1, "hammer-M"); 208 for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) { 209 info = kmalloc(sizeof(*info), hmp->m_misc, M_WAITOK|M_ZERO); 210 info->hmp = hmp; 211 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry); 212 lwkt_create(hammer_flusher_slave_thread, info, 213 &info->td, NULL, 0, -1, "hammer-S%d", i); 214 } 215 } 216 217 void 218 hammer_flusher_destroy(hammer_mount_t hmp) 219 { 220 hammer_flusher_info_t info; 221 222 /* 223 * Kill the master 224 */ 225 hmp->flusher.exiting = 1; 226 while (hmp->flusher.td) { 227 ++hmp->flusher.signal; 228 wakeup(&hmp->flusher.signal); 229 tsleep(&hmp->flusher.exiting, 0, "hmrwex", hz); 230 } 231 232 /* 233 * Kill the slaves 234 */ 235 while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) != NULL) { 236 KKASSERT(info->runstate == 0); 237 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry); 238 info->runstate = -1; 239 wakeup(&info->runstate); 240 while (info->td) 241 tsleep(&info->td, 0, "hmrwwc", 0); 242 kfree(info, hmp->m_misc); 243 } 244 } 245 246 /* 247 * The master flusher thread manages the flusher sequence id and 248 * synchronization with the slave work threads. 249 */ 250 static void 251 hammer_flusher_master_thread(void *arg) 252 { 253 hammer_mount_t hmp; 254 int seq; 255 int nomore; 256 257 hmp = arg; 258 259 lwkt_gettoken(&hmp->fs_token); 260 261 for (;;) { 262 /* 263 * Flush all sequence numbers up to but not including .next, 264 * or until an open flush group is encountered. 265 */ 266 for (;;) { 267 while (hmp->flusher.group_lock) 268 tsleep(&hmp->flusher.group_lock, 0, "hmrhld",0); 269 hammer_flusher_clean_loose_ios(hmp); 270 271 seq = hammer_flusher_flush(hmp, &nomore); 272 hmp->flusher.done = seq; 273 wakeup(&hmp->flusher.done); 274 275 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 276 break; 277 if (nomore) 278 break; 279 } 280 281 /* 282 * Wait for activity. 283 */ 284 if (hmp->flusher.exiting && TAILQ_EMPTY(&hmp->flush_group_list)) 285 break; 286 while (hmp->flusher.signal == 0) 287 tsleep(&hmp->flusher.signal, 0, "hmrwwa", 0); 288 hmp->flusher.signal = 0; 289 } 290 291 /* 292 * And we are done. 293 */ 294 hmp->flusher.td = NULL; 295 wakeup(&hmp->flusher.exiting); 296 lwkt_reltoken(&hmp->fs_token); 297 lwkt_exit(); 298 } 299 300 /* 301 * Flush the next sequence number until an open flush group is encountered 302 * or we reach (next). Not all sequence numbers will have flush groups 303 * associated with them. These require that the UNDO/REDO FIFO still be 304 * flushed since it can take at least one additional run to synchronize 305 * the FIFO, and more to also synchronize the reserve structures. 306 */ 307 static int 308 hammer_flusher_flush(hammer_mount_t hmp, int *nomorep) 309 { 310 hammer_flusher_info_t info; 311 hammer_flush_group_t flg; 312 hammer_reserve_t resv; 313 int count; 314 int seq; 315 316 /* 317 * Just in-case there's a flush race on mount. Seq number 318 * does not change. 319 */ 320 if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL) { 321 *nomorep = 1; 322 return (hmp->flusher.done); 323 } 324 *nomorep = 0; 325 326 /* 327 * Flush the next sequence number. Sequence numbers can exist 328 * without an assigned flush group, indicating that just a FIFO flush 329 * should occur. 330 */ 331 seq = hmp->flusher.done + 1; 332 flg = TAILQ_FIRST(&hmp->flush_group_list); 333 if (flg == NULL) { 334 if (seq == hmp->flusher.next) { 335 *nomorep = 1; 336 return (hmp->flusher.done); 337 } 338 } else if (seq == flg->seq) { 339 if (flg->closed) { 340 KKASSERT(flg->running == 0); 341 flg->running = 1; 342 if (hmp->fill_flush_group == flg) { 343 hmp->fill_flush_group = 344 TAILQ_NEXT(flg, flush_entry); 345 } 346 } else { 347 *nomorep = 1; 348 return (hmp->flusher.done); 349 } 350 } else { 351 /* 352 * Sequence number problems can only happen if a critical 353 * filesystem error occurred which forced the filesystem into 354 * read-only mode. 355 */ 356 KKASSERT(flg->seq - seq > 0 || hmp->ronly >= 2); 357 flg = NULL; 358 } 359 360 /* 361 * We only do one flg but we may have to loop/retry. 362 * 363 * Due to various races it is possible to come across a flush 364 * group which as not yet been closed. 365 */ 366 count = 0; 367 while (flg && flg->running) { 368 ++count; 369 if (hammer_debug_general & 0x0001) { 370 hdkprintf("%d ttl=%d recs=%d\n", 371 flg->seq, flg->total_count, flg->refs); 372 } 373 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 374 break; 375 hammer_start_transaction_fls(&hmp->flusher.trans, hmp); 376 377 /* 378 * If the previous flush cycle just about exhausted our 379 * UNDO space we may have to do a dummy cycle to move the 380 * first_offset up before actually digging into a new cycle, 381 * or the new cycle will not have sufficient undo space. 382 */ 383 if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3)) 384 hammer_flusher_finalize(&hmp->flusher.trans, 0); 385 386 KKASSERT(hmp->next_flush_group != flg); 387 388 /* 389 * Place the flg in the flusher structure and start the 390 * slaves running. The slaves will compete for inodes 391 * to flush. 392 * 393 * Make a per-thread copy of the transaction. 394 */ 395 while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) != NULL) { 396 TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry); 397 info->flg = flg; 398 info->runstate = 1; 399 info->trans = hmp->flusher.trans; 400 TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry); 401 wakeup(&info->runstate); 402 } 403 404 /* 405 * Wait for all slaves to finish running 406 */ 407 while (TAILQ_FIRST(&hmp->flusher.run_list) != NULL) 408 tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0); 409 410 /* 411 * Do the final finalization, clean up 412 */ 413 hammer_flusher_finalize(&hmp->flusher.trans, 1); 414 hmp->flusher.tid = hmp->flusher.trans.tid; 415 416 hammer_done_transaction(&hmp->flusher.trans); 417 418 /* 419 * Loop up on the same flg. If the flg is done clean it up 420 * and break out. We only flush one flg. 421 */ 422 if (RB_EMPTY(&flg->flush_tree)) { 423 KKASSERT(flg->refs == 0); 424 TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry); 425 kfree(flg, hmp->m_misc); 426 break; 427 } 428 KKASSERT(TAILQ_FIRST(&hmp->flush_group_list) == flg); 429 } 430 431 /* 432 * We may have pure meta-data to flush, or we may have to finish 433 * cycling the UNDO FIFO, even if there were no flush groups. 434 */ 435 if (count == 0 && hammer_flusher_haswork(hmp)) { 436 hammer_start_transaction_fls(&hmp->flusher.trans, hmp); 437 hammer_flusher_finalize(&hmp->flusher.trans, 1); 438 hammer_done_transaction(&hmp->flusher.trans); 439 } 440 441 /* 442 * Clean up any freed big-blocks (typically zone-2). 443 * resv->flush_group is typically set several flush groups ahead 444 * of the free to ensure that the freed block is not reused until 445 * it can no longer be reused. 446 */ 447 while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) { 448 if (resv->flg_no - seq > 0) 449 break; 450 hammer_reserve_clrdelay(hmp, resv); 451 } 452 return (seq); 453 } 454 455 456 /* 457 * The slave flusher thread pulls work off the master flush list until no 458 * work is left. 459 */ 460 static void 461 hammer_flusher_slave_thread(void *arg) 462 { 463 hammer_flush_group_t flg; 464 hammer_flusher_info_t info; 465 hammer_mount_t hmp; 466 467 info = arg; 468 hmp = info->hmp; 469 lwkt_gettoken(&hmp->fs_token); 470 471 for (;;) { 472 while (info->runstate == 0) 473 tsleep(&info->runstate, 0, "hmrssw", 0); 474 if (info->runstate < 0) 475 break; 476 flg = info->flg; 477 478 RB_SCAN(hammer_fls_rb_tree, &flg->flush_tree, NULL, 479 hammer_flusher_flush_inode, info); 480 481 info->runstate = 0; 482 info->flg = NULL; 483 TAILQ_REMOVE(&hmp->flusher.run_list, info, entry); 484 TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry); 485 wakeup(&hmp->flusher.ready_list); 486 } 487 info->td = NULL; 488 wakeup(&info->td); 489 lwkt_reltoken(&hmp->fs_token); 490 lwkt_exit(); 491 } 492 493 void 494 hammer_flusher_clean_loose_ios(hammer_mount_t hmp) 495 { 496 hammer_buffer_t buffer; 497 hammer_io_t io; 498 499 /* 500 * loose ends - buffers without bp's aren't tracked by the kernel 501 * and can build up, so clean them out. This can occur when an 502 * IO completes on a buffer with no references left. 503 * 504 * The io_token is needed to protect the list. 505 */ 506 if ((io = RB_ROOT(&hmp->lose_root)) != NULL) { 507 lwkt_gettoken(&hmp->io_token); 508 while ((io = RB_ROOT(&hmp->lose_root)) != NULL) { 509 KKASSERT(io->mod_root == &hmp->lose_root); 510 RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io); 511 io->mod_root = NULL; 512 hammer_ref(&io->lock); 513 buffer = (void *)io; 514 hammer_rel_buffer(buffer, 0); 515 } 516 lwkt_reltoken(&hmp->io_token); 517 } 518 } 519 520 /* 521 * Flush a single inode that is part of a flush group. 522 * 523 * Flusher errors are extremely serious, even ENOSPC shouldn't occur because 524 * the front-end should have reserved sufficient space on the media. Any 525 * error other then EWOULDBLOCK will force the mount to be read-only. 526 */ 527 static 528 int 529 hammer_flusher_flush_inode(hammer_inode_t ip, void *data) 530 { 531 hammer_flusher_info_t info = data; 532 hammer_mount_t hmp = info->hmp; 533 hammer_transaction_t trans = &info->trans; 534 int error; 535 536 /* 537 * Several slaves are operating on the same flush group concurrently. 538 * The SLAVEFLUSH flag prevents them from tripping over each other. 539 * 540 * NOTE: It is possible for a EWOULDBLOCK'd ip returned by one slave 541 * to be resynced by another, but normally such inodes are not 542 * revisited until the master loop gets to them. 543 */ 544 if (ip->flags & HAMMER_INODE_SLAVEFLUSH) 545 return(0); 546 ip->flags |= HAMMER_INODE_SLAVEFLUSH; 547 ++hammer_stats_inode_flushes; 548 549 hammer_flusher_clean_loose_ios(hmp); 550 vm_wait_nominal(); 551 error = hammer_sync_inode(trans, ip); 552 553 /* 554 * EWOULDBLOCK can happen under normal operation, all other errors 555 * are considered extremely serious. We must set WOULDBLOCK 556 * mechanics to deal with the mess left over from the abort of the 557 * previous flush. 558 */ 559 if (error) { 560 ip->flags |= HAMMER_INODE_WOULDBLOCK; 561 if (error == EWOULDBLOCK) 562 error = 0; 563 } 564 hammer_sync_inode_done(ip, error); 565 /* ip invalid */ 566 567 while (hmp->flusher.finalize_want) 568 tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0); 569 if (hammer_flusher_undo_exhausted(trans, 1)) { 570 hkprintf("Warning: UNDO area too small!\n"); 571 hammer_flusher_finalize(trans, 1); 572 } else if (hammer_flusher_meta_limit(trans->hmp)) { 573 hammer_flusher_finalize(trans, 0); 574 } 575 return (0); 576 } 577 578 /* 579 * Return non-zero if the UNDO area has less then (QUARTER / 4) of its 580 * space left. 581 * 582 * 1/4 - Emergency free undo space level. Below this point the flusher 583 * will finalize even if directory dependancies have not been resolved. 584 * 585 * 2/4 - Used by the pruning and reblocking code. These functions may be 586 * running in parallel with a flush and cannot be allowed to drop 587 * available undo space to emergency levels. 588 * 589 * 3/4 - Used at the beginning of a flush to force-sync the volume header 590 * to give the flush plenty of runway to work in. 591 */ 592 int 593 hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter) 594 { 595 if (hammer_undo_space(trans) < 596 hammer_undo_max(trans->hmp) * quarter / 4) { 597 return(1); 598 } else { 599 return(0); 600 } 601 } 602 603 /* 604 * Flush all pending UNDOs, wait for write completion, update the volume 605 * header with the new UNDO end position, and flush it. Then 606 * asynchronously flush the meta-data. 607 * 608 * If this is the last finalization in a flush group we also synchronize 609 * our cached blockmap and set hmp->flusher_undo_start and our cached undo 610 * fifo first_offset so the next flush resets the FIFO pointers. 611 * 612 * If this is not final it is being called because too many dirty meta-data 613 * buffers have built up and must be flushed with UNDO synchronization to 614 * avoid a buffer cache deadlock. 615 */ 616 void 617 hammer_flusher_finalize(hammer_transaction_t trans, int final) 618 { 619 hammer_volume_t root_volume; 620 hammer_blockmap_t cundomap, dundomap; 621 hammer_mount_t hmp; 622 hammer_io_t io; 623 hammer_off_t save_undo_next_offset; 624 int count; 625 int i; 626 627 hmp = trans->hmp; 628 root_volume = trans->rootvol; 629 630 /* 631 * Exclusively lock the flusher. This guarantees that all dirty 632 * buffers will be idled (have a mod-count of 0). 633 */ 634 ++hmp->flusher.finalize_want; 635 hammer_lock_ex(&hmp->flusher.finalize_lock); 636 637 /* 638 * If this isn't the final sync several threads may have hit the 639 * meta-limit at the same time and raced. Only sync if we really 640 * have to, after acquiring the lock. 641 */ 642 if (final == 0 && !hammer_flusher_meta_limit(hmp)) 643 goto done; 644 645 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 646 goto done; 647 648 /* 649 * Flush data buffers. This can occur asynchronously and at any 650 * time. We must interlock against the frontend direct-data write 651 * but do not have to acquire the sync-lock yet. 652 * 653 * These data buffers have already been collected prior to the 654 * related inode(s) getting queued to the flush group. 655 */ 656 count = 0; 657 while ((io = RB_FIRST(hammer_mod_rb_tree, &hmp->data_root)) != NULL) { 658 if (io->ioerror) 659 break; 660 hammer_ref(&io->lock); 661 hammer_io_write_interlock(io); 662 KKASSERT(io->type != HAMMER_IOTYPE_VOLUME); 663 hammer_io_flush(io, 0); 664 hammer_io_done_interlock(io); 665 hammer_rel_buffer(HAMMER_ITOB(io), 0); 666 hammer_io_limit_backlog(hmp); 667 ++count; 668 } 669 670 /* 671 * The sync-lock is required for the remaining sequence. This lock 672 * prevents meta-data from being modified. 673 */ 674 hammer_sync_lock_ex(trans); 675 676 /* 677 * If we have been asked to finalize the volume header sync the 678 * cached blockmap to the on-disk blockmap. Generate an UNDO 679 * record for the update. 680 */ 681 if (final) { 682 cundomap = &hmp->blockmap[0]; 683 dundomap = &root_volume->ondisk->vol0_blockmap[0]; 684 if (root_volume->io.modified) { 685 hammer_modify_volume(trans, root_volume, 686 dundomap, sizeof(hmp->blockmap)); 687 for (i = 0; i < HAMMER_MAX_ZONES; ++i) { 688 hammer_crc_set_blockmap(hmp->version, 689 &cundomap[i]); 690 } 691 bcopy(cundomap, dundomap, sizeof(hmp->blockmap)); 692 hammer_modify_volume_done(root_volume); 693 } 694 } 695 696 /* 697 * Flush UNDOs. This can occur concurrently with the data flush 698 * because data writes never overwrite. 699 * 700 * This also waits for I/Os to complete and flushes the cache on 701 * the target disk. 702 * 703 * Record the UNDO append point as this can continue to change 704 * after we have flushed the UNDOs. 705 */ 706 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; 707 hammer_lock_ex(&hmp->undo_lock); 708 save_undo_next_offset = cundomap->next_offset; 709 hammer_unlock(&hmp->undo_lock); 710 hammer_flusher_flush_undos(hmp, HAMMER_FLUSH_UNDOS_FORCED); 711 712 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 713 goto failed; 714 715 /* 716 * HAMMER VERSION < 4: 717 * Update the on-disk volume header with new UNDO FIFO end 718 * position (do not generate new UNDO records for this change). 719 * We have to do this for the UNDO FIFO whether (final) is 720 * set or not in order for the UNDOs to be recognized on 721 * recovery. 722 * 723 * HAMMER VERSION >= 4: 724 * The UNDO FIFO data written above will be recognized on 725 * recovery without us having to sync the volume header. 726 * 727 * Also update the on-disk next_tid field. This does not require 728 * an UNDO. However, because our TID is generated before we get 729 * the sync lock another sync may have beat us to the punch. 730 * 731 * This also has the side effect of updating first_offset based on 732 * a prior finalization when the first finalization of the next flush 733 * cycle occurs, removing any undo info from the prior finalization 734 * from consideration. 735 * 736 * The volume header will be flushed out synchronously. 737 */ 738 dundomap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 739 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; 740 741 if (dundomap->first_offset != cundomap->first_offset || 742 dundomap->next_offset != save_undo_next_offset) { 743 hammer_modify_volume_noundo(NULL, root_volume); 744 dundomap->first_offset = cundomap->first_offset; 745 dundomap->next_offset = save_undo_next_offset; 746 hammer_crc_set_blockmap(hmp->version, dundomap); 747 hammer_modify_volume_done(root_volume); 748 } 749 750 /* 751 * vol0_next_tid is used for TID selection and is updated without 752 * an UNDO so we do not reuse a TID that may have been rolled-back. 753 * 754 * vol0_last_tid is the highest fully-synchronized TID. It is 755 * set-up when the UNDO fifo is fully synced, later on (not here). 756 * 757 * The root volume can be open for modification by other threads 758 * generating UNDO or REDO records. For example, reblocking, 759 * pruning, REDO mode fast-fsyncs, so the write interlock is 760 * mandatory. 761 */ 762 if (root_volume->io.modified) { 763 hammer_modify_volume_noundo(NULL, root_volume); 764 if (root_volume->ondisk->vol0_next_tid < trans->tid) 765 root_volume->ondisk->vol0_next_tid = trans->tid; 766 hammer_crc_set_volume(hmp->version, root_volume->ondisk); 767 hammer_modify_volume_done(root_volume); 768 hammer_io_write_interlock(&root_volume->io); 769 hammer_io_flush(&root_volume->io, 0); 770 hammer_io_done_interlock(&root_volume->io); 771 } 772 773 /* 774 * Wait for I/Os to complete. 775 * 776 * For HAMMER VERSION 4+ filesystems we do not have to wait for 777 * the I/O to complete as the new UNDO FIFO entries are recognized 778 * even without the volume header update. This allows the volume 779 * header to flushed along with meta-data, significantly reducing 780 * flush overheads. 781 */ 782 hammer_flusher_clean_loose_ios(hmp); 783 if (hmp->version < HAMMER_VOL_VERSION_FOUR) 784 hammer_io_wait_all(hmp, "hmrfl3", 1); 785 786 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 787 goto failed; 788 789 /* 790 * Flush meta-data. The meta-data will be undone if we crash 791 * so we can safely flush it asynchronously. There is no need 792 * to wait for I/O to complete (or issue a synchronous disk flush). 793 * 794 * In fact, even if we did wait the meta-data will still be undone 795 * by a crash up until the next flush cycle due to the first_offset 796 * in the volume header for the UNDO FIFO not being adjusted until 797 * the following flush cycle. 798 * 799 * No io interlock is needed, bioops callbacks will not mess with 800 * meta data buffers. 801 */ 802 count = 0; 803 while ((io = RB_FIRST(hammer_mod_rb_tree, &hmp->meta_root)) != NULL) { 804 if (io->ioerror) 805 break; 806 KKASSERT(io->modify_refs == 0); 807 hammer_ref(&io->lock); 808 KKASSERT(io->type != HAMMER_IOTYPE_VOLUME); 809 hammer_io_flush(io, 0); 810 hammer_rel_buffer(HAMMER_ITOB(io), 0); 811 hammer_io_limit_backlog(hmp); 812 ++count; 813 } 814 815 /* 816 * If this is the final finalization for the flush group set 817 * up for the next sequence by setting a new first_offset in 818 * our cached blockmap and clearing the undo history. 819 * 820 * Even though we have updated our cached first_offset, the on-disk 821 * first_offset still governs available-undo-space calculations. 822 * 823 * We synchronize to save_undo_next_offset rather than 824 * cundomap->next_offset because that is what we flushed out 825 * above. 826 * 827 * NOTE! UNDOs can only be added with the sync_lock held 828 * so we can clear the undo history without racing. 829 * REDOs can be added at any time which is why we 830 * have to be careful and use save_undo_next_offset 831 * when setting the new first_offset. 832 */ 833 if (final) { 834 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; 835 if (cundomap->first_offset != save_undo_next_offset) { 836 cundomap->first_offset = save_undo_next_offset; 837 hmp->hflags |= HMNT_UNDO_DIRTY; 838 } else if (cundomap->first_offset != cundomap->next_offset) { 839 hmp->hflags |= HMNT_UNDO_DIRTY; 840 } else { 841 hmp->hflags &= ~HMNT_UNDO_DIRTY; 842 } 843 hammer_clear_undo_history(hmp); 844 845 /* 846 * Flush tid sequencing. flush_tid1 is fully synchronized, 847 * meaning a crash will not roll it back. flush_tid2 has 848 * been written out asynchronously and a crash will roll 849 * it back. flush_tid1 is used for all mirroring masters. 850 */ 851 if (hmp->flush_tid1 != hmp->flush_tid2) { 852 hmp->flush_tid1 = hmp->flush_tid2; 853 wakeup(&hmp->flush_tid1); 854 } 855 hmp->flush_tid2 = trans->tid; 856 857 /* 858 * Clear the REDO SYNC flag. This flag is used to ensure 859 * that the recovery span in the UNDO/REDO FIFO contains 860 * at least one REDO SYNC record. 861 */ 862 hmp->flags &= ~HAMMER_MOUNT_REDO_SYNC; 863 } 864 865 /* 866 * Cleanup. Report any critical errors. 867 */ 868 failed: 869 hammer_sync_unlock(trans); 870 871 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) { 872 hvkprintf(root_volume, 873 "Critical write error during flush, " 874 "refusing to sync UNDO FIFO\n"); 875 } 876 877 done: 878 hammer_unlock(&hmp->flusher.finalize_lock); 879 880 if (--hmp->flusher.finalize_want == 0) 881 wakeup(&hmp->flusher.finalize_want); 882 hammer_stats_commits += final; 883 } 884 885 /* 886 * Flush UNDOs. 887 */ 888 void 889 hammer_flusher_flush_undos(hammer_mount_t hmp, int mode) 890 { 891 hammer_io_t io; 892 int count; 893 894 count = 0; 895 while ((io = RB_FIRST(hammer_mod_rb_tree, &hmp->undo_root)) != NULL) { 896 if (io->ioerror) 897 break; 898 hammer_ref(&io->lock); 899 KKASSERT(io->type != HAMMER_IOTYPE_VOLUME); 900 hammer_io_write_interlock(io); 901 hammer_io_flush(io, hammer_undo_reclaim(io)); 902 hammer_io_done_interlock(io); 903 hammer_rel_buffer(HAMMER_ITOB(io), 0); 904 hammer_io_limit_backlog(hmp); 905 ++count; 906 } 907 hammer_flusher_clean_loose_ios(hmp); 908 if (mode == HAMMER_FLUSH_UNDOS_FORCED || 909 (mode == HAMMER_FLUSH_UNDOS_AUTO && count)) { 910 hammer_io_wait_all(hmp, "hmrfl1", 1); 911 } else { 912 hammer_io_wait_all(hmp, "hmrfl2", 0); 913 } 914 } 915 916 /* 917 * Return non-zero if too many dirty meta-data buffers have built up. 918 * 919 * Since we cannot allow such buffers to flush until we have dealt with 920 * the UNDOs, we risk deadlocking the kernel's buffer cache. 921 */ 922 int 923 hammer_flusher_meta_limit(hammer_mount_t hmp) 924 { 925 if (hmp->locked_dirty_space + hmp->io_running_space > 926 hammer_limit_dirtybufspace) { 927 return(1); 928 } 929 return(0); 930 } 931 932 /* 933 * Return non-zero if too many dirty meta-data buffers have built up. 934 * 935 * This version is used by background operations (mirror, prune, reblock) 936 * to leave room for foreground operations. 937 */ 938 int 939 hammer_flusher_meta_halflimit(hammer_mount_t hmp) 940 { 941 if (hmp->locked_dirty_space + hmp->io_running_space > 942 hammer_limit_dirtybufspace / 2) { 943 return(1); 944 } 945 return(0); 946 } 947 948 /* 949 * Return non-zero if the flusher still has something to flush. 950 */ 951 int 952 hammer_flusher_haswork(hammer_mount_t hmp) 953 { 954 if (hmp->ronly) 955 return(0); 956 if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) 957 return(0); 958 if (TAILQ_FIRST(&hmp->flush_group_list) || /* dirty inodes */ 959 RB_ROOT(&hmp->volu_root) || /* dirty buffers */ 960 RB_ROOT(&hmp->undo_root) || 961 RB_ROOT(&hmp->data_root) || 962 RB_ROOT(&hmp->meta_root) || 963 (hmp->hflags & HMNT_UNDO_DIRTY)) { /* UNDO FIFO sync */ 964 return(1); 965 } 966 return(0); 967 } 968 969 int 970 hammer_flush_dirty(hammer_mount_t hmp, int max_count) 971 { 972 int count = 0; 973 int dummy; 974 975 while (hammer_flusher_haswork(hmp)) { 976 hammer_flusher_sync(hmp); 977 ++count; 978 if (count >= 5) { 979 if (count == 5) 980 hkprintf("flushing."); 981 else 982 kprintf("."); 983 tsleep(&dummy, 0, "hmrufl", hz); 984 } 985 if (max_count != -1 && count == max_count) { 986 kprintf("giving up"); 987 break; 988 } 989 } 990 if (count >= 5) 991 kprintf("\n"); 992 993 if (count >= max_count) 994 return(-1); 995 return(0); 996 } 997