1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.33 2008/07/07 00:24:31 dillon Exp $ 35 */ 36 /* 37 * HAMMER dependancy flusher thread 38 * 39 * Meta data updates create buffer dependancies which are arranged as a 40 * hierarchy of lists. 41 */ 42 43 #include "hammer.h" 44 45 static void hammer_flusher_master_thread(void *arg); 46 static void hammer_flusher_slave_thread(void *arg); 47 static void hammer_flusher_flush(hammer_mount_t hmp); 48 static void hammer_flusher_flush_inode(hammer_inode_t ip, 49 hammer_transaction_t trans); 50 static void hammer_flusher_finalize(hammer_transaction_t trans, int final); 51 52 /* 53 * Support structures for the flusher threads. 54 */ 55 struct hammer_flusher_info { 56 struct hammer_mount *hmp; 57 thread_t td; 58 int startit; 59 hammer_inode_t work_array[HAMMER_FLUSH_GROUP_SIZE]; 60 }; 61 62 typedef struct hammer_flusher_info *hammer_flusher_info_t; 63 64 /* 65 * Sync all inodes pending on the flusher. This routine may have to be 66 * called twice to get them all as some may be queued to a later flush group. 67 */ 68 void 69 hammer_flusher_sync(hammer_mount_t hmp) 70 { 71 int seq; 72 73 if (hmp->flusher.td) { 74 seq = hmp->flusher.next; 75 if (hmp->flusher.signal++ == 0) 76 wakeup(&hmp->flusher.signal); 77 while ((int)(seq - hmp->flusher.done) > 0) 78 tsleep(&hmp->flusher.done, 0, "hmrfls", 0); 79 } 80 } 81 82 /* 83 * Sync all inodes pending on the flusher - return immediately. 84 */ 85 void 86 hammer_flusher_async(hammer_mount_t hmp) 87 { 88 if (hmp->flusher.td) { 89 if (hmp->flusher.signal++ == 0) 90 wakeup(&hmp->flusher.signal); 91 } 92 } 93 94 void 95 hammer_flusher_create(hammer_mount_t hmp) 96 { 97 hammer_flusher_info_t info; 98 int i; 99 100 hmp->flusher.signal = 0; 101 hmp->flusher.act = 0; 102 hmp->flusher.done = 0; 103 hmp->flusher.next = 1; 104 hmp->flusher.count = 0; 105 hammer_ref(&hmp->flusher.finalize_lock); 106 107 lwkt_create(hammer_flusher_master_thread, hmp, 108 &hmp->flusher.td, NULL, 0, -1, "hammer-M"); 109 for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) { 110 info = kmalloc(sizeof(*info), M_HAMMER, M_WAITOK|M_ZERO); 111 info->hmp = hmp; 112 ++hmp->flusher.count; 113 hmp->flusher.info[i] = info; 114 lwkt_create(hammer_flusher_slave_thread, info, 115 &info->td, NULL, 0, -1, "hammer-S%d", i); 116 } 117 } 118 119 void 120 hammer_flusher_destroy(hammer_mount_t hmp) 121 { 122 hammer_flusher_info_t info; 123 int i; 124 125 /* 126 * Kill the master 127 */ 128 hmp->flusher.exiting = 1; 129 while (hmp->flusher.td) { 130 ++hmp->flusher.signal; 131 wakeup(&hmp->flusher.signal); 132 tsleep(&hmp->flusher.exiting, 0, "hmrwex", hz); 133 } 134 135 /* 136 * Kill the slaves 137 */ 138 for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) { 139 if ((info = hmp->flusher.info[i]) != NULL) { 140 KKASSERT(info->startit == 0); 141 info->startit = -1; 142 wakeup(&info->startit); 143 while (info->td) { 144 tsleep(&info->td, 0, "hmrwwc", 0); 145 } 146 hmp->flusher.info[i] = NULL; 147 kfree(info, M_HAMMER); 148 --hmp->flusher.count; 149 } 150 } 151 KKASSERT(hmp->flusher.count == 0); 152 } 153 154 /* 155 * The master flusher thread manages the flusher sequence id and 156 * synchronization with the slave work threads. 157 */ 158 static void 159 hammer_flusher_master_thread(void *arg) 160 { 161 hammer_mount_t hmp = arg; 162 163 for (;;) { 164 while (hmp->flusher.group_lock) 165 tsleep(&hmp->flusher.group_lock, 0, "hmrhld", 0); 166 hmp->flusher.act = hmp->flusher.next; 167 ++hmp->flusher.next; 168 hammer_flusher_clean_loose_ios(hmp); 169 hammer_flusher_flush(hmp); 170 hmp->flusher.done = hmp->flusher.act; 171 wakeup(&hmp->flusher.done); 172 173 /* 174 * Wait for activity. 175 */ 176 if (hmp->flusher.exiting && TAILQ_EMPTY(&hmp->flush_list)) 177 break; 178 179 /* 180 * This is a hack until we can dispose of frontend buffer 181 * cache buffers on the frontend. 182 */ 183 while (hmp->flusher.signal == 0) 184 tsleep(&hmp->flusher.signal, 0, "hmrwwa", 0); 185 hmp->flusher.signal = 0; 186 } 187 188 /* 189 * And we are done. 190 */ 191 hmp->flusher.td = NULL; 192 wakeup(&hmp->flusher.exiting); 193 lwkt_exit(); 194 } 195 196 /* 197 * The slave flusher thread pulls work off the master flush_list until no 198 * work is left. 199 */ 200 static void 201 hammer_flusher_slave_thread(void *arg) 202 { 203 hammer_flusher_info_t info; 204 hammer_mount_t hmp; 205 hammer_inode_t ip; 206 int c; 207 int i; 208 int n; 209 210 info = arg; 211 hmp = info->hmp; 212 213 for (;;) { 214 while (info->startit == 0) 215 tsleep(&info->startit, 0, "hmrssw", 0); 216 if (info->startit < 0) 217 break; 218 info->startit = 0; 219 220 /* 221 * Try to pull out around ~64 inodes at a time to flush. 222 * The idea is to try to avoid deadlocks between the slaves. 223 */ 224 n = c = 0; 225 while ((ip = TAILQ_FIRST(&hmp->flush_list)) != NULL) { 226 if (ip->flush_group != hmp->flusher.act) 227 break; 228 TAILQ_REMOVE(&hmp->flush_list, ip, flush_entry); 229 info->work_array[n++] = ip; 230 c += ip->rsv_recs; 231 if (n < HAMMER_FLUSH_GROUP_SIZE && 232 c < HAMMER_FLUSH_GROUP_SIZE * 8) { 233 continue; 234 } 235 for (i = 0; i < n; ++i){ 236 hammer_flusher_flush_inode(info->work_array[i], 237 &hmp->flusher.trans); 238 } 239 n = c = 0; 240 } 241 for (i = 0; i < n; ++i) { 242 hammer_flusher_flush_inode(info->work_array[i], 243 &hmp->flusher.trans); 244 } 245 if (--hmp->flusher.running == 0) 246 wakeup(&hmp->flusher.running); 247 } 248 info->td = NULL; 249 wakeup(&info->td); 250 lwkt_exit(); 251 } 252 253 void 254 hammer_flusher_clean_loose_ios(hammer_mount_t hmp) 255 { 256 hammer_buffer_t buffer; 257 hammer_io_t io; 258 259 /* 260 * loose ends - buffers without bp's aren't tracked by the kernel 261 * and can build up, so clean them out. This can occur when an 262 * IO completes on a buffer with no references left. 263 */ 264 if ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) { 265 crit_enter(); /* biodone() race */ 266 while ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) { 267 KKASSERT(io->mod_list == &hmp->lose_list); 268 TAILQ_REMOVE(&hmp->lose_list, io, mod_entry); 269 io->mod_list = NULL; 270 if (io->lock.refs == 0) 271 ++hammer_count_refedbufs; 272 hammer_ref(&io->lock); 273 buffer = (void *)io; 274 hammer_rel_buffer(buffer, 0); 275 } 276 crit_exit(); 277 } 278 } 279 280 /* 281 * Flush all inodes in the current flush group. 282 */ 283 static void 284 hammer_flusher_flush(hammer_mount_t hmp) 285 { 286 hammer_flusher_info_t info; 287 hammer_reserve_t resv; 288 int i; 289 int n; 290 291 hammer_start_transaction_fls(&hmp->flusher.trans, hmp); 292 293 /* 294 * If the previous flush cycle just about exhausted our UNDO space 295 * we may have to do a dummy cycle to move the first_offset up 296 * before actually digging into a new cycle, or the new cycle will 297 * not have sufficient undo space. 298 */ 299 if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3)) { 300 hammer_lock_ex(&hmp->flusher.finalize_lock); 301 hammer_flusher_finalize(&hmp->flusher.trans, 0); 302 hammer_unlock(&hmp->flusher.finalize_lock); 303 } 304 305 /* 306 * Start work threads. 307 */ 308 i = 0; 309 n = hmp->count_iqueued / HAMMER_FLUSH_GROUP_SIZE; 310 if (TAILQ_FIRST(&hmp->flush_list)) { 311 for (i = 0; i <= n; ++i) { 312 if (i == HAMMER_MAX_FLUSHERS || 313 hmp->flusher.info[i] == NULL) { 314 break; 315 } 316 info = hmp->flusher.info[i]; 317 if (info->startit == 0) { 318 ++hmp->flusher.running; 319 info->startit = 1; 320 wakeup(&info->startit); 321 } 322 } 323 } 324 while (hmp->flusher.running) 325 tsleep(&hmp->flusher.running, 0, "hmrfcc", 0); 326 327 hammer_flusher_finalize(&hmp->flusher.trans, 1); 328 hmp->flusher.tid = hmp->flusher.trans.tid; 329 330 /* 331 * Clean up any freed big-blocks (typically zone-2). 332 * resv->flush_group is typically set several flush groups ahead 333 * of the free to ensure that the freed block is not reused until 334 * it can no longer be reused. 335 */ 336 while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) { 337 if (resv->flush_group != hmp->flusher.act) 338 break; 339 hammer_reserve_clrdelay(hmp, resv); 340 } 341 hammer_done_transaction(&hmp->flusher.trans); 342 } 343 344 /* 345 * Flush a single inode that is part of a flush group. 346 * 347 * NOTE! The sync code can return EWOULDBLOCK if the flush operation 348 * would otherwise blow out the buffer cache. hammer_flush_inode_done() 349 * will re-queue the inode for the next flush sequence and force the 350 * flusher to run again if this occurs. 351 */ 352 static 353 void 354 hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans) 355 { 356 hammer_mount_t hmp = ip->hmp; 357 int error; 358 359 hammer_flusher_clean_loose_ios(hmp); 360 hammer_lock_sh(&hmp->flusher.finalize_lock); 361 error = hammer_sync_inode(ip); 362 if (error != EWOULDBLOCK) 363 ip->error = error; 364 hammer_flush_inode_done(ip); 365 hammer_unlock(&hmp->flusher.finalize_lock); 366 while (hmp->flusher.finalize_want) 367 tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0); 368 if (hammer_flusher_undo_exhausted(trans, 1)) { 369 hmp->flusher.finalize_want = 1; 370 hammer_lock_ex(&hmp->flusher.finalize_lock); 371 kprintf("HAMMER: Warning: UNDO area too small!\n"); 372 hammer_flusher_finalize(trans, 1); 373 hammer_unlock(&hmp->flusher.finalize_lock); 374 hmp->flusher.finalize_want = 0; 375 wakeup(&hmp->flusher.finalize_want); 376 } else if (hammer_flusher_meta_limit(trans->hmp)) { 377 hmp->flusher.finalize_want = 1; 378 hammer_lock_ex(&hmp->flusher.finalize_lock); 379 hammer_flusher_finalize(trans, 0); 380 hammer_unlock(&hmp->flusher.finalize_lock); 381 hmp->flusher.finalize_want = 0; 382 wakeup(&hmp->flusher.finalize_want); 383 } 384 } 385 386 /* 387 * Return non-zero if the UNDO area has less then (QUARTER / 4) of its 388 * space left. 389 * 390 * 1/4 - Emergency free undo space level. Below this point the flusher 391 * will finalize even if directory dependancies have not been resolved. 392 * 393 * 2/4 - Used by the pruning and reblocking code. These functions may be 394 * running in parallel with a flush and cannot be allowed to drop 395 * available undo space to emergency levels. 396 * 397 * 3/4 - Used at the beginning of a flush to force-sync the volume header 398 * to give the flush plenty of runway to work in. 399 */ 400 int 401 hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter) 402 { 403 if (hammer_undo_space(trans) < 404 hammer_undo_max(trans->hmp) * quarter / 4) { 405 kprintf("%c", '0' + quarter); 406 return(1); 407 } else { 408 return(0); 409 } 410 } 411 412 /* 413 * Flush all pending UNDOs, wait for write completion, update the volume 414 * header with the new UNDO end position, and flush it. Then 415 * asynchronously flush the meta-data. 416 * 417 * If this is the last finalization in a flush group we also synchronize 418 * our cached blockmap and set hmp->flusher_undo_start and our cached undo 419 * fifo first_offset so the next flush resets the FIFO pointers. 420 */ 421 static 422 void 423 hammer_flusher_finalize(hammer_transaction_t trans, int final) 424 { 425 hammer_volume_t root_volume; 426 hammer_blockmap_t cundomap, dundomap; 427 hammer_mount_t hmp; 428 hammer_io_t io; 429 int count; 430 int i; 431 432 hmp = trans->hmp; 433 root_volume = trans->rootvol; 434 435 /* 436 * Flush data buffers. This can occur asynchronously and at any 437 * time. We must interlock against the frontend direct-data write 438 * but do not have to acquire the sync-lock yet. 439 */ 440 count = 0; 441 while ((io = TAILQ_FIRST(&hmp->data_list)) != NULL) { 442 if (io->lock.refs == 0) 443 ++hammer_count_refedbufs; 444 hammer_ref(&io->lock); 445 hammer_io_write_interlock(io); 446 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME); 447 hammer_io_flush(io); 448 hammer_io_done_interlock(io); 449 hammer_rel_buffer((hammer_buffer_t)io, 0); 450 ++count; 451 } 452 453 /* 454 * The sync-lock is required for the remaining sequence. This lock 455 * prevents meta-data from being modified. 456 */ 457 hammer_sync_lock_ex(trans); 458 459 /* 460 * If we have been asked to finalize the volume header sync the 461 * cached blockmap to the on-disk blockmap. Generate an UNDO 462 * record for the update. 463 */ 464 if (final) { 465 cundomap = &hmp->blockmap[0]; 466 dundomap = &root_volume->ondisk->vol0_blockmap[0]; 467 if (root_volume->io.modified) { 468 hammer_modify_volume(trans, root_volume, 469 dundomap, sizeof(hmp->blockmap)); 470 for (i = 0; i < HAMMER_MAX_ZONES; ++i) 471 hammer_crc_set_blockmap(&cundomap[i]); 472 bcopy(cundomap, dundomap, sizeof(hmp->blockmap)); 473 hammer_modify_volume_done(root_volume); 474 } 475 } 476 477 /* 478 * Flush UNDOs 479 */ 480 count = 0; 481 while ((io = TAILQ_FIRST(&hmp->undo_list)) != NULL) { 482 KKASSERT(io->modify_refs == 0); 483 if (io->lock.refs == 0) 484 ++hammer_count_refedbufs; 485 hammer_ref(&io->lock); 486 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME); 487 hammer_io_flush(io); 488 hammer_rel_buffer((hammer_buffer_t)io, 0); 489 ++count; 490 } 491 492 /* 493 * Wait for I/Os to complete 494 */ 495 hammer_flusher_clean_loose_ios(hmp); 496 hammer_io_wait_all(hmp, "hmrfl1"); 497 498 /* 499 * Update the on-disk volume header with new UNDO FIFO end position 500 * (do not generate new UNDO records for this change). We have to 501 * do this for the UNDO FIFO whether (final) is set or not. 502 * 503 * Also update the on-disk next_tid field. This does not require 504 * an UNDO. However, because our TID is generated before we get 505 * the sync lock another sync may have beat us to the punch. 506 * 507 * This also has the side effect of updating first_offset based on 508 * a prior finalization when the first finalization of the next flush 509 * cycle occurs, removing any undo info from the prior finalization 510 * from consideration. 511 * 512 * The volume header will be flushed out synchronously. 513 */ 514 dundomap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 515 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; 516 517 if (dundomap->first_offset != cundomap->first_offset || 518 dundomap->next_offset != cundomap->next_offset) { 519 hammer_modify_volume(NULL, root_volume, NULL, 0); 520 dundomap->first_offset = cundomap->first_offset; 521 dundomap->next_offset = cundomap->next_offset; 522 hammer_crc_set_blockmap(dundomap); 523 hammer_modify_volume_done(root_volume); 524 } 525 526 if (root_volume->io.modified) { 527 hammer_modify_volume(NULL, root_volume, NULL, 0); 528 if (root_volume->ondisk->vol0_next_tid < trans->tid) 529 root_volume->ondisk->vol0_next_tid = trans->tid; 530 hammer_crc_set_volume(root_volume->ondisk); 531 hammer_modify_volume_done(root_volume); 532 hammer_io_flush(&root_volume->io); 533 } 534 535 /* 536 * Wait for I/Os to complete 537 */ 538 hammer_flusher_clean_loose_ios(hmp); 539 hammer_io_wait_all(hmp, "hmrfl2"); 540 541 /* 542 * Flush meta-data. The meta-data will be undone if we crash 543 * so we can safely flush it asynchronously. 544 * 545 * Repeated catchups will wind up flushing this update's meta-data 546 * and the UNDO buffers for the next update simultaniously. This 547 * is ok. 548 */ 549 count = 0; 550 while ((io = TAILQ_FIRST(&hmp->meta_list)) != NULL) { 551 KKASSERT(io->modify_refs == 0); 552 if (io->lock.refs == 0) 553 ++hammer_count_refedbufs; 554 hammer_ref(&io->lock); 555 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME); 556 hammer_io_flush(io); 557 hammer_rel_buffer((hammer_buffer_t)io, 0); 558 ++count; 559 } 560 561 /* 562 * If this is the final finalization for the flush group set 563 * up for the next sequence by setting a new first_offset in 564 * our cached blockmap and clearing the undo history. 565 * 566 * Even though we have updated our cached first_offset, the on-disk 567 * first_offset still governs available-undo-space calculations. 568 */ 569 if (final) { 570 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; 571 cundomap->first_offset = cundomap->next_offset; 572 hammer_clear_undo_history(hmp); 573 } 574 575 hammer_sync_unlock(trans); 576 } 577 578 /* 579 * Return non-zero if too many dirty meta-data buffers have built up. 580 * 581 * Since we cannot allow such buffers to flush until we have dealt with 582 * the UNDOs, we risk deadlocking the kernel's buffer cache. 583 */ 584 int 585 hammer_flusher_meta_limit(hammer_mount_t hmp) 586 { 587 if (hmp->locked_dirty_space + hmp->io_running_space > 588 hammer_limit_dirtybufspace) { 589 return(1); 590 } 591 return(0); 592 } 593 594