1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * UNDO ALGORITHM: 37 * 38 * The UNDO algorithm is trivial. The nominal UNDO range in the 39 * FIFO is determined by taking the first/next offset stored in 40 * the volume header. The next offset may not be correct since 41 * UNDO flushes are not required to flush the volume header, so 42 * the code also scans forward until it finds a discontinuous 43 * sequence number. 44 * 45 * The UNDOs are then scanned and executed in reverse order. These 46 * UNDOs are effectively just data restorations based on HAMMER offsets. 47 * 48 * REDO ALGORITHM: 49 * 50 * REDO records are laid down in the UNDO/REDO FIFO for nominal 51 * writes, truncations, and file extension ops. On a per-inode 52 * basis two types of REDO records are generated, REDO_WRITE 53 * and REDO_TRUNC. 54 * 55 * Essentially the recovery block will contain UNDO records backing 56 * out partial operations and REDO records to regenerate those partial 57 * operations guaranteed by the filesystem during recovery. 58 * 59 * REDO generation is optional, and can also be started and then 60 * later stopped due to excessive write()s inbetween fsyncs, or not 61 * started at all. Because of this the recovery code must determine 62 * when REDOs are valid and when they are not. Additional records are 63 * generated to help figure it out. 64 * 65 * The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated 66 * during a flush cycle indicating which records the flush cycle 67 * has synched meta-data for, and HAMMER_REDO_SYNC is generated in 68 * each flush cycle to indicate how far back in the UNDO/REDO FIFO 69 * the recovery code must go to find the earliest applicable REDO 70 * record. Applicable REDO records can be far outside the nominal 71 * UNDO recovery range, for example if a write() lays down a REDO but 72 * the related file is not flushed for several cycles. 73 * 74 * The SYNC reference is to a point prior to the nominal UNDO FIFO 75 * range, creating an extended REDO range which must be scanned. 76 * 77 * Any REDO_WRITE/REDO_TRUNC encountered within the extended range 78 * which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records 79 * prior to the start of the nominal UNDO range are applicable. 80 * That is, any REDO_TERM_* records in the extended range but not in 81 * the nominal undo range will mask any redo operations for prior REDO 82 * records. This is necessary because once the TERM is laid down 83 * followup operations may make additional changes to the related 84 * records but not necessarily record them as REDOs (because REDOs are 85 * optional). 86 * 87 * REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range 88 * must be ignored since they represent meta-data flushes which are 89 * undone by the UNDOs in that nominal UNDO range by the recovery 90 * code. Only REDO_TERM_* records in the extended range but not 91 * in the nominal undo range are applicable. 92 * 93 * The REDO_SYNC record itself always exists in the nominal UNDO range 94 * (this is how the extended range is determined). For recovery 95 * purposes the most recent REDO_SYNC record is always used if several 96 * are found. 97 * 98 * CRASHES DURING UNDO/REDO 99 * 100 * A crash during the UNDO phase requires no additional effort. The 101 * UNDOs will simply be re-run again. The state of the UNDO/REDO fifo 102 * remains unchanged and has no re-crash issues. 103 * 104 * A crash during the REDO phase is more complex because the REDOs 105 * run normal filesystem ops and generate additional UNDO/REDO records. 106 * REDO is disabled during REDO recovery and any SYNC records generated 107 * by flushes during REDO recovery must continue to reference the 108 * original extended range. 109 * 110 * If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery 111 * may become impossible. This is detected when the start of the 112 * extended range fails to have monotonically increasing sequence 113 * numbers leading into the nominal undo range. 114 */ 115 116 #include "hammer.h" 117 118 /* 119 * Specify the way we want to handle stage2 errors. 120 * 121 * Following values are accepted: 122 * 123 * 0 - Run redo recovery normally and fail to mount if 124 * the operation fails (default). 125 * 1 - Run redo recovery, but don't fail to mount if the 126 * operation fails. 127 * 2 - Completely skip redo recovery (only for severe error 128 * conditions and/or debugging. 129 */ 130 static int hammer_skip_redo = 0; 131 TUNABLE_INT("vfs.hammer.skip_redo", &hammer_skip_redo); 132 133 /* 134 * Each rterm entry has a list of fifo offsets indicating termination 135 * points. These are stripped as the scan progresses. 136 */ 137 typedef struct hammer_rterm_entry { 138 struct hammer_rterm_entry *next; 139 hammer_off_t fifo_offset; 140 } *hammer_rterm_entry_t; 141 142 /* 143 * rterm entries sorted in RB tree are indexed by objid, flags, and offset. 144 * TRUNC entries ignore the offset. 145 */ 146 typedef struct hammer_rterm { 147 RB_ENTRY(hammer_rterm) rb_node; 148 int64_t redo_objid; 149 u_int32_t redo_localization; 150 u_int32_t redo_flags; 151 hammer_off_t redo_offset; 152 hammer_rterm_entry_t term_list; 153 } *hammer_rterm_t; 154 155 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2); 156 struct hammer_rterm_rb_tree; 157 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm); 158 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp); 159 160 static int hammer_check_tail_signature(hammer_fifo_tail_t tail, 161 hammer_off_t end_off); 162 static int hammer_check_head_signature(hammer_fifo_head_t head, 163 hammer_off_t beg_off); 164 static void hammer_recover_copy_undo(hammer_off_t undo_offset, 165 char *src, char *dst, int bytes); 166 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp, 167 hammer_volume_t root_volume, 168 hammer_off_t *scan_offsetp, 169 int *errorp, struct hammer_buffer **bufferp); 170 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp, 171 hammer_volume_t root_volume, 172 hammer_off_t *scan_offsetp, 173 int *errorp, struct hammer_buffer **bufferp); 174 #if 0 175 static void hammer_recover_debug_dump(int w, char *buf, int bytes); 176 #endif 177 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume, 178 hammer_fifo_undo_t undo); 179 static int hammer_recover_redo_rec(hammer_mount_t hmp, 180 struct hammer_rterm_rb_tree *root, 181 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo); 182 static int hammer_recover_redo_run(hammer_mount_t hmp, 183 struct hammer_rterm_rb_tree *root, 184 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo); 185 static void hammer_recover_redo_exec(hammer_mount_t hmp, 186 hammer_fifo_redo_t redo); 187 188 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp); 189 190 /* 191 * Recover filesystem meta-data on mount. This procedure figures out the 192 * UNDO FIFO range and runs the UNDOs backwards. The FIFO pointers are not 193 * resynchronized by this procedure. 194 * 195 * This procedure is run near the beginning of the mount sequence, before 196 * any B-Tree or high-level accesses are enabled, and is responsible for 197 * restoring the meta-data to a consistent state. High level HAMMER data 198 * structures (such as the B-Tree) cannot be accessed here. 199 * 200 * NOTE: No information from the root volume has been cached in the 201 * hammer_mount structure yet, so we need to access the root volume's 202 * buffer directly. 203 * 204 * NOTE: 205 */ 206 int 207 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume) 208 { 209 hammer_blockmap_t rootmap; 210 hammer_buffer_t buffer; 211 hammer_off_t scan_offset; 212 hammer_off_t scan_offset_save; 213 hammer_off_t bytes; 214 hammer_fifo_any_t head; 215 hammer_off_t first_offset; 216 hammer_off_t last_offset; 217 u_int32_t seqno; 218 int error; 219 int degenerate_case = 0; 220 221 /* 222 * Examine the UNDO FIFO indices in the volume header. 223 */ 224 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 225 first_offset = rootmap->first_offset; 226 last_offset = rootmap->next_offset; 227 buffer = NULL; 228 error = 0; 229 230 hmp->recover_stage2_offset = 0; 231 232 if (first_offset > rootmap->alloc_offset || 233 last_offset > rootmap->alloc_offset) { 234 hvkprintf(root_volume, 235 "Illegal UNDO FIFO index range " 236 "%016jx, %016jx limit %016jx\n", 237 (intmax_t)first_offset, 238 (intmax_t)last_offset, 239 (intmax_t)rootmap->alloc_offset); 240 error = EIO; 241 goto done; 242 } 243 244 /* 245 * In HAMMER version 4+ filesystems the volume header does NOT 246 * contain definitive UNDO FIFO state. In particular, the 247 * rootmap->next_offset may not be indexed completely to the 248 * end of the active UNDO FIFO. 249 */ 250 if (hmp->version >= HAMMER_VOL_VERSION_FOUR) { 251 /* 252 * To find the definitive range we must first scan backwards 253 * from first_offset to locate the first real record and 254 * extract the sequence number from it. This record is not 255 * part of the active undo space. 256 */ 257 scan_offset = first_offset; 258 seqno = 0; 259 260 for (;;) { 261 head = hammer_recover_scan_rev(hmp, root_volume, 262 &scan_offset, 263 &error, &buffer); 264 if (error) 265 break; 266 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { 267 seqno = head->head.hdr_seq; 268 break; 269 } 270 } 271 if (error) { 272 hvkprintf(root_volume, 273 "recovery failure during seqno backscan\n"); 274 goto done; 275 } 276 277 /* 278 * Scan forwards from first_offset and (seqno+1) looking 279 * for a sequence space discontinuity. This denotes the 280 * end of the active FIFO area. 281 * 282 * NOTE: For the case where the FIFO is empty the very first 283 * record we find will be discontinuous. 284 * 285 * NOTE: Do not include trailing PADs in the scan range, 286 * and remember the returned scan_offset after a 287 * fwd iteration points to the end of the returned 288 * record. 289 */ 290 hvkprintf(root_volume, "recovery check seqno=%08x\n", seqno); 291 292 scan_offset = first_offset; 293 scan_offset_save = scan_offset; 294 ++seqno; 295 hmp->recover_stage2_seqno = seqno; 296 297 for (;;) { 298 head = hammer_recover_scan_fwd(hmp, root_volume, 299 &scan_offset, 300 &error, &buffer); 301 if (error) 302 break; 303 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { 304 if (seqno != head->head.hdr_seq) { 305 scan_offset = scan_offset_save; 306 break; 307 } 308 scan_offset_save = scan_offset; 309 ++seqno; 310 } 311 312 #if 0 313 /* 314 * If the forward scan is grossly ahead of last_offset 315 * then something is wrong. last_offset is supposed 316 * to be flushed out 317 */ 318 if (last_offset >= scan_offset) { 319 bytes = last_offset - scan_offset; 320 } else { 321 bytes = rootmap->alloc_offset - scan_offset + 322 (last_offset & HAMMER_OFF_LONG_MASK); 323 } 324 if (bytes > 325 (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) * 326 4 / 5) { 327 hvkprintf(root_volume, 328 "recovery forward scan is " 329 "grossly beyond the last_offset in " 330 "the volume header, this can't be " 331 "right.\n"); 332 error = EIO; 333 break; 334 } 335 #endif 336 } 337 338 /* 339 * Store the seqno. This will be the next seqno we lay down 340 * when generating new UNDOs. 341 */ 342 hmp->undo_seqno = seqno; 343 if (error) { 344 hvkprintf(root_volume, 345 "recovery failure during seqno fwdscan\n"); 346 goto done; 347 } 348 last_offset = scan_offset; 349 hvkprintf(root_volume, 350 "recovery range %016jx-%016jx\n", 351 (intmax_t)first_offset, 352 (intmax_t)last_offset); 353 hvkprintf(root_volume, 354 "recovery nexto %016jx endseqno=%08x\n", 355 (intmax_t)rootmap->next_offset, 356 seqno); 357 } 358 359 /* 360 * Calculate the size of the active portion of the FIFO. If the 361 * FIFO is empty the filesystem is clean and no further action is 362 * needed. 363 */ 364 if (last_offset >= first_offset) { 365 bytes = last_offset - first_offset; 366 } else { 367 bytes = rootmap->alloc_offset - first_offset + 368 (last_offset & HAMMER_OFF_LONG_MASK); 369 } 370 if (bytes == 0) { 371 degenerate_case = 1; 372 error = 0; 373 goto done; 374 } 375 376 hvkprintf(root_volume, 377 "recovery undo %016jx-%016jx (%jd bytes)%s\n", 378 (intmax_t)first_offset, 379 (intmax_t)last_offset, 380 (intmax_t)bytes, 381 (hmp->ronly ? " (RO)" : "(RW)")); 382 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) { 383 hkprintf("Undo size is absurd, unable to mount\n"); 384 error = EIO; 385 goto done; 386 } 387 388 /* 389 * Scan the UNDOs backwards. 390 */ 391 scan_offset = last_offset; 392 393 while ((int64_t)bytes > 0) { 394 KKASSERT(scan_offset != first_offset); 395 head = hammer_recover_scan_rev(hmp, root_volume, 396 &scan_offset, &error, &buffer); 397 if (error) 398 break; 399 400 /* 401 * Normal UNDO 402 */ 403 error = hammer_recover_undo(hmp, root_volume, &head->undo); 404 if (error) { 405 hvkprintf(root_volume, 406 "UNDO record at %016jx failed\n", 407 (intmax_t)scan_offset - head->head.hdr_size); 408 break; 409 } 410 411 /* 412 * The first REDO_SYNC record encountered (scanning backwards) 413 * enables REDO processing. 414 */ 415 if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO && 416 head->redo.redo_flags == HAMMER_REDO_SYNC) { 417 if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) { 418 hvkprintf(root_volume, 419 "Ignoring extra REDO_SYNC " 420 "records in UNDO/REDO FIFO.\n"); 421 } else { 422 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ; 423 hmp->recover_stage2_offset = 424 head->redo.redo_offset; 425 hvkprintf(root_volume, 426 "Found REDO_SYNC %016jx\n", 427 (intmax_t)head->redo.redo_offset); 428 } 429 } 430 431 bytes -= head->head.hdr_size; 432 433 /* 434 * If too many dirty buffers have built up we have to flush'm 435 * out. As long as we do not flush out the volume header 436 * a crash here should not cause any problems. 437 * 438 * buffer must be released so the flush can assert that 439 * all buffers are idle. 440 */ 441 if (hammer_flusher_meta_limit(hmp)) { 442 if (buffer) { 443 hammer_rel_buffer(buffer, 0); 444 buffer = NULL; 445 } 446 if (hmp->ronly == 0) { 447 hammer_recover_flush_buffers(hmp, root_volume, 448 0); 449 hvkprintf(root_volume, "Continuing recovery\n"); 450 } else { 451 hvkprintf(root_volume, 452 "Recovery failure: " 453 "Insufficient buffer cache to hold " 454 "dirty buffers on read-only mount!\n"); 455 error = EIO; 456 break; 457 } 458 } 459 } 460 KKASSERT(error || bytes == 0); 461 done: 462 if (buffer) { 463 hammer_rel_buffer(buffer, 0); 464 buffer = NULL; 465 } 466 467 /* 468 * After completely flushing all the recovered buffers the volume 469 * header will also be flushed. 470 */ 471 if (root_volume->io.recovered == 0) { 472 hammer_ref_volume(root_volume); 473 root_volume->io.recovered = 1; 474 } 475 476 /* 477 * Finish up flushing (or discarding) recovered buffers. FIFO 478 * indices in the volume header are updated to the actual undo 479 * range but will not be collapsed until stage 2. 480 */ 481 if (error == 0) { 482 hammer_modify_volume_noundo(NULL, root_volume); 483 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 484 rootmap->first_offset = first_offset; 485 rootmap->next_offset = last_offset; 486 hammer_modify_volume_done(root_volume); 487 if (hmp->ronly == 0) 488 hammer_recover_flush_buffers(hmp, root_volume, 1); 489 } else { 490 hammer_recover_flush_buffers(hmp, root_volume, -1); 491 } 492 if (degenerate_case == 0) { 493 hvkprintf(root_volume, "recovery complete\n"); 494 } else { 495 hvkprintf(root_volume, "mounted clean, no recovery needed\n"); 496 } 497 return (error); 498 } 499 500 /* 501 * Execute redo operations 502 * 503 * This procedure is run at the end of the mount sequence, after the hammer 504 * mount structure has been completely initialized but before the filesystem 505 * goes live. It can access standard cursors, the B-Tree, flush the 506 * filesystem, and so forth. 507 * 508 * This code may only be called for read-write mounts or when a mount 509 * switches from read-only to read-write. vnodes may or may not be present. 510 * 511 * The stage1 code will have already calculated the correct FIFO range 512 * for the nominal UNDO FIFO and stored it in the rootmap. The extended 513 * range for REDO is stored in hmp->recover_stage2_offset. 514 */ 515 int 516 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume) 517 { 518 hammer_blockmap_t rootmap; 519 hammer_buffer_t buffer; 520 hammer_off_t scan_offset; 521 hammer_off_t oscan_offset; 522 hammer_off_t bytes; 523 hammer_off_t ext_bytes; 524 hammer_fifo_any_t head; 525 hammer_off_t first_offset; 526 hammer_off_t last_offset; 527 hammer_off_t ext_offset; 528 struct hammer_rterm_rb_tree rterm_root; 529 u_int32_t seqno; 530 int error; 531 int verbose = 0; 532 int dorscan; 533 534 /* 535 * Stage 2 can only be run on a RW mount, or when the mount is 536 * switched from RO to RW. 537 */ 538 KKASSERT(hmp->ronly == 0); 539 RB_INIT(&rterm_root); 540 541 if (hammer_skip_redo == 1) 542 hvkprintf(root_volume, "recovery redo marked as optional\n"); 543 544 if (hammer_skip_redo == 2) { 545 hvkprintf(root_volume, "recovery redo skipped.\n"); 546 return (0); 547 } 548 549 /* 550 * Examine the UNDO FIFO. If it is empty the filesystem is clean 551 * and no action need be taken. 552 */ 553 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 554 first_offset = rootmap->first_offset; 555 last_offset = rootmap->next_offset; 556 if (first_offset == last_offset) { 557 KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0); 558 return(0); 559 } 560 561 /* 562 * Stage2 must only be run once, and will not be run at all 563 * if Stage1 did not find a REDO_SYNC record. 564 */ 565 error = 0; 566 buffer = NULL; 567 568 if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0) 569 goto done; 570 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ; 571 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN; 572 ext_offset = hmp->recover_stage2_offset; 573 if (ext_offset == 0) { 574 hvkprintf(root_volume, 575 "REDO stage specified but no REDO_SYNC " 576 "offset, ignoring\n"); 577 goto done; 578 } 579 580 /* 581 * Calculate nominal UNDO range (this is not yet the extended 582 * range). 583 */ 584 if (last_offset >= first_offset) { 585 bytes = last_offset - first_offset; 586 } else { 587 bytes = rootmap->alloc_offset - first_offset + 588 (last_offset & HAMMER_OFF_LONG_MASK); 589 } 590 hvkprintf(root_volume, 591 "recovery redo %016jx-%016jx (%jd bytes)%s\n", 592 (intmax_t)first_offset, 593 (intmax_t)last_offset, 594 (intmax_t)bytes, 595 (hmp->ronly ? " (RO)" : "(RW)")); 596 verbose = 1; 597 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) { 598 hkprintf("Undo size is absurd, unable to mount\n"); 599 error = EIO; 600 goto fatal; 601 } 602 603 /* 604 * Scan the REDOs backwards collecting REDO_TERM_* information. 605 * This information is only collected for the extended range, 606 * non-inclusive of any TERMs in the nominal UNDO range. 607 * 608 * If the stage2 extended range is inside the nominal undo range 609 * we have nothing to scan. 610 * 611 * This must fit in memory! 612 */ 613 if (first_offset < last_offset) { 614 /* 615 * [ first_offset........last_offset ] 616 */ 617 if (ext_offset < first_offset) { 618 dorscan = 1; 619 ext_bytes = first_offset - ext_offset; 620 } else if (ext_offset > last_offset) { 621 dorscan = 1; 622 ext_bytes = (rootmap->alloc_offset - ext_offset) + 623 (first_offset & HAMMER_OFF_LONG_MASK); 624 } else { 625 ext_bytes = -(ext_offset - first_offset); 626 dorscan = 0; 627 } 628 } else { 629 /* 630 * [......last_offset first_offset.....] 631 */ 632 if (ext_offset < last_offset) { 633 ext_bytes = -((rootmap->alloc_offset - first_offset) + 634 (ext_offset & HAMMER_OFF_LONG_MASK)); 635 dorscan = 0; 636 } else if (ext_offset > first_offset) { 637 ext_bytes = -(ext_offset - first_offset); 638 dorscan = 0; 639 } else { 640 ext_bytes = first_offset - ext_offset; 641 dorscan = 1; 642 } 643 } 644 645 if (dorscan) { 646 scan_offset = first_offset; 647 hvkprintf(root_volume, 648 "Find extended redo %016jx, %jd extbytes\n", 649 (intmax_t)ext_offset, 650 (intmax_t)ext_bytes); 651 seqno = hmp->recover_stage2_seqno - 1; 652 for (;;) { 653 head = hammer_recover_scan_rev(hmp, root_volume, 654 &scan_offset, 655 &error, &buffer); 656 if (error) 657 break; 658 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { 659 if (head->head.hdr_seq != seqno) { 660 error = ERANGE; 661 break; 662 } 663 error = hammer_recover_redo_rec( 664 hmp, &rterm_root, 665 scan_offset, &head->redo); 666 --seqno; 667 } 668 if (scan_offset == ext_offset) 669 break; 670 } 671 if (error) { 672 hvkprintf(root_volume, 673 "Find extended redo failed %d, " 674 "unable to run REDO\n", 675 error); 676 goto done; 677 } 678 } else { 679 hvkprintf(root_volume, 680 "Embedded extended redo %016jx, %jd extbytes\n", 681 (intmax_t)ext_offset, 682 (intmax_t)ext_bytes); 683 } 684 685 /* 686 * Scan the REDO forwards through the entire extended range. 687 * Anything with a previously recorded matching TERM is discarded. 688 */ 689 scan_offset = ext_offset; 690 bytes += ext_bytes; 691 692 /* 693 * NOTE: when doing a forward scan the returned scan_offset is 694 * for the record following the returned record, so we 695 * have to play a bit. 696 */ 697 while ((int64_t)bytes > 0) { 698 KKASSERT(scan_offset != last_offset); 699 700 oscan_offset = scan_offset; 701 head = hammer_recover_scan_fwd(hmp, root_volume, 702 &scan_offset, &error, &buffer); 703 if (error) 704 break; 705 706 error = hammer_recover_redo_run(hmp, &rterm_root, 707 oscan_offset, &head->redo); 708 if (error) { 709 hvkprintf(root_volume, 710 "UNDO record at %016jx failed\n", 711 (intmax_t)scan_offset - head->head.hdr_size); 712 break; 713 } 714 bytes -= head->head.hdr_size; 715 } 716 KKASSERT(error || bytes == 0); 717 718 done: 719 if (buffer) { 720 hammer_rel_buffer(buffer, 0); 721 buffer = NULL; 722 } 723 724 /* 725 * Cleanup rterm tree 726 */ 727 { 728 hammer_rterm_t rterm; 729 hammer_rterm_entry_t rte; 730 731 while ((rterm = RB_ROOT(&rterm_root)) != NULL) { 732 RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm); 733 while ((rte = rterm->term_list) != NULL) { 734 rterm->term_list = rte->next; 735 kfree(rte, hmp->m_misc); 736 } 737 kfree(rterm, hmp->m_misc); 738 } 739 } 740 741 /* 742 * Finish up flushing (or discarding) recovered buffers by executing 743 * a normal flush cycle. Setting HMNT_UNDO_DIRTY bypasses degenerate 744 * case tests and forces the flush in order to update the FIFO indices. 745 * 746 * If a crash occurs during the flush the entire undo/redo will be 747 * re-run during recovery on the next mount. 748 */ 749 if (error == 0) { 750 if (rootmap->first_offset != rootmap->next_offset) 751 hmp->hflags |= HMNT_UNDO_DIRTY; 752 hammer_flusher_sync(hmp); 753 } 754 fatal: 755 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN; 756 if (verbose) { 757 hvkprintf(root_volume, "End redo recovery\n"); 758 } 759 760 if (error && hammer_skip_redo == 1) 761 hvkprintf(root_volume, 762 "recovery redo error %d, skipping.\n", 763 error); 764 765 return (hammer_skip_redo ? 0 : error); 766 } 767 768 /* 769 * Scan backwards from *scan_offsetp, return the FIFO record prior to the 770 * record at *scan_offsetp or NULL if an error occured. 771 * 772 * On return *scan_offsetp will be the offset of the returned record. 773 */ 774 hammer_fifo_any_t 775 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume, 776 hammer_off_t *scan_offsetp, 777 int *errorp, struct hammer_buffer **bufferp) 778 { 779 hammer_off_t scan_offset; 780 hammer_blockmap_t rootmap; 781 hammer_fifo_any_t head; 782 hammer_fifo_tail_t tail; 783 784 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 785 scan_offset = *scan_offsetp; 786 787 if (hammer_debug_general & 0x0080) 788 hdkprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset); 789 if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) 790 scan_offset = rootmap->alloc_offset; 791 if (scan_offset - sizeof(*tail) < 792 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) { 793 hvkprintf(root_volume, 794 "UNDO record at %016jx FIFO underflow\n", 795 (intmax_t)scan_offset); 796 *errorp = EIO; 797 return (NULL); 798 } 799 tail = hammer_bread(hmp, scan_offset - sizeof(*tail), 800 errorp, bufferp); 801 if (*errorp) { 802 hvkprintf(root_volume, 803 "Unable to read UNDO TAIL at %016jx\n", 804 (intmax_t)scan_offset - sizeof(*tail)); 805 return (NULL); 806 } 807 808 if (hammer_check_tail_signature(tail, scan_offset) != 0) { 809 hvkprintf(root_volume, 810 "Illegal UNDO TAIL signature at %016jx\n", 811 (intmax_t)scan_offset - sizeof(*tail)); 812 *errorp = EIO; 813 return (NULL); 814 } 815 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size); 816 *scan_offsetp = scan_offset - head->head.hdr_size; 817 818 return (head); 819 } 820 821 /* 822 * Scan forwards from *scan_offsetp, return the FIFO record or NULL if 823 * an error occured. 824 * 825 * On return *scan_offsetp will be the offset of the record following 826 * the returned record. 827 */ 828 hammer_fifo_any_t 829 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume, 830 hammer_off_t *scan_offsetp, 831 int *errorp, struct hammer_buffer **bufferp) 832 { 833 hammer_off_t scan_offset; 834 hammer_blockmap_t rootmap; 835 hammer_fifo_any_t head; 836 837 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 838 scan_offset = *scan_offsetp; 839 840 if (hammer_debug_general & 0x0080) 841 hdkprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset); 842 if (scan_offset == rootmap->alloc_offset) 843 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0); 844 845 head = hammer_bread(hmp, scan_offset, errorp, bufferp); 846 if (*errorp) { 847 hvkprintf(root_volume, 848 "Unable to read UNDO HEAD at %016jx\n", 849 (intmax_t)scan_offset); 850 return (NULL); 851 } 852 853 if (hammer_check_head_signature(&head->head, scan_offset) != 0) { 854 hvkprintf(root_volume, 855 "Illegal UNDO TAIL signature at %016jx\n", 856 (intmax_t)scan_offset); 857 *errorp = EIO; 858 return (NULL); 859 } 860 scan_offset += head->head.hdr_size; 861 if (scan_offset == rootmap->alloc_offset) 862 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0); 863 *scan_offsetp = scan_offset; 864 865 return (head); 866 } 867 868 /* 869 * Helper function for hammer_check_{head,tail}_signature(). Check stuff 870 * once the head and tail has been established. 871 * 872 * This function validates the entire FIFO record wrapper. 873 */ 874 static __inline 875 int 876 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail, 877 hammer_off_t beg_off) 878 { 879 hammer_off_t end_off; 880 u_int32_t crc; 881 int bytes; 882 883 /* 884 * Check signatures. The tail signature is allowed to be the 885 * head signature only for 8-byte PADs. 886 */ 887 if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) { 888 hkprintf("FIFO record bad head signature %04x at %016jx\n", 889 head->hdr_signature, 890 (intmax_t)beg_off); 891 return(2); 892 } 893 if (head->hdr_size < HAMMER_HEAD_ALIGN || 894 (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) { 895 hkprintf("FIFO record unaligned or bad size %04x at %016jx\n", 896 head->hdr_size, 897 (intmax_t)beg_off); 898 return(2); 899 } 900 end_off = beg_off + head->hdr_size; 901 902 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD || 903 (size_t)(end_off - beg_off) != sizeof(*tail)) { 904 if (head->hdr_type != tail->tail_type) { 905 hkprintf("FIFO record head/tail type mismatch " 906 "%04x %04x at %016jx\n", 907 head->hdr_type, tail->tail_type, 908 (intmax_t)beg_off); 909 return(2); 910 } 911 if (head->hdr_size != tail->tail_size) { 912 hkprintf("FIFO record head/tail size mismatch " 913 "%04x %04x at %016jx\n", 914 head->hdr_size, tail->tail_size, 915 (intmax_t)beg_off); 916 return(2); 917 } 918 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) { 919 hkprintf("FIFO record bad tail signature " 920 "%04x at %016jx\n", 921 tail->tail_signature, 922 (intmax_t)beg_off); 923 return(3); 924 } 925 } 926 927 /* 928 * Non-PAD records must have a CRC and must be sized at 929 * least large enough to fit the head and tail. 930 */ 931 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) { 932 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^ 933 crc32(head + 1, head->hdr_size - sizeof(*head)); 934 if (head->hdr_crc != crc) { 935 hkprintf("FIFO record CRC failed %08x %08x at %016jx\n", 936 head->hdr_crc, crc, 937 (intmax_t)beg_off); 938 return(EIO); 939 } 940 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) { 941 hkprintf("FIFO record too small %04x at %016jx\n", 942 head->hdr_size, 943 (intmax_t)beg_off); 944 return(EIO); 945 } 946 } 947 948 /* 949 * Check the tail 950 */ 951 bytes = head->hdr_size; 952 tail = (void *)((char *)head + bytes - sizeof(*tail)); 953 if (tail->tail_size != head->hdr_size) { 954 hkprintf("Bad tail size %04x vs %04x at %016jx\n", 955 tail->tail_size, head->hdr_size, 956 (intmax_t)beg_off); 957 return(EIO); 958 } 959 if (tail->tail_type != head->hdr_type) { 960 hkprintf("Bad tail type %04x vs %04x at %016jx\n", 961 tail->tail_type, head->hdr_type, 962 (intmax_t)beg_off); 963 return(EIO); 964 } 965 966 return(0); 967 } 968 969 /* 970 * Check that the FIFO record is in-bounds given the head and the 971 * hammer offset. 972 * 973 * Also checks that the head and tail structures agree with each other, 974 * but does not check beyond the signature, type, and size. 975 */ 976 static int 977 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off) 978 { 979 hammer_fifo_tail_t tail; 980 hammer_off_t end_off; 981 982 /* 983 * head overlaps buffer boundary. This could be a PAD so only 984 * check the minimum PAD size here. 985 */ 986 if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64) 987 return(1); 988 989 /* 990 * Calculate the ending offset and make sure the record does 991 * not cross a buffer boundary. 992 */ 993 end_off = beg_off + head->hdr_size; 994 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64) 995 return(1); 996 tail = (void *)((char *)head + head->hdr_size - sizeof(*tail)); 997 return (_hammer_check_signature(head, tail, beg_off)); 998 } 999 1000 /* 1001 * Check that the FIFO record is in-bounds given the tail and the 1002 * hammer offset. The offset is pointing at the ending boundary of the 1003 * record. 1004 * 1005 * Also checks that the head and tail structures agree with each other, 1006 * but does not check beyond the signature, type, and size. 1007 */ 1008 static int 1009 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off) 1010 { 1011 hammer_fifo_head_t head; 1012 hammer_off_t beg_off; 1013 1014 /* 1015 * tail overlaps buffer boundary 1016 */ 1017 if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64) 1018 return(1); 1019 1020 /* 1021 * Calculate the begining offset and make sure the record does 1022 * not cross a buffer boundary. 1023 */ 1024 beg_off = end_off - tail->tail_size; 1025 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64) 1026 return(1); 1027 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size); 1028 return (_hammer_check_signature(head, tail, beg_off)); 1029 } 1030 1031 static int 1032 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume, 1033 hammer_fifo_undo_t undo) 1034 { 1035 hammer_volume_t volume; 1036 hammer_buffer_t buffer; 1037 hammer_off_t buf_offset; 1038 int zone; 1039 int error; 1040 int vol_no; 1041 int bytes; 1042 u_int32_t offset; 1043 1044 /* 1045 * Only process UNDO records. Flag if we find other records to 1046 * optimize stage2 recovery. 1047 */ 1048 if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO) 1049 return(0); 1050 1051 /* 1052 * Validate the UNDO record. 1053 */ 1054 bytes = undo->head.hdr_size - sizeof(*undo) - 1055 sizeof(struct hammer_fifo_tail); 1056 if (bytes < 0 || undo->undo_data_bytes < 0 || 1057 undo->undo_data_bytes > bytes) { 1058 hkprintf("Corrupt UNDO record, undo_data_bytes %d/%d\n", 1059 undo->undo_data_bytes, bytes); 1060 return(EIO); 1061 } 1062 1063 bytes = undo->undo_data_bytes; 1064 1065 /* 1066 * The undo offset may only be a zone-1 or zone-2 offset. 1067 * 1068 * Currently we only support a zone-1 offset representing the 1069 * volume header. 1070 */ 1071 zone = HAMMER_ZONE_DECODE(undo->undo_offset); 1072 offset = undo->undo_offset & HAMMER_BUFMASK; 1073 1074 if (offset + bytes > HAMMER_BUFSIZE) { 1075 hkprintf("Corrupt UNDO record, bad offset\n"); 1076 return (EIO); 1077 } 1078 1079 switch(zone) { 1080 case HAMMER_ZONE_RAW_VOLUME_INDEX: 1081 vol_no = HAMMER_VOL_DECODE(undo->undo_offset); 1082 volume = hammer_get_volume(hmp, vol_no, &error); 1083 if (volume == NULL) { 1084 hkprintf("UNDO record, cannot access volume %d\n", 1085 vol_no); 1086 break; 1087 } 1088 hammer_modify_volume_noundo(NULL, volume); 1089 hammer_recover_copy_undo(undo->undo_offset, 1090 (char *)(undo + 1), 1091 (char *)volume->ondisk + offset, 1092 bytes); 1093 hammer_modify_volume_done(volume); 1094 1095 /* 1096 * Multiple modifications may be made to the same buffer. 1097 * Also, the volume header cannot be written out until 1098 * everything else has been flushed. This also 1099 * covers the read-only case by preventing the kernel from 1100 * flushing the buffer. 1101 */ 1102 if (volume->io.recovered == 0) 1103 volume->io.recovered = 1; 1104 else 1105 hammer_rel_volume(volume, 0); 1106 break; 1107 case HAMMER_ZONE_RAW_BUFFER_INDEX: 1108 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64; 1109 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE, 1110 0, &error); 1111 if (buffer == NULL) { 1112 hkprintf("UNDO record, cannot access buffer %016jx\n", 1113 (intmax_t)undo->undo_offset); 1114 break; 1115 } 1116 hammer_modify_buffer_noundo(NULL, buffer); 1117 hammer_recover_copy_undo(undo->undo_offset, 1118 (char *)(undo + 1), 1119 (char *)buffer->ondisk + offset, 1120 bytes); 1121 hammer_modify_buffer_done(buffer); 1122 1123 /* 1124 * Multiple modifications may be made to the same buffer, 1125 * improve performance by delaying the flush. This also 1126 * covers the read-only case by preventing the kernel from 1127 * flushing the buffer. 1128 */ 1129 if (buffer->io.recovered == 0) 1130 buffer->io.recovered = 1; 1131 else 1132 hammer_rel_buffer(buffer, 0); 1133 break; 1134 default: 1135 hkprintf("Corrupt UNDO record\n"); 1136 error = EIO; 1137 } 1138 return (error); 1139 } 1140 1141 static void 1142 hammer_recover_copy_undo(hammer_off_t undo_offset, 1143 char *src, char *dst, int bytes) 1144 { 1145 if (hammer_debug_general & 0x0080) { 1146 hdkprintf("UNDO %016jx: %d\n", 1147 (intmax_t)undo_offset, bytes); 1148 } 1149 #if 0 1150 hkprintf("UNDO %016jx:", (intmax_t)undo_offset); 1151 hammer_recover_debug_dump(22, dst, bytes); 1152 kprintf("%22s", "to:"); 1153 hammer_recover_debug_dump(22, src, bytes); 1154 #endif 1155 bcopy(src, dst, bytes); 1156 } 1157 1158 /* 1159 * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations 1160 * during the backwards scan of the extended UNDO/REDO FIFO. This scan 1161 * does not include the nominal UNDO range, just the extended range. 1162 */ 1163 int 1164 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, 1165 hammer_off_t scan_offset, hammer_fifo_redo_t redo) 1166 { 1167 hammer_rterm_t rterm; 1168 hammer_rterm_t nrterm; 1169 hammer_rterm_entry_t rte; 1170 1171 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO) 1172 return(0); 1173 if (redo->redo_flags != HAMMER_REDO_TERM_WRITE && 1174 redo->redo_flags != HAMMER_REDO_TERM_TRUNC) { 1175 return(0); 1176 } 1177 1178 nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO); 1179 nrterm->redo_objid = redo->redo_objid; 1180 nrterm->redo_localization = redo->redo_localization; 1181 nrterm->redo_flags = redo->redo_flags; 1182 nrterm->redo_offset = redo->redo_offset; 1183 1184 rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm); 1185 if (rterm) 1186 kfree(nrterm, hmp->m_misc); 1187 else 1188 rterm = nrterm; 1189 1190 if (bootverbose) { 1191 hkprintf("record record %016jx objid %016jx " 1192 "offset %016jx flags %08x\n", 1193 (intmax_t)scan_offset, 1194 (intmax_t)redo->redo_objid, 1195 (intmax_t)redo->redo_offset, 1196 (int)redo->redo_flags); 1197 } 1198 1199 /* 1200 * Scan in reverse order, rte prepended, so the rte list will be 1201 * in forward order. 1202 */ 1203 rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO); 1204 rte->fifo_offset = scan_offset; 1205 rte->next = rterm->term_list; 1206 rterm->term_list = rte; 1207 1208 return(0); 1209 } 1210 1211 /* 1212 * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during 1213 * the forwards scan of the entire extended UNDO/REDO FIFO range. 1214 * 1215 * Records matching previously recorded TERMs have already been committed 1216 * and are ignored. 1217 */ 1218 int 1219 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, 1220 hammer_off_t scan_offset, hammer_fifo_redo_t redo) 1221 { 1222 struct hammer_rterm rtval; 1223 hammer_rterm_t rterm; 1224 hammer_rterm_entry_t rte; 1225 1226 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO) 1227 return(0); 1228 1229 switch(redo->redo_flags) { 1230 case HAMMER_REDO_WRITE: 1231 case HAMMER_REDO_TRUNC: 1232 /* 1233 * We hit a REDO request. The REDO request is only executed 1234 * if there is no matching TERM. 1235 */ 1236 bzero(&rtval, sizeof(rtval)); 1237 rtval.redo_objid = redo->redo_objid; 1238 rtval.redo_localization = redo->redo_localization; 1239 rtval.redo_offset = redo->redo_offset; 1240 rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ? 1241 HAMMER_REDO_TERM_WRITE : 1242 HAMMER_REDO_TERM_TRUNC; 1243 1244 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval); 1245 if (rterm) { 1246 if (bootverbose) { 1247 hkprintf("ignore record %016jx objid %016jx " 1248 "offset %016jx flags %08x\n", 1249 (intmax_t)scan_offset, 1250 (intmax_t)redo->redo_objid, 1251 (intmax_t)redo->redo_offset, 1252 (int)redo->redo_flags); 1253 } 1254 break; 1255 } 1256 if (bootverbose) { 1257 hkprintf("run record %016jx objid %016jx " 1258 "offset %016jx flags %08x\n", 1259 (intmax_t)scan_offset, 1260 (intmax_t)redo->redo_objid, 1261 (intmax_t)redo->redo_offset, 1262 (int)redo->redo_flags); 1263 } 1264 1265 /* 1266 * Redo stage2 can access a live filesystem, acquire the 1267 * vnode. 1268 */ 1269 hammer_recover_redo_exec(hmp, redo); 1270 break; 1271 case HAMMER_REDO_TERM_WRITE: 1272 case HAMMER_REDO_TERM_TRUNC: 1273 /* 1274 * As we encounter TERMs in the forward scan we remove 1275 * them. Once the forward scan hits the nominal undo range 1276 * there will be no more recorded TERMs. 1277 */ 1278 bzero(&rtval, sizeof(rtval)); 1279 rtval.redo_objid = redo->redo_objid; 1280 rtval.redo_localization = redo->redo_localization; 1281 rtval.redo_flags = redo->redo_flags; 1282 rtval.redo_offset = redo->redo_offset; 1283 1284 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval); 1285 if (rterm) { 1286 if ((rte = rterm->term_list) != NULL) { 1287 KKASSERT(rte->fifo_offset == scan_offset); 1288 rterm->term_list = rte->next; 1289 kfree(rte, hmp->m_misc); 1290 } 1291 } 1292 break; 1293 } 1294 return(0); 1295 } 1296 1297 static void 1298 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo) 1299 { 1300 struct hammer_transaction trans; 1301 struct vattr va; 1302 struct hammer_inode *ip; 1303 struct vnode *vp = NULL; 1304 int error; 1305 1306 hammer_start_transaction(&trans, hmp); 1307 1308 ip = hammer_get_inode(&trans, NULL, redo->redo_objid, 1309 HAMMER_MAX_TID, redo->redo_localization, 1310 0, &error); 1311 if (ip == NULL) { 1312 hkprintf("unable to find objid %016jx:%08x\n", 1313 (intmax_t)redo->redo_objid, redo->redo_localization); 1314 goto done2; 1315 } 1316 error = hammer_get_vnode(ip, &vp); 1317 if (error) { 1318 hkprintf("unable to acquire vnode for %016jx:%08x\n", 1319 (intmax_t)redo->redo_objid, redo->redo_localization); 1320 goto done1; 1321 } 1322 1323 switch(redo->redo_flags) { 1324 case HAMMER_REDO_WRITE: 1325 error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL); 1326 if (error) { 1327 hkprintf("vn_rdwr open %016jx:%08x returned %d\n", 1328 (intmax_t)redo->redo_objid, 1329 redo->redo_localization, error); 1330 break; 1331 } 1332 vn_unlock(vp); 1333 error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1), 1334 redo->redo_data_bytes, 1335 redo->redo_offset, UIO_SYSSPACE, 1336 0, proc0.p_ucred, NULL); 1337 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1338 if (error) { 1339 hkprintf("write %016jx:%08x returned %d\n", 1340 (intmax_t)redo->redo_objid, 1341 redo->redo_localization, error); 1342 } 1343 VOP_CLOSE(vp, FREAD|FWRITE, NULL); 1344 break; 1345 case HAMMER_REDO_TRUNC: 1346 VATTR_NULL(&va); 1347 va.va_size = redo->redo_offset; 1348 error = VOP_SETATTR(vp, &va, proc0.p_ucred); 1349 if (error) { 1350 hkprintf("setattr offset %016jx error %d\n", 1351 (intmax_t)redo->redo_offset, error); 1352 } 1353 break; 1354 } 1355 vput(vp); 1356 done1: 1357 hammer_rel_inode(ip, 0); 1358 done2: 1359 hammer_done_transaction(&trans); 1360 } 1361 1362 /* 1363 * RB tree compare function. Note that REDO_TERM_TRUNC ops ignore 1364 * the offset. 1365 * 1366 * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc. 1367 */ 1368 static int 1369 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2) 1370 { 1371 if (rt1->redo_objid < rt2->redo_objid) 1372 return(-1); 1373 if (rt1->redo_objid > rt2->redo_objid) 1374 return(1); 1375 if (rt1->redo_localization < rt2->redo_localization) 1376 return(-1); 1377 if (rt1->redo_localization > rt2->redo_localization) 1378 return(1); 1379 if (rt1->redo_flags < rt2->redo_flags) 1380 return(-1); 1381 if (rt1->redo_flags > rt2->redo_flags) 1382 return(1); 1383 if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) { 1384 if (rt1->redo_offset < rt2->redo_offset) 1385 return(-1); 1386 if (rt1->redo_offset > rt2->redo_offset) 1387 return(1); 1388 } 1389 return(0); 1390 } 1391 1392 #if 0 1393 1394 static void 1395 hammer_recover_debug_dump(int w, char *buf, int bytes) 1396 { 1397 int i; 1398 1399 for (i = 0; i < bytes; ++i) { 1400 if (i && (i & 15) == 0) 1401 kprintf("\n%*.*s", w, w, ""); 1402 kprintf(" %02x", (unsigned char)buf[i]); 1403 } 1404 kprintf("\n"); 1405 } 1406 1407 #endif 1408 1409 /* 1410 * Flush recovered buffers from recovery operations. The call to this 1411 * routine may be delayed if a read-only mount was made and then later 1412 * upgraded to read-write. This routine is also called when unmounting 1413 * a read-only mount to clean out recovered (dirty) buffers which we 1414 * couldn't flush (because the mount is read-only). 1415 * 1416 * The volume header is always written last. The UNDO FIFO will be forced 1417 * to zero-length by setting next_offset to first_offset. This leaves the 1418 * (now stale) UNDO information used to recover the disk available for 1419 * forensic analysis. 1420 * 1421 * final is typically 0 or 1. The volume header is only written if final 1422 * is 1. If final is -1 the recovered buffers are discarded instead of 1423 * written and root_volume can also be passed as NULL in that case. 1424 */ 1425 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *); 1426 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *); 1427 1428 void 1429 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume, 1430 int final) 1431 { 1432 /* 1433 * Flush the buffers out asynchronously, wait for all the I/O to 1434 * complete, then do it again to destroy the buffer cache buffer 1435 * so it doesn't alias something later on. 1436 */ 1437 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, 1438 hammer_recover_flush_buffer_callback, &final); 1439 hammer_io_wait_all(hmp, "hmrrcw", 1); 1440 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, 1441 hammer_recover_flush_buffer_callback, &final); 1442 1443 /* 1444 * Flush all volume headers except the root volume. If final < 0 1445 * we discard all volume headers including the root volume. 1446 */ 1447 if (final >= 0) { 1448 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, 1449 hammer_recover_flush_volume_callback, root_volume); 1450 } else { 1451 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, 1452 hammer_recover_flush_volume_callback, NULL); 1453 } 1454 1455 /* 1456 * Finalize the root volume header. 1457 * 1458 * No interlock is needed, volume buffers are not 1459 * messed with by bioops. 1460 */ 1461 if (root_volume && root_volume->io.recovered && final > 0) { 1462 hammer_io_wait_all(hmp, "hmrflx", 1); 1463 root_volume->io.recovered = 0; 1464 hammer_io_flush(&root_volume->io, 0); 1465 hammer_rel_volume(root_volume, 0); 1466 hammer_io_wait_all(hmp, "hmrfly", 1); 1467 } 1468 } 1469 1470 /* 1471 * Callback to flush volume headers. If discarding data will be NULL and 1472 * all volume headers (including the root volume) will be discarded. 1473 * Otherwise data is the root_volume and we flush all volume headers 1474 * EXCEPT the root_volume. 1475 * 1476 * Clear any I/O error or modified condition when discarding buffers to 1477 * clean up the reference count, otherwise the buffer may have extra refs 1478 * on it. 1479 */ 1480 static 1481 int 1482 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data) 1483 { 1484 hammer_volume_t root_volume = data; 1485 1486 if (volume->io.recovered && volume != root_volume) { 1487 volume->io.recovered = 0; 1488 if (root_volume != NULL) { 1489 /* 1490 * No interlock is needed, volume buffers are not 1491 * messed with by bioops. 1492 */ 1493 hammer_io_flush(&volume->io, 0); 1494 } else { 1495 hammer_io_clear_error(&volume->io); 1496 hammer_io_clear_modify(&volume->io, 1); 1497 } 1498 hammer_rel_volume(volume, 0); 1499 } 1500 return(0); 1501 } 1502 1503 /* 1504 * Flush or discard recovered I/O buffers. 1505 * 1506 * Clear any I/O error or modified condition when discarding buffers to 1507 * clean up the reference count, otherwise the buffer may have extra refs 1508 * on it. 1509 */ 1510 static 1511 int 1512 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data) 1513 { 1514 int final = *(int *)data; 1515 int flush; 1516 1517 if (buffer->io.recovered) { 1518 buffer->io.recovered = 0; 1519 buffer->io.reclaim = 1; 1520 if (final < 0) { 1521 hammer_io_clear_error(&buffer->io); 1522 hammer_io_clear_modify(&buffer->io, 1); 1523 } else { 1524 hammer_io_write_interlock(&buffer->io); 1525 hammer_io_flush(&buffer->io, 0); 1526 hammer_io_done_interlock(&buffer->io); 1527 } 1528 hammer_rel_buffer(buffer, 0); 1529 } else { 1530 flush = hammer_ref_interlock(&buffer->io.lock); 1531 if (flush) 1532 atomic_add_int(&hammer_count_refedbufs, 1); 1533 1534 if (final < 0) { 1535 hammer_io_clear_error(&buffer->io); 1536 hammer_io_clear_modify(&buffer->io, 1); 1537 } 1538 KKASSERT(hammer_oneref(&buffer->io.lock)); 1539 buffer->io.reclaim = 1; 1540 hammer_rel_buffer(buffer, flush); 1541 } 1542 return(0); 1543 } 1544 1545