1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * UNDO ALGORITHM: 37 * 38 * The UNDO algorithm is trivial. The nominal UNDO range in the 39 * FIFO is determined by taking the first/next offset stored in 40 * the volume header. The next offset may not be correct since 41 * UNDO flushes are not required to flush the volume header, so 42 * the code also scans forward until it finds a discontinuous 43 * sequence number. 44 * 45 * The UNDOs are then scanned and executed in reverse order. These 46 * UNDOs are effectively just data restorations based on HAMMER offsets. 47 * 48 * REDO ALGORITHM: 49 * 50 * REDO records are laid down in the UNDO/REDO FIFO for nominal 51 * writes, truncations, and file extension ops. On a per-inode 52 * basis two types of REDO records are generated, REDO_WRITE 53 * and REDO_TRUNC. 54 * 55 * Essentially the recovery block will contain UNDO records backing 56 * out partial operations and REDO records to regenerate those partial 57 * operations guaranteed by the filesystem during recovery. 58 * 59 * REDO generation is optional, and can also be started and then 60 * later stopped due to excessive write()s inbetween fsyncs, or not 61 * started at all. Because of this the recovery code must determine 62 * when REDOs are valid and when they are not. Additional records are 63 * generated to help figure it out. 64 * 65 * The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated 66 * during a flush cycle indicating which records the flush cycle 67 * has synched meta-data for, and HAMMER_REDO_SYNC is generated in 68 * each flush cycle to indicate how far back in the UNDO/REDO FIFO 69 * the recovery code must go to find the earliest applicable REDO 70 * record. Applicable REDO records can be far outside the nominal 71 * UNDO recovery range, for example if a write() lays down a REDO but 72 * the related file is not flushed for several cycles. 73 * 74 * The SYNC reference is to a point prior to the nominal UNDO FIFO 75 * range, creating an extended REDO range which must be scanned. 76 * 77 * Any REDO_WRITE/REDO_TRUNC encountered within the extended range 78 * which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records 79 * prior to the start of the nominal UNDO range are applicable. 80 * That is, any REDO_TERM_* records in the extended range but not in 81 * the nominal undo range will mask any redo operations for prior REDO 82 * records. This is necessary because once the TERM is laid down 83 * followup operations may make additional changes to the related 84 * records but not necessarily record them as REDOs (because REDOs are 85 * optional). 86 * 87 * REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range 88 * must be ignored since they represent meta-data flushes which are 89 * undone by the UNDOs in that nominal UNDO range by the recovery 90 * code. Only REDO_TERM_* records in the extended range but not 91 * in the nominal undo range are applicable. 92 * 93 * The REDO_SYNC record itself always exists in the nominal UNDO range 94 * (this is how the extended range is determined). For recovery 95 * purposes the most recent REDO_SYNC record is always used if several 96 * are found. 97 * 98 * CRASHES DURING UNDO/REDO 99 * 100 * A crash during the UNDO phase requires no additional effort. The 101 * UNDOs will simply be re-run again. The state of the UNDO/REDO fifo 102 * remains unchanged and has no re-crash issues. 103 * 104 * A crash during the REDO phase is more complex because the REDOs 105 * run normal filesystem ops and generate additional UNDO/REDO records. 106 * REDO is disabled during REDO recovery and any SYNC records generated 107 * by flushes during REDO recovery must continue to reference the 108 * original extended range. 109 * 110 * If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery 111 * may become impossible. This is detected when the start of the 112 * extended range fails to have monotonically increasing sequence 113 * numbers leading into the nominal undo range. 114 */ 115 116 #include "hammer.h" 117 118 /* 119 * Specify the way we want to handle stage2 errors. 120 * 121 * Following values are accepted: 122 * 123 * 0 - Run redo recovery normally and fail to mount if 124 * the operation fails (default). 125 * 1 - Run redo recovery, but don't fail to mount if the 126 * operation fails. 127 * 2 - Completely skip redo recovery (only for severe error 128 * conditions and/or debugging. 129 */ 130 static int hammer_skip_redo = 0; 131 TUNABLE_INT("vfs.hammer.skip_redo", &hammer_skip_redo); 132 133 /* 134 * Each rterm entry has a list of fifo offsets indicating termination 135 * points. These are stripped as the scan progresses. 136 */ 137 typedef struct hammer_rterm_entry { 138 struct hammer_rterm_entry *next; 139 hammer_off_t fifo_offset; 140 } *hammer_rterm_entry_t; 141 142 /* 143 * rterm entries sorted in RB tree are indexed by objid, flags, and offset. 144 * TRUNC entries ignore the offset. 145 */ 146 typedef struct hammer_rterm { 147 RB_ENTRY(hammer_rterm) rb_node; 148 int64_t redo_objid; 149 u_int32_t redo_localization; 150 u_int32_t redo_flags; 151 hammer_off_t redo_offset; 152 hammer_rterm_entry_t term_list; 153 } *hammer_rterm_t; 154 155 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2); 156 struct hammer_rterm_rb_tree; 157 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm); 158 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp); 159 160 static int hammer_check_tail_signature(hammer_fifo_tail_t tail, 161 hammer_off_t end_off); 162 static int hammer_check_head_signature(hammer_fifo_head_t head, 163 hammer_off_t beg_off); 164 static void hammer_recover_copy_undo(hammer_off_t undo_offset, 165 char *src, char *dst, int bytes); 166 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp, 167 hammer_volume_t root_volume, 168 hammer_off_t *scan_offsetp, 169 int *errorp, struct hammer_buffer **bufferp); 170 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp, 171 hammer_volume_t root_volume, 172 hammer_off_t *scan_offsetp, 173 int *errorp, struct hammer_buffer **bufferp); 174 #if 0 175 static void hammer_recover_debug_dump(int w, char *buf, int bytes); 176 #endif 177 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume, 178 hammer_fifo_undo_t undo); 179 static int hammer_recover_redo_rec(hammer_mount_t hmp, 180 struct hammer_rterm_rb_tree *root, 181 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo); 182 static int hammer_recover_redo_run(hammer_mount_t hmp, 183 struct hammer_rterm_rb_tree *root, 184 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo); 185 static void hammer_recover_redo_exec(hammer_mount_t hmp, 186 hammer_fifo_redo_t redo); 187 188 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp); 189 190 /* 191 * Recover filesystem meta-data on mount. This procedure figures out the 192 * UNDO FIFO range and runs the UNDOs backwards. The FIFO pointers are not 193 * resynchronized by this procedure. 194 * 195 * This procedure is run near the beginning of the mount sequence, before 196 * any B-Tree or high-level accesses are enabled, and is responsible for 197 * restoring the meta-data to a consistent state. High level HAMMER data 198 * structures (such as the B-Tree) cannot be accessed here. 199 * 200 * NOTE: No information from the root volume has been cached in the 201 * hammer_mount structure yet, so we need to access the root volume's 202 * buffer directly. 203 * 204 * NOTE: 205 */ 206 int 207 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume) 208 { 209 hammer_blockmap_t rootmap; 210 hammer_buffer_t buffer; 211 hammer_off_t scan_offset; 212 hammer_off_t scan_offset_save; 213 hammer_off_t bytes; 214 hammer_fifo_any_t head; 215 hammer_off_t first_offset; 216 hammer_off_t last_offset; 217 u_int32_t seqno; 218 int error; 219 int degenerate_case = 0; 220 221 /* 222 * Examine the UNDO FIFO indices in the volume header. 223 */ 224 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 225 first_offset = rootmap->first_offset; 226 last_offset = rootmap->next_offset; 227 buffer = NULL; 228 error = 0; 229 230 hmp->recover_stage2_offset = 0; 231 232 if (first_offset > rootmap->alloc_offset || 233 last_offset > rootmap->alloc_offset) { 234 kprintf("HAMMER(%s) Illegal UNDO FIFO index range " 235 "%016jx, %016jx limit %016jx\n", 236 root_volume->ondisk->vol_name, 237 (intmax_t)first_offset, 238 (intmax_t)last_offset, 239 (intmax_t)rootmap->alloc_offset); 240 error = EIO; 241 goto done; 242 } 243 244 /* 245 * In HAMMER version 4+ filesystems the volume header does NOT 246 * contain definitive UNDO FIFO state. In particular, the 247 * rootmap->next_offset may not be indexed completely to the 248 * end of the active UNDO FIFO. 249 */ 250 if (hmp->version >= HAMMER_VOL_VERSION_FOUR) { 251 /* 252 * To find the definitive range we must first scan backwards 253 * from first_offset to locate the first real record and 254 * extract the sequence number from it. This record is not 255 * part of the active undo space. 256 */ 257 scan_offset = first_offset; 258 seqno = 0; 259 260 for (;;) { 261 head = hammer_recover_scan_rev(hmp, root_volume, 262 &scan_offset, 263 &error, &buffer); 264 if (error) 265 break; 266 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { 267 seqno = head->head.hdr_seq; 268 break; 269 } 270 } 271 if (error) { 272 kprintf("HAMMER(%s) recovery failure " 273 "during seqno backscan\n", 274 root_volume->ondisk->vol_name); 275 goto done; 276 } 277 278 /* 279 * Scan forwards from first_offset and (seqno+1) looking 280 * for a sequence space discontinuity. This denotes the 281 * end of the active FIFO area. 282 * 283 * NOTE: For the case where the FIFO is empty the very first 284 * record we find will be discontinuous. 285 * 286 * NOTE: Do not include trailing PADs in the scan range, 287 * and remember the returned scan_offset after a 288 * fwd iteration points to the end of the returned 289 * record. 290 */ 291 kprintf("HAMMER(%s) recovery check seqno=%08x\n", 292 root_volume->ondisk->vol_name, 293 seqno); 294 295 scan_offset = first_offset; 296 scan_offset_save = scan_offset; 297 ++seqno; 298 hmp->recover_stage2_seqno = seqno; 299 300 for (;;) { 301 head = hammer_recover_scan_fwd(hmp, root_volume, 302 &scan_offset, 303 &error, &buffer); 304 if (error) 305 break; 306 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { 307 if (seqno != head->head.hdr_seq) { 308 scan_offset = scan_offset_save; 309 break; 310 } 311 scan_offset_save = scan_offset; 312 ++seqno; 313 } 314 315 #if 0 316 /* 317 * If the forward scan is grossly ahead of last_offset 318 * then something is wrong. last_offset is supposed 319 * to be flushed out 320 */ 321 if (last_offset >= scan_offset) { 322 bytes = last_offset - scan_offset; 323 } else { 324 bytes = rootmap->alloc_offset - scan_offset + 325 (last_offset & HAMMER_OFF_LONG_MASK); 326 } 327 if (bytes > 328 (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) * 329 4 / 5) { 330 kprintf("HAMMER(%s) recovery forward scan is " 331 "grossly beyond the last_offset in " 332 "the volume header, this can't be " 333 "right.\n", 334 root_volume->ondisk->vol_name); 335 error = EIO; 336 break; 337 } 338 #endif 339 } 340 341 /* 342 * Store the seqno. This will be the next seqno we lay down 343 * when generating new UNDOs. 344 */ 345 hmp->undo_seqno = seqno; 346 if (error) { 347 kprintf("HAMMER(%s) recovery failure " 348 "during seqno fwdscan\n", 349 root_volume->ondisk->vol_name); 350 goto done; 351 } 352 last_offset = scan_offset; 353 kprintf("HAMMER(%s) recovery range %016jx-%016jx\n" 354 "HAMMER(%s) recovery nexto %016jx endseqno=%08x\n", 355 root_volume->ondisk->vol_name, 356 (intmax_t)first_offset, 357 (intmax_t)last_offset, 358 root_volume->ondisk->vol_name, 359 (intmax_t)rootmap->next_offset, 360 seqno); 361 } 362 363 /* 364 * Calculate the size of the active portion of the FIFO. If the 365 * FIFO is empty the filesystem is clean and no further action is 366 * needed. 367 */ 368 if (last_offset >= first_offset) { 369 bytes = last_offset - first_offset; 370 } else { 371 bytes = rootmap->alloc_offset - first_offset + 372 (last_offset & HAMMER_OFF_LONG_MASK); 373 } 374 if (bytes == 0) { 375 degenerate_case = 1; 376 error = 0; 377 goto done; 378 } 379 380 kprintf("HAMMER(%s) recovery undo %016jx-%016jx (%jd bytes)%s\n", 381 root_volume->ondisk->vol_name, 382 (intmax_t)first_offset, 383 (intmax_t)last_offset, 384 (intmax_t)bytes, 385 (hmp->ronly ? " (RO)" : "(RW)")); 386 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) { 387 kprintf("Undo size is absurd, unable to mount\n"); 388 error = EIO; 389 goto done; 390 } 391 392 /* 393 * Scan the UNDOs backwards. 394 */ 395 scan_offset = last_offset; 396 397 while ((int64_t)bytes > 0) { 398 KKASSERT(scan_offset != first_offset); 399 head = hammer_recover_scan_rev(hmp, root_volume, 400 &scan_offset, &error, &buffer); 401 if (error) 402 break; 403 404 /* 405 * Normal UNDO 406 */ 407 error = hammer_recover_undo(hmp, root_volume, &head->undo); 408 if (error) { 409 kprintf("HAMMER(%s) UNDO record at %016jx failed\n", 410 root_volume->ondisk->vol_name, 411 (intmax_t)scan_offset - head->head.hdr_size); 412 break; 413 } 414 415 /* 416 * The first REDO_SYNC record encountered (scanning backwards) 417 * enables REDO processing. 418 */ 419 if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO && 420 head->redo.redo_flags == HAMMER_REDO_SYNC) { 421 if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) { 422 kprintf("HAMMER(%s) Ignoring extra REDO_SYNC " 423 "records in UNDO/REDO FIFO.\n", 424 root_volume->ondisk->vol_name 425 ); 426 } else { 427 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ; 428 hmp->recover_stage2_offset = 429 head->redo.redo_offset; 430 kprintf("HAMMER(%s) Found REDO_SYNC %016jx\n", 431 root_volume->ondisk->vol_name, 432 (intmax_t)head->redo.redo_offset); 433 } 434 } 435 436 bytes -= head->head.hdr_size; 437 438 /* 439 * If too many dirty buffers have built up we have to flush'm 440 * out. As long as we do not flush out the volume header 441 * a crash here should not cause any problems. 442 * 443 * buffer must be released so the flush can assert that 444 * all buffers are idle. 445 */ 446 if (hammer_flusher_meta_limit(hmp)) { 447 if (buffer) { 448 hammer_rel_buffer(buffer, 0); 449 buffer = NULL; 450 } 451 if (hmp->ronly == 0) { 452 hammer_recover_flush_buffers(hmp, root_volume, 453 0); 454 kprintf("HAMMER(%s) Continuing recovery\n", 455 root_volume->ondisk->vol_name); 456 } else { 457 kprintf("HAMMER(%s) Recovery failure: " 458 "Insufficient buffer cache to hold " 459 "dirty buffers on read-only mount!\n", 460 root_volume->ondisk->vol_name); 461 error = EIO; 462 break; 463 } 464 } 465 } 466 KKASSERT(error || bytes == 0); 467 done: 468 if (buffer) { 469 hammer_rel_buffer(buffer, 0); 470 buffer = NULL; 471 } 472 473 /* 474 * After completely flushing all the recovered buffers the volume 475 * header will also be flushed. 476 */ 477 if (root_volume->io.recovered == 0) { 478 hammer_ref_volume(root_volume); 479 root_volume->io.recovered = 1; 480 } 481 482 /* 483 * Finish up flushing (or discarding) recovered buffers. FIFO 484 * indices in the volume header are updated to the actual undo 485 * range but will not be collapsed until stage 2. 486 */ 487 if (error == 0) { 488 hammer_modify_volume_noundo(NULL, root_volume); 489 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 490 rootmap->first_offset = first_offset; 491 rootmap->next_offset = last_offset; 492 hammer_modify_volume_done(root_volume); 493 if (hmp->ronly == 0) 494 hammer_recover_flush_buffers(hmp, root_volume, 1); 495 } else { 496 hammer_recover_flush_buffers(hmp, root_volume, -1); 497 } 498 if (degenerate_case == 0) { 499 kprintf("HAMMER(%s) recovery complete\n", 500 root_volume->ondisk->vol_name); 501 } else { 502 kprintf("HAMMER(%s) mounted clean, no recovery needed\n", 503 root_volume->ondisk->vol_name); 504 } 505 return (error); 506 } 507 508 /* 509 * Execute redo operations 510 * 511 * This procedure is run at the end of the mount sequence, after the hammer 512 * mount structure has been completely initialized but before the filesystem 513 * goes live. It can access standard cursors, the B-Tree, flush the 514 * filesystem, and so forth. 515 * 516 * This code may only be called for read-write mounts or when a mount 517 * switches from read-only to read-write. vnodes may or may not be present. 518 * 519 * The stage1 code will have already calculated the correct FIFO range 520 * for the nominal UNDO FIFO and stored it in the rootmap. The extended 521 * range for REDO is stored in hmp->recover_stage2_offset. 522 */ 523 int 524 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume) 525 { 526 hammer_blockmap_t rootmap; 527 hammer_buffer_t buffer; 528 hammer_off_t scan_offset; 529 hammer_off_t oscan_offset; 530 hammer_off_t bytes; 531 hammer_off_t ext_bytes; 532 hammer_fifo_any_t head; 533 hammer_off_t first_offset; 534 hammer_off_t last_offset; 535 hammer_off_t ext_offset; 536 struct hammer_rterm_rb_tree rterm_root; 537 u_int32_t seqno; 538 int error; 539 int verbose = 0; 540 int dorscan; 541 542 /* 543 * Stage 2 can only be run on a RW mount, or when the mount is 544 * switched from RO to RW. 545 */ 546 KKASSERT(hmp->ronly == 0); 547 RB_INIT(&rterm_root); 548 549 if (hammer_skip_redo == 1) 550 kprintf("HAMMER(%s) recovery redo marked as optional\n", 551 root_volume->ondisk->vol_name); 552 553 if (hammer_skip_redo == 2) { 554 kprintf("HAMMER(%s) recovery redo skipped.\n", 555 root_volume->ondisk->vol_name); 556 return (0); 557 } 558 559 /* 560 * Examine the UNDO FIFO. If it is empty the filesystem is clean 561 * and no action need be taken. 562 */ 563 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 564 first_offset = rootmap->first_offset; 565 last_offset = rootmap->next_offset; 566 if (first_offset == last_offset) { 567 KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0); 568 return(0); 569 } 570 571 /* 572 * Stage2 must only be run once, and will not be run at all 573 * if Stage1 did not find a REDO_SYNC record. 574 */ 575 error = 0; 576 buffer = NULL; 577 578 if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0) 579 goto done; 580 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ; 581 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN; 582 ext_offset = hmp->recover_stage2_offset; 583 if (ext_offset == 0) { 584 kprintf("HAMMER(%s) REDO stage specified but no REDO_SYNC " 585 "offset, ignoring\n", 586 root_volume->ondisk->vol_name); 587 goto done; 588 } 589 590 /* 591 * Calculate nominal UNDO range (this is not yet the extended 592 * range). 593 */ 594 if (last_offset >= first_offset) { 595 bytes = last_offset - first_offset; 596 } else { 597 bytes = rootmap->alloc_offset - first_offset + 598 (last_offset & HAMMER_OFF_LONG_MASK); 599 } 600 kprintf("HAMMER(%s) recovery redo %016jx-%016jx (%jd bytes)%s\n", 601 root_volume->ondisk->vol_name, 602 (intmax_t)first_offset, 603 (intmax_t)last_offset, 604 (intmax_t)bytes, 605 (hmp->ronly ? " (RO)" : "(RW)")); 606 verbose = 1; 607 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) { 608 kprintf("Undo size is absurd, unable to mount\n"); 609 error = EIO; 610 goto fatal; 611 } 612 613 /* 614 * Scan the REDOs backwards collecting REDO_TERM_* information. 615 * This information is only collected for the extended range, 616 * non-inclusive of any TERMs in the nominal UNDO range. 617 * 618 * If the stage2 extended range is inside the nominal undo range 619 * we have nothing to scan. 620 * 621 * This must fit in memory! 622 */ 623 if (first_offset < last_offset) { 624 /* 625 * [ first_offset........last_offset ] 626 */ 627 if (ext_offset < first_offset) { 628 dorscan = 1; 629 ext_bytes = first_offset - ext_offset; 630 } else if (ext_offset > last_offset) { 631 dorscan = 1; 632 ext_bytes = (rootmap->alloc_offset - ext_offset) + 633 (first_offset & HAMMER_OFF_LONG_MASK); 634 } else { 635 ext_bytes = -(ext_offset - first_offset); 636 dorscan = 0; 637 } 638 } else { 639 /* 640 * [......last_offset first_offset.....] 641 */ 642 if (ext_offset < last_offset) { 643 ext_bytes = -((rootmap->alloc_offset - first_offset) + 644 (ext_offset & HAMMER_OFF_LONG_MASK)); 645 dorscan = 0; 646 } else if (ext_offset > first_offset) { 647 ext_bytes = -(ext_offset - first_offset); 648 dorscan = 0; 649 } else { 650 ext_bytes = first_offset - ext_offset; 651 dorscan = 1; 652 } 653 } 654 655 if (dorscan) { 656 scan_offset = first_offset; 657 kprintf("HAMMER(%s) Find extended redo %016jx, %jd extbytes\n", 658 root_volume->ondisk->vol_name, 659 (intmax_t)ext_offset, 660 (intmax_t)ext_bytes); 661 seqno = hmp->recover_stage2_seqno - 1; 662 for (;;) { 663 head = hammer_recover_scan_rev(hmp, root_volume, 664 &scan_offset, 665 &error, &buffer); 666 if (error) 667 break; 668 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { 669 if (head->head.hdr_seq != seqno) { 670 error = ERANGE; 671 break; 672 } 673 error = hammer_recover_redo_rec( 674 hmp, &rterm_root, 675 scan_offset, &head->redo); 676 --seqno; 677 } 678 if (scan_offset == ext_offset) 679 break; 680 } 681 if (error) { 682 kprintf("HAMMER(%s) Find extended redo failed %d, " 683 "unable to run REDO\n", 684 root_volume->ondisk->vol_name, 685 error); 686 goto done; 687 } 688 } else { 689 kprintf("HAMMER(%s) Embedded extended redo %016jx, " 690 "%jd extbytes\n", 691 root_volume->ondisk->vol_name, 692 (intmax_t)ext_offset, 693 (intmax_t)ext_bytes); 694 } 695 696 /* 697 * Scan the REDO forwards through the entire extended range. 698 * Anything with a previously recorded matching TERM is discarded. 699 */ 700 scan_offset = ext_offset; 701 bytes += ext_bytes; 702 703 /* 704 * NOTE: when doing a forward scan the returned scan_offset is 705 * for the record following the returned record, so we 706 * have to play a bit. 707 */ 708 while ((int64_t)bytes > 0) { 709 KKASSERT(scan_offset != last_offset); 710 711 oscan_offset = scan_offset; 712 head = hammer_recover_scan_fwd(hmp, root_volume, 713 &scan_offset, &error, &buffer); 714 if (error) 715 break; 716 717 error = hammer_recover_redo_run(hmp, &rterm_root, 718 oscan_offset, &head->redo); 719 if (error) { 720 kprintf("HAMMER(%s) UNDO record at %016jx failed\n", 721 root_volume->ondisk->vol_name, 722 (intmax_t)scan_offset - head->head.hdr_size); 723 break; 724 } 725 bytes -= head->head.hdr_size; 726 } 727 KKASSERT(error || bytes == 0); 728 729 done: 730 if (buffer) { 731 hammer_rel_buffer(buffer, 0); 732 buffer = NULL; 733 } 734 735 /* 736 * Cleanup rterm tree 737 */ 738 { 739 hammer_rterm_t rterm; 740 hammer_rterm_entry_t rte; 741 742 while ((rterm = RB_ROOT(&rterm_root)) != NULL) { 743 RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm); 744 while ((rte = rterm->term_list) != NULL) { 745 rterm->term_list = rte->next; 746 kfree(rte, hmp->m_misc); 747 } 748 kfree(rterm, hmp->m_misc); 749 } 750 } 751 752 /* 753 * Finish up flushing (or discarding) recovered buffers by executing 754 * a normal flush cycle. Setting HMNT_UNDO_DIRTY bypasses degenerate 755 * case tests and forces the flush in order to update the FIFO indices. 756 * 757 * If a crash occurs during the flush the entire undo/redo will be 758 * re-run during recovery on the next mount. 759 */ 760 if (error == 0) { 761 if (rootmap->first_offset != rootmap->next_offset) 762 hmp->hflags |= HMNT_UNDO_DIRTY; 763 hammer_flusher_sync(hmp); 764 } 765 fatal: 766 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN; 767 if (verbose) { 768 kprintf("HAMMER(%s) End redo recovery\n", 769 root_volume->ondisk->vol_name); 770 } 771 772 if (error && hammer_skip_redo == 1) 773 kprintf("HAMMER(%s) recovery redo error %d, " 774 " skipping.\n", root_volume->ondisk->vol_name, 775 error); 776 777 return (hammer_skip_redo ? 0 : error); 778 } 779 780 /* 781 * Scan backwards from *scan_offsetp, return the FIFO record prior to the 782 * record at *scan_offsetp or NULL if an error occured. 783 * 784 * On return *scan_offsetp will be the offset of the returned record. 785 */ 786 hammer_fifo_any_t 787 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume, 788 hammer_off_t *scan_offsetp, 789 int *errorp, struct hammer_buffer **bufferp) 790 { 791 hammer_off_t scan_offset; 792 hammer_blockmap_t rootmap; 793 hammer_fifo_any_t head; 794 hammer_fifo_tail_t tail; 795 796 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 797 scan_offset = *scan_offsetp; 798 799 if (hammer_debug_general & 0x0080) 800 kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset); 801 if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) 802 scan_offset = rootmap->alloc_offset; 803 if (scan_offset - sizeof(*tail) < 804 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) { 805 kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n", 806 root_volume->ondisk->vol_name, 807 (intmax_t)scan_offset); 808 *errorp = EIO; 809 return (NULL); 810 } 811 tail = hammer_bread(hmp, scan_offset - sizeof(*tail), 812 errorp, bufferp); 813 if (*errorp) { 814 kprintf("HAMMER(%s) Unable to read UNDO TAIL " 815 "at %016jx\n", 816 root_volume->ondisk->vol_name, 817 (intmax_t)scan_offset - sizeof(*tail)); 818 return (NULL); 819 } 820 821 if (hammer_check_tail_signature(tail, scan_offset) != 0) { 822 kprintf("HAMMER(%s) Illegal UNDO TAIL signature " 823 "at %016jx\n", 824 root_volume->ondisk->vol_name, 825 (intmax_t)scan_offset - sizeof(*tail)); 826 *errorp = EIO; 827 return (NULL); 828 } 829 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size); 830 *scan_offsetp = scan_offset - head->head.hdr_size; 831 832 return (head); 833 } 834 835 /* 836 * Scan forwards from *scan_offsetp, return the FIFO record or NULL if 837 * an error occured. 838 * 839 * On return *scan_offsetp will be the offset of the record following 840 * the returned record. 841 */ 842 hammer_fifo_any_t 843 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume, 844 hammer_off_t *scan_offsetp, 845 int *errorp, struct hammer_buffer **bufferp) 846 { 847 hammer_off_t scan_offset; 848 hammer_blockmap_t rootmap; 849 hammer_fifo_any_t head; 850 851 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 852 scan_offset = *scan_offsetp; 853 854 if (hammer_debug_general & 0x0080) 855 kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset); 856 if (scan_offset == rootmap->alloc_offset) 857 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0); 858 859 head = hammer_bread(hmp, scan_offset, errorp, bufferp); 860 if (*errorp) { 861 kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n", 862 root_volume->ondisk->vol_name, 863 (intmax_t)scan_offset); 864 return (NULL); 865 } 866 867 if (hammer_check_head_signature(&head->head, scan_offset) != 0) { 868 kprintf("HAMMER(%s) Illegal UNDO TAIL signature " 869 "at %016jx\n", 870 root_volume->ondisk->vol_name, 871 (intmax_t)scan_offset); 872 *errorp = EIO; 873 return (NULL); 874 } 875 scan_offset += head->head.hdr_size; 876 if (scan_offset == rootmap->alloc_offset) 877 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0); 878 *scan_offsetp = scan_offset; 879 880 return (head); 881 } 882 883 /* 884 * Helper function for hammer_check_{head,tail}_signature(). Check stuff 885 * once the head and tail has been established. 886 * 887 * This function validates the entire FIFO record wrapper. 888 */ 889 static __inline 890 int 891 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail, 892 hammer_off_t beg_off) 893 { 894 hammer_off_t end_off; 895 u_int32_t crc; 896 int bytes; 897 898 /* 899 * Check signatures. The tail signature is allowed to be the 900 * head signature only for 8-byte PADs. 901 */ 902 if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) { 903 kprintf("HAMMER: FIFO record bad head signature " 904 "%04x at %016jx\n", 905 head->hdr_signature, 906 (intmax_t)beg_off); 907 return(2); 908 } 909 if (head->hdr_size < HAMMER_HEAD_ALIGN || 910 (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) { 911 kprintf("HAMMER: FIFO record unaligned or bad size" 912 "%04x at %016jx\n", 913 head->hdr_size, 914 (intmax_t)beg_off); 915 return(2); 916 } 917 end_off = beg_off + head->hdr_size; 918 919 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD || 920 (size_t)(end_off - beg_off) != sizeof(*tail)) { 921 if (head->hdr_type != tail->tail_type) { 922 kprintf("HAMMER: FIFO record head/tail type mismatch " 923 "%04x %04x at %016jx\n", 924 head->hdr_type, tail->tail_type, 925 (intmax_t)beg_off); 926 return(2); 927 } 928 if (head->hdr_size != tail->tail_size) { 929 kprintf("HAMMER: FIFO record head/tail size mismatch " 930 "%04x %04x at %016jx\n", 931 head->hdr_size, tail->tail_size, 932 (intmax_t)beg_off); 933 return(2); 934 } 935 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) { 936 kprintf("HAMMER: FIFO record bad tail signature " 937 "%04x at %016jx\n", 938 tail->tail_signature, 939 (intmax_t)beg_off); 940 return(3); 941 } 942 } 943 944 /* 945 * Non-PAD records must have a CRC and must be sized at 946 * least large enough to fit the head and tail. 947 */ 948 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) { 949 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^ 950 crc32(head + 1, head->hdr_size - sizeof(*head)); 951 if (head->hdr_crc != crc) { 952 kprintf("HAMMER: FIFO record CRC failed %08x %08x " 953 "at %016jx\n", 954 head->hdr_crc, crc, 955 (intmax_t)beg_off); 956 return(EIO); 957 } 958 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) { 959 kprintf("HAMMER: FIFO record too small " 960 "%04x at %016jx\n", 961 head->hdr_size, 962 (intmax_t)beg_off); 963 return(EIO); 964 } 965 } 966 967 /* 968 * Check the tail 969 */ 970 bytes = head->hdr_size; 971 tail = (void *)((char *)head + bytes - sizeof(*tail)); 972 if (tail->tail_size != head->hdr_size) { 973 kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n", 974 tail->tail_size, head->hdr_size, 975 (intmax_t)beg_off); 976 return(EIO); 977 } 978 if (tail->tail_type != head->hdr_type) { 979 kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n", 980 tail->tail_type, head->hdr_type, 981 (intmax_t)beg_off); 982 return(EIO); 983 } 984 985 return(0); 986 } 987 988 /* 989 * Check that the FIFO record is in-bounds given the head and the 990 * hammer offset. 991 * 992 * Also checks that the head and tail structures agree with each other, 993 * but does not check beyond the signature, type, and size. 994 */ 995 static int 996 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off) 997 { 998 hammer_fifo_tail_t tail; 999 hammer_off_t end_off; 1000 1001 /* 1002 * head overlaps buffer boundary. This could be a PAD so only 1003 * check the minimum PAD size here. 1004 */ 1005 if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64) 1006 return(1); 1007 1008 /* 1009 * Calculate the ending offset and make sure the record does 1010 * not cross a buffer boundary. 1011 */ 1012 end_off = beg_off + head->hdr_size; 1013 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64) 1014 return(1); 1015 tail = (void *)((char *)head + head->hdr_size - sizeof(*tail)); 1016 return (_hammer_check_signature(head, tail, beg_off)); 1017 } 1018 1019 /* 1020 * Check that the FIFO record is in-bounds given the tail and the 1021 * hammer offset. The offset is pointing at the ending boundary of the 1022 * record. 1023 * 1024 * Also checks that the head and tail structures agree with each other, 1025 * but does not check beyond the signature, type, and size. 1026 */ 1027 static int 1028 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off) 1029 { 1030 hammer_fifo_head_t head; 1031 hammer_off_t beg_off; 1032 1033 /* 1034 * tail overlaps buffer boundary 1035 */ 1036 if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64) 1037 return(1); 1038 1039 /* 1040 * Calculate the begining offset and make sure the record does 1041 * not cross a buffer boundary. 1042 */ 1043 beg_off = end_off - tail->tail_size; 1044 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64) 1045 return(1); 1046 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size); 1047 return (_hammer_check_signature(head, tail, beg_off)); 1048 } 1049 1050 static int 1051 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume, 1052 hammer_fifo_undo_t undo) 1053 { 1054 hammer_volume_t volume; 1055 hammer_buffer_t buffer; 1056 hammer_off_t buf_offset; 1057 int zone; 1058 int error; 1059 int vol_no; 1060 int bytes; 1061 u_int32_t offset; 1062 1063 /* 1064 * Only process UNDO records. Flag if we find other records to 1065 * optimize stage2 recovery. 1066 */ 1067 if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO) 1068 return(0); 1069 1070 /* 1071 * Validate the UNDO record. 1072 */ 1073 bytes = undo->head.hdr_size - sizeof(*undo) - 1074 sizeof(struct hammer_fifo_tail); 1075 if (bytes < 0 || undo->undo_data_bytes < 0 || 1076 undo->undo_data_bytes > bytes) { 1077 kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n", 1078 undo->undo_data_bytes, bytes); 1079 return(EIO); 1080 } 1081 1082 bytes = undo->undo_data_bytes; 1083 1084 /* 1085 * The undo offset may only be a zone-1 or zone-2 offset. 1086 * 1087 * Currently we only support a zone-1 offset representing the 1088 * volume header. 1089 */ 1090 zone = HAMMER_ZONE_DECODE(undo->undo_offset); 1091 offset = undo->undo_offset & HAMMER_BUFMASK; 1092 1093 if (offset + bytes > HAMMER_BUFSIZE) { 1094 kprintf("HAMMER: Corrupt UNDO record, bad offset\n"); 1095 return (EIO); 1096 } 1097 1098 switch(zone) { 1099 case HAMMER_ZONE_RAW_VOLUME_INDEX: 1100 vol_no = HAMMER_VOL_DECODE(undo->undo_offset); 1101 volume = hammer_get_volume(hmp, vol_no, &error); 1102 if (volume == NULL) { 1103 kprintf("HAMMER: UNDO record, " 1104 "cannot access volume %d\n", vol_no); 1105 break; 1106 } 1107 hammer_modify_volume_noundo(NULL, volume); 1108 hammer_recover_copy_undo(undo->undo_offset, 1109 (char *)(undo + 1), 1110 (char *)volume->ondisk + offset, 1111 bytes); 1112 hammer_modify_volume_done(volume); 1113 1114 /* 1115 * Multiple modifications may be made to the same buffer. 1116 * Also, the volume header cannot be written out until 1117 * everything else has been flushed. This also 1118 * covers the read-only case by preventing the kernel from 1119 * flushing the buffer. 1120 */ 1121 if (volume->io.recovered == 0) 1122 volume->io.recovered = 1; 1123 else 1124 hammer_rel_volume(volume, 0); 1125 break; 1126 case HAMMER_ZONE_RAW_BUFFER_INDEX: 1127 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64; 1128 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE, 1129 0, &error); 1130 if (buffer == NULL) { 1131 kprintf("HAMMER: UNDO record, " 1132 "cannot access buffer %016jx\n", 1133 (intmax_t)undo->undo_offset); 1134 break; 1135 } 1136 hammer_modify_buffer_noundo(NULL, buffer); 1137 hammer_recover_copy_undo(undo->undo_offset, 1138 (char *)(undo + 1), 1139 (char *)buffer->ondisk + offset, 1140 bytes); 1141 hammer_modify_buffer_done(buffer); 1142 1143 /* 1144 * Multiple modifications may be made to the same buffer, 1145 * improve performance by delaying the flush. This also 1146 * covers the read-only case by preventing the kernel from 1147 * flushing the buffer. 1148 */ 1149 if (buffer->io.recovered == 0) 1150 buffer->io.recovered = 1; 1151 else 1152 hammer_rel_buffer(buffer, 0); 1153 break; 1154 default: 1155 kprintf("HAMMER: Corrupt UNDO record\n"); 1156 error = EIO; 1157 } 1158 return (error); 1159 } 1160 1161 static void 1162 hammer_recover_copy_undo(hammer_off_t undo_offset, 1163 char *src, char *dst, int bytes) 1164 { 1165 if (hammer_debug_general & 0x0080) { 1166 kprintf("UNDO %016jx: %d\n", 1167 (intmax_t)undo_offset, bytes); 1168 } 1169 #if 0 1170 kprintf("UNDO %016jx:", (intmax_t)undo_offset); 1171 hammer_recover_debug_dump(22, dst, bytes); 1172 kprintf("%22s", "to:"); 1173 hammer_recover_debug_dump(22, src, bytes); 1174 #endif 1175 bcopy(src, dst, bytes); 1176 } 1177 1178 /* 1179 * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations 1180 * during the backwards scan of the extended UNDO/REDO FIFO. This scan 1181 * does not include the nominal UNDO range, just the extended range. 1182 */ 1183 int 1184 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, 1185 hammer_off_t scan_offset, hammer_fifo_redo_t redo) 1186 { 1187 hammer_rterm_t rterm; 1188 hammer_rterm_t nrterm; 1189 hammer_rterm_entry_t rte; 1190 1191 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO) 1192 return(0); 1193 if (redo->redo_flags != HAMMER_REDO_TERM_WRITE && 1194 redo->redo_flags != HAMMER_REDO_TERM_TRUNC) { 1195 return(0); 1196 } 1197 1198 nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO); 1199 nrterm->redo_objid = redo->redo_objid; 1200 nrterm->redo_localization = redo->redo_localization; 1201 nrterm->redo_flags = redo->redo_flags; 1202 nrterm->redo_offset = redo->redo_offset; 1203 1204 rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm); 1205 if (rterm) 1206 kfree(nrterm, hmp->m_misc); 1207 else 1208 rterm = nrterm; 1209 1210 if (bootverbose) { 1211 kprintf("record record %016jx objid %016jx " 1212 "offset %016jx flags %08x\n", 1213 (intmax_t)scan_offset, 1214 (intmax_t)redo->redo_objid, 1215 (intmax_t)redo->redo_offset, 1216 (int)redo->redo_flags); 1217 } 1218 1219 /* 1220 * Scan in reverse order, rte prepended, so the rte list will be 1221 * in forward order. 1222 */ 1223 rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO); 1224 rte->fifo_offset = scan_offset; 1225 rte->next = rterm->term_list; 1226 rterm->term_list = rte; 1227 1228 return(0); 1229 } 1230 1231 /* 1232 * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during 1233 * the forwards scan of the entire extended UNDO/REDO FIFO range. 1234 * 1235 * Records matching previously recorded TERMs have already been committed 1236 * and are ignored. 1237 */ 1238 int 1239 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, 1240 hammer_off_t scan_offset, hammer_fifo_redo_t redo) 1241 { 1242 struct hammer_rterm rtval; 1243 hammer_rterm_t rterm; 1244 hammer_rterm_entry_t rte; 1245 1246 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO) 1247 return(0); 1248 1249 switch(redo->redo_flags) { 1250 case HAMMER_REDO_WRITE: 1251 case HAMMER_REDO_TRUNC: 1252 /* 1253 * We hit a REDO request. The REDO request is only executed 1254 * if there is no matching TERM. 1255 */ 1256 bzero(&rtval, sizeof(rtval)); 1257 rtval.redo_objid = redo->redo_objid; 1258 rtval.redo_localization = redo->redo_localization; 1259 rtval.redo_offset = redo->redo_offset; 1260 rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ? 1261 HAMMER_REDO_TERM_WRITE : 1262 HAMMER_REDO_TERM_TRUNC; 1263 1264 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval); 1265 if (rterm) { 1266 if (bootverbose) { 1267 kprintf("ignore record %016jx objid %016jx " 1268 "offset %016jx flags %08x\n", 1269 (intmax_t)scan_offset, 1270 (intmax_t)redo->redo_objid, 1271 (intmax_t)redo->redo_offset, 1272 (int)redo->redo_flags); 1273 } 1274 break; 1275 } 1276 if (bootverbose) { 1277 kprintf("run record %016jx objid %016jx " 1278 "offset %016jx flags %08x\n", 1279 (intmax_t)scan_offset, 1280 (intmax_t)redo->redo_objid, 1281 (intmax_t)redo->redo_offset, 1282 (int)redo->redo_flags); 1283 } 1284 1285 /* 1286 * Redo stage2 can access a live filesystem, acquire the 1287 * vnode. 1288 */ 1289 hammer_recover_redo_exec(hmp, redo); 1290 break; 1291 case HAMMER_REDO_TERM_WRITE: 1292 case HAMMER_REDO_TERM_TRUNC: 1293 /* 1294 * As we encounter TERMs in the forward scan we remove 1295 * them. Once the forward scan hits the nominal undo range 1296 * there will be no more recorded TERMs. 1297 */ 1298 bzero(&rtval, sizeof(rtval)); 1299 rtval.redo_objid = redo->redo_objid; 1300 rtval.redo_localization = redo->redo_localization; 1301 rtval.redo_flags = redo->redo_flags; 1302 rtval.redo_offset = redo->redo_offset; 1303 1304 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval); 1305 if (rterm) { 1306 if ((rte = rterm->term_list) != NULL) { 1307 KKASSERT(rte->fifo_offset == scan_offset); 1308 rterm->term_list = rte->next; 1309 kfree(rte, hmp->m_misc); 1310 } 1311 } 1312 break; 1313 } 1314 return(0); 1315 } 1316 1317 static void 1318 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo) 1319 { 1320 struct hammer_transaction trans; 1321 struct vattr va; 1322 struct hammer_inode *ip; 1323 struct vnode *vp = NULL; 1324 int error; 1325 1326 hammer_start_transaction(&trans, hmp); 1327 1328 ip = hammer_get_inode(&trans, NULL, redo->redo_objid, 1329 HAMMER_MAX_TID, redo->redo_localization, 1330 0, &error); 1331 if (ip == NULL) { 1332 kprintf("unable to find objid %016jx:%08x\n", 1333 (intmax_t)redo->redo_objid, redo->redo_localization); 1334 goto done2; 1335 } 1336 error = hammer_get_vnode(ip, &vp); 1337 if (error) { 1338 kprintf("unable to acquire vnode for %016jx:%08x\n", 1339 (intmax_t)redo->redo_objid, redo->redo_localization); 1340 goto done1; 1341 } 1342 1343 switch(redo->redo_flags) { 1344 case HAMMER_REDO_WRITE: 1345 error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL); 1346 if (error) { 1347 kprintf("vn_rdwr open %016jx:%08x returned %d\n", 1348 (intmax_t)redo->redo_objid, 1349 redo->redo_localization, error); 1350 break; 1351 } 1352 vn_unlock(vp); 1353 error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1), 1354 redo->redo_data_bytes, 1355 redo->redo_offset, UIO_SYSSPACE, 1356 0, proc0.p_ucred, NULL); 1357 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1358 if (error) { 1359 kprintf("write %016jx:%08x returned %d\n", 1360 (intmax_t)redo->redo_objid, 1361 redo->redo_localization, error); 1362 } 1363 VOP_CLOSE(vp, FREAD|FWRITE, NULL); 1364 break; 1365 case HAMMER_REDO_TRUNC: 1366 VATTR_NULL(&va); 1367 va.va_size = redo->redo_offset; 1368 error = VOP_SETATTR(vp, &va, proc0.p_ucred); 1369 if (error) { 1370 kprintf("setattr offset %016jx error %d\n", 1371 (intmax_t)redo->redo_offset, error); 1372 } 1373 break; 1374 } 1375 vput(vp); 1376 done1: 1377 hammer_rel_inode(ip, 0); 1378 done2: 1379 hammer_done_transaction(&trans); 1380 } 1381 1382 /* 1383 * RB tree compare function. Note that REDO_TERM_TRUNC ops ignore 1384 * the offset. 1385 * 1386 * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc. 1387 */ 1388 static int 1389 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2) 1390 { 1391 if (rt1->redo_objid < rt2->redo_objid) 1392 return(-1); 1393 if (rt1->redo_objid > rt2->redo_objid) 1394 return(1); 1395 if (rt1->redo_localization < rt2->redo_localization) 1396 return(-1); 1397 if (rt1->redo_localization > rt2->redo_localization) 1398 return(1); 1399 if (rt1->redo_flags < rt2->redo_flags) 1400 return(-1); 1401 if (rt1->redo_flags > rt2->redo_flags) 1402 return(1); 1403 if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) { 1404 if (rt1->redo_offset < rt2->redo_offset) 1405 return(-1); 1406 if (rt1->redo_offset > rt2->redo_offset) 1407 return(1); 1408 } 1409 return(0); 1410 } 1411 1412 #if 0 1413 1414 static void 1415 hammer_recover_debug_dump(int w, char *buf, int bytes) 1416 { 1417 int i; 1418 1419 for (i = 0; i < bytes; ++i) { 1420 if (i && (i & 15) == 0) 1421 kprintf("\n%*.*s", w, w, ""); 1422 kprintf(" %02x", (unsigned char)buf[i]); 1423 } 1424 kprintf("\n"); 1425 } 1426 1427 #endif 1428 1429 /* 1430 * Flush recovered buffers from recovery operations. The call to this 1431 * routine may be delayed if a read-only mount was made and then later 1432 * upgraded to read-write. This routine is also called when unmounting 1433 * a read-only mount to clean out recovered (dirty) buffers which we 1434 * couldn't flush (because the mount is read-only). 1435 * 1436 * The volume header is always written last. The UNDO FIFO will be forced 1437 * to zero-length by setting next_offset to first_offset. This leaves the 1438 * (now stale) UNDO information used to recover the disk available for 1439 * forensic analysis. 1440 * 1441 * final is typically 0 or 1. The volume header is only written if final 1442 * is 1. If final is -1 the recovered buffers are discarded instead of 1443 * written and root_volume can also be passed as NULL in that case. 1444 */ 1445 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *); 1446 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *); 1447 1448 void 1449 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume, 1450 int final) 1451 { 1452 /* 1453 * Flush the buffers out asynchronously, wait for all the I/O to 1454 * complete, then do it again to destroy the buffer cache buffer 1455 * so it doesn't alias something later on. 1456 */ 1457 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, 1458 hammer_recover_flush_buffer_callback, &final); 1459 hammer_io_wait_all(hmp, "hmrrcw", 1); 1460 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, 1461 hammer_recover_flush_buffer_callback, &final); 1462 1463 /* 1464 * Flush all volume headers except the root volume. If final < 0 1465 * we discard all volume headers including the root volume. 1466 */ 1467 if (final >= 0) { 1468 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, 1469 hammer_recover_flush_volume_callback, root_volume); 1470 } else { 1471 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, 1472 hammer_recover_flush_volume_callback, NULL); 1473 } 1474 1475 /* 1476 * Finalize the root volume header. 1477 * 1478 * No interlock is needed, volume buffers are not 1479 * messed with by bioops. 1480 */ 1481 if (root_volume && root_volume->io.recovered && final > 0) { 1482 hammer_io_wait_all(hmp, "hmrflx", 1); 1483 root_volume->io.recovered = 0; 1484 hammer_io_flush(&root_volume->io, 0); 1485 hammer_rel_volume(root_volume, 0); 1486 hammer_io_wait_all(hmp, "hmrfly", 1); 1487 } 1488 } 1489 1490 /* 1491 * Callback to flush volume headers. If discarding data will be NULL and 1492 * all volume headers (including the root volume) will be discarded. 1493 * Otherwise data is the root_volume and we flush all volume headers 1494 * EXCEPT the root_volume. 1495 * 1496 * Clear any I/O error or modified condition when discarding buffers to 1497 * clean up the reference count, otherwise the buffer may have extra refs 1498 * on it. 1499 */ 1500 static 1501 int 1502 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data) 1503 { 1504 hammer_volume_t root_volume = data; 1505 1506 if (volume->io.recovered && volume != root_volume) { 1507 volume->io.recovered = 0; 1508 if (root_volume != NULL) { 1509 /* 1510 * No interlock is needed, volume buffers are not 1511 * messed with by bioops. 1512 */ 1513 hammer_io_flush(&volume->io, 0); 1514 } else { 1515 hammer_io_clear_error(&volume->io); 1516 hammer_io_clear_modify(&volume->io, 1); 1517 } 1518 hammer_rel_volume(volume, 0); 1519 } 1520 return(0); 1521 } 1522 1523 /* 1524 * Flush or discard recovered I/O buffers. 1525 * 1526 * Clear any I/O error or modified condition when discarding buffers to 1527 * clean up the reference count, otherwise the buffer may have extra refs 1528 * on it. 1529 */ 1530 static 1531 int 1532 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data) 1533 { 1534 int final = *(int *)data; 1535 int flush; 1536 1537 if (buffer->io.recovered) { 1538 buffer->io.recovered = 0; 1539 buffer->io.reclaim = 1; 1540 if (final < 0) { 1541 hammer_io_clear_error(&buffer->io); 1542 hammer_io_clear_modify(&buffer->io, 1); 1543 } else { 1544 hammer_io_write_interlock(&buffer->io); 1545 hammer_io_flush(&buffer->io, 0); 1546 hammer_io_done_interlock(&buffer->io); 1547 } 1548 hammer_rel_buffer(buffer, 0); 1549 } else { 1550 flush = hammer_ref_interlock(&buffer->io.lock); 1551 if (flush) 1552 atomic_add_int(&hammer_count_refedbufs, 1); 1553 1554 if (final < 0) { 1555 hammer_io_clear_error(&buffer->io); 1556 hammer_io_clear_modify(&buffer->io, 1); 1557 } 1558 KKASSERT(hammer_oneref(&buffer->io.lock)); 1559 buffer->io.reclaim = 1; 1560 hammer_rel_buffer(buffer, flush); 1561 } 1562 return(0); 1563 } 1564 1565