1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * UNDO ALGORITHM: 37 * 38 * The UNDO algorithm is trivial. The nominal UNDO range in the 39 * FIFO is determined by taking the first/next offset stored in 40 * the volume header. The next offset may not be correct since 41 * UNDO flushes are not required to flush the volume header, so 42 * the code also scans forward until it finds a discontinuous 43 * sequence number. 44 * 45 * The UNDOs are then scanned and executed in reverse order. These 46 * UNDOs are effectively just data restorations based on HAMMER offsets. 47 * 48 * REDO ALGORITHM: 49 * 50 * REDO records are laid down in the UNDO/REDO FIFO for nominal 51 * writes, truncations, and file extension ops. On a per-inode 52 * basis two types of REDO records are generated, REDO_WRITE 53 * and REDO_TRUNC. 54 * 55 * Essentially the recovery block will contain UNDO records backing 56 * out partial operations and REDO records to regenerate those partial 57 * operations guaranteed by the filesystem during recovery. 58 * 59 * REDO generation is optional, and can also be started and then 60 * later stopped due to excessive write()s inbetween fsyncs, or not 61 * started at all. Because of this the recovery code must determine 62 * when REDOs are valid and when they are not. Additional records are 63 * generated to help figure it out. 64 * 65 * The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated 66 * during a flush cycle indicating which records the flush cycle 67 * has synched meta-data for, and HAMMER_REDO_SYNC is generated in 68 * each flush cycle to indicate how far back in the UNDO/REDO FIFO 69 * the recovery code must go to find the earliest applicable REDO 70 * record. Applicable REDO records can be far outside the nominal 71 * UNDO recovery range, for example if a write() lays down a REDO but 72 * the related file is not flushed for several cycles. 73 * 74 * The SYNC reference is to a point prior to the nominal UNDO FIFO 75 * range, creating an extended REDO range which must be scanned. 76 * 77 * Any REDO_WRITE/REDO_TRUNC encountered within the extended range 78 * which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records 79 * prior to the start of the nominal UNDO range are applicable. 80 * That is, any REDO_TERM_* records in the extended range but not in 81 * the nominal undo range will mask any redo operations for prior REDO 82 * records. This is necessary because once the TERM is laid down 83 * followup operations may make additional changes to the related 84 * records but not necessarily record them as REDOs (because REDOs are 85 * optional). 86 * 87 * REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range 88 * must be ignored since they represent meta-data flushes which are 89 * undone by the UNDOs in that nominal UNDO range by the recovery 90 * code. Only REDO_TERM_* records in the extended range but not 91 * in the nominal undo range are applicable. 92 * 93 * The REDO_SYNC record itself always exists in the nominal UNDO range 94 * (this is how the extended range is determined). For recovery 95 * purposes the most recent REDO_SYNC record is always used if several 96 * are found. 97 * 98 * CRASHES DURING UNDO/REDO 99 * 100 * A crash during the UNDO phase requires no additional effort. The 101 * UNDOs will simply be re-run again. The state of the UNDO/REDO fifo 102 * remains unchanged and has no re-crash issues. 103 * 104 * A crash during the REDO phase is more complex because the REDOs 105 * run normal filesystem ops and generate additional UNDO/REDO records. 106 * REDO is disabled during REDO recovery and any SYNC records generated 107 * by flushes during REDO recovery must continue to reference the 108 * original extended range. 109 * 110 * If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery 111 * may become impossible. This is detected when the start of the 112 * extended range fails to have monotonically increasing sequence 113 * numbers leading into the nominal undo range. 114 */ 115 116 #include "hammer.h" 117 118 /* 119 * Each rterm entry has a list of fifo offsets indicating termination 120 * points. These are stripped as the scan progresses. 121 */ 122 typedef struct hammer_rterm_entry { 123 struct hammer_rterm_entry *next; 124 hammer_off_t fifo_offset; 125 } *hammer_rterm_entry_t; 126 127 /* 128 * rterm entries sorted in RB tree are indexed by objid, flags, and offset. 129 * TRUNC entries ignore the offset. 130 */ 131 typedef struct hammer_rterm { 132 RB_ENTRY(hammer_rterm) rb_node; 133 int64_t redo_objid; 134 u_int32_t redo_localization; 135 u_int32_t redo_flags; 136 hammer_off_t redo_offset; 137 hammer_rterm_entry_t term_list; 138 } *hammer_rterm_t; 139 140 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2); 141 struct hammer_rterm_rb_tree; 142 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm); 143 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp); 144 145 static int hammer_check_tail_signature(hammer_fifo_tail_t tail, 146 hammer_off_t end_off); 147 static int hammer_check_head_signature(hammer_fifo_head_t head, 148 hammer_off_t beg_off); 149 static void hammer_recover_copy_undo(hammer_off_t undo_offset, 150 char *src, char *dst, int bytes); 151 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp, 152 hammer_volume_t root_volume, 153 hammer_off_t *scan_offsetp, 154 int *errorp, struct hammer_buffer **bufferp); 155 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp, 156 hammer_volume_t root_volume, 157 hammer_off_t *scan_offsetp, 158 int *errorp, struct hammer_buffer **bufferp); 159 #if 0 160 static void hammer_recover_debug_dump(int w, char *buf, int bytes); 161 #endif 162 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume, 163 hammer_fifo_undo_t undo); 164 static int hammer_recover_redo_rec(hammer_mount_t hmp, 165 struct hammer_rterm_rb_tree *root, 166 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo); 167 static int hammer_recover_redo_run(hammer_mount_t hmp, 168 struct hammer_rterm_rb_tree *root, 169 hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo); 170 static void hammer_recover_redo_exec(hammer_mount_t hmp, 171 hammer_fifo_redo_t redo); 172 173 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp); 174 175 /* 176 * Recover filesystem meta-data on mount. This procedure figures out the 177 * UNDO FIFO range and runs the UNDOs backwards. The FIFO pointers are not 178 * resynchronized by this procedure. 179 * 180 * This procedure is run near the beginning of the mount sequence, before 181 * any B-Tree or high-level accesses are enabled, and is responsible for 182 * restoring the meta-data to a consistent state. High level HAMMER data 183 * structures (such as the B-Tree) cannot be accessed here. 184 * 185 * NOTE: No information from the root volume has been cached in the 186 * hammer_mount structure yet, so we need to access the root volume's 187 * buffer directly. 188 * 189 * NOTE: 190 */ 191 int 192 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume) 193 { 194 hammer_blockmap_t rootmap; 195 hammer_buffer_t buffer; 196 hammer_off_t scan_offset; 197 hammer_off_t scan_offset_save; 198 hammer_off_t bytes; 199 hammer_fifo_any_t head; 200 hammer_off_t first_offset; 201 hammer_off_t last_offset; 202 u_int32_t seqno; 203 int error; 204 int degenerate_case = 0; 205 206 /* 207 * Examine the UNDO FIFO indices in the volume header. 208 */ 209 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 210 first_offset = rootmap->first_offset; 211 last_offset = rootmap->next_offset; 212 buffer = NULL; 213 error = 0; 214 215 hmp->recover_stage2_offset = 0; 216 217 if (first_offset > rootmap->alloc_offset || 218 last_offset > rootmap->alloc_offset) { 219 kprintf("HAMMER(%s) Illegal UNDO FIFO index range " 220 "%016jx, %016jx limit %016jx\n", 221 root_volume->ondisk->vol_name, 222 (intmax_t)first_offset, 223 (intmax_t)last_offset, 224 (intmax_t)rootmap->alloc_offset); 225 error = EIO; 226 goto done; 227 } 228 229 /* 230 * In HAMMER version 4+ filesystems the volume header does NOT 231 * contain definitive UNDO FIFO state. In particular, the 232 * rootmap->next_offset may not be indexed completely to the 233 * end of the active UNDO FIFO. 234 */ 235 if (hmp->version >= HAMMER_VOL_VERSION_FOUR) { 236 /* 237 * To find the definitive range we must first scan backwards 238 * from first_offset to locate the first real record and 239 * extract the sequence number from it. This record is not 240 * part of the active undo space. 241 */ 242 scan_offset = first_offset; 243 seqno = 0; 244 245 for (;;) { 246 head = hammer_recover_scan_rev(hmp, root_volume, 247 &scan_offset, 248 &error, &buffer); 249 if (error) 250 break; 251 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { 252 seqno = head->head.hdr_seq; 253 break; 254 } 255 } 256 if (error) { 257 kprintf("HAMMER(%s) recovery failure " 258 "during seqno backscan\n", 259 root_volume->ondisk->vol_name); 260 goto done; 261 } 262 263 /* 264 * Scan forwards from first_offset and (seqno+1) looking 265 * for a sequence space discontinuity. This denotes the 266 * end of the active FIFO area. 267 * 268 * NOTE: For the case where the FIFO is empty the very first 269 * record we find will be discontinuous. 270 * 271 * NOTE: Do not include trailing PADs in the scan range, 272 * and remember the returned scan_offset after a 273 * fwd iteration points to the end of the returned 274 * record. 275 */ 276 kprintf("HAMMER(%s) recovery check seqno=%08x\n", 277 root_volume->ondisk->vol_name, 278 seqno); 279 280 scan_offset = first_offset; 281 scan_offset_save = scan_offset; 282 ++seqno; 283 hmp->recover_stage2_seqno = seqno; 284 285 for (;;) { 286 head = hammer_recover_scan_fwd(hmp, root_volume, 287 &scan_offset, 288 &error, &buffer); 289 if (error) 290 break; 291 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { 292 if (seqno != head->head.hdr_seq) { 293 scan_offset = scan_offset_save; 294 break; 295 } 296 scan_offset_save = scan_offset; 297 ++seqno; 298 } 299 300 #if 0 301 /* 302 * If the forward scan is grossly ahead of last_offset 303 * then something is wrong. last_offset is supposed 304 * to be flushed out 305 */ 306 if (last_offset >= scan_offset) { 307 bytes = last_offset - scan_offset; 308 } else { 309 bytes = rootmap->alloc_offset - scan_offset + 310 (last_offset & HAMMER_OFF_LONG_MASK); 311 } 312 if (bytes > 313 (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) * 314 4 / 5) { 315 kprintf("HAMMER(%s) recovery forward scan is " 316 "grossly beyond the last_offset in " 317 "the volume header, this can't be " 318 "right.\n", 319 root_volume->ondisk->vol_name); 320 error = EIO; 321 break; 322 } 323 #endif 324 } 325 326 /* 327 * Store the seqno. This will be the next seqno we lay down 328 * when generating new UNDOs. 329 */ 330 hmp->undo_seqno = seqno; 331 if (error) { 332 kprintf("HAMMER(%s) recovery failure " 333 "during seqno fwdscan\n", 334 root_volume->ondisk->vol_name); 335 goto done; 336 } 337 last_offset = scan_offset; 338 kprintf("HAMMER(%s) recovery range %016jx-%016jx\n" 339 "HAMMER(%s) recovery nexto %016jx endseqno=%08x\n", 340 root_volume->ondisk->vol_name, 341 (intmax_t)first_offset, 342 (intmax_t)last_offset, 343 root_volume->ondisk->vol_name, 344 (intmax_t)rootmap->next_offset, 345 seqno); 346 } 347 348 /* 349 * Calculate the size of the active portion of the FIFO. If the 350 * FIFO is empty the filesystem is clean and no further action is 351 * needed. 352 */ 353 if (last_offset >= first_offset) { 354 bytes = last_offset - first_offset; 355 } else { 356 bytes = rootmap->alloc_offset - first_offset + 357 (last_offset & HAMMER_OFF_LONG_MASK); 358 } 359 if (bytes == 0) { 360 degenerate_case = 1; 361 error = 0; 362 goto done; 363 } 364 365 kprintf("HAMMER(%s) recovery undo %016jx-%016jx (%jd bytes)%s\n", 366 root_volume->ondisk->vol_name, 367 (intmax_t)first_offset, 368 (intmax_t)last_offset, 369 (intmax_t)bytes, 370 (hmp->ronly ? " (RO)" : "(RW)")); 371 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) { 372 kprintf("Undo size is absurd, unable to mount\n"); 373 error = EIO; 374 goto done; 375 } 376 377 /* 378 * Scan the UNDOs backwards. 379 */ 380 scan_offset = last_offset; 381 382 while ((int64_t)bytes > 0) { 383 KKASSERT(scan_offset != first_offset); 384 head = hammer_recover_scan_rev(hmp, root_volume, 385 &scan_offset, &error, &buffer); 386 if (error) 387 break; 388 389 /* 390 * Normal UNDO 391 */ 392 error = hammer_recover_undo(hmp, root_volume, &head->undo); 393 if (error) { 394 kprintf("HAMMER(%s) UNDO record at %016jx failed\n", 395 root_volume->ondisk->vol_name, 396 (intmax_t)scan_offset - head->head.hdr_size); 397 break; 398 } 399 400 /* 401 * The first REDO_SYNC record encountered (scanning backwards) 402 * enables REDO processing. 403 */ 404 if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO && 405 head->redo.redo_flags == HAMMER_REDO_SYNC) { 406 if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) { 407 kprintf("HAMMER(%s) Ignoring extra REDO_SYNC " 408 "records in UNDO/REDO FIFO.\n", 409 root_volume->ondisk->vol_name 410 ); 411 } else { 412 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ; 413 hmp->recover_stage2_offset = 414 head->redo.redo_offset; 415 kprintf("HAMMER(%s) Found REDO_SYNC %016jx\n", 416 root_volume->ondisk->vol_name, 417 (intmax_t)head->redo.redo_offset); 418 } 419 } 420 421 bytes -= head->head.hdr_size; 422 423 /* 424 * If too many dirty buffers have built up we have to flush'm 425 * out. As long as we do not flush out the volume header 426 * a crash here should not cause any problems. 427 * 428 * buffer must be released so the flush can assert that 429 * all buffers are idle. 430 */ 431 if (hammer_flusher_meta_limit(hmp)) { 432 if (buffer) { 433 hammer_rel_buffer(buffer, 0); 434 buffer = NULL; 435 } 436 if (hmp->ronly == 0) { 437 hammer_recover_flush_buffers(hmp, root_volume, 438 0); 439 kprintf("HAMMER(%s) Continuing recovery\n", 440 root_volume->ondisk->vol_name); 441 } else { 442 kprintf("HAMMER(%s) Recovery failure: Insufficient buffer cache to hold dirty buffers on read-only mount!\n", 443 root_volume->ondisk->vol_name); 444 error = EIO; 445 break; 446 } 447 } 448 } 449 KKASSERT(error || bytes == 0); 450 done: 451 if (buffer) { 452 hammer_rel_buffer(buffer, 0); 453 buffer = NULL; 454 } 455 456 /* 457 * After completely flushing all the recovered buffers the volume 458 * header will also be flushed. 459 */ 460 if (root_volume->io.recovered == 0) { 461 hammer_ref_volume(root_volume); 462 root_volume->io.recovered = 1; 463 } 464 465 /* 466 * Finish up flushing (or discarding) recovered buffers. FIFO 467 * indices in the volume header are updated to the actual undo 468 * range but will not be collapsed until stage 2. 469 */ 470 if (error == 0) { 471 hammer_modify_volume(NULL, root_volume, NULL, 0); 472 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 473 rootmap->first_offset = first_offset; 474 rootmap->next_offset = last_offset; 475 hammer_modify_volume_done(root_volume); 476 if (hmp->ronly == 0) 477 hammer_recover_flush_buffers(hmp, root_volume, 1); 478 } else { 479 hammer_recover_flush_buffers(hmp, root_volume, -1); 480 } 481 if (degenerate_case == 0) { 482 kprintf("HAMMER(%s) recovery complete\n", 483 root_volume->ondisk->vol_name); 484 } else { 485 kprintf("HAMMER(%s) mounted clean, no recovery needed\n", 486 root_volume->ondisk->vol_name); 487 } 488 return (error); 489 } 490 491 /* 492 * Execute redo operations 493 * 494 * This procedure is run at the end of the mount sequence, after the hammer 495 * mount structure has been completely initialized but before the filesystem 496 * goes live. It can access standard cursors, the B-Tree, flush the 497 * filesystem, and so forth. 498 * 499 * This code may only be called for read-write mounts or when a mount 500 * switches from read-only to read-write. vnodes may or may not be present. 501 * 502 * The stage1 code will have already calculated the correct FIFO range 503 * for the nominal UNDO FIFO and stored it in the rootmap. The extended 504 * range for REDO is stored in hmp->recover_stage2_offset. 505 */ 506 int 507 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume) 508 { 509 hammer_blockmap_t rootmap; 510 hammer_buffer_t buffer; 511 hammer_off_t scan_offset; 512 hammer_off_t oscan_offset; 513 hammer_off_t bytes; 514 hammer_off_t ext_bytes; 515 hammer_fifo_any_t head; 516 hammer_off_t first_offset; 517 hammer_off_t last_offset; 518 hammer_off_t ext_offset; 519 struct hammer_rterm_rb_tree rterm_root; 520 u_int32_t seqno; 521 int error; 522 int verbose = 0; 523 int dorscan; 524 525 /* 526 * Stage 2 can only be run on a RW mount, or when the mount is 527 * switched from RO to RW. 528 */ 529 KKASSERT(hmp->ronly == 0); 530 RB_INIT(&rterm_root); 531 532 /* 533 * Examine the UNDO FIFO. If it is empty the filesystem is clean 534 * and no action need be taken. 535 */ 536 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 537 first_offset = rootmap->first_offset; 538 last_offset = rootmap->next_offset; 539 if (first_offset == last_offset) { 540 KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0); 541 return(0); 542 } 543 544 /* 545 * Stage2 must only be run once, and will not be run at all 546 * if Stage1 did not find a REDO_SYNC record. 547 */ 548 error = 0; 549 buffer = NULL; 550 551 if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0) 552 goto done; 553 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ; 554 hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN; 555 ext_offset = hmp->recover_stage2_offset; 556 if (ext_offset == 0) { 557 kprintf("HAMMER(%s) REDO stage specified but no REDO_SYNC " 558 "offset, ignoring\n", 559 root_volume->ondisk->vol_name); 560 goto done; 561 } 562 563 /* 564 * Calculate nominal UNDO range (this is not yet the extended 565 * range). 566 */ 567 if (last_offset >= first_offset) { 568 bytes = last_offset - first_offset; 569 } else { 570 bytes = rootmap->alloc_offset - first_offset + 571 (last_offset & HAMMER_OFF_LONG_MASK); 572 } 573 kprintf("HAMMER(%s) recovery redo %016jx-%016jx (%jd bytes)%s\n", 574 root_volume->ondisk->vol_name, 575 (intmax_t)first_offset, 576 (intmax_t)last_offset, 577 (intmax_t)bytes, 578 (hmp->ronly ? " (RO)" : "(RW)")); 579 verbose = 1; 580 if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) { 581 kprintf("Undo size is absurd, unable to mount\n"); 582 error = EIO; 583 goto fatal; 584 } 585 586 /* 587 * Scan the REDOs backwards collecting REDO_TERM_* information. 588 * This information is only collected for the extended range, 589 * non-inclusive of any TERMs in the nominal UNDO range. 590 * 591 * If the stage2 extended range is inside the nominal undo range 592 * we have nothing to scan. 593 * 594 * This must fit in memory! 595 */ 596 if (first_offset < last_offset) { 597 /* 598 * [ first_offset........last_offset ] 599 */ 600 if (ext_offset < first_offset) { 601 dorscan = 1; 602 ext_bytes = first_offset - ext_offset; 603 } else if (ext_offset > last_offset) { 604 dorscan = 1; 605 ext_bytes = (rootmap->alloc_offset - ext_offset) + 606 (first_offset & HAMMER_OFF_LONG_MASK); 607 } else { 608 ext_bytes = -(ext_offset - first_offset); 609 dorscan = 0; 610 } 611 } else { 612 /* 613 * [......last_offset first_offset.....] 614 */ 615 if (ext_offset < last_offset) { 616 ext_bytes = -((rootmap->alloc_offset - first_offset) + 617 (ext_offset & HAMMER_OFF_LONG_MASK)); 618 dorscan = 0; 619 } else if (ext_offset > first_offset) { 620 ext_bytes = -(ext_offset - first_offset); 621 dorscan = 0; 622 } else { 623 ext_bytes = first_offset - ext_offset; 624 dorscan = 1; 625 } 626 } 627 628 if (dorscan) { 629 scan_offset = first_offset; 630 kprintf("HAMMER(%s) Find extended redo %016jx, %jd extbytes\n", 631 root_volume->ondisk->vol_name, 632 (intmax_t)ext_offset, 633 (intmax_t)ext_bytes); 634 seqno = hmp->recover_stage2_seqno - 1; 635 for (;;) { 636 head = hammer_recover_scan_rev(hmp, root_volume, 637 &scan_offset, 638 &error, &buffer); 639 if (error) 640 break; 641 if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { 642 if (head->head.hdr_seq != seqno) { 643 error = ERANGE; 644 break; 645 } 646 error = hammer_recover_redo_rec( 647 hmp, &rterm_root, 648 scan_offset, &head->redo); 649 --seqno; 650 } 651 if (scan_offset == ext_offset) 652 break; 653 } 654 if (error) { 655 kprintf("HAMMER(%s) Find extended redo failed %d, " 656 "unable to run REDO\n", 657 root_volume->ondisk->vol_name, 658 error); 659 goto done; 660 } 661 } else { 662 kprintf("HAMMER(%s) Embedded extended redo %016jx, " 663 "%jd extbytes\n", 664 root_volume->ondisk->vol_name, 665 (intmax_t)ext_offset, 666 (intmax_t)ext_bytes); 667 } 668 669 /* 670 * Scan the REDO forwards through the entire extended range. 671 * Anything with a previously recorded matching TERM is discarded. 672 */ 673 scan_offset = ext_offset; 674 bytes += ext_bytes; 675 676 /* 677 * NOTE: when doing a forward scan the returned scan_offset is 678 * for the record following the returned record, so we 679 * have to play a bit. 680 */ 681 while ((int64_t)bytes > 0) { 682 KKASSERT(scan_offset != last_offset); 683 684 oscan_offset = scan_offset; 685 head = hammer_recover_scan_fwd(hmp, root_volume, 686 &scan_offset, &error, &buffer); 687 if (error) 688 break; 689 690 error = hammer_recover_redo_run(hmp, &rterm_root, 691 oscan_offset, &head->redo); 692 if (error) { 693 kprintf("HAMMER(%s) UNDO record at %016jx failed\n", 694 root_volume->ondisk->vol_name, 695 (intmax_t)scan_offset - head->head.hdr_size); 696 break; 697 } 698 bytes -= head->head.hdr_size; 699 } 700 KKASSERT(error || bytes == 0); 701 702 done: 703 if (buffer) { 704 hammer_rel_buffer(buffer, 0); 705 buffer = NULL; 706 } 707 708 /* 709 * Cleanup rterm tree 710 */ 711 { 712 hammer_rterm_t rterm; 713 hammer_rterm_entry_t rte; 714 715 while ((rterm = RB_ROOT(&rterm_root)) != NULL) { 716 RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm); 717 while ((rte = rterm->term_list) != NULL) { 718 rterm->term_list = rte->next; 719 kfree(rte, hmp->m_misc); 720 } 721 kfree(rterm, hmp->m_misc); 722 } 723 } 724 725 /* 726 * Finish up flushing (or discarding) recovered buffers by executing 727 * a normal flush cycle. Setting HMNT_UNDO_DIRTY bypasses degenerate 728 * case tests and forces the flush in order to update the FIFO indices. 729 * 730 * If a crash occurs during the flush the entire undo/redo will be 731 * re-run during recovery on the next mount. 732 */ 733 if (error == 0) { 734 if (rootmap->first_offset != rootmap->next_offset) 735 hmp->hflags |= HMNT_UNDO_DIRTY; 736 hammer_flusher_sync(hmp); 737 } 738 fatal: 739 hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN; 740 if (verbose) { 741 kprintf("HAMMER(%s) End redo recovery\n", 742 root_volume->ondisk->vol_name); 743 } 744 return (error); 745 } 746 747 /* 748 * Scan backwards from *scan_offsetp, return the FIFO record prior to the 749 * record at *scan_offsetp or NULL if an error occured. 750 * 751 * On return *scan_offsetp will be the offset of the returned record. 752 */ 753 hammer_fifo_any_t 754 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume, 755 hammer_off_t *scan_offsetp, 756 int *errorp, struct hammer_buffer **bufferp) 757 { 758 hammer_off_t scan_offset; 759 hammer_blockmap_t rootmap; 760 hammer_fifo_any_t head; 761 hammer_fifo_tail_t tail; 762 763 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 764 scan_offset = *scan_offsetp; 765 766 if (hammer_debug_general & 0x0080) 767 kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset); 768 if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) 769 scan_offset = rootmap->alloc_offset; 770 if (scan_offset - sizeof(*tail) < 771 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) { 772 kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n", 773 root_volume->ondisk->vol_name, 774 (intmax_t)scan_offset); 775 *errorp = EIO; 776 return (NULL); 777 } 778 tail = hammer_bread(hmp, scan_offset - sizeof(*tail), 779 errorp, bufferp); 780 if (*errorp) { 781 kprintf("HAMMER(%s) Unable to read UNDO TAIL " 782 "at %016jx\n", 783 root_volume->ondisk->vol_name, 784 (intmax_t)scan_offset - sizeof(*tail)); 785 return (NULL); 786 } 787 788 if (hammer_check_tail_signature(tail, scan_offset) != 0) { 789 kprintf("HAMMER(%s) Illegal UNDO TAIL signature " 790 "at %016jx\n", 791 root_volume->ondisk->vol_name, 792 (intmax_t)scan_offset - sizeof(*tail)); 793 *errorp = EIO; 794 return (NULL); 795 } 796 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size); 797 *scan_offsetp = scan_offset - head->head.hdr_size; 798 799 return (head); 800 } 801 802 /* 803 * Scan forwards from *scan_offsetp, return the FIFO record or NULL if 804 * an error occured. 805 * 806 * On return *scan_offsetp will be the offset of the record following 807 * the returned record. 808 */ 809 hammer_fifo_any_t 810 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume, 811 hammer_off_t *scan_offsetp, 812 int *errorp, struct hammer_buffer **bufferp) 813 { 814 hammer_off_t scan_offset; 815 hammer_blockmap_t rootmap; 816 hammer_fifo_any_t head; 817 818 rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; 819 scan_offset = *scan_offsetp; 820 821 if (hammer_debug_general & 0x0080) 822 kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset); 823 if (scan_offset == rootmap->alloc_offset) 824 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0); 825 826 head = hammer_bread(hmp, scan_offset, errorp, bufferp); 827 if (*errorp) { 828 kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n", 829 root_volume->ondisk->vol_name, 830 (intmax_t)scan_offset); 831 return (NULL); 832 } 833 834 if (hammer_check_head_signature(&head->head, scan_offset) != 0) { 835 kprintf("HAMMER(%s) Illegal UNDO TAIL signature " 836 "at %016jx\n", 837 root_volume->ondisk->vol_name, 838 (intmax_t)scan_offset); 839 *errorp = EIO; 840 return (NULL); 841 } 842 scan_offset += head->head.hdr_size; 843 if (scan_offset == rootmap->alloc_offset) 844 scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0); 845 *scan_offsetp = scan_offset; 846 847 return (head); 848 } 849 850 /* 851 * Helper function for hammer_check_{head,tail}_signature(). Check stuff 852 * once the head and tail has been established. 853 * 854 * This function validates the entire FIFO record wrapper. 855 */ 856 static __inline 857 int 858 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail, 859 hammer_off_t beg_off) 860 { 861 hammer_off_t end_off; 862 u_int32_t crc; 863 int bytes; 864 865 /* 866 * Check signatures. The tail signature is allowed to be the 867 * head signature only for 8-byte PADs. 868 */ 869 if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) { 870 kprintf("HAMMER: FIFO record bad head signature " 871 "%04x at %016jx\n", 872 head->hdr_signature, 873 (intmax_t)beg_off); 874 return(2); 875 } 876 if (head->hdr_size < HAMMER_HEAD_ALIGN || 877 (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) { 878 kprintf("HAMMER: FIFO record unaligned or bad size" 879 "%04x at %016jx\n", 880 head->hdr_size, 881 (intmax_t)beg_off); 882 return(2); 883 } 884 end_off = beg_off + head->hdr_size; 885 886 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD || 887 (size_t)(end_off - beg_off) != sizeof(*tail)) { 888 if (head->hdr_type != tail->tail_type) { 889 kprintf("HAMMER: FIFO record head/tail type mismatch " 890 "%04x %04x at %016jx\n", 891 head->hdr_type, tail->tail_type, 892 (intmax_t)beg_off); 893 return(2); 894 } 895 if (head->hdr_size != tail->tail_size) { 896 kprintf("HAMMER: FIFO record head/tail size mismatch " 897 "%04x %04x at %016jx\n", 898 head->hdr_size, tail->tail_size, 899 (intmax_t)beg_off); 900 return(2); 901 } 902 if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) { 903 kprintf("HAMMER: FIFO record bad tail signature " 904 "%04x at %016jx\n", 905 tail->tail_signature, 906 (intmax_t)beg_off); 907 return(3); 908 } 909 } 910 911 /* 912 * Non-PAD records must have a CRC and must be sized at 913 * least large enough to fit the head and tail. 914 */ 915 if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) { 916 crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^ 917 crc32(head + 1, head->hdr_size - sizeof(*head)); 918 if (head->hdr_crc != crc) { 919 kprintf("HAMMER: FIFO record CRC failed %08x %08x " 920 "at %016jx\n", 921 head->hdr_crc, crc, 922 (intmax_t)beg_off); 923 return(EIO); 924 } 925 if (head->hdr_size < sizeof(*head) + sizeof(*tail)) { 926 kprintf("HAMMER: FIFO record too small " 927 "%04x at %016jx\n", 928 head->hdr_size, 929 (intmax_t)beg_off); 930 return(EIO); 931 } 932 } 933 934 /* 935 * Check the tail 936 */ 937 bytes = head->hdr_size; 938 tail = (void *)((char *)head + bytes - sizeof(*tail)); 939 if (tail->tail_size != head->hdr_size) { 940 kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n", 941 tail->tail_size, head->hdr_size, 942 (intmax_t)beg_off); 943 return(EIO); 944 } 945 if (tail->tail_type != head->hdr_type) { 946 kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n", 947 tail->tail_type, head->hdr_type, 948 (intmax_t)beg_off); 949 return(EIO); 950 } 951 952 return(0); 953 } 954 955 /* 956 * Check that the FIFO record is in-bounds given the head and the 957 * hammer offset. 958 * 959 * Also checks that the head and tail structures agree with each other, 960 * but does not check beyond the signature, type, and size. 961 */ 962 static int 963 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off) 964 { 965 hammer_fifo_tail_t tail; 966 hammer_off_t end_off; 967 968 /* 969 * head overlaps buffer boundary. This could be a PAD so only 970 * check the minimum PAD size here. 971 */ 972 if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64) 973 return(1); 974 975 /* 976 * Calculate the ending offset and make sure the record does 977 * not cross a buffer boundary. 978 */ 979 end_off = beg_off + head->hdr_size; 980 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64) 981 return(1); 982 tail = (void *)((char *)head + head->hdr_size - sizeof(*tail)); 983 return (_hammer_check_signature(head, tail, beg_off)); 984 } 985 986 /* 987 * Check that the FIFO record is in-bounds given the tail and the 988 * hammer offset. The offset is pointing at the ending boundary of the 989 * record. 990 * 991 * Also checks that the head and tail structures agree with each other, 992 * but does not check beyond the signature, type, and size. 993 */ 994 static int 995 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off) 996 { 997 hammer_fifo_head_t head; 998 hammer_off_t beg_off; 999 1000 /* 1001 * tail overlaps buffer boundary 1002 */ 1003 if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64) 1004 return(1); 1005 1006 /* 1007 * Calculate the begining offset and make sure the record does 1008 * not cross a buffer boundary. 1009 */ 1010 beg_off = end_off - tail->tail_size; 1011 if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64) 1012 return(1); 1013 head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size); 1014 return (_hammer_check_signature(head, tail, beg_off)); 1015 } 1016 1017 static int 1018 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume, 1019 hammer_fifo_undo_t undo) 1020 { 1021 hammer_volume_t volume; 1022 hammer_buffer_t buffer; 1023 hammer_off_t buf_offset; 1024 int zone; 1025 int error; 1026 int vol_no; 1027 int bytes; 1028 u_int32_t offset; 1029 1030 /* 1031 * Only process UNDO records. Flag if we find other records to 1032 * optimize stage2 recovery. 1033 */ 1034 if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO) 1035 return(0); 1036 1037 /* 1038 * Validate the UNDO record. 1039 */ 1040 bytes = undo->head.hdr_size - sizeof(*undo) - 1041 sizeof(struct hammer_fifo_tail); 1042 if (bytes < 0 || undo->undo_data_bytes < 0 || 1043 undo->undo_data_bytes > bytes) { 1044 kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n", 1045 undo->undo_data_bytes, bytes); 1046 return(EIO); 1047 } 1048 1049 bytes = undo->undo_data_bytes; 1050 1051 /* 1052 * The undo offset may only be a zone-1 or zone-2 offset. 1053 * 1054 * Currently we only support a zone-1 offset representing the 1055 * volume header. 1056 */ 1057 zone = HAMMER_ZONE_DECODE(undo->undo_offset); 1058 offset = undo->undo_offset & HAMMER_BUFMASK; 1059 1060 if (offset + bytes > HAMMER_BUFSIZE) { 1061 kprintf("HAMMER: Corrupt UNDO record, bad offset\n"); 1062 return (EIO); 1063 } 1064 1065 switch(zone) { 1066 case HAMMER_ZONE_RAW_VOLUME_INDEX: 1067 vol_no = HAMMER_VOL_DECODE(undo->undo_offset); 1068 volume = hammer_get_volume(hmp, vol_no, &error); 1069 if (volume == NULL) { 1070 kprintf("HAMMER: UNDO record, " 1071 "cannot access volume %d\n", vol_no); 1072 break; 1073 } 1074 hammer_modify_volume(NULL, volume, NULL, 0); 1075 hammer_recover_copy_undo(undo->undo_offset, 1076 (char *)(undo + 1), 1077 (char *)volume->ondisk + offset, 1078 bytes); 1079 hammer_modify_volume_done(volume); 1080 1081 /* 1082 * Multiple modifications may be made to the same buffer. 1083 * Also, the volume header cannot be written out until 1084 * everything else has been flushed. This also 1085 * covers the read-only case by preventing the kernel from 1086 * flushing the buffer. 1087 */ 1088 if (volume->io.recovered == 0) 1089 volume->io.recovered = 1; 1090 else 1091 hammer_rel_volume(volume, 0); 1092 break; 1093 case HAMMER_ZONE_RAW_BUFFER_INDEX: 1094 buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64; 1095 buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE, 1096 0, &error); 1097 if (buffer == NULL) { 1098 kprintf("HAMMER: UNDO record, " 1099 "cannot access buffer %016jx\n", 1100 (intmax_t)undo->undo_offset); 1101 break; 1102 } 1103 hammer_modify_buffer(NULL, buffer, NULL, 0); 1104 hammer_recover_copy_undo(undo->undo_offset, 1105 (char *)(undo + 1), 1106 (char *)buffer->ondisk + offset, 1107 bytes); 1108 hammer_modify_buffer_done(buffer); 1109 1110 /* 1111 * Multiple modifications may be made to the same buffer, 1112 * improve performance by delaying the flush. This also 1113 * covers the read-only case by preventing the kernel from 1114 * flushing the buffer. 1115 */ 1116 if (buffer->io.recovered == 0) 1117 buffer->io.recovered = 1; 1118 else 1119 hammer_rel_buffer(buffer, 0); 1120 break; 1121 default: 1122 kprintf("HAMMER: Corrupt UNDO record\n"); 1123 error = EIO; 1124 } 1125 return (error); 1126 } 1127 1128 static void 1129 hammer_recover_copy_undo(hammer_off_t undo_offset, 1130 char *src, char *dst, int bytes) 1131 { 1132 if (hammer_debug_general & 0x0080) { 1133 kprintf("UNDO %016jx: %d\n", 1134 (intmax_t)undo_offset, bytes); 1135 } 1136 #if 0 1137 kprintf("UNDO %016jx:", (intmax_t)undo_offset); 1138 hammer_recover_debug_dump(22, dst, bytes); 1139 kprintf("%22s", "to:"); 1140 hammer_recover_debug_dump(22, src, bytes); 1141 #endif 1142 bcopy(src, dst, bytes); 1143 } 1144 1145 /* 1146 * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations 1147 * during the backwards scan of the extended UNDO/REDO FIFO. This scan 1148 * does not include the nominal UNDO range, just the extended range. 1149 */ 1150 int 1151 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, 1152 hammer_off_t scan_offset, hammer_fifo_redo_t redo) 1153 { 1154 hammer_rterm_t rterm; 1155 hammer_rterm_t nrterm; 1156 hammer_rterm_entry_t rte; 1157 1158 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO) 1159 return(0); 1160 if (redo->redo_flags != HAMMER_REDO_TERM_WRITE && 1161 redo->redo_flags != HAMMER_REDO_TERM_TRUNC) { 1162 return(0); 1163 } 1164 1165 nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO); 1166 nrterm->redo_objid = redo->redo_objid; 1167 nrterm->redo_localization = redo->redo_localization; 1168 nrterm->redo_flags = redo->redo_flags; 1169 nrterm->redo_offset = redo->redo_offset; 1170 1171 rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm); 1172 if (rterm) 1173 kfree(nrterm, hmp->m_misc); 1174 else 1175 rterm = nrterm; 1176 1177 if (bootverbose) { 1178 kprintf("record record %016jx objid %016jx " 1179 "offset %016jx flags %08x\n", 1180 (intmax_t)scan_offset, 1181 (intmax_t)redo->redo_objid, 1182 (intmax_t)redo->redo_offset, 1183 (int)redo->redo_flags); 1184 } 1185 1186 /* 1187 * Scan in reverse order, rte prepended, so the rte list will be 1188 * in forward order. 1189 */ 1190 rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO); 1191 rte->fifo_offset = scan_offset; 1192 rte->next = rterm->term_list; 1193 rterm->term_list = rte; 1194 1195 return(0); 1196 } 1197 1198 /* 1199 * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during 1200 * the forwards scan of the entire extended UNDO/REDO FIFO range. 1201 * 1202 * Records matching previously recorded TERMs have already been committed 1203 * and are ignored. 1204 */ 1205 int 1206 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, 1207 hammer_off_t scan_offset, hammer_fifo_redo_t redo) 1208 { 1209 struct hammer_rterm rtval; 1210 hammer_rterm_t rterm; 1211 hammer_rterm_entry_t rte; 1212 1213 if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO) 1214 return(0); 1215 1216 switch(redo->redo_flags) { 1217 case HAMMER_REDO_WRITE: 1218 case HAMMER_REDO_TRUNC: 1219 /* 1220 * We hit a REDO request. The REDO request is only executed 1221 * if there is no matching TERM. 1222 */ 1223 bzero(&rtval, sizeof(rtval)); 1224 rtval.redo_objid = redo->redo_objid; 1225 rtval.redo_localization = redo->redo_localization; 1226 rtval.redo_offset = redo->redo_offset; 1227 rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ? 1228 HAMMER_REDO_TERM_WRITE : 1229 HAMMER_REDO_TERM_TRUNC; 1230 1231 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval); 1232 if (rterm) { 1233 if (bootverbose) { 1234 kprintf("ignore record %016jx objid %016jx " 1235 "offset %016jx flags %08x\n", 1236 (intmax_t)scan_offset, 1237 (intmax_t)redo->redo_objid, 1238 (intmax_t)redo->redo_offset, 1239 (int)redo->redo_flags); 1240 } 1241 break; 1242 } 1243 if (bootverbose) { 1244 kprintf("run record %016jx objid %016jx " 1245 "offset %016jx flags %08x\n", 1246 (intmax_t)scan_offset, 1247 (intmax_t)redo->redo_objid, 1248 (intmax_t)redo->redo_offset, 1249 (int)redo->redo_flags); 1250 } 1251 1252 /* 1253 * Redo stage2 can access a live filesystem, acquire the 1254 * vnode. 1255 */ 1256 hammer_recover_redo_exec(hmp, redo); 1257 break; 1258 case HAMMER_REDO_TERM_WRITE: 1259 case HAMMER_REDO_TERM_TRUNC: 1260 /* 1261 * As we encounter TERMs in the forward scan we remove 1262 * them. Once the forward scan hits the nominal undo range 1263 * there will be no more recorded TERMs. 1264 */ 1265 bzero(&rtval, sizeof(rtval)); 1266 rtval.redo_objid = redo->redo_objid; 1267 rtval.redo_localization = redo->redo_localization; 1268 rtval.redo_flags = redo->redo_flags; 1269 rtval.redo_offset = redo->redo_offset; 1270 1271 rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval); 1272 if (rterm) { 1273 if ((rte = rterm->term_list) != NULL) { 1274 KKASSERT(rte->fifo_offset == scan_offset); 1275 rterm->term_list = rte->next; 1276 kfree(rte, hmp->m_misc); 1277 } 1278 } 1279 break; 1280 } 1281 return(0); 1282 } 1283 1284 static void 1285 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo) 1286 { 1287 struct hammer_transaction trans; 1288 struct vattr va; 1289 struct hammer_inode *ip; 1290 struct vnode *vp = NULL; 1291 int error; 1292 1293 hammer_start_transaction(&trans, hmp); 1294 1295 ip = hammer_get_inode(&trans, NULL, redo->redo_objid, 1296 HAMMER_MAX_TID, redo->redo_localization, 1297 0, &error); 1298 if (ip == NULL) { 1299 kprintf("unable to find objid %016jx:%08x\n", 1300 (intmax_t)redo->redo_objid, redo->redo_localization); 1301 goto done2; 1302 } 1303 error = hammer_get_vnode(ip, &vp); 1304 if (error) { 1305 kprintf("unable to acquire vnode for %016jx:%08x\n", 1306 (intmax_t)redo->redo_objid, redo->redo_localization); 1307 goto done1; 1308 } 1309 1310 switch(redo->redo_flags) { 1311 case HAMMER_REDO_WRITE: 1312 error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL); 1313 if (error) { 1314 kprintf("vn_rdwr open %016jx:%08x returned %d\n", 1315 (intmax_t)redo->redo_objid, 1316 redo->redo_localization, error); 1317 break; 1318 } 1319 vn_unlock(vp); 1320 error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1), 1321 redo->redo_data_bytes, 1322 redo->redo_offset, UIO_SYSSPACE, 1323 0, proc0.p_ucred, NULL); 1324 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1325 if (error) { 1326 kprintf("write %016jx:%08x returned %d\n", 1327 (intmax_t)redo->redo_objid, 1328 redo->redo_localization, error); 1329 } 1330 VOP_CLOSE(vp, FREAD|FWRITE); 1331 break; 1332 case HAMMER_REDO_TRUNC: 1333 VATTR_NULL(&va); 1334 va.va_size = redo->redo_offset; 1335 error = VOP_SETATTR(vp, &va, proc0.p_ucred); 1336 if (error) { 1337 kprintf("setattr offset %016jx error %d\n", 1338 (intmax_t)redo->redo_offset, error); 1339 } 1340 break; 1341 } 1342 vput(vp); 1343 done1: 1344 hammer_rel_inode(ip, 0); 1345 done2: 1346 hammer_done_transaction(&trans); 1347 } 1348 1349 /* 1350 * RB tree compare function. Note that REDO_TERM_TRUNC ops ignore 1351 * the offset. 1352 * 1353 * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc. 1354 */ 1355 static int 1356 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2) 1357 { 1358 if (rt1->redo_objid < rt2->redo_objid) 1359 return(-1); 1360 if (rt1->redo_objid > rt2->redo_objid) 1361 return(1); 1362 if (rt1->redo_localization < rt2->redo_localization) 1363 return(-1); 1364 if (rt1->redo_localization > rt2->redo_localization) 1365 return(1); 1366 if (rt1->redo_flags < rt2->redo_flags) 1367 return(-1); 1368 if (rt1->redo_flags > rt2->redo_flags) 1369 return(1); 1370 if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) { 1371 if (rt1->redo_offset < rt2->redo_offset) 1372 return(-1); 1373 if (rt1->redo_offset > rt2->redo_offset) 1374 return(1); 1375 } 1376 return(0); 1377 } 1378 1379 #if 0 1380 1381 static void 1382 hammer_recover_debug_dump(int w, char *buf, int bytes) 1383 { 1384 int i; 1385 1386 for (i = 0; i < bytes; ++i) { 1387 if (i && (i & 15) == 0) 1388 kprintf("\n%*.*s", w, w, ""); 1389 kprintf(" %02x", (unsigned char)buf[i]); 1390 } 1391 kprintf("\n"); 1392 } 1393 1394 #endif 1395 1396 /* 1397 * Flush recovered buffers from recovery operations. The call to this 1398 * routine may be delayed if a read-only mount was made and then later 1399 * upgraded to read-write. This routine is also called when unmounting 1400 * a read-only mount to clean out recovered (dirty) buffers which we 1401 * couldn't flush (because the mount is read-only). 1402 * 1403 * The volume header is always written last. The UNDO FIFO will be forced 1404 * to zero-length by setting next_offset to first_offset. This leaves the 1405 * (now stale) UNDO information used to recover the disk available for 1406 * forensic analysis. 1407 * 1408 * final is typically 0 or 1. The volume header is only written if final 1409 * is 1. If final is -1 the recovered buffers are discarded instead of 1410 * written and root_volume can also be passed as NULL in that case. 1411 */ 1412 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *); 1413 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *); 1414 1415 void 1416 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume, 1417 int final) 1418 { 1419 /* 1420 * Flush the buffers out asynchronously, wait for all the I/O to 1421 * complete, then do it again to destroy the buffer cache buffer 1422 * so it doesn't alias something later on. 1423 */ 1424 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, 1425 hammer_recover_flush_buffer_callback, &final); 1426 hammer_io_wait_all(hmp, "hmrrcw", 1); 1427 RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, 1428 hammer_recover_flush_buffer_callback, &final); 1429 1430 /* 1431 * Flush all volume headers except the root volume. If final < 0 1432 * we discard all volume headers including the root volume. 1433 */ 1434 if (final >= 0) { 1435 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, 1436 hammer_recover_flush_volume_callback, root_volume); 1437 } else { 1438 RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, 1439 hammer_recover_flush_volume_callback, NULL); 1440 } 1441 1442 /* 1443 * Finalize the root volume header. 1444 * 1445 * No interlock is needed, volume buffers are not 1446 * messed with by bioops. 1447 */ 1448 if (root_volume && root_volume->io.recovered && final > 0) { 1449 hammer_io_wait_all(hmp, "hmrflx", 1); 1450 root_volume->io.recovered = 0; 1451 hammer_io_flush(&root_volume->io, 0); 1452 hammer_rel_volume(root_volume, 0); 1453 hammer_io_wait_all(hmp, "hmrfly", 1); 1454 } 1455 } 1456 1457 /* 1458 * Callback to flush volume headers. If discarding data will be NULL and 1459 * all volume headers (including the root volume) will be discarded. 1460 * Otherwise data is the root_volume and we flush all volume headers 1461 * EXCEPT the root_volume. 1462 * 1463 * Clear any I/O error or modified condition when discarding buffers to 1464 * clean up the reference count, otherwise the buffer may have extra refs 1465 * on it. 1466 */ 1467 static 1468 int 1469 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data) 1470 { 1471 hammer_volume_t root_volume = data; 1472 1473 if (volume->io.recovered && volume != root_volume) { 1474 volume->io.recovered = 0; 1475 if (root_volume != NULL) { 1476 /* 1477 * No interlock is needed, volume buffers are not 1478 * messed with by bioops. 1479 */ 1480 hammer_io_flush(&volume->io, 0); 1481 } else { 1482 hammer_io_clear_error(&volume->io); 1483 hammer_io_clear_modify(&volume->io, 1); 1484 } 1485 hammer_rel_volume(volume, 0); 1486 } 1487 return(0); 1488 } 1489 1490 /* 1491 * Flush or discard recovered I/O buffers. 1492 * 1493 * Clear any I/O error or modified condition when discarding buffers to 1494 * clean up the reference count, otherwise the buffer may have extra refs 1495 * on it. 1496 */ 1497 static 1498 int 1499 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data) 1500 { 1501 int final = *(int *)data; 1502 int flush; 1503 1504 if (buffer->io.recovered) { 1505 buffer->io.recovered = 0; 1506 buffer->io.reclaim = 1; 1507 if (final < 0) { 1508 hammer_io_clear_error(&buffer->io); 1509 hammer_io_clear_modify(&buffer->io, 1); 1510 } else { 1511 hammer_io_write_interlock(&buffer->io); 1512 hammer_io_flush(&buffer->io, 0); 1513 hammer_io_done_interlock(&buffer->io); 1514 } 1515 hammer_rel_buffer(buffer, 0); 1516 } else { 1517 flush = hammer_ref_interlock(&buffer->io.lock); 1518 if (flush) 1519 ++hammer_count_refedbufs; 1520 1521 if (final < 0) { 1522 hammer_io_clear_error(&buffer->io); 1523 hammer_io_clear_modify(&buffer->io, 1); 1524 } 1525 KKASSERT(hammer_oneref(&buffer->io.lock)); 1526 buffer->io.reclaim = 1; 1527 hammer_rel_buffer(buffer, flush); 1528 } 1529 return(0); 1530 } 1531 1532