1 /* $NetBSD: lfs_rfw.c,v 1.32 2015/10/03 08:27:55 dholland Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Konrad E. Schroder <perseant@hhhh.org>. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.32 2015/10/03 08:27:55 dholland Exp $"); 34 35 #if defined(_KERNEL_OPT) 36 #include "opt_quota.h" 37 #endif 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/namei.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/vnode.h> 45 #include <sys/mount.h> 46 #include <sys/kthread.h> 47 #include <sys/buf.h> 48 #include <sys/device.h> 49 #include <sys/mbuf.h> 50 #include <sys/file.h> 51 #include <sys/disklabel.h> 52 #include <sys/ioctl.h> 53 #include <sys/errno.h> 54 #include <sys/malloc.h> 55 #include <sys/pool.h> 56 #include <sys/socket.h> 57 #include <sys/syslog.h> 58 #include <uvm/uvm_extern.h> 59 #include <sys/sysctl.h> 60 #include <sys/conf.h> 61 #include <sys/kauth.h> 62 63 #include <miscfs/specfs/specdev.h> 64 65 #include <ufs/lfs/ulfs_quotacommon.h> 66 #include <ufs/lfs/ulfs_inode.h> 67 #include <ufs/lfs/ulfsmount.h> 68 #include <ufs/lfs/ulfs_extern.h> 69 70 #include <uvm/uvm.h> 71 #include <uvm/uvm_stat.h> 72 #include <uvm/uvm_pager.h> 73 #include <uvm/uvm_pdaemon.h> 74 75 #include <ufs/lfs/lfs.h> 76 #include <ufs/lfs/lfs_accessors.h> 77 #include <ufs/lfs/lfs_kernel.h> 78 #include <ufs/lfs/lfs_extern.h> 79 80 #include <miscfs/genfs/genfs.h> 81 #include <miscfs/genfs/genfs_node.h> 82 83 /* 84 * Roll-forward code. 85 */ 86 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t, 87 kauth_cred_t, int, int *, struct lwp *); 88 89 extern int lfs_do_rfw; 90 91 /* 92 * Allocate a particular inode with a particular version number, freeing 93 * any previous versions of this inode that may have gone before. 94 * Used by the roll-forward code. 95 * 96 * XXX this function does not have appropriate locking to be used on a live fs; 97 * XXX but something similar could probably be used for an "undelete" call. 98 * 99 * Called with the Ifile inode locked. 100 */ 101 int 102 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l, 103 struct vnode **vpp) 104 { 105 struct vattr va; 106 struct vnode *vp; 107 struct inode *ip; 108 int error; 109 110 ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */ 111 112 /* 113 * First, just try a vget. If the version number is the one we want, 114 * we don't have to do anything else. If the version number is wrong, 115 * take appropriate action. 116 */ 117 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp); 118 if (error == 0) { 119 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", ino, vp)); 120 121 *vpp = vp; 122 ip = VTOI(vp); 123 if (ip->i_gen == vers) 124 return 0; 125 else if (ip->i_gen < vers) { 126 lfs_truncate(vp, (off_t)0, 0, NOCRED); 127 ip->i_gen = vers; 128 lfs_dino_setgen(fs, ip->i_din, vers); 129 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 130 return 0; 131 } else { 132 DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n", 133 ino, vers, lfs_dino_getgen(fs, ip->i_din))); 134 vput(vp); 135 *vpp = NULLVP; 136 return EEXIST; 137 } 138 } 139 140 /* Not found, create as regular file. */ 141 vattr_null(&va); 142 va.va_type = VREG; 143 va.va_mode = 0; 144 va.va_fileid = ino; 145 va.va_gen = vers; 146 error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, &vp); 147 if (error) 148 return error; 149 error = vn_lock(vp, LK_EXCLUSIVE); 150 if (error) { 151 vrele(vp); 152 *vpp = NULLVP; 153 return error; 154 } 155 ip = VTOI(vp); 156 ip->i_nlink = 1; 157 lfs_dino_setnlink(fs, ip->i_din, 1); 158 *vpp = vp; 159 return 0; 160 } 161 162 /* 163 * Load the appropriate indirect block, and change the appropriate pointer. 164 * Mark the block dirty. Do segment and avail accounting. 165 */ 166 static int 167 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn, 168 daddr_t ndaddr, size_t size, struct lwp *l) 169 { 170 int error; 171 struct vnode *vp; 172 struct inode *ip; 173 #ifdef DEBUG 174 daddr_t odaddr; 175 struct indir a[ULFS_NIADDR]; 176 int num; 177 int i; 178 #endif /* DEBUG */ 179 struct buf *bp; 180 SEGUSE *sup; 181 182 KASSERT(lbn >= 0); /* no indirect blocks */ 183 184 if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp)) != 0) { 185 DLOG((DLOG_RF, "update_meta: ino %d: lfs_rf_valloc" 186 " returned %d\n", ino, error)); 187 return error; 188 } 189 190 if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), size, 191 NOCRED, 0, &bp)) != 0) { 192 vput(vp); 193 return (error); 194 } 195 /* No need to write, the block is already on disk */ 196 if (bp->b_oflags & BO_DELWRI) { 197 LFS_UNLOCK_BUF(bp); 198 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount)); 199 /* XXX should this wake up fs->lfs_availsleep? */ 200 } 201 brelse(bp, BC_INVAL); 202 203 /* 204 * Extend the file, if it is not large enough already. 205 * XXX this is not exactly right, we don't know how much of the 206 * XXX last block is actually used. We hope that an inode will 207 * XXX appear later to give the correct size. 208 */ 209 ip = VTOI(vp); 210 if (ip->i_size <= (lbn << lfs_sb_getbshift(fs))) { 211 u_int64_t newsize; 212 213 if (lbn < ULFS_NDADDR) { 214 newsize = (lbn << lfs_sb_getbshift(fs)) + 215 (size - lfs_sb_getfsize(fs)) + 1; 216 } else { 217 newsize = (lbn << lfs_sb_getbshift(fs)) + 1; 218 } 219 lfs_dino_setsize(fs, ip->i_din, newsize); 220 221 if (ip->i_size < newsize) { 222 ip->i_size = newsize; 223 /* 224 * tell vm our new size for the case the inode won't 225 * appear later. 226 */ 227 uvm_vnp_setsize(vp, newsize); 228 } 229 } 230 231 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size); 232 233 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 234 sup->su_nbytes += size; 235 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp); 236 237 /* differences here should be due to UNWRITTEN indirect blocks. */ 238 KASSERT((lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR && 239 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) || 240 ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din)); 241 242 #ifdef DEBUG 243 /* Now look again to make sure it worked */ 244 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL); 245 for (i = num; i > 0; i--) { 246 if (!a[i].in_exists) 247 panic("update_meta: absent %d lv indirect block", i); 248 } 249 if (LFS_DBTOFSB(fs, odaddr) != ndaddr) 250 DLOG((DLOG_RF, "update_meta: failed setting ino %d lbn %" 251 PRId64 " to %" PRId64 "\n", ino, lbn, ndaddr)); 252 #endif /* DEBUG */ 253 vput(vp); 254 return 0; 255 } 256 257 /* 258 * Copy some the fields of the dinode as needed by update_inoblk(). 259 */ 260 static void 261 update_inoblk_copy_dinode(struct lfs *fs, 262 union lfs_dinode *dstu, const union lfs_dinode *srcu) 263 { 264 if (fs->lfs_is64) { 265 struct lfs64_dinode *dst = &dstu->u_64; 266 const struct lfs64_dinode *src = &srcu->u_64; 267 unsigned i; 268 269 /* 270 * Copy everything but the block pointers and di_blocks. 271 * XXX what about di_extb? 272 */ 273 dst->di_mode = src->di_mode; 274 dst->di_nlink = src->di_nlink; 275 dst->di_uid = src->di_uid; 276 dst->di_gid = src->di_gid; 277 dst->di_blksize = src->di_blksize; 278 dst->di_size = src->di_size; 279 dst->di_atime = src->di_atime; 280 dst->di_mtime = src->di_mtime; 281 dst->di_ctime = src->di_ctime; 282 dst->di_birthtime = src->di_birthtime; 283 dst->di_mtimensec = src->di_mtimensec; 284 dst->di_atimensec = src->di_atimensec; 285 dst->di_ctimensec = src->di_ctimensec; 286 dst->di_birthnsec = src->di_birthnsec; 287 dst->di_gen = src->di_gen; 288 dst->di_kernflags = src->di_kernflags; 289 dst->di_flags = src->di_flags; 290 dst->di_extsize = src->di_extsize; 291 dst->di_modrev = src->di_modrev; 292 dst->di_inumber = src->di_inumber; 293 for (i = 0; i < __arraycount(src->di_spare); i++) { 294 dst->di_spare[i] = src->di_spare[i]; 295 } 296 } else { 297 struct lfs32_dinode *dst = &dstu->u_32; 298 const struct lfs32_dinode *src = &srcu->u_32; 299 300 /* Get mode, link count, size, and times */ 301 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0])); 302 303 /* Then the rest, except di_blocks */ 304 dst->di_flags = src->di_flags; 305 dst->di_gen = src->di_gen; 306 dst->di_uid = src->di_uid; 307 dst->di_gid = src->di_gid; 308 dst->di_modrev = src->di_modrev; 309 } 310 } 311 312 static int 313 update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred, 314 struct lwp *l) 315 { 316 struct vnode *devvp, *vp; 317 struct inode *ip; 318 union lfs_dinode *dip; 319 struct buf *dbp, *ibp; 320 int error; 321 daddr_t daddr; 322 IFILE *ifp; 323 SEGUSE *sup; 324 unsigned i, num; 325 326 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 327 328 /* 329 * Get the inode, update times and perms. 330 * DO NOT update disk blocks, we do that separately. 331 */ 332 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs), 333 0, &dbp); 334 if (error) { 335 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error)); 336 return error; 337 } 338 num = LFS_INOPB(fs); 339 for (i = num; i-- > 0; ) { 340 dip = DINO_IN_BLOCK(fs, dbp->b_data, i); 341 if (lfs_dino_getinumber(fs, dip) > LFS_IFILE_INUM) { 342 error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip), 343 lfs_dino_getgen(fs, dip), 344 l, &vp); 345 if (error) { 346 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc" 347 " returned %d\n", error)); 348 continue; 349 } 350 ip = VTOI(vp); 351 if (lfs_dino_getsize(fs, dip) != ip->i_size) 352 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0, 353 NOCRED); 354 update_inoblk_copy_dinode(fs, ip->i_din, dip); 355 356 ip->i_flags = lfs_dino_getflags(fs, dip); 357 ip->i_gen = lfs_dino_getgen(fs, dip); 358 ip->i_uid = lfs_dino_getuid(fs, dip); 359 ip->i_gid = lfs_dino_getgid(fs, dip); 360 361 ip->i_mode = lfs_dino_getmode(fs, dip); 362 ip->i_nlink = lfs_dino_getnlink(fs, dip); 363 ip->i_size = lfs_dino_getsize(fs, dip); 364 365 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE); 366 367 /* Re-initialize to get type right */ 368 ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, 369 &vp); 370 vput(vp); 371 372 /* Record change in location */ 373 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp); 374 daddr = lfs_if_getdaddr(fs, ifp); 375 lfs_if_setdaddr(fs, ifp, LFS_DBTOFSB(fs, dbp->b_blkno)); 376 error = LFS_BWRITE_LOG(ibp); /* Ifile */ 377 /* And do segment accounting */ 378 if (lfs_dtosn(fs, daddr) != lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno))) { 379 if (daddr > 0) { 380 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, daddr), 381 ibp); 382 sup->su_nbytes -= DINOSIZE(fs); 383 LFS_WRITESEGENTRY(sup, fs, 384 lfs_dtosn(fs, daddr), 385 ibp); 386 } 387 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno)), 388 ibp); 389 sup->su_nbytes += DINOSIZE(fs); 390 LFS_WRITESEGENTRY(sup, fs, 391 lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno)), 392 ibp); 393 } 394 } 395 } 396 brelse(dbp, BC_AGE); 397 398 return 0; 399 } 400 401 #define CHECK_CKSUM 0x0001 /* Check the checksum to make sure it's valid */ 402 #define CHECK_UPDATE 0x0002 /* Update Ifile for new data blocks / inodes */ 403 404 static daddr_t 405 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial, 406 kauth_cred_t cred, int flags, int *pseg_flags, struct lwp *l) 407 { 408 struct vnode *devvp; 409 struct buf *bp, *dbp; 410 int error, nblocks = 0, ninos, i, j; /* XXX: gcc */ 411 SEGSUM *ssp; 412 u_long *dp = NULL, *datap = NULL; /* XXX u_int32_t */ 413 daddr_t oldoffset; 414 IINFO *iip; 415 FINFO *fip; 416 SEGUSE *sup; 417 size_t size; 418 uint32_t datasum, foundsum; 419 420 devvp = VTOI(fs->lfs_ivnode)->i_devvp; 421 /* 422 * If the segment has a superblock and we're at the top 423 * of the segment, skip the superblock. 424 */ 425 if (lfs_sntod(fs, lfs_dtosn(fs, offset)) == offset) { 426 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 427 if (sup->su_flags & SEGUSE_SUPERBLOCK) 428 offset += lfs_btofsb(fs, LFS_SBPAD); 429 brelse(bp, 0); 430 } 431 432 /* Read in the segment summary */ 433 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs), 434 0, &bp); 435 if (error) 436 return -1; 437 438 /* Check summary checksum */ 439 ssp = (SEGSUM *)bp->b_data; 440 if (flags & CHECK_CKSUM) { 441 size_t sumstart; 442 443 sumstart = lfs_ss_getsumstart(fs); 444 if (lfs_ss_getsumsum(fs, ssp) != 445 cksum((char *)ssp + sumstart, 446 lfs_sb_getsumsize(fs) - sumstart)) { 447 DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", offset)); 448 offset = -1; 449 goto err1; 450 } 451 if (lfs_ss_getnfinfo(fs, ssp) == 0 && 452 lfs_ss_getninos(fs, ssp) == 0) { 453 DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", offset)); 454 offset = -1; 455 goto err1; 456 } 457 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) { 458 DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset)); 459 offset = -1; 460 goto err1; 461 } 462 } 463 if (lfs_sb_getversion(fs) > 1) { 464 if (lfs_ss_getserial(fs, ssp) != nextserial) { 465 DLOG((DLOG_RF, "Unexpected serial number at 0x%" PRIx64 466 "\n", offset)); 467 offset = -1; 468 goto err1; 469 } 470 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) { 471 DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%" 472 PRIx64 "\n", lfs_ss_getident(fs, ssp), 473 lfs_sb_getident(fs), offset)); 474 offset = -1; 475 goto err1; 476 } 477 } 478 if (pseg_flags) 479 *pseg_flags = lfs_ss_getflags(fs, ssp); 480 oldoffset = offset; 481 offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs)); 482 483 ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs)); 484 iip = SEGSUM_IINFOSTART(fs, bp->b_data); 485 if (flags & CHECK_CKSUM) { 486 /* Count blocks */ 487 nblocks = 0; 488 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)bp->b_data); 489 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp); ++i) { 490 nblocks += lfs_fi_getnblocks(fs, fip); 491 if (lfs_fi_getnblocks(fs, fip) <= 0) 492 break; 493 fip = NEXT_FINFO(fs, fip); 494 } 495 nblocks += ninos; 496 /* Create the sum array */ 497 datap = dp = malloc(nblocks * sizeof(u_long), 498 M_SEGMENT, M_WAITOK); 499 } 500 501 /* Handle individual blocks */ 502 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)bp->b_data); 503 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) { 504 /* Inode block? */ 505 if (ninos && lfs_ii_getblock(fs, iip) == offset) { 506 if (flags & CHECK_CKSUM) { 507 /* Read in the head and add to the buffer */ 508 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getbsize(fs), 509 0, &dbp); 510 if (error) { 511 offset = -1; 512 goto err2; 513 } 514 /* XXX this can't be right, on-disk u_long? */ 515 (*dp++) = ((u_long *)(dbp->b_data))[0]; 516 brelse(dbp, BC_AGE); 517 } 518 if (flags & CHECK_UPDATE) { 519 if ((error = update_inoblk(fs, offset, cred, l)) 520 != 0) { 521 offset = -1; 522 goto err2; 523 } 524 } 525 offset += lfs_btofsb(fs, lfs_sb_getibsize(fs)); 526 iip = NEXTLOWER_IINFO(fs, iip); 527 --ninos; 528 --i; /* compensate for ++i in loop header */ 529 continue; 530 } 531 size = lfs_sb_getbsize(fs); 532 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) { 533 if (j == lfs_fi_getnblocks(fs, fip) - 1) 534 size = lfs_fi_getlastlength(fs, fip); 535 if (flags & CHECK_CKSUM) { 536 error = bread(devvp, LFS_FSBTODB(fs, offset), size, 537 0, &dbp); 538 if (error) { 539 offset = -1; 540 goto err2; 541 } 542 (*dp++) = ((u_long *)(dbp->b_data))[0]; 543 brelse(dbp, BC_AGE); 544 } 545 /* Account for and update any direct blocks */ 546 if ((flags & CHECK_UPDATE) && 547 lfs_fi_getino(fs, fip) > LFS_IFILE_INUM && 548 lfs_fi_getblock(fs, fip, j) >= 0) { 549 update_meta(fs, lfs_fi_getino(fs, fip), 550 lfs_fi_getversion(fs, fip), 551 lfs_fi_getblock(fs, fip, j), 552 offset, size, l); 553 } 554 offset += lfs_btofsb(fs, size); 555 } 556 fip = NEXT_FINFO(fs, fip); 557 } 558 /* Checksum the array, compare */ 559 datasum = lfs_ss_getdatasum(fs, ssp); 560 foundsum = cksum(datap, nblocks * sizeof(u_long)); 561 if ((flags & CHECK_CKSUM) && datasum != foundsum) { 562 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64 563 " (wanted %x got %x)\n", 564 offset, datasum, foundsum)); 565 offset = -1; 566 goto err2; 567 } 568 569 /* If we're at the end of the segment, move to the next */ 570 if (lfs_dtosn(fs, offset + lfs_btofsb(fs, lfs_sb_getsumsize(fs) + lfs_sb_getbsize(fs))) != 571 lfs_dtosn(fs, offset)) { 572 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, ssp))) { 573 offset = -1; 574 goto err2; 575 } 576 offset = lfs_ss_getnext(fs, ssp); 577 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64 578 " -> segment %d\n", offset, lfs_dtosn(fs,offset))); 579 } 580 581 if (flags & CHECK_UPDATE) { 582 lfs_sb_subavail(fs, offset - oldoffset); 583 /* Don't clog the buffer queue */ 584 mutex_enter(&lfs_lock); 585 if (locked_queue_count > LFS_MAX_BUFS || 586 locked_queue_bytes > LFS_MAX_BYTES) { 587 lfs_flush(fs, SEGM_CKP, 0); 588 } 589 mutex_exit(&lfs_lock); 590 } 591 592 err2: 593 if (flags & CHECK_CKSUM) 594 free(datap, M_SEGMENT); 595 err1: 596 brelse(bp, BC_AGE); 597 598 /* XXX should we update the serial number even for bad psegs? */ 599 if ((flags & CHECK_UPDATE) && offset > 0 && lfs_sb_getversion(fs) > 1) 600 lfs_sb_setserial(fs, nextserial); 601 return offset; 602 } 603 604 void 605 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l) 606 { 607 int flags, dirty; 608 daddr_t offset, oldoffset, lastgoodpseg; 609 int sn, curseg, do_rollforward; 610 struct proc *p; 611 kauth_cred_t cred; 612 SEGUSE *sup; 613 struct buf *bp; 614 615 p = l ? l->l_proc : NULL; 616 cred = p ? p->p_cred : NOCRED; 617 618 /* 619 * Roll forward. 620 * 621 * We don't roll forward for v1 filesystems, because 622 * of the danger that the clock was turned back between the last 623 * checkpoint and crash. This would roll forward garbage. 624 * 625 * v2 filesystems don't have this problem because they use a 626 * monotonically increasing serial number instead of a timestamp. 627 */ 628 do_rollforward = (!(lfs_sb_getpflags(fs) & LFS_PF_CLEAN) && 629 lfs_do_rfw && lfs_sb_getversion(fs) > 1 && p != NULL); 630 if (do_rollforward) { 631 u_int64_t nextserial; 632 /* 633 * Phase I: Find the address of the last good partial 634 * segment that was written after the checkpoint. Mark 635 * the segments in question dirty, so they won't be 636 * reallocated. 637 */ 638 lastgoodpseg = oldoffset = offset = lfs_sb_getoffset(fs); 639 flags = 0x0; 640 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%" 641 PRIx64 "\n", offset)); 642 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 643 if (!(sup->su_flags & SEGUSE_DIRTY)) 644 lfs_sb_subnclean(fs, 1); 645 sup->su_flags |= SEGUSE_DIRTY; 646 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp); 647 nextserial = lfs_sb_getserial(fs) + 1; 648 while ((offset = check_segsum(fs, offset, nextserial, 649 cred, CHECK_CKSUM, &flags, l)) > 0) { 650 nextserial++; 651 if (lfs_sntod(fs, oldoffset) != lfs_sntod(fs, offset)) { 652 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, oldoffset), 653 bp); 654 if (!(sup->su_flags & SEGUSE_DIRTY)) 655 lfs_sb_subnclean(fs, 1); 656 sup->su_flags |= SEGUSE_DIRTY; 657 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, oldoffset), 658 bp); 659 } 660 661 DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%" 662 PRIx64 "\n", offset)); 663 if (flags & SS_DIROP) { 664 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%" 665 PRIx64 "\n", oldoffset)); 666 if (!(flags & SS_CONT)) { 667 DLOG((DLOG_RF, "lfs_mountfs: dirops end " 668 "at 0x%" PRIx64 "\n", oldoffset)); 669 } 670 } 671 if (!(flags & SS_CONT)) 672 lastgoodpseg = offset; 673 oldoffset = offset; 674 } 675 if (flags & SS_CONT) { 676 DLOG((DLOG_RF, "LFS roll forward: warning: incomplete " 677 "dirops discarded\n")); 678 } 679 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: " 680 "lastgoodpseg=0x%" PRIx64 "\n", lastgoodpseg)); 681 oldoffset = lfs_sb_getoffset(fs); 682 if (lfs_sb_getoffset(fs) != lastgoodpseg) { 683 /* Don't overwrite what we're trying to preserve */ 684 offset = lfs_sb_getoffset(fs); 685 lfs_sb_setoffset(fs, lastgoodpseg); 686 lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)))); 687 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) { 688 sn = (sn + 1) % lfs_sb_getnseg(fs); 689 if (sn == curseg) 690 panic("lfs_mountfs: no clean segments"); 691 LFS_SEGENTRY(sup, fs, sn, bp); 692 dirty = (sup->su_flags & SEGUSE_DIRTY); 693 brelse(bp, 0); 694 if (!dirty) 695 break; 696 } 697 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn)); 698 699 /* 700 * Phase II: Roll forward from the first superblock. 701 */ 702 while (offset != lastgoodpseg) { 703 DLOG((DLOG_RF, "LFS roll forward phase 2: 0x%" 704 PRIx64 "\n", offset)); 705 offset = check_segsum(fs, offset, 706 lfs_sb_getserial(fs) + 1, cred, CHECK_UPDATE, 707 NULL, l); 708 } 709 710 /* 711 * Finish: flush our changes to disk. 712 */ 713 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); 714 DLOG((DLOG_RF, "lfs_mountfs: roll forward ", 715 "recovered %jd blocks\n", 716 (intmax_t)(lastgoodpseg - oldoffset))); 717 } 718 DLOG((DLOG_RF, "LFS roll forward complete\n")); 719 } 720 } 721