1 /* $NetBSD: ffs_snapshot.c,v 1.131 2013/10/19 19:28:13 martin Exp $ */ 2 3 /* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40 #include <sys/cdefs.h> 41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.131 2013/10/19 19:28:13 martin Exp $"); 42 43 #if defined(_KERNEL_OPT) 44 #include "opt_ffs.h" 45 #include "opt_quota.h" 46 #endif 47 48 #include <sys/param.h> 49 #include <sys/kernel.h> 50 #include <sys/systm.h> 51 #include <sys/conf.h> 52 #include <sys/buf.h> 53 #include <sys/proc.h> 54 #include <sys/namei.h> 55 #include <sys/sched.h> 56 #include <sys/stat.h> 57 #include <sys/malloc.h> 58 #include <sys/mount.h> 59 #include <sys/resource.h> 60 #include <sys/resourcevar.h> 61 #include <sys/vnode.h> 62 #include <sys/kauth.h> 63 #include <sys/fstrans.h> 64 #include <sys/wapbl.h> 65 66 #include <miscfs/specfs/specdev.h> 67 68 #include <ufs/ufs/quota.h> 69 #include <ufs/ufs/ufsmount.h> 70 #include <ufs/ufs/inode.h> 71 #include <ufs/ufs/ufs_extern.h> 72 #include <ufs/ufs/ufs_bswap.h> 73 #include <ufs/ufs/ufs_wapbl.h> 74 75 #include <ufs/ffs/fs.h> 76 #include <ufs/ffs/ffs_extern.h> 77 78 #include <uvm/uvm.h> 79 80 struct snap_info { 81 kmutex_t si_lock; /* Lock this snapinfo */ 82 kmutex_t si_snaplock; /* Snapshot vnode common lock */ 83 lwp_t *si_owner; /* Sanplock owner */ 84 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 85 daddr_t *si_snapblklist; /* Snapshot block hints list */ 86 uint32_t si_gen; /* Incremented on change */ 87 }; 88 89 #if !defined(FFS_NO_SNAPSHOT) 90 typedef int (*acctfunc_t) 91 (struct vnode *, void *, int, int, struct fs *, daddr_t, int); 92 93 static int snapshot_setup(struct mount *, struct vnode *); 94 static int snapshot_copyfs(struct mount *, struct vnode *, void **); 95 static int snapshot_expunge(struct mount *, struct vnode *, 96 struct fs *, daddr_t *, daddr_t **); 97 static int snapshot_expunge_snap(struct mount *, struct vnode *, 98 struct fs *, daddr_t); 99 static int snapshot_writefs(struct mount *, struct vnode *, void *); 100 static int cgaccount(struct vnode *, int, int *); 101 static int cgaccount1(int, struct vnode *, void *, int); 102 static int expunge(struct vnode *, struct inode *, struct fs *, 103 acctfunc_t, int); 104 static int indiracct(struct vnode *, struct vnode *, int, daddr_t, 105 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int); 106 static int fullacct(struct vnode *, void *, int, int, struct fs *, 107 daddr_t, int); 108 static int snapacct(struct vnode *, void *, int, int, struct fs *, 109 daddr_t, int); 110 static int mapacct(struct vnode *, void *, int, int, struct fs *, 111 daddr_t, int); 112 #endif /* !defined(FFS_NO_SNAPSHOT) */ 113 114 static int ffs_copyonwrite(void *, struct buf *, bool); 115 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); 116 static int rwfsblk(struct vnode *, int, void *, daddr_t); 117 static int syncsnap(struct vnode *); 118 static int wrsnapblk(struct vnode *, void *, daddr_t); 119 #if !defined(FFS_NO_SNAPSHOT) 120 static int blocks_in_journal(struct fs *); 121 #endif 122 123 static inline bool is_active_snapshot(struct snap_info *, struct inode *); 124 static inline daddr_t db_get(struct inode *, int); 125 static inline void db_assign(struct inode *, int, daddr_t); 126 static inline daddr_t ib_get(struct inode *, int); 127 static inline daddr_t idb_get(struct inode *, void *, int); 128 static inline void idb_assign(struct inode *, void *, int, daddr_t); 129 130 #ifdef DEBUG 131 static int snapdebug = 0; 132 #endif 133 134 int 135 ffs_snapshot_init(struct ufsmount *ump) 136 { 137 struct snap_info *si; 138 139 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 140 if (si == NULL) 141 return ENOMEM; 142 143 TAILQ_INIT(&si->si_snapshots); 144 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 145 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); 146 si->si_owner = NULL; 147 si->si_gen = 0; 148 si->si_snapblklist = NULL; 149 150 return 0; 151 } 152 153 void 154 ffs_snapshot_fini(struct ufsmount *ump) 155 { 156 struct snap_info *si; 157 158 si = ump->um_snapinfo; 159 ump->um_snapinfo = NULL; 160 161 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 162 mutex_destroy(&si->si_lock); 163 mutex_destroy(&si->si_snaplock); 164 KASSERT(si->si_snapblklist == NULL); 165 kmem_free(si, sizeof(*si)); 166 } 167 168 /* 169 * Create a snapshot file and initialize it for the filesystem. 170 * Vnode is locked on entry and return. 171 */ 172 int 173 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) 174 { 175 #if defined(FFS_NO_SNAPSHOT) 176 return EOPNOTSUPP; 177 } 178 #else /* defined(FFS_NO_SNAPSHOT) */ 179 bool suspended = false; 180 int error, redo = 0, snaploc; 181 void *sbbuf = NULL; 182 daddr_t *snaplist = NULL, snaplistsize = 0; 183 struct buf *bp, *nbp; 184 struct fs *copy_fs = NULL; 185 struct fs *fs = VFSTOUFS(mp)->um_fs; 186 struct inode *ip = VTOI(vp); 187 struct lwp *l = curlwp; 188 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 189 struct timespec ts; 190 struct timeval starttime; 191 #ifdef DEBUG 192 struct timeval endtime; 193 #endif 194 struct vnode *devvp = ip->i_devvp; 195 196 /* 197 * If the vnode already is a snapshot, return. 198 */ 199 if ((VTOI(vp)->i_flags & SF_SNAPSHOT)) { 200 if ((VTOI(vp)->i_flags & SF_SNAPINVAL)) 201 return EINVAL; 202 if (ctime) { 203 ctime->tv_sec = DIP(VTOI(vp), mtime); 204 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 205 } 206 return 0; 207 } 208 /* 209 * Check for free snapshot slot in the superblock. 210 */ 211 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 212 if (fs->fs_snapinum[snaploc] == 0) 213 break; 214 if (snaploc == FSMAXSNAP) 215 return (ENOSPC); 216 /* 217 * Prepare the vnode to become a snapshot. 218 */ 219 error = snapshot_setup(mp, vp); 220 if (error) 221 goto out; 222 223 /* 224 * Copy all the cylinder group maps. Although the 225 * filesystem is still active, we hope that only a few 226 * cylinder groups will change between now and when we 227 * suspend operations. Thus, we will be able to quickly 228 * touch up the few cylinder groups that changed during 229 * the suspension period. 230 */ 231 error = cgaccount(vp, 1, NULL); 232 if (error) 233 goto out; 234 235 /* 236 * snapshot is now valid 237 */ 238 ip->i_flags &= ~SF_SNAPINVAL; 239 DIP_ASSIGN(ip, flags, ip->i_flags); 240 ip->i_flag |= IN_CHANGE | IN_UPDATE; 241 242 /* 243 * Ensure that the snapshot is completely on disk. 244 * Since we have marked it as a snapshot it is safe to 245 * unlock it as no process will be allowed to write to it. 246 */ 247 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 248 if (error) 249 goto out; 250 VOP_UNLOCK(vp); 251 /* 252 * All allocations are done, so we can now suspend the filesystem. 253 */ 254 error = vfs_suspend(vp->v_mount, 0); 255 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 256 if (error) 257 goto out; 258 suspended = true; 259 getmicrotime(&starttime); 260 /* 261 * First, copy all the cylinder group maps that have changed. 262 */ 263 error = cgaccount(vp, 2, &redo); 264 if (error) 265 goto out; 266 /* 267 * Create a copy of the superblock and its summary information. 268 */ 269 error = snapshot_copyfs(mp, vp, &sbbuf); 270 copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc)); 271 if (error) 272 goto out; 273 /* 274 * Expunge unlinked files from our view. 275 */ 276 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist); 277 if (error) 278 goto out; 279 /* 280 * Record snapshot inode. Since this is the newest snapshot, 281 * it must be placed at the end of the list. 282 */ 283 if (ip->i_nlink > 0) 284 fs->fs_snapinum[snaploc] = ip->i_number; 285 286 mutex_enter(&si->si_lock); 287 if (is_active_snapshot(si, ip)) 288 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number); 289 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 290 if (TAILQ_FIRST(&si->si_snapshots) == ip) { 291 /* 292 * If this is the first snapshot on this filesystem, put the 293 * preliminary list in place and establish the cow handler. 294 */ 295 si->si_snapblklist = snaplist; 296 fscow_establish(mp, ffs_copyonwrite, devvp); 297 } 298 si->si_gen++; 299 mutex_exit(&si->si_lock); 300 301 vp->v_vflag |= VV_SYSTEM; 302 /* 303 * Set the mtime to the time the snapshot has been taken. 304 */ 305 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 306 if (ctime) 307 *ctime = ts; 308 DIP_ASSIGN(ip, mtime, ts.tv_sec); 309 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 310 ip->i_flag |= IN_CHANGE | IN_UPDATE; 311 /* 312 * Copy allocation information from all snapshots and then 313 * expunge them from our view. 314 */ 315 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize); 316 if (error) 317 goto out; 318 /* 319 * Write the superblock and its summary information to the snapshot. 320 */ 321 error = snapshot_writefs(mp, vp, sbbuf); 322 if (error) 323 goto out; 324 /* 325 * We're nearly done, ensure that the snapshot is completely on disk. 326 */ 327 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 328 if (error) 329 goto out; 330 /* 331 * Invalidate and free all pages on the snapshot vnode. 332 * We will read and write through the buffercache. 333 */ 334 mutex_enter(vp->v_interlock); 335 error = VOP_PUTPAGES(vp, 0, 0, 336 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE); 337 if (error) 338 goto out; 339 /* 340 * Invalidate short ( < fs_bsize ) buffers. We will always read 341 * full size buffers later. 342 */ 343 mutex_enter(&bufcache_lock); 344 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 345 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 346 nbp = LIST_NEXT(bp, b_vnbufs); 347 if (bp->b_bcount == fs->fs_bsize) 348 continue; 349 error = bbusy(bp, false, 0, NULL); 350 if (error != 0) { 351 if (error == EPASSTHROUGH) { 352 nbp = LIST_FIRST(&vp->v_cleanblkhd); 353 continue; 354 } 355 break; 356 } 357 brelsel(bp, BC_INVAL | BC_VFLUSH); 358 } 359 mutex_exit(&bufcache_lock); 360 361 out: 362 if (sbbuf != NULL) { 363 free(copy_fs->fs_csp, M_UFSMNT); 364 free(sbbuf, M_UFSMNT); 365 } 366 if (fs->fs_active != NULL) { 367 free(fs->fs_active, M_DEVBUF); 368 fs->fs_active = NULL; 369 } 370 371 mutex_enter(&si->si_lock); 372 if (snaplist != NULL) { 373 if (si->si_snapblklist == snaplist) 374 si->si_snapblklist = NULL; 375 free(snaplist, M_UFSMNT); 376 } 377 if (error) { 378 fs->fs_snapinum[snaploc] = 0; 379 } else { 380 /* 381 * As this is the newest list, it is the most inclusive, so 382 * should replace the previous list. 383 */ 384 si->si_snapblklist = ip->i_snapblklist; 385 } 386 si->si_gen++; 387 mutex_exit(&si->si_lock); 388 389 if (suspended) { 390 VOP_UNLOCK(vp); 391 vfs_resume(vp->v_mount); 392 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 393 #ifdef DEBUG 394 getmicrotime(&endtime); 395 timersub(&endtime, &starttime, &endtime); 396 printf("%s: suspended %lld.%03d sec, redo %d of %d\n", 397 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec, 398 endtime.tv_usec / 1000, redo, fs->fs_ncg); 399 #endif 400 } 401 if (error) { 402 if (!UFS_WAPBL_BEGIN(mp)) { 403 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 404 UFS_WAPBL_END(mp); 405 } 406 } else if (ip->i_nlink > 0) 407 vref(vp); 408 return (error); 409 } 410 411 /* 412 * Prepare vnode to become a snapshot. 413 */ 414 static int 415 snapshot_setup(struct mount *mp, struct vnode *vp) 416 { 417 int error, n, len, loc, cg; 418 daddr_t blkno, numblks; 419 struct buf *ibp, *nbp; 420 struct fs *fs = VFSTOUFS(mp)->um_fs; 421 struct lwp *l = curlwp; 422 const int wbreak = blocks_in_journal(fs)/8; 423 struct inode *ip = VTOI(vp); 424 425 /* 426 * Check mount, exclusive reference and owner. 427 */ 428 if (vp->v_mount != mp) 429 return EXDEV; 430 if (vp->v_usecount != 1 || vp->v_writecount != 0) 431 return EBUSY; 432 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT, 433 0, mp, vp, NULL); 434 if (error) 435 return EACCES; 436 437 if (vp->v_size != 0) { 438 error = ffs_truncate(vp, 0, 0, NOCRED); 439 if (error) 440 return error; 441 } 442 443 /* Change inode to snapshot type file. */ 444 error = UFS_WAPBL_BEGIN(mp); 445 if (error) 446 return error; 447 #if defined(QUOTA) || defined(QUOTA2) 448 /* shapshot inodes are not accounted in quotas */ 449 chkiq(ip, -1, l->l_cred, 0); 450 #endif 451 ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL); 452 DIP_ASSIGN(ip, flags, ip->i_flags); 453 ip->i_flag |= IN_CHANGE | IN_UPDATE; 454 ffs_update(vp, NULL, NULL, UPDATE_WAIT); 455 UFS_WAPBL_END(mp); 456 457 KASSERT(ip->i_flags & SF_SNAPSHOT); 458 /* 459 * Write an empty list of preallocated blocks to the end of 460 * the snapshot to set size to at least that of the filesystem. 461 */ 462 numblks = howmany(fs->fs_size, fs->fs_frag); 463 blkno = 1; 464 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs)); 465 error = vn_rdwr(UIO_WRITE, vp, 466 (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks), 467 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 468 if (error) 469 return error; 470 /* 471 * Preallocate critical data structures so that we can copy 472 * them in without further allocation after we suspend all 473 * operations on the filesystem. We would like to just release 474 * the allocated buffers without writing them since they will 475 * be filled in below once we are ready to go, but this upsets 476 * the soft update code, so we go ahead and write the new buffers. 477 * 478 * Allocate all indirect blocks and mark all of them as not 479 * needing to be copied. 480 */ 481 error = UFS_WAPBL_BEGIN(mp); 482 if (error) 483 return error; 484 for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) { 485 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno), 486 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 487 if (error) 488 goto out; 489 brelse(ibp, 0); 490 if (wbreak > 0 && (++n % wbreak) == 0) { 491 UFS_WAPBL_END(mp); 492 error = UFS_WAPBL_BEGIN(mp); 493 if (error) 494 return error; 495 } 496 } 497 /* 498 * Allocate copies for the superblock and its summary information. 499 */ 500 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred, 501 0, &nbp); 502 if (error) 503 goto out; 504 bawrite(nbp); 505 blkno = ffs_fragstoblks(fs, fs->fs_csaddr); 506 len = howmany(fs->fs_cssize, fs->fs_bsize); 507 for (loc = 0; loc < len; loc++) { 508 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)), 509 fs->fs_bsize, l->l_cred, 0, &nbp); 510 if (error) 511 goto out; 512 bawrite(nbp); 513 if (wbreak > 0 && (++n % wbreak) == 0) { 514 UFS_WAPBL_END(mp); 515 error = UFS_WAPBL_BEGIN(mp); 516 if (error) 517 return error; 518 } 519 } 520 /* 521 * Allocate all cylinder group blocks. 522 */ 523 for (cg = 0; cg < fs->fs_ncg; cg++) { 524 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)), 525 fs->fs_bsize, l->l_cred, 0, &nbp); 526 if (error) 527 goto out; 528 bawrite(nbp); 529 if (wbreak > 0 && (++n % wbreak) == 0) { 530 UFS_WAPBL_END(mp); 531 error = UFS_WAPBL_BEGIN(mp); 532 if (error) 533 return error; 534 } 535 } 536 537 out: 538 UFS_WAPBL_END(mp); 539 return error; 540 } 541 542 /* 543 * Create a copy of the superblock and its summary information. 544 * It is up to the caller to free copyfs and copy_fs->fs_csp. 545 */ 546 static int 547 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf) 548 { 549 int error, i, len, loc, size; 550 void *space; 551 int32_t *lp; 552 struct buf *bp; 553 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 554 struct lwp *l = curlwp; 555 struct vnode *devvp = VTOI(vp)->i_devvp; 556 557 /* 558 * Grab a copy of the superblock and its summary information. 559 * We delay writing it until the suspension is released below. 560 */ 561 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 562 loc = ffs_blkoff(fs, fs->fs_sblockloc); 563 if (loc > 0) 564 memset(*sbbuf, 0, loc); 565 copyfs = (struct fs *)((char *)(*sbbuf) + loc); 566 memcpy(copyfs, fs, fs->fs_sbsize); 567 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 568 if (fs->fs_sbsize < size) 569 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, 570 size - fs->fs_sbsize); 571 size = ffs_blkroundup(fs, fs->fs_cssize); 572 if (fs->fs_contigsumsize > 0) 573 size += fs->fs_ncg * sizeof(int32_t); 574 space = malloc(size, M_UFSMNT, M_WAITOK); 575 copyfs->fs_csp = space; 576 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize); 577 space = (char *)space + fs->fs_cssize; 578 loc = howmany(fs->fs_cssize, fs->fs_fsize); 579 i = fs->fs_frag - loc % fs->fs_frag; 580 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 581 if (len > 0) { 582 if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc), 583 len, l->l_cred, 0, &bp)) != 0) { 584 free(copyfs->fs_csp, M_UFSMNT); 585 free(*sbbuf, M_UFSMNT); 586 *sbbuf = NULL; 587 return error; 588 } 589 memcpy(space, bp->b_data, (u_int)len); 590 space = (char *)space + len; 591 brelse(bp, BC_INVAL | BC_NOCACHE); 592 } 593 if (fs->fs_contigsumsize > 0) { 594 copyfs->fs_maxcluster = lp = space; 595 for (i = 0; i < fs->fs_ncg; i++) 596 *lp++ = fs->fs_contigsumsize; 597 } 598 if (mp->mnt_wapbl) 599 copyfs->fs_flags &= ~FS_DOWAPBL; 600 return 0; 601 } 602 603 /* 604 * We must check for active files that have been unlinked (e.g., with a zero 605 * link count). We have to expunge all trace of these files from the snapshot 606 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped. 607 * Note that we skip unlinked snapshot files as they will be handled separately. 608 * Calculate the snapshot list size and create a preliminary list. 609 */ 610 static int 611 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs, 612 daddr_t *snaplistsize, daddr_t **snaplist) 613 { 614 int cg, error = 0, len, loc; 615 daddr_t blkno, *blkp; 616 struct fs *fs = VFSTOUFS(mp)->um_fs; 617 struct inode *xp; 618 struct lwp *l = curlwp; 619 struct vattr vat; 620 struct vnode *logvp = NULL, *mvp = NULL, *xvp; 621 622 *snaplist = NULL; 623 /* 624 * Get the log inode if any. 625 */ 626 if ((fs->fs_flags & FS_DOWAPBL) && 627 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 628 error = VFS_VGET(mp, 629 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp); 630 if (error) 631 goto out; 632 } 633 /* 634 * Allocate a marker vnode. 635 */ 636 mvp = vnalloc(mp); 637 /* 638 * We also calculate the needed size for the snapshot list. 639 */ 640 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 641 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 642 mutex_enter(&mntvnode_lock); 643 /* 644 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 645 * and vclean() can be called indirectly 646 */ 647 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { 648 vmark(mvp, xvp); 649 /* 650 * Make sure this vnode wasn't reclaimed in getnewvnode(). 651 * Start over if it has (it won't be on the list anymore). 652 */ 653 if (xvp->v_mount != mp || vismarker(xvp)) 654 continue; 655 mutex_enter(xvp->v_interlock); 656 if ((xvp->v_iflag & VI_XLOCK) || 657 xvp->v_usecount == 0 || xvp->v_type == VNON || 658 VTOI(xvp) == NULL || 659 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 660 mutex_exit(xvp->v_interlock); 661 continue; 662 } 663 mutex_exit(&mntvnode_lock); 664 /* 665 * XXXAD should increase vnode ref count to prevent it 666 * disappearing or being recycled. 667 */ 668 mutex_exit(xvp->v_interlock); 669 #ifdef DEBUG 670 if (snapdebug) 671 vprint("ffs_snapshot: busy vnode", xvp); 672 #endif 673 xp = VTOI(xvp); 674 if (xvp != logvp) { 675 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && 676 vat.va_nlink > 0) { 677 mutex_enter(&mntvnode_lock); 678 continue; 679 } 680 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 681 mutex_enter(&mntvnode_lock); 682 continue; 683 } 684 } 685 /* 686 * If there is a fragment, clear it here. 687 */ 688 blkno = 0; 689 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 690 if (loc < UFS_NDADDR) { 691 len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size)); 692 if (len > 0 && len < fs->fs_bsize) { 693 error = UFS_WAPBL_BEGIN(mp); 694 if (error) { 695 (void)vunmark(mvp); 696 goto out; 697 } 698 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc), 699 len, xp->i_number); 700 blkno = db_get(xp, loc); 701 db_assign(xp, loc, 0); 702 UFS_WAPBL_END(mp); 703 } 704 } 705 *snaplistsize += 1; 706 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); 707 if (blkno) 708 db_assign(xp, loc, blkno); 709 if (!error) { 710 error = UFS_WAPBL_BEGIN(mp); 711 if (!error) { 712 error = ffs_freefile_snap(copy_fs, vp, 713 xp->i_number, xp->i_mode); 714 UFS_WAPBL_END(mp); 715 } 716 } 717 if (error) { 718 (void)vunmark(mvp); 719 goto out; 720 } 721 mutex_enter(&mntvnode_lock); 722 } 723 mutex_exit(&mntvnode_lock); 724 /* 725 * Create a preliminary list of preallocated snapshot blocks. 726 */ 727 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 728 blkp = &(*snaplist)[1]; 729 *blkp++ = ffs_lblkno(fs, fs->fs_sblockloc); 730 blkno = ffs_fragstoblks(fs, fs->fs_csaddr); 731 for (cg = 0; cg < fs->fs_ncg; cg++) { 732 if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno) 733 break; 734 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg)); 735 } 736 len = howmany(fs->fs_cssize, fs->fs_bsize); 737 for (loc = 0; loc < len; loc++) 738 *blkp++ = blkno + loc; 739 for (; cg < fs->fs_ncg; cg++) 740 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg)); 741 (*snaplist)[0] = blkp - &(*snaplist)[0]; 742 743 out: 744 if (mvp != NULL) 745 vnfree(mvp); 746 if (logvp != NULL) 747 vput(logvp); 748 if (error && *snaplist != NULL) { 749 free(*snaplist, M_UFSMNT); 750 *snaplist = NULL; 751 } 752 753 return error; 754 } 755 756 /* 757 * Copy allocation information from all the snapshots in this snapshot and 758 * then expunge them from its view. Also, collect the list of allocated 759 * blocks in i_snapblklist. 760 */ 761 static int 762 snapshot_expunge_snap(struct mount *mp, struct vnode *vp, 763 struct fs *copy_fs, daddr_t snaplistsize) 764 { 765 int error = 0, i; 766 daddr_t numblks, *snaplist = NULL; 767 struct fs *fs = VFSTOUFS(mp)->um_fs; 768 struct inode *ip = VTOI(vp), *xp; 769 struct lwp *l = curlwp; 770 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 771 772 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 773 if (xp != ip) { 774 error = expunge(vp, xp, fs, snapacct, BLK_SNAP); 775 if (error) 776 break; 777 } 778 if (xp->i_nlink != 0) 779 continue; 780 error = UFS_WAPBL_BEGIN(mp); 781 if (error) 782 break; 783 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode); 784 UFS_WAPBL_END(mp); 785 if (error) 786 break; 787 } 788 if (error) 789 goto out; 790 /* 791 * Allocate space for the full list of preallocated snapshot blocks. 792 */ 793 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 794 ip->i_snapblklist = &snaplist[1]; 795 /* 796 * Expunge the blocks used by the snapshots from the set of 797 * blocks marked as used in the snapshot bitmaps. Also, collect 798 * the list of allocated blocks in i_snapblklist. 799 */ 800 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP); 801 if (error) 802 goto out; 803 if (snaplistsize < ip->i_snapblklist - snaplist) 804 panic("ffs_snapshot: list too small"); 805 snaplistsize = ip->i_snapblklist - snaplist; 806 snaplist[0] = snaplistsize; 807 ip->i_snapblklist = &snaplist[0]; 808 /* 809 * Write out the list of allocated blocks to the end of the snapshot. 810 */ 811 numblks = howmany(fs->fs_size, fs->fs_frag); 812 for (i = 0; i < snaplistsize; i++) 813 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 814 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist, 815 snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks), 816 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL); 817 for (i = 0; i < snaplistsize; i++) 818 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 819 out: 820 if (error && snaplist != NULL) { 821 free(snaplist, M_UFSMNT); 822 ip->i_snapblklist = NULL; 823 } 824 return error; 825 } 826 827 /* 828 * Write the superblock and its summary information to the snapshot. 829 * Make sure, the first UFS_NDADDR blocks get copied to the snapshot. 830 */ 831 static int 832 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf) 833 { 834 int error, len, loc; 835 void *space; 836 daddr_t blkno; 837 struct buf *bp; 838 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 839 struct inode *ip = VTOI(vp); 840 struct lwp *l = curlwp; 841 842 copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc)); 843 844 /* 845 * Write the superblock and its summary information 846 * to the snapshot. 847 */ 848 blkno = ffs_fragstoblks(fs, fs->fs_csaddr); 849 len = howmany(fs->fs_cssize, fs->fs_bsize); 850 space = copyfs->fs_csp; 851 #ifdef FFS_EI 852 if (UFS_FSNEEDSWAP(fs)) { 853 ffs_sb_swap(copyfs, copyfs); 854 ffs_csum_swap(space, space, fs->fs_cssize); 855 } 856 #endif 857 error = UFS_WAPBL_BEGIN(mp); 858 if (error) 859 return error; 860 for (loc = 0; loc < len; loc++) { 861 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred, 862 B_MODIFY, &bp); 863 if (error) { 864 break; 865 } 866 memcpy(bp->b_data, space, fs->fs_bsize); 867 space = (char *)space + fs->fs_bsize; 868 bawrite(bp); 869 } 870 if (error) 871 goto out; 872 error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc), 873 fs->fs_bsize, l->l_cred, B_MODIFY, &bp); 874 if (error) { 875 goto out; 876 } else { 877 memcpy(bp->b_data, sbbuf, fs->fs_bsize); 878 bawrite(bp); 879 } 880 /* 881 * Copy the first UFS_NDADDR blocks to the snapshot so 882 * ffs_copyonwrite() and ffs_snapblkfree() will always work on 883 * indirect blocks. 884 */ 885 for (loc = 0; loc < UFS_NDADDR; loc++) { 886 if (db_get(ip, loc) != 0) 887 continue; 888 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc), 889 fs->fs_bsize, l->l_cred, 0, &bp); 890 if (error) 891 break; 892 error = rwfsblk(vp, B_READ, bp->b_data, loc); 893 if (error) { 894 brelse(bp, 0); 895 break; 896 } 897 bawrite(bp); 898 } 899 900 out: 901 UFS_WAPBL_END(mp); 902 return error; 903 } 904 905 /* 906 * Copy all cylinder group maps. 907 */ 908 static int 909 cgaccount(struct vnode *vp, int passno, int *redo) 910 { 911 int cg, error = 0; 912 struct buf *nbp; 913 struct fs *fs = VTOI(vp)->i_fs; 914 915 if (redo != NULL) 916 *redo = 0; 917 if (passno == 1) 918 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY), 919 M_DEVBUF, M_WAITOK | M_ZERO); 920 for (cg = 0; cg < fs->fs_ncg; cg++) { 921 if (passno == 2 && ACTIVECG_ISSET(fs, cg)) 922 continue; 923 924 if (redo != NULL) 925 *redo += 1; 926 error = UFS_WAPBL_BEGIN(vp->v_mount); 927 if (error) 928 return error; 929 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)), 930 fs->fs_bsize, curlwp->l_cred, 0, &nbp); 931 if (error) { 932 UFS_WAPBL_END(vp->v_mount); 933 break; 934 } 935 error = cgaccount1(cg, vp, nbp->b_data, passno); 936 bawrite(nbp); 937 UFS_WAPBL_END(vp->v_mount); 938 if (error) 939 break; 940 } 941 return error; 942 } 943 944 /* 945 * Copy a cylinder group map. All the unallocated blocks are marked 946 * BLK_NOCOPY so that the snapshot knows that it need not copy them 947 * if they are later written. If passno is one, then this is a first 948 * pass, so only setting needs to be done. If passno is 2, then this 949 * is a revision to a previous pass which must be undone as the 950 * replacement pass is done. 951 */ 952 static int 953 cgaccount1(int cg, struct vnode *vp, void *data, int passno) 954 { 955 struct buf *bp, *ibp; 956 struct inode *ip; 957 struct cg *cgp; 958 struct fs *fs; 959 struct lwp *l = curlwp; 960 daddr_t base, numblks; 961 int error, len, loc, ns __unused, indiroff; 962 963 ip = VTOI(vp); 964 fs = ip->i_fs; 965 ns = UFS_FSNEEDSWAP(fs); 966 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 967 (int)fs->fs_cgsize, l->l_cred, 0, &bp); 968 if (error) { 969 return (error); 970 } 971 cgp = (struct cg *)bp->b_data; 972 if (!cg_chkmagic(cgp, ns)) { 973 brelse(bp, 0); 974 return (EIO); 975 } 976 ACTIVECG_SET(fs, cg); 977 978 memcpy(data, bp->b_data, fs->fs_cgsize); 979 brelse(bp, 0); 980 if (fs->fs_cgsize < fs->fs_bsize) 981 memset((char *)data + fs->fs_cgsize, 0, 982 fs->fs_bsize - fs->fs_cgsize); 983 numblks = howmany(fs->fs_size, fs->fs_frag); 984 len = howmany(fs->fs_fpg, fs->fs_frag); 985 base = cg * fs->fs_fpg / fs->fs_frag; 986 if (base + len >= numblks) 987 len = numblks - base - 1; 988 loc = 0; 989 if (base < UFS_NDADDR) { 990 for ( ; loc < UFS_NDADDR; loc++) { 991 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 992 db_assign(ip, loc, BLK_NOCOPY); 993 else if (db_get(ip, loc) == BLK_NOCOPY) { 994 if (passno == 2) 995 db_assign(ip, loc, 0); 996 else if (passno == 1) 997 panic("ffs_snapshot: lost direct block"); 998 } 999 } 1000 } 1001 if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)), 1002 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 1003 return (error); 1004 indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs); 1005 for ( ; loc < len; loc++, indiroff++) { 1006 if (indiroff >= FFS_NINDIR(fs)) { 1007 bawrite(ibp); 1008 if ((error = ffs_balloc(vp, 1009 ffs_lblktosize(fs, (off_t)(base + loc)), 1010 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 1011 return (error); 1012 indiroff = 0; 1013 } 1014 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 1015 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 1016 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 1017 if (passno == 2) 1018 idb_assign(ip, ibp->b_data, indiroff, 0); 1019 else if (passno == 1) 1020 panic("ffs_snapshot: lost indirect block"); 1021 } 1022 } 1023 bdwrite(ibp); 1024 return (0); 1025 } 1026 1027 /* 1028 * Before expunging a snapshot inode, note all the 1029 * blocks that it claims with BLK_SNAP so that fsck will 1030 * be able to account for those blocks properly and so 1031 * that this snapshot knows that it need not copy them 1032 * if the other snapshot holding them is freed. 1033 */ 1034 static int 1035 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 1036 acctfunc_t acctfunc, int expungetype) 1037 { 1038 int i, error, ns __unused; 1039 daddr_t lbn, rlbn; 1040 daddr_t len, blkno, numblks, blksperindir; 1041 struct ufs1_dinode *dip1; 1042 struct ufs2_dinode *dip2; 1043 struct lwp *l = curlwp; 1044 void *bap; 1045 struct buf *bp; 1046 struct mount *mp; 1047 1048 ns = UFS_FSNEEDSWAP(fs); 1049 mp = snapvp->v_mount; 1050 1051 error = UFS_WAPBL_BEGIN(mp); 1052 if (error) 1053 return error; 1054 /* 1055 * Prepare to expunge the inode. If its inode block has not 1056 * yet been copied, then allocate and fill the copy. 1057 */ 1058 lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1059 error = snapblkaddr(snapvp, lbn, &blkno); 1060 if (error) 1061 return error; 1062 if (blkno != 0) { 1063 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred, 1064 B_MODIFY, &bp); 1065 } else { 1066 error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn), 1067 fs->fs_bsize, l->l_cred, 0, &bp); 1068 if (! error) 1069 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 1070 } 1071 if (error) { 1072 UFS_WAPBL_END(mp); 1073 return error; 1074 } 1075 /* 1076 * Set a snapshot inode to be a zero length file, regular files 1077 * or unlinked snapshots to be completely unallocated. 1078 */ 1079 if (fs->fs_magic == FS_UFS1_MAGIC) { 1080 dip1 = (struct ufs1_dinode *)bp->b_data + 1081 ino_to_fsbo(fs, cancelip->i_number); 1082 if (cancelip->i_flags & SF_SNAPSHOT) { 1083 dip1->di_flags = 1084 ufs_rw32(ufs_rw32(dip1->di_flags, ns) | 1085 SF_SNAPINVAL, ns); 1086 } 1087 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1088 dip1->di_mode = 0; 1089 dip1->di_size = 0; 1090 dip1->di_blocks = 0; 1091 memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t)); 1092 } else { 1093 dip2 = (struct ufs2_dinode *)bp->b_data + 1094 ino_to_fsbo(fs, cancelip->i_number); 1095 if (cancelip->i_flags & SF_SNAPSHOT) { 1096 dip2->di_flags = 1097 ufs_rw32(ufs_rw32(dip2->di_flags, ns) | 1098 SF_SNAPINVAL, ns); 1099 } 1100 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1101 dip2->di_mode = 0; 1102 dip2->di_size = 0; 1103 dip2->di_blocks = 0; 1104 memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t)); 1105 } 1106 bdwrite(bp); 1107 UFS_WAPBL_END(mp); 1108 /* 1109 * Now go through and expunge all the blocks in the file 1110 * using the function requested. 1111 */ 1112 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1113 if (fs->fs_magic == FS_UFS1_MAGIC) 1114 bap = &cancelip->i_ffs1_db[0]; 1115 else 1116 bap = &cancelip->i_ffs2_db[0]; 1117 error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype); 1118 if (error) 1119 return (error); 1120 if (fs->fs_magic == FS_UFS1_MAGIC) 1121 bap = &cancelip->i_ffs1_ib[0]; 1122 else 1123 bap = &cancelip->i_ffs2_ib[0]; 1124 error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype); 1125 if (error) 1126 return (error); 1127 blksperindir = 1; 1128 lbn = -UFS_NDADDR; 1129 len = numblks - UFS_NDADDR; 1130 rlbn = UFS_NDADDR; 1131 for (i = 0; len > 0 && i < UFS_NIADDR; i++) { 1132 error = indiracct(snapvp, ITOV(cancelip), i, 1133 ib_get(cancelip, i), lbn, rlbn, len, 1134 blksperindir, fs, acctfunc, expungetype); 1135 if (error) 1136 return (error); 1137 blksperindir *= FFS_NINDIR(fs); 1138 lbn -= blksperindir + 1; 1139 len -= blksperindir; 1140 rlbn += blksperindir; 1141 } 1142 return (0); 1143 } 1144 1145 /* 1146 * Descend an indirect block chain for vnode cancelvp accounting for all 1147 * its indirect blocks in snapvp. 1148 */ 1149 static int 1150 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level, 1151 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks, 1152 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype) 1153 { 1154 int error, num, i; 1155 daddr_t subblksperindir; 1156 struct indir indirs[UFS_NIADDR + 2]; 1157 daddr_t last; 1158 void *bap; 1159 struct buf *bp; 1160 1161 if (blkno == 0) { 1162 if (expungetype == BLK_NOCOPY) 1163 return (0); 1164 panic("indiracct: missing indir"); 1165 } 1166 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1167 return (error); 1168 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1169 panic("indiracct: botched params"); 1170 /* 1171 * We have to expand bread here since it will deadlock looking 1172 * up the block number for any blocks that are not in the cache. 1173 */ 1174 error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize, 1175 false, &bp); 1176 if (error) 1177 return error; 1178 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 1179 rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) { 1180 brelse(bp, 0); 1181 return (error); 1182 } 1183 /* 1184 * Account for the block pointers in this indirect block. 1185 */ 1186 last = howmany(remblks, blksperindir); 1187 if (last > FFS_NINDIR(fs)) 1188 last = FFS_NINDIR(fs); 1189 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO); 1190 memcpy((void *)bap, bp->b_data, fs->fs_bsize); 1191 brelse(bp, 0); 1192 error = (*acctfunc)(snapvp, bap, 0, last, 1193 fs, level == 0 ? rlbn : -1, expungetype); 1194 if (error || level == 0) 1195 goto out; 1196 /* 1197 * Account for the block pointers in each of the indirect blocks 1198 * in the levels below us. 1199 */ 1200 subblksperindir = blksperindir / FFS_NINDIR(fs); 1201 for (lbn++, level--, i = 0; i < last; i++) { 1202 error = indiracct(snapvp, cancelvp, level, 1203 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks, 1204 subblksperindir, fs, acctfunc, expungetype); 1205 if (error) 1206 goto out; 1207 rlbn += blksperindir; 1208 lbn -= blksperindir; 1209 remblks -= blksperindir; 1210 } 1211 out: 1212 free(bap, M_DEVBUF); 1213 return (error); 1214 } 1215 1216 /* 1217 * Do both snap accounting and map accounting. 1218 */ 1219 static int 1220 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1221 struct fs *fs, daddr_t lblkno, 1222 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1223 { 1224 int error; 1225 1226 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype))) 1227 return (error); 1228 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)); 1229 } 1230 1231 /* 1232 * Identify a set of blocks allocated in a snapshot inode. 1233 */ 1234 static int 1235 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1236 struct fs *fs, daddr_t lblkno, 1237 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1238 { 1239 struct inode *ip = VTOI(vp); 1240 struct lwp *l = curlwp; 1241 struct mount *mp = vp->v_mount; 1242 daddr_t blkno; 1243 daddr_t lbn; 1244 struct buf *ibp; 1245 int error, n; 1246 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; 1247 1248 error = UFS_WAPBL_BEGIN(mp); 1249 if (error) 1250 return error; 1251 for ( n = 0; oldblkp < lastblkp; oldblkp++) { 1252 blkno = idb_get(ip, bap, oldblkp); 1253 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1254 continue; 1255 lbn = ffs_fragstoblks(fs, blkno); 1256 if (lbn < UFS_NDADDR) { 1257 blkno = db_get(ip, lbn); 1258 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1259 } else { 1260 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), 1261 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1262 if (error) 1263 break; 1264 blkno = idb_get(ip, ibp->b_data, 1265 (lbn - UFS_NDADDR) % FFS_NINDIR(fs)); 1266 } 1267 /* 1268 * If we are expunging a snapshot vnode and we 1269 * find a block marked BLK_NOCOPY, then it is 1270 * one that has been allocated to this snapshot after 1271 * we took our current snapshot and can be ignored. 1272 */ 1273 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1274 if (lbn >= UFS_NDADDR) 1275 brelse(ibp, 0); 1276 } else { 1277 if (blkno != 0) 1278 panic("snapacct: bad block"); 1279 if (lbn < UFS_NDADDR) 1280 db_assign(ip, lbn, expungetype); 1281 else { 1282 idb_assign(ip, ibp->b_data, 1283 (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype); 1284 bdwrite(ibp); 1285 } 1286 } 1287 if (wbreak > 0 && (++n % wbreak) == 0) { 1288 UFS_WAPBL_END(mp); 1289 error = UFS_WAPBL_BEGIN(mp); 1290 if (error) 1291 return error; 1292 } 1293 } 1294 UFS_WAPBL_END(mp); 1295 return error; 1296 } 1297 1298 /* 1299 * Account for a set of blocks allocated in a snapshot inode. 1300 */ 1301 static int 1302 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1303 struct fs *fs, daddr_t lblkno, int expungetype) 1304 { 1305 daddr_t blkno; 1306 struct inode *ip; 1307 struct mount *mp = vp->v_mount; 1308 ino_t inum; 1309 int acctit, error, n; 1310 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; 1311 1312 error = UFS_WAPBL_BEGIN(mp); 1313 if (error) 1314 return error; 1315 ip = VTOI(vp); 1316 inum = ip->i_number; 1317 if (lblkno == -1) 1318 acctit = 0; 1319 else 1320 acctit = 1; 1321 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) { 1322 blkno = idb_get(ip, bap, oldblkp); 1323 if (blkno == 0 || blkno == BLK_NOCOPY) 1324 continue; 1325 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1326 *ip->i_snapblklist++ = lblkno; 1327 if (blkno == BLK_SNAP) 1328 blkno = ffs_blkstofrags(fs, lblkno); 1329 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum); 1330 if (wbreak > 0 && (++n % wbreak) == 0) { 1331 UFS_WAPBL_END(mp); 1332 error = UFS_WAPBL_BEGIN(mp); 1333 if (error) 1334 return error; 1335 } 1336 } 1337 UFS_WAPBL_END(mp); 1338 return (0); 1339 } 1340 1341 /* 1342 * Number of blocks that fit into the journal or zero if not logging. 1343 */ 1344 static int 1345 blocks_in_journal(struct fs *fs) 1346 { 1347 off_t bpj; 1348 1349 if ((fs->fs_flags & FS_DOWAPBL) == 0) 1350 return 0; 1351 bpj = 1; 1352 if (fs->fs_journal_version == UFS_WAPBL_VERSION) { 1353 switch (fs->fs_journal_location) { 1354 case UFS_WAPBL_JOURNALLOC_END_PARTITION: 1355 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]* 1356 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; 1357 break; 1358 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: 1359 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]* 1360 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; 1361 break; 1362 } 1363 } 1364 bpj /= fs->fs_bsize; 1365 return (bpj > 0 ? bpj : 1); 1366 } 1367 #endif /* defined(FFS_NO_SNAPSHOT) */ 1368 1369 /* 1370 * Decrement extra reference on snapshot when last name is removed. 1371 * It will not be freed until the last open reference goes away. 1372 */ 1373 void 1374 ffs_snapgone(struct vnode *vp) 1375 { 1376 struct inode *xp, *ip = VTOI(vp); 1377 struct mount *mp = spec_node_getmountedfs(ip->i_devvp); 1378 struct fs *fs; 1379 struct snap_info *si; 1380 int snaploc; 1381 1382 si = VFSTOUFS(mp)->um_snapinfo; 1383 1384 /* 1385 * Find snapshot in incore list. 1386 */ 1387 mutex_enter(&si->si_lock); 1388 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1389 if (xp == ip) 1390 break; 1391 mutex_exit(&si->si_lock); 1392 if (xp != NULL) 1393 vrele(ITOV(ip)); 1394 #ifdef DEBUG 1395 else if (snapdebug) 1396 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1397 (unsigned long long)ip->i_number); 1398 #endif 1399 /* 1400 * Delete snapshot inode from superblock. Keep list dense. 1401 */ 1402 mutex_enter(&si->si_lock); 1403 fs = ip->i_fs; 1404 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1405 if (fs->fs_snapinum[snaploc] == ip->i_number) 1406 break; 1407 if (snaploc < FSMAXSNAP) { 1408 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1409 if (fs->fs_snapinum[snaploc] == 0) 1410 break; 1411 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1412 } 1413 fs->fs_snapinum[snaploc - 1] = 0; 1414 } 1415 si->si_gen++; 1416 mutex_exit(&si->si_lock); 1417 } 1418 1419 /* 1420 * Prepare a snapshot file for being removed. 1421 */ 1422 void 1423 ffs_snapremove(struct vnode *vp) 1424 { 1425 struct inode *ip = VTOI(vp), *xp; 1426 struct vnode *devvp = ip->i_devvp; 1427 struct fs *fs = ip->i_fs; 1428 struct mount *mp = spec_node_getmountedfs(devvp); 1429 struct buf *ibp; 1430 struct snap_info *si; 1431 struct lwp *l = curlwp; 1432 daddr_t numblks, blkno, dblk; 1433 int error, loc, last; 1434 1435 si = VFSTOUFS(mp)->um_snapinfo; 1436 /* 1437 * If active, delete from incore list (this snapshot may 1438 * already have been in the process of being deleted, so 1439 * would not have been active). 1440 * 1441 * Clear copy-on-write flag if last snapshot. 1442 */ 1443 mutex_enter(&si->si_snaplock); 1444 mutex_enter(&si->si_lock); 1445 if (is_active_snapshot(si, ip)) { 1446 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1447 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1448 /* Roll back the list of preallocated blocks. */ 1449 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1450 si->si_snapblklist = xp->i_snapblklist; 1451 si->si_gen++; 1452 mutex_exit(&si->si_lock); 1453 mutex_exit(&si->si_snaplock); 1454 } else { 1455 si->si_snapblklist = 0; 1456 si->si_gen++; 1457 mutex_exit(&si->si_lock); 1458 mutex_exit(&si->si_snaplock); 1459 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1460 } 1461 if (ip->i_snapblklist != NULL) { 1462 free(ip->i_snapblklist, M_UFSMNT); 1463 ip->i_snapblklist = NULL; 1464 } 1465 } else { 1466 mutex_exit(&si->si_lock); 1467 mutex_exit(&si->si_snaplock); 1468 } 1469 /* 1470 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1471 * snapshots that want them (see ffs_snapblkfree below). 1472 */ 1473 for (blkno = 1; blkno < UFS_NDADDR; blkno++) { 1474 dblk = db_get(ip, blkno); 1475 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1476 db_assign(ip, blkno, 0); 1477 else if ((dblk == ffs_blkstofrags(fs, blkno) && 1478 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1479 ip->i_number))) { 1480 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1481 db_assign(ip, blkno, 0); 1482 } 1483 } 1484 numblks = howmany(ip->i_size, fs->fs_bsize); 1485 for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) { 1486 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno), 1487 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1488 if (error) 1489 continue; 1490 if (fs->fs_size - blkno > FFS_NINDIR(fs)) 1491 last = FFS_NINDIR(fs); 1492 else 1493 last = fs->fs_size - blkno; 1494 for (loc = 0; loc < last; loc++) { 1495 dblk = idb_get(ip, ibp->b_data, loc); 1496 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1497 idb_assign(ip, ibp->b_data, loc, 0); 1498 else if (dblk == ffs_blkstofrags(fs, blkno) && 1499 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1500 fs->fs_bsize, ip->i_number)) { 1501 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1502 idb_assign(ip, ibp->b_data, loc, 0); 1503 } 1504 } 1505 bawrite(ibp); 1506 UFS_WAPBL_END(mp); 1507 error = UFS_WAPBL_BEGIN(mp); 1508 KASSERT(error == 0); 1509 } 1510 /* 1511 * Clear snapshot flag and drop reference. 1512 */ 1513 ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL); 1514 DIP_ASSIGN(ip, flags, ip->i_flags); 1515 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1516 #if defined(QUOTA) || defined(QUOTA2) 1517 chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE); 1518 chkiq(ip, 1, l->l_cred, FORCE); 1519 #endif 1520 } 1521 1522 /* 1523 * Notification that a block is being freed. Return zero if the free 1524 * should be allowed to proceed. Return non-zero if the snapshot file 1525 * wants to claim the block. The block will be claimed if it is an 1526 * uncopied part of one of the snapshots. It will be freed if it is 1527 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1528 * If a fragment is being freed, then all snapshots that care about 1529 * it must make a copy since a snapshot file can only claim full sized 1530 * blocks. Note that if more than one snapshot file maps the block, 1531 * we can pick one at random to claim it. Since none of the snapshots 1532 * can change, we are assurred that they will all see the same unmodified 1533 * image. When deleting a snapshot file (see ffs_snapremove above), we 1534 * must push any of these claimed blocks to one of the other snapshots 1535 * that maps it. These claimed blocks are easily identified as they will 1536 * have a block number equal to their logical block number within the 1537 * snapshot. A copied block can never have this property because they 1538 * must always have been allocated from a BLK_NOCOPY location. 1539 */ 1540 int 1541 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, 1542 long size, ino_t inum) 1543 { 1544 struct mount *mp = spec_node_getmountedfs(devvp); 1545 struct buf *ibp; 1546 struct inode *ip; 1547 struct vnode *vp = NULL; 1548 struct snap_info *si; 1549 void *saved_data = NULL; 1550 daddr_t lbn; 1551 daddr_t blkno; 1552 uint32_t gen; 1553 int indiroff = 0, error = 0, claimedblk = 0; 1554 1555 si = VFSTOUFS(mp)->um_snapinfo; 1556 lbn = ffs_fragstoblks(fs, bno); 1557 mutex_enter(&si->si_snaplock); 1558 mutex_enter(&si->si_lock); 1559 si->si_owner = curlwp; 1560 1561 retry: 1562 gen = si->si_gen; 1563 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1564 vp = ITOV(ip); 1565 /* 1566 * Lookup block being written. 1567 */ 1568 if (lbn < UFS_NDADDR) { 1569 blkno = db_get(ip, lbn); 1570 } else { 1571 mutex_exit(&si->si_lock); 1572 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), 1573 fs->fs_bsize, FSCRED, B_METAONLY, &ibp); 1574 if (error) { 1575 mutex_enter(&si->si_lock); 1576 break; 1577 } 1578 indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs); 1579 blkno = idb_get(ip, ibp->b_data, indiroff); 1580 mutex_enter(&si->si_lock); 1581 if (gen != si->si_gen) { 1582 brelse(ibp, 0); 1583 goto retry; 1584 } 1585 } 1586 /* 1587 * Check to see if block needs to be copied. 1588 */ 1589 if (blkno == 0) { 1590 /* 1591 * A block that we map is being freed. If it has not 1592 * been claimed yet, we will claim or copy it (below). 1593 */ 1594 claimedblk = 1; 1595 } else if (blkno == BLK_SNAP) { 1596 /* 1597 * No previous snapshot claimed the block, 1598 * so it will be freed and become a BLK_NOCOPY 1599 * (don't care) for us. 1600 */ 1601 if (claimedblk) 1602 panic("snapblkfree: inconsistent block type"); 1603 if (lbn < UFS_NDADDR) { 1604 db_assign(ip, lbn, BLK_NOCOPY); 1605 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1606 } else { 1607 idb_assign(ip, ibp->b_data, indiroff, 1608 BLK_NOCOPY); 1609 mutex_exit(&si->si_lock); 1610 if (ip->i_nlink > 0) 1611 bwrite(ibp); 1612 else 1613 bdwrite(ibp); 1614 mutex_enter(&si->si_lock); 1615 if (gen != si->si_gen) 1616 goto retry; 1617 } 1618 continue; 1619 } else /* BLK_NOCOPY or default */ { 1620 /* 1621 * If the snapshot has already copied the block 1622 * (default), or does not care about the block, 1623 * it is not needed. 1624 */ 1625 if (lbn >= UFS_NDADDR) 1626 brelse(ibp, 0); 1627 continue; 1628 } 1629 /* 1630 * If this is a full size block, we will just grab it 1631 * and assign it to the snapshot inode. Otherwise we 1632 * will proceed to copy it. See explanation for this 1633 * routine as to why only a single snapshot needs to 1634 * claim this block. 1635 */ 1636 if (size == fs->fs_bsize) { 1637 #ifdef DEBUG 1638 if (snapdebug) 1639 printf("%s %llu lbn %" PRId64 1640 "from inum %llu\n", 1641 "Grabonremove: snapino", 1642 (unsigned long long)ip->i_number, 1643 lbn, (unsigned long long)inum); 1644 #endif 1645 mutex_exit(&si->si_lock); 1646 if (lbn < UFS_NDADDR) { 1647 db_assign(ip, lbn, bno); 1648 } else { 1649 idb_assign(ip, ibp->b_data, indiroff, bno); 1650 if (ip->i_nlink > 0) 1651 bwrite(ibp); 1652 else 1653 bdwrite(ibp); 1654 } 1655 DIP_ADD(ip, blocks, btodb(size)); 1656 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1657 if (ip->i_nlink > 0 && mp->mnt_wapbl) 1658 error = syncsnap(vp); 1659 else 1660 error = 0; 1661 mutex_enter(&si->si_lock); 1662 si->si_owner = NULL; 1663 mutex_exit(&si->si_lock); 1664 mutex_exit(&si->si_snaplock); 1665 return (error == 0); 1666 } 1667 if (lbn >= UFS_NDADDR) 1668 brelse(ibp, 0); 1669 #ifdef DEBUG 1670 if (snapdebug) 1671 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1672 "Copyonremove: snapino ", 1673 (unsigned long long)ip->i_number, 1674 lbn, "for inum", (unsigned long long)inum, size); 1675 #endif 1676 /* 1677 * If we have already read the old block contents, then 1678 * simply copy them to the new block. Note that we need 1679 * to synchronously write snapshots that have not been 1680 * unlinked, and hence will be visible after a crash, 1681 * to ensure their integrity. 1682 */ 1683 mutex_exit(&si->si_lock); 1684 if (saved_data == NULL) { 1685 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1686 error = rwfsblk(vp, B_READ, saved_data, lbn); 1687 if (error) { 1688 free(saved_data, M_UFSMNT); 1689 saved_data = NULL; 1690 mutex_enter(&si->si_lock); 1691 break; 1692 } 1693 } 1694 error = wrsnapblk(vp, saved_data, lbn); 1695 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 1696 error = syncsnap(vp); 1697 mutex_enter(&si->si_lock); 1698 if (error) 1699 break; 1700 if (gen != si->si_gen) 1701 goto retry; 1702 } 1703 si->si_owner = NULL; 1704 mutex_exit(&si->si_lock); 1705 mutex_exit(&si->si_snaplock); 1706 if (saved_data) 1707 free(saved_data, M_UFSMNT); 1708 /* 1709 * If we have been unable to allocate a block in which to do 1710 * the copy, then return non-zero so that the fragment will 1711 * not be freed. Although space will be lost, the snapshot 1712 * will stay consistent. 1713 */ 1714 return (error); 1715 } 1716 1717 /* 1718 * Associate snapshot files when mounting. 1719 */ 1720 void 1721 ffs_snapshot_mount(struct mount *mp) 1722 { 1723 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1724 struct fs *fs = VFSTOUFS(mp)->um_fs; 1725 struct lwp *l = curlwp; 1726 struct vnode *vp; 1727 struct inode *ip, *xp; 1728 struct snap_info *si; 1729 daddr_t snaplistsize, *snapblklist; 1730 int i, error, ns __unused, snaploc, loc; 1731 1732 /* 1733 * No persistent snapshots on apple ufs file systems. 1734 */ 1735 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1736 return; 1737 1738 si = VFSTOUFS(mp)->um_snapinfo; 1739 ns = UFS_FSNEEDSWAP(fs); 1740 /* 1741 * XXX The following needs to be set before ffs_truncate or 1742 * VOP_READ can be called. 1743 */ 1744 mp->mnt_stat.f_iosize = fs->fs_bsize; 1745 /* 1746 * Process each snapshot listed in the superblock. 1747 */ 1748 vp = NULL; 1749 mutex_enter(&si->si_lock); 1750 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1751 if (fs->fs_snapinum[snaploc] == 0) 1752 break; 1753 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1754 &vp)) != 0) { 1755 printf("ffs_snapshot_mount: vget failed %d\n", error); 1756 continue; 1757 } 1758 ip = VTOI(vp); 1759 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) != 1760 SF_SNAPSHOT) { 1761 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1762 fs->fs_snapinum[snaploc]); 1763 vput(vp); 1764 vp = NULL; 1765 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1766 if (fs->fs_snapinum[loc] == 0) 1767 break; 1768 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1769 } 1770 fs->fs_snapinum[loc - 1] = 0; 1771 snaploc--; 1772 continue; 1773 } 1774 1775 /* 1776 * Read the block hints list. Use an empty list on 1777 * read errors. 1778 */ 1779 error = vn_rdwr(UIO_READ, vp, 1780 (void *)&snaplistsize, sizeof(snaplistsize), 1781 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1782 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1783 l->l_cred, NULL, NULL); 1784 if (error) { 1785 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1786 snaplistsize = 1; 1787 } else 1788 snaplistsize = ufs_rw64(snaplistsize, ns); 1789 snapblklist = malloc( 1790 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 1791 if (error) 1792 snapblklist[0] = 1; 1793 else { 1794 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1795 snaplistsize * sizeof(daddr_t), 1796 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1797 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1798 l->l_cred, NULL, NULL); 1799 for (i = 0; i < snaplistsize; i++) 1800 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1801 if (error) { 1802 printf("ffs_snapshot_mount: read_2 failed %d\n", 1803 error); 1804 snapblklist[0] = 1; 1805 } 1806 } 1807 ip->i_snapblklist = &snapblklist[0]; 1808 1809 /* 1810 * Link it onto the active snapshot list. 1811 */ 1812 if (is_active_snapshot(si, ip)) 1813 panic("ffs_snapshot_mount: %"PRIu64" already on list", 1814 ip->i_number); 1815 else 1816 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1817 vp->v_vflag |= VV_SYSTEM; 1818 VOP_UNLOCK(vp); 1819 } 1820 /* 1821 * No usable snapshots found. 1822 */ 1823 if (vp == NULL) { 1824 mutex_exit(&si->si_lock); 1825 return; 1826 } 1827 /* 1828 * Attach the block hints list. We always want to 1829 * use the list from the newest snapshot. 1830 */ 1831 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1832 si->si_snapblklist = xp->i_snapblklist; 1833 fscow_establish(mp, ffs_copyonwrite, devvp); 1834 si->si_gen++; 1835 mutex_exit(&si->si_lock); 1836 } 1837 1838 /* 1839 * Disassociate snapshot files when unmounting. 1840 */ 1841 void 1842 ffs_snapshot_unmount(struct mount *mp) 1843 { 1844 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1845 struct inode *xp; 1846 struct vnode *vp = NULL; 1847 struct snap_info *si; 1848 1849 si = VFSTOUFS(mp)->um_snapinfo; 1850 mutex_enter(&si->si_lock); 1851 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1852 vp = ITOV(xp); 1853 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); 1854 if (xp->i_snapblklist == si->si_snapblklist) 1855 si->si_snapblklist = NULL; 1856 free(xp->i_snapblklist, M_UFSMNT); 1857 if (xp->i_nlink > 0) { 1858 si->si_gen++; 1859 mutex_exit(&si->si_lock); 1860 vrele(vp); 1861 mutex_enter(&si->si_lock); 1862 } 1863 } 1864 si->si_gen++; 1865 mutex_exit(&si->si_lock); 1866 if (vp) 1867 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1868 } 1869 1870 /* 1871 * Check for need to copy block that is about to be written, 1872 * copying the block if necessary. 1873 */ 1874 static int 1875 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1876 { 1877 struct fs *fs; 1878 struct inode *ip; 1879 struct vnode *devvp = v, *vp = NULL; 1880 struct mount *mp = spec_node_getmountedfs(devvp); 1881 struct snap_info *si; 1882 void *saved_data = NULL; 1883 daddr_t lbn, blkno, *snapblklist; 1884 uint32_t gen; 1885 int lower, upper, mid, snapshot_locked = 0, error = 0; 1886 1887 /* 1888 * Check for valid snapshots. 1889 */ 1890 si = VFSTOUFS(mp)->um_snapinfo; 1891 mutex_enter(&si->si_lock); 1892 ip = TAILQ_FIRST(&si->si_snapshots); 1893 if (ip == NULL) { 1894 mutex_exit(&si->si_lock); 1895 return 0; 1896 } 1897 /* 1898 * First check to see if it is after the file system, 1899 * in the journal or in the preallocated list. 1900 * By doing these checks we avoid several potential deadlocks. 1901 */ 1902 fs = ip->i_fs; 1903 lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)); 1904 if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) { 1905 mutex_exit(&si->si_lock); 1906 return 0; 1907 } 1908 if ((fs->fs_flags & FS_DOWAPBL) && 1909 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 1910 off_t blk_off, log_start, log_end; 1911 1912 log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] * 1913 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; 1914 log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] * 1915 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; 1916 blk_off = dbtob(bp->b_blkno); 1917 if (blk_off >= log_start && blk_off < log_end) { 1918 mutex_exit(&si->si_lock); 1919 return 0; 1920 } 1921 } 1922 snapblklist = si->si_snapblklist; 1923 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0); 1924 lower = 1; 1925 while (lower <= upper) { 1926 mid = (lower + upper) / 2; 1927 if (snapblklist[mid] == lbn) 1928 break; 1929 if (snapblklist[mid] < lbn) 1930 lower = mid + 1; 1931 else 1932 upper = mid - 1; 1933 } 1934 if (lower <= upper) { 1935 mutex_exit(&si->si_lock); 1936 return 0; 1937 } 1938 /* 1939 * Not in the precomputed list, so check the snapshots. 1940 */ 1941 if (si->si_owner != curlwp) { 1942 if (!mutex_tryenter(&si->si_snaplock)) { 1943 mutex_exit(&si->si_lock); 1944 mutex_enter(&si->si_snaplock); 1945 mutex_enter(&si->si_lock); 1946 } 1947 si->si_owner = curlwp; 1948 snapshot_locked = 1; 1949 } 1950 if (data_valid && bp->b_bcount == fs->fs_bsize) 1951 saved_data = bp->b_data; 1952 retry: 1953 gen = si->si_gen; 1954 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1955 vp = ITOV(ip); 1956 /* 1957 * We ensure that everything of our own that needs to be 1958 * copied will be done at the time that ffs_snapshot is 1959 * called. Thus we can skip the check here which can 1960 * deadlock in doing the lookup in ffs_balloc. 1961 */ 1962 if (bp->b_vp == vp) 1963 continue; 1964 /* 1965 * Check to see if block needs to be copied. 1966 */ 1967 if (lbn < UFS_NDADDR) { 1968 blkno = db_get(ip, lbn); 1969 } else { 1970 mutex_exit(&si->si_lock); 1971 blkno = 0; /* XXX: GCC */ 1972 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1973 mutex_enter(&si->si_lock); 1974 break; 1975 } 1976 mutex_enter(&si->si_lock); 1977 if (gen != si->si_gen) 1978 goto retry; 1979 } 1980 #ifdef DIAGNOSTIC 1981 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1982 panic("ffs_copyonwrite: bad copy block"); 1983 #endif 1984 if (blkno != 0) 1985 continue; 1986 1987 if (curlwp == uvm.pagedaemon_lwp) { 1988 error = ENOMEM; 1989 break; 1990 } 1991 /* Only one level of recursion allowed. */ 1992 KASSERT(snapshot_locked); 1993 /* 1994 * Allocate the block into which to do the copy. Since 1995 * multiple processes may all try to copy the same block, 1996 * we have to recheck our need to do a copy if we sleep 1997 * waiting for the lock. 1998 * 1999 * Because all snapshots on a filesystem share a single 2000 * lock, we ensure that we will never be in competition 2001 * with another process to allocate a block. 2002 */ 2003 #ifdef DEBUG 2004 if (snapdebug) { 2005 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 2006 (unsigned long long)ip->i_number, lbn); 2007 if (bp->b_vp == devvp) 2008 printf("fs metadata"); 2009 else 2010 printf("inum %llu", (unsigned long long) 2011 VTOI(bp->b_vp)->i_number); 2012 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 2013 } 2014 #endif 2015 /* 2016 * If we have already read the old block contents, then 2017 * simply copy them to the new block. Note that we need 2018 * to synchronously write snapshots that have not been 2019 * unlinked, and hence will be visible after a crash, 2020 * to ensure their integrity. 2021 */ 2022 mutex_exit(&si->si_lock); 2023 if (saved_data == NULL) { 2024 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 2025 error = rwfsblk(vp, B_READ, saved_data, lbn); 2026 if (error) { 2027 free(saved_data, M_UFSMNT); 2028 saved_data = NULL; 2029 mutex_enter(&si->si_lock); 2030 break; 2031 } 2032 } 2033 error = wrsnapblk(vp, saved_data, lbn); 2034 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 2035 error = syncsnap(vp); 2036 mutex_enter(&si->si_lock); 2037 if (error) 2038 break; 2039 if (gen != si->si_gen) 2040 goto retry; 2041 } 2042 /* 2043 * Note that we need to synchronously write snapshots that 2044 * have not been unlinked, and hence will be visible after 2045 * a crash, to ensure their integrity. 2046 */ 2047 if (snapshot_locked) { 2048 si->si_owner = NULL; 2049 mutex_exit(&si->si_lock); 2050 mutex_exit(&si->si_snaplock); 2051 } else 2052 mutex_exit(&si->si_lock); 2053 if (saved_data && saved_data != bp->b_data) 2054 free(saved_data, M_UFSMNT); 2055 return error; 2056 } 2057 2058 /* 2059 * Read from a snapshot. 2060 */ 2061 int 2062 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) 2063 { 2064 struct inode *ip = VTOI(vp); 2065 struct fs *fs = ip->i_fs; 2066 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; 2067 struct buf *bp; 2068 daddr_t lbn, nextlbn; 2069 off_t fsbytes, bytesinfile; 2070 long size, xfersize, blkoffset; 2071 int error; 2072 2073 fstrans_start(vp->v_mount, FSTRANS_SHARED); 2074 mutex_enter(&si->si_snaplock); 2075 2076 if (ioflag & IO_ALTSEMANTICS) 2077 fsbytes = ip->i_size; 2078 else 2079 fsbytes = ffs_lfragtosize(fs, fs->fs_size); 2080 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 2081 bytesinfile = fsbytes - uio->uio_offset; 2082 if (bytesinfile <= 0) 2083 break; 2084 lbn = ffs_lblkno(fs, uio->uio_offset); 2085 nextlbn = lbn + 1; 2086 size = fs->fs_bsize; 2087 blkoffset = ffs_blkoff(fs, uio->uio_offset); 2088 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 2089 bytesinfile); 2090 2091 if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) { 2092 if (ffs_lblktosize(fs, lbn) + size > fsbytes) 2093 size = ffs_fragroundup(fs, 2094 fsbytes - ffs_lblktosize(fs, lbn)); 2095 error = bread(vp, lbn, size, NOCRED, 0, &bp); 2096 } else { 2097 int nextsize = fs->fs_bsize; 2098 error = breadn(vp, lbn, 2099 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); 2100 } 2101 if (error) 2102 break; 2103 2104 /* 2105 * We should only get non-zero b_resid when an I/O error 2106 * has occurred, which should cause us to break above. 2107 * However, if the short read did not cause an error, 2108 * then we want to ensure that we do not uiomove bad 2109 * or uninitialized data. 2110 */ 2111 size -= bp->b_resid; 2112 if (size < blkoffset + xfersize) { 2113 xfersize = size - blkoffset; 2114 if (xfersize <= 0) 2115 break; 2116 } 2117 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 2118 if (error) 2119 break; 2120 brelse(bp, BC_AGE); 2121 } 2122 if (bp != NULL) 2123 brelse(bp, BC_AGE); 2124 2125 mutex_exit(&si->si_snaplock); 2126 fstrans_done(vp->v_mount); 2127 return error; 2128 } 2129 2130 /* 2131 * Lookup a snapshots data block address. 2132 * Simpler than UFS_BALLOC() as we know all metadata is already allocated 2133 * and safe even for the pagedaemon where we cannot bread(). 2134 */ 2135 static int 2136 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) 2137 { 2138 struct indir indirs[UFS_NIADDR + 2]; 2139 struct inode *ip = VTOI(vp); 2140 struct fs *fs = ip->i_fs; 2141 struct buf *bp; 2142 int error, num; 2143 2144 KASSERT(lbn >= 0); 2145 2146 if (lbn < UFS_NDADDR) { 2147 *res = db_get(ip, lbn); 2148 return 0; 2149 } 2150 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 2151 return error; 2152 if (curlwp == uvm.pagedaemon_lwp) { 2153 mutex_enter(&bufcache_lock); 2154 bp = incore(vp, indirs[num-1].in_lbn); 2155 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { 2156 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2157 error = 0; 2158 } else 2159 error = ENOMEM; 2160 mutex_exit(&bufcache_lock); 2161 return error; 2162 } 2163 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp); 2164 if (error == 0) { 2165 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2166 brelse(bp, 0); 2167 } 2168 2169 return error; 2170 } 2171 2172 /* 2173 * Read or write the specified block of the filesystem vp resides on 2174 * from or to the disk bypassing the buffer cache. 2175 */ 2176 static int 2177 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn) 2178 { 2179 int error; 2180 struct inode *ip = VTOI(vp); 2181 struct fs *fs = ip->i_fs; 2182 struct buf *nbp; 2183 2184 nbp = getiobuf(NULL, true); 2185 nbp->b_flags = flags; 2186 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2187 nbp->b_error = 0; 2188 nbp->b_data = data; 2189 nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn)); 2190 nbp->b_proc = NULL; 2191 nbp->b_dev = ip->i_devvp->v_rdev; 2192 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ 2193 2194 bdev_strategy(nbp); 2195 2196 error = biowait(nbp); 2197 2198 putiobuf(nbp); 2199 2200 return error; 2201 } 2202 2203 /* 2204 * Write all dirty buffers to disk and invalidate them. 2205 */ 2206 static int 2207 syncsnap(struct vnode *vp) 2208 { 2209 int error; 2210 buf_t *bp; 2211 struct fs *fs = VTOI(vp)->i_fs; 2212 2213 mutex_enter(&bufcache_lock); 2214 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { 2215 error = bbusy(bp, false, 0, NULL); 2216 if (error == EPASSTHROUGH) 2217 continue; 2218 else if (error != 0) { 2219 mutex_exit(&bufcache_lock); 2220 return error; 2221 } 2222 KASSERT(bp->b_bcount == fs->fs_bsize); 2223 mutex_exit(&bufcache_lock); 2224 error = rwfsblk(vp, B_WRITE, bp->b_data, 2225 ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno))); 2226 brelse(bp, BC_INVAL | BC_VFLUSH); 2227 if (error) 2228 return error; 2229 mutex_enter(&bufcache_lock); 2230 } 2231 mutex_exit(&bufcache_lock); 2232 2233 return 0; 2234 } 2235 2236 /* 2237 * Write the specified block to a snapshot. 2238 */ 2239 static int 2240 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn) 2241 { 2242 struct inode *ip = VTOI(vp); 2243 struct fs *fs = ip->i_fs; 2244 struct buf *bp; 2245 int error; 2246 2247 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize, 2248 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp); 2249 if (error) 2250 return error; 2251 memcpy(bp->b_data, data, fs->fs_bsize); 2252 if (ip->i_nlink > 0) 2253 error = bwrite(bp); 2254 else 2255 bawrite(bp); 2256 2257 return error; 2258 } 2259 2260 /* 2261 * Check if this inode is present on the active snapshot list. 2262 * Must be called with snapinfo locked. 2263 */ 2264 static inline bool 2265 is_active_snapshot(struct snap_info *si, struct inode *ip) 2266 { 2267 struct inode *xp; 2268 2269 KASSERT(mutex_owned(&si->si_lock)); 2270 2271 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 2272 if (xp == ip) 2273 return true; 2274 return false; 2275 } 2276 2277 /* 2278 * Get/Put direct block from inode or buffer containing disk addresses. Take 2279 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2280 * into a global include. 2281 */ 2282 static inline daddr_t 2283 db_get(struct inode *ip, int loc) 2284 { 2285 if (ip->i_ump->um_fstype == UFS1) 2286 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2287 else 2288 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2289 } 2290 2291 static inline void 2292 db_assign(struct inode *ip, int loc, daddr_t val) 2293 { 2294 if (ip->i_ump->um_fstype == UFS1) 2295 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2296 else 2297 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2298 } 2299 2300 static inline daddr_t 2301 ib_get(struct inode *ip, int loc) 2302 { 2303 if (ip->i_ump->um_fstype == UFS1) 2304 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip)); 2305 else 2306 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip)); 2307 } 2308 2309 static inline daddr_t 2310 idb_get(struct inode *ip, void *bf, int loc) 2311 { 2312 if (ip->i_ump->um_fstype == UFS1) 2313 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2314 else 2315 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2316 } 2317 2318 static inline void 2319 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val) 2320 { 2321 if (ip->i_ump->um_fstype == UFS1) 2322 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2323 else 2324 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2325 } 2326