1 /* $NetBSD: ffs_snapshot.c,v 1.102 2010/12/20 00:25:47 matt Exp $ */ 2 3 /* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40 #include <sys/cdefs.h> 41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.102 2010/12/20 00:25:47 matt Exp $"); 42 43 #if defined(_KERNEL_OPT) 44 #include "opt_ffs.h" 45 #endif 46 47 #include <sys/param.h> 48 #include <sys/kernel.h> 49 #include <sys/systm.h> 50 #include <sys/conf.h> 51 #include <sys/buf.h> 52 #include <sys/proc.h> 53 #include <sys/namei.h> 54 #include <sys/sched.h> 55 #include <sys/stat.h> 56 #include <sys/malloc.h> 57 #include <sys/mount.h> 58 #include <sys/resource.h> 59 #include <sys/resourcevar.h> 60 #include <sys/vnode.h> 61 #include <sys/kauth.h> 62 #include <sys/fstrans.h> 63 #include <sys/wapbl.h> 64 65 #include <miscfs/specfs/specdev.h> 66 67 #include <ufs/ufs/quota.h> 68 #include <ufs/ufs/ufsmount.h> 69 #include <ufs/ufs/inode.h> 70 #include <ufs/ufs/ufs_extern.h> 71 #include <ufs/ufs/ufs_bswap.h> 72 #include <ufs/ufs/ufs_wapbl.h> 73 74 #include <ufs/ffs/fs.h> 75 #include <ufs/ffs/ffs_extern.h> 76 77 #include <uvm/uvm.h> 78 79 struct snap_info { 80 kmutex_t si_lock; /* Lock this snapinfo */ 81 kmutex_t si_snaplock; /* Snapshot vnode common lock */ 82 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 83 daddr_t *si_snapblklist; /* Snapshot block hints list */ 84 uint32_t si_gen; /* Incremented on change */ 85 }; 86 87 #if !defined(FFS_NO_SNAPSHOT) 88 typedef int (*acctfunc_t) 89 (struct vnode *, void *, int, int, struct fs *, daddr_t, int); 90 91 static int snapshot_setup(struct mount *, struct vnode *); 92 static int snapshot_copyfs(struct mount *, struct vnode *, void **); 93 static int snapshot_expunge(struct mount *, struct vnode *, 94 struct fs *, daddr_t *, daddr_t **); 95 static int snapshot_expunge_snap(struct mount *, struct vnode *, 96 struct fs *, daddr_t); 97 static int snapshot_writefs(struct mount *, struct vnode *, void *); 98 static int cgaccount(struct vnode *, int, int *); 99 static int cgaccount1(int, struct vnode *, void *, int); 100 static int expunge(struct vnode *, struct inode *, struct fs *, 101 acctfunc_t, int); 102 static int indiracct(struct vnode *, struct vnode *, int, daddr_t, 103 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int); 104 static int fullacct(struct vnode *, void *, int, int, struct fs *, 105 daddr_t, int); 106 static int snapacct(struct vnode *, void *, int, int, struct fs *, 107 daddr_t, int); 108 static int mapacct(struct vnode *, void *, int, int, struct fs *, 109 daddr_t, int); 110 #endif /* !defined(FFS_NO_SNAPSHOT) */ 111 112 static int ffs_copyonwrite(void *, struct buf *, bool); 113 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); 114 static int rwfsblk(struct vnode *, int, void *, daddr_t); 115 static int syncsnap(struct vnode *); 116 static int wrsnapblk(struct vnode *, void *, daddr_t); 117 118 static inline bool is_active_snapshot(struct snap_info *, struct inode *); 119 static inline daddr_t db_get(struct inode *, int); 120 static inline void db_assign(struct inode *, int, daddr_t); 121 static inline daddr_t ib_get(struct inode *, int); 122 static inline void ib_assign(struct inode *, int, daddr_t); 123 static inline daddr_t idb_get(struct inode *, void *, int); 124 static inline void idb_assign(struct inode *, void *, int, daddr_t); 125 126 #ifdef DEBUG 127 static int snapdebug = 0; 128 #endif 129 130 int 131 ffs_snapshot_init(struct ufsmount *ump) 132 { 133 struct snap_info *si; 134 135 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 136 if (si == NULL) 137 return ENOMEM; 138 139 TAILQ_INIT(&si->si_snapshots); 140 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 141 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); 142 si->si_gen = 0; 143 si->si_snapblklist = NULL; 144 145 return 0; 146 } 147 148 void 149 ffs_snapshot_fini(struct ufsmount *ump) 150 { 151 struct snap_info *si; 152 153 si = ump->um_snapinfo; 154 ump->um_snapinfo = NULL; 155 156 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 157 mutex_destroy(&si->si_lock); 158 mutex_destroy(&si->si_snaplock); 159 KASSERT(si->si_snapblklist == NULL); 160 kmem_free(si, sizeof(*si)); 161 } 162 163 /* 164 * Create a snapshot file and initialize it for the filesystem. 165 * Vnode is locked on entry and return. 166 */ 167 int 168 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) 169 { 170 #if defined(FFS_NO_SNAPSHOT) 171 return EOPNOTSUPP; 172 } 173 #else /* defined(FFS_NO_SNAPSHOT) */ 174 bool suspended = false; 175 bool snapshot_locked = false; 176 int error, redo = 0, snaploc; 177 void *sbbuf = NULL; 178 daddr_t *snaplist = NULL, snaplistsize = 0; 179 struct buf *bp, *nbp; 180 struct fs *copy_fs = NULL; 181 struct fs *fs = VFSTOUFS(mp)->um_fs; 182 struct inode *ip = VTOI(vp); 183 struct lwp *l = curlwp; 184 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 185 struct timespec ts; 186 struct timeval starttime; 187 #ifdef DEBUG 188 struct timeval endtime; 189 #endif 190 struct vnode *devvp = ip->i_devvp; 191 192 /* 193 * If the vnode already is a snapshot, return. 194 */ 195 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 196 if (ctime) { 197 ctime->tv_sec = DIP(VTOI(vp), mtime); 198 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 199 } 200 return 0; 201 } 202 /* 203 * Check for free snapshot slot in the superblock. 204 */ 205 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 206 if (fs->fs_snapinum[snaploc] == 0) 207 break; 208 if (snaploc == FSMAXSNAP) 209 return (ENOSPC); 210 /* 211 * Prepare the vnode to become a snapshot. 212 */ 213 error = snapshot_setup(mp, vp); 214 if (error) 215 goto out; 216 /* 217 * Change inode to snapshot type file. 218 */ 219 ip->i_flags |= SF_SNAPSHOT; 220 DIP_ASSIGN(ip, flags, ip->i_flags); 221 ip->i_flag |= IN_CHANGE | IN_UPDATE; 222 /* 223 * Copy all the cylinder group maps. Although the 224 * filesystem is still active, we hope that only a few 225 * cylinder groups will change between now and when we 226 * suspend operations. Thus, we will be able to quickly 227 * touch up the few cylinder groups that changed during 228 * the suspension period. 229 */ 230 error = cgaccount(vp, 1, NULL); 231 if (error) 232 goto out; 233 /* 234 * Ensure that the snapshot is completely on disk. 235 * Since we have marked it as a snapshot it is safe to 236 * unlock it as no process will be allowed to write to it. 237 */ 238 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 239 if (error) 240 goto out; 241 VOP_UNLOCK(vp); 242 /* 243 * All allocations are done, so we can now suspend the filesystem. 244 */ 245 error = vfs_suspend(vp->v_mount, 0); 246 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 247 if (error) 248 goto out; 249 suspended = true; 250 getmicrotime(&starttime); 251 /* 252 * First, copy all the cylinder group maps that have changed. 253 */ 254 error = cgaccount(vp, 2, &redo); 255 if (error) 256 goto out; 257 /* 258 * Create a copy of the superblock and its summary information. 259 */ 260 error = snapshot_copyfs(mp, vp, &sbbuf); 261 copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); 262 if (error) 263 goto out; 264 /* 265 * Expunge unlinked files from our view. 266 */ 267 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist); 268 if (error) 269 goto out; 270 /* 271 * Acquire the snapshot lock. 272 */ 273 mutex_enter(&si->si_snaplock); 274 snapshot_locked = true; 275 /* 276 * Record snapshot inode. Since this is the newest snapshot, 277 * it must be placed at the end of the list. 278 */ 279 fs->fs_snapinum[snaploc] = ip->i_number; 280 281 mutex_enter(&si->si_lock); 282 vref(vp); 283 if (is_active_snapshot(si, ip)) 284 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number); 285 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 286 if (TAILQ_FIRST(&si->si_snapshots) == ip) { 287 /* 288 * If this is the first snapshot on this filesystem, put the 289 * preliminary list in place and establish the cow handler. 290 */ 291 si->si_snapblklist = snaplist; 292 fscow_establish(mp, ffs_copyonwrite, devvp); 293 } 294 si->si_gen++; 295 mutex_exit(&si->si_lock); 296 297 vp->v_vflag |= VV_SYSTEM; 298 /* 299 * Set the mtime to the time the snapshot has been taken. 300 */ 301 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 302 if (ctime) 303 *ctime = ts; 304 DIP_ASSIGN(ip, mtime, ts.tv_sec); 305 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 306 ip->i_flag |= IN_CHANGE | IN_UPDATE; 307 /* 308 * Copy allocation information from all snapshots and then 309 * expunge them from our view. 310 */ 311 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize); 312 if (error) 313 goto out; 314 /* 315 * Write the superblock and its summary information to the snapshot. 316 */ 317 error = snapshot_writefs(mp, vp, sbbuf); 318 if (error) 319 goto out; 320 /* 321 * We're nearly done, ensure that the snapshot is completely on disk. 322 */ 323 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 324 if (error) 325 goto out; 326 /* 327 * Invalidate and free all pages on the snapshot vnode. 328 * We will read and write through the buffercache. 329 */ 330 mutex_enter(&vp->v_interlock); 331 error = VOP_PUTPAGES(vp, 0, 0, 332 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE); 333 if (error) 334 goto out; 335 /* 336 * Invalidate short ( < fs_bsize ) buffers. We will always read 337 * full size buffers later. 338 */ 339 mutex_enter(&bufcache_lock); 340 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 341 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 342 nbp = LIST_NEXT(bp, b_vnbufs); 343 KASSERT((bp->b_cflags & BC_BUSY) == 0); 344 if (bp->b_bcount < fs->fs_bsize) { 345 bp->b_cflags |= BC_BUSY; 346 brelsel(bp, BC_INVAL | BC_VFLUSH); 347 } 348 } 349 mutex_exit(&bufcache_lock); 350 351 out: 352 if (sbbuf != NULL) { 353 free(copy_fs->fs_csp, M_UFSMNT); 354 free(sbbuf, M_UFSMNT); 355 } 356 if (fs->fs_active != NULL) { 357 free(fs->fs_active, M_DEVBUF); 358 fs->fs_active = NULL; 359 } 360 361 mutex_enter(&si->si_lock); 362 if (snaplist != NULL) { 363 if (si->si_snapblklist == snaplist) 364 si->si_snapblklist = NULL; 365 free(snaplist, M_UFSMNT); 366 } 367 if (error) { 368 fs->fs_snapinum[snaploc] = 0; 369 } else { 370 /* 371 * As this is the newest list, it is the most inclusive, so 372 * should replace the previous list. 373 */ 374 si->si_snapblklist = ip->i_snapblklist; 375 } 376 si->si_gen++; 377 mutex_exit(&si->si_lock); 378 379 if (snapshot_locked) 380 mutex_exit(&si->si_snaplock); 381 if (suspended) { 382 vfs_resume(vp->v_mount); 383 #ifdef DEBUG 384 getmicrotime(&endtime); 385 timersub(&endtime, &starttime, &endtime); 386 printf("%s: suspended %lld.%03d sec, redo %d of %d\n", 387 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec, 388 endtime.tv_usec / 1000, redo, fs->fs_ncg); 389 #endif 390 } 391 if (error) { 392 if (!UFS_WAPBL_BEGIN(mp)) { 393 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 394 UFS_WAPBL_END(mp); 395 } 396 } 397 return (error); 398 } 399 400 /* 401 * Prepare vnode to become a snapshot. 402 */ 403 static int 404 snapshot_setup(struct mount *mp, struct vnode *vp) 405 { 406 int error, i, len, loc; 407 daddr_t blkno, numblks; 408 struct buf *ibp, *nbp; 409 struct fs *fs = VFSTOUFS(mp)->um_fs; 410 struct lwp *l = curlwp; 411 412 /* 413 * Check mount, exclusive reference and owner. 414 */ 415 if (vp->v_mount != mp) 416 return EXDEV; 417 if (vp->v_usecount != 1 || vp->v_writecount != 0) 418 return EBUSY; 419 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 420 NULL) != 0 && 421 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) 422 return EACCES; 423 424 if (vp->v_size != 0) { 425 error = ffs_truncate(vp, 0, 0, NOCRED); 426 if (error) 427 return error; 428 } 429 /* 430 * Write an empty list of preallocated blocks to the end of 431 * the snapshot to set size to at least that of the filesystem. 432 */ 433 numblks = howmany(fs->fs_size, fs->fs_frag); 434 blkno = 1; 435 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs)); 436 error = vn_rdwr(UIO_WRITE, vp, 437 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 438 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 439 if (error) 440 return error; 441 /* 442 * Preallocate critical data structures so that we can copy 443 * them in without further allocation after we suspend all 444 * operations on the filesystem. We would like to just release 445 * the allocated buffers without writing them since they will 446 * be filled in below once we are ready to go, but this upsets 447 * the soft update code, so we go ahead and write the new buffers. 448 * 449 * Allocate all indirect blocks and mark all of them as not 450 * needing to be copied. 451 */ 452 error = UFS_WAPBL_BEGIN(mp); 453 if (error) 454 return error; 455 for (blkno = NDADDR, i = 0; blkno < numblks; blkno += NINDIR(fs)) { 456 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 457 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 458 if (error) 459 goto out; 460 brelse(ibp, 0); 461 if ((++i % 16) == 0) { 462 UFS_WAPBL_END(mp); 463 error = UFS_WAPBL_BEGIN(mp); 464 if (error) 465 return error; 466 } 467 } 468 /* 469 * Allocate copies for the superblock and its summary information. 470 */ 471 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred, 472 0, &nbp); 473 if (error) 474 goto out; 475 bawrite(nbp); 476 blkno = fragstoblks(fs, fs->fs_csaddr); 477 len = howmany(fs->fs_cssize, fs->fs_bsize); 478 for (loc = 0; loc < len; loc++) { 479 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), 480 fs->fs_bsize, l->l_cred, 0, &nbp); 481 if (error) 482 goto out; 483 bawrite(nbp); 484 } 485 486 out: 487 UFS_WAPBL_END(mp); 488 return error; 489 } 490 491 /* 492 * Create a copy of the superblock and its summary information. 493 * It is up to the caller to free copyfs and copy_fs->fs_csp. 494 */ 495 static int 496 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf) 497 { 498 int error, i, len, loc, size; 499 void *space; 500 int32_t *lp; 501 struct buf *bp; 502 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 503 struct lwp *l = curlwp; 504 struct vnode *devvp = VTOI(vp)->i_devvp; 505 506 /* 507 * Grab a copy of the superblock and its summary information. 508 * We delay writing it until the suspension is released below. 509 */ 510 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 511 loc = blkoff(fs, fs->fs_sblockloc); 512 if (loc > 0) 513 memset(*sbbuf, 0, loc); 514 copyfs = (struct fs *)((char *)(*sbbuf) + loc); 515 memcpy(copyfs, fs, fs->fs_sbsize); 516 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 517 if (fs->fs_sbsize < size) 518 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, 519 size - fs->fs_sbsize); 520 size = blkroundup(fs, fs->fs_cssize); 521 if (fs->fs_contigsumsize > 0) 522 size += fs->fs_ncg * sizeof(int32_t); 523 space = malloc(size, M_UFSMNT, M_WAITOK); 524 copyfs->fs_csp = space; 525 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize); 526 space = (char *)space + fs->fs_cssize; 527 loc = howmany(fs->fs_cssize, fs->fs_fsize); 528 i = fs->fs_frag - loc % fs->fs_frag; 529 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 530 if (len > 0) { 531 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 532 len, l->l_cred, 0, &bp)) != 0) { 533 brelse(bp, 0); 534 free(copyfs->fs_csp, M_UFSMNT); 535 free(*sbbuf, M_UFSMNT); 536 *sbbuf = NULL; 537 return error; 538 } 539 memcpy(space, bp->b_data, (u_int)len); 540 space = (char *)space + len; 541 brelse(bp, BC_INVAL | BC_NOCACHE); 542 } 543 if (fs->fs_contigsumsize > 0) { 544 copyfs->fs_maxcluster = lp = space; 545 for (i = 0; i < fs->fs_ncg; i++) 546 *lp++ = fs->fs_contigsumsize; 547 } 548 if (mp->mnt_wapbl) 549 copyfs->fs_flags &= ~FS_DOWAPBL; 550 return 0; 551 } 552 553 /* 554 * We must check for active files that have been unlinked (e.g., with a zero 555 * link count). We have to expunge all trace of these files from the snapshot 556 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped. 557 * Note that we skip unlinked snapshot files as they will be handled separately. 558 * Calculate the snapshot list size and create a preliminary list. 559 */ 560 static int 561 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs, 562 daddr_t *snaplistsize, daddr_t **snaplist) 563 { 564 bool has_wapbl = false; 565 int cg, error, len, loc; 566 daddr_t blkno, *blkp; 567 struct fs *fs = VFSTOUFS(mp)->um_fs; 568 struct inode *xp; 569 struct lwp *l = curlwp; 570 struct vattr vat; 571 struct vnode *logvp = NULL, *mvp = NULL, *xvp; 572 573 *snaplist = NULL; 574 /* 575 * Get the log inode if any. 576 */ 577 if ((fs->fs_flags & FS_DOWAPBL) && 578 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 579 error = VFS_VGET(mp, 580 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp); 581 if (error) 582 goto out; 583 } 584 /* 585 * Allocate a marker vnode. 586 */ 587 if ((mvp = vnalloc(mp)) == NULL) { 588 error = ENOMEM; 589 goto out; 590 } 591 /* 592 * We also calculate the needed size for the snapshot list. 593 */ 594 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 595 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 596 error = UFS_WAPBL_BEGIN(mp); 597 if (error) 598 goto out; 599 has_wapbl = true; 600 mutex_enter(&mntvnode_lock); 601 /* 602 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 603 * and vclean() can be called indirectly 604 */ 605 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { 606 vmark(mvp, xvp); 607 /* 608 * Make sure this vnode wasn't reclaimed in getnewvnode(). 609 * Start over if it has (it won't be on the list anymore). 610 */ 611 if (xvp->v_mount != mp || vismarker(xvp)) 612 continue; 613 mutex_enter(&xvp->v_interlock); 614 if ((xvp->v_iflag & VI_XLOCK) || 615 xvp->v_usecount == 0 || xvp->v_type == VNON || 616 VTOI(xvp) == NULL || 617 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 618 mutex_exit(&xvp->v_interlock); 619 continue; 620 } 621 mutex_exit(&mntvnode_lock); 622 /* 623 * XXXAD should increase vnode ref count to prevent it 624 * disappearing or being recycled. 625 */ 626 mutex_exit(&xvp->v_interlock); 627 #ifdef DEBUG 628 if (snapdebug) 629 vprint("ffs_snapshot: busy vnode", xvp); 630 #endif 631 xp = VTOI(xvp); 632 if (xvp != logvp) { 633 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && 634 vat.va_nlink > 0) { 635 mutex_enter(&mntvnode_lock); 636 continue; 637 } 638 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 639 mutex_enter(&mntvnode_lock); 640 continue; 641 } 642 } 643 /* 644 * If there is a fragment, clear it here. 645 */ 646 blkno = 0; 647 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 648 if (loc < NDADDR) { 649 len = fragroundup(fs, blkoff(fs, xp->i_size)); 650 if (len > 0 && len < fs->fs_bsize) { 651 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc), 652 len, xp->i_number); 653 blkno = db_get(xp, loc); 654 db_assign(xp, loc, 0); 655 } 656 } 657 *snaplistsize += 1; 658 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); 659 if (blkno) 660 db_assign(xp, loc, blkno); 661 if (!error) 662 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, 663 xp->i_mode); 664 if (error) { 665 (void)vunmark(mvp); 666 goto out; 667 } 668 mutex_enter(&mntvnode_lock); 669 } 670 mutex_exit(&mntvnode_lock); 671 /* 672 * Create a preliminary list of preallocated snapshot blocks. 673 */ 674 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 675 blkp = &(*snaplist)[1]; 676 *blkp++ = lblkno(fs, fs->fs_sblockloc); 677 blkno = fragstoblks(fs, fs->fs_csaddr); 678 for (cg = 0; cg < fs->fs_ncg; cg++) { 679 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 680 break; 681 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 682 } 683 len = howmany(fs->fs_cssize, fs->fs_bsize); 684 for (loc = 0; loc < len; loc++) 685 *blkp++ = blkno + loc; 686 for (; cg < fs->fs_ncg; cg++) 687 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 688 (*snaplist)[0] = blkp - &(*snaplist)[0]; 689 690 out: 691 if (has_wapbl) 692 UFS_WAPBL_END(mp); 693 if (mvp != NULL) 694 vnfree(mvp); 695 if (logvp != NULL) 696 vput(logvp); 697 if (error && *snaplist != NULL) { 698 free(*snaplist, M_UFSMNT); 699 *snaplist = NULL; 700 } 701 702 return error; 703 } 704 705 /* 706 * Copy allocation information from all the snapshots in this snapshot and 707 * then expunge them from its view. Also, collect the list of allocated 708 * blocks in i_snapblklist. 709 */ 710 static int 711 snapshot_expunge_snap(struct mount *mp, struct vnode *vp, 712 struct fs *copy_fs, daddr_t snaplistsize) 713 { 714 int error, i; 715 daddr_t numblks, *snaplist = NULL; 716 struct fs *fs = VFSTOUFS(mp)->um_fs; 717 struct inode *ip = VTOI(vp), *xp; 718 struct lwp *l = curlwp; 719 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 720 721 error = UFS_WAPBL_BEGIN(mp); 722 if (error) 723 return error; 724 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 725 if (xp == ip) 726 break; 727 error = expunge(vp, xp, fs, snapacct, BLK_SNAP); 728 if (error) 729 break; 730 if (xp->i_nlink != 0) 731 continue; 732 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode); 733 if (error) 734 break; 735 } 736 if (error) 737 goto out; 738 /* 739 * Allocate space for the full list of preallocated snapshot blocks. 740 */ 741 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 742 ip->i_snapblklist = &snaplist[1]; 743 /* 744 * Expunge the blocks used by the snapshots from the set of 745 * blocks marked as used in the snapshot bitmaps. Also, collect 746 * the list of allocated blocks in i_snapblklist. 747 */ 748 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP); 749 if (error) 750 goto out; 751 if (snaplistsize < ip->i_snapblklist - snaplist) 752 panic("ffs_snapshot: list too small"); 753 snaplistsize = ip->i_snapblklist - snaplist; 754 snaplist[0] = snaplistsize; 755 ip->i_snapblklist = &snaplist[0]; 756 /* 757 * Write out the list of allocated blocks to the end of the snapshot. 758 */ 759 numblks = howmany(fs->fs_size, fs->fs_frag); 760 for (i = 0; i < snaplistsize; i++) 761 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 762 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist, 763 snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks), 764 UIO_SYSSPACE, IO_NODELOCKED | IO_JOURNALLOCKED | IO_UNIT, 765 l->l_cred, NULL, NULL); 766 for (i = 0; i < snaplistsize; i++) 767 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 768 out: 769 UFS_WAPBL_END(mp); 770 if (error && snaplist != NULL) { 771 free(snaplist, M_UFSMNT); 772 ip->i_snapblklist = NULL; 773 } 774 return error; 775 } 776 777 /* 778 * Write the superblock and its summary information to the snapshot. 779 * Make sure, the first NDADDR blocks get copied to the snapshot. 780 */ 781 static int 782 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf) 783 { 784 int error, len, loc; 785 void *space; 786 daddr_t blkno; 787 struct buf *bp; 788 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 789 struct inode *ip = VTOI(vp); 790 struct lwp *l = curlwp; 791 792 copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc)); 793 794 /* 795 * Write the superblock and its summary information 796 * to the snapshot. 797 */ 798 blkno = fragstoblks(fs, fs->fs_csaddr); 799 len = howmany(fs->fs_cssize, fs->fs_bsize); 800 space = copyfs->fs_csp; 801 #ifdef FFS_EI 802 if (UFS_FSNEEDSWAP(fs)) { 803 ffs_sb_swap(copyfs, copyfs); 804 ffs_csum_swap(space, space, fs->fs_cssize); 805 } 806 #endif 807 error = UFS_WAPBL_BEGIN(mp); 808 if (error) 809 return error; 810 for (loc = 0; loc < len; loc++) { 811 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred, 812 B_MODIFY, &bp); 813 if (error) { 814 brelse(bp, 0); 815 break; 816 } 817 memcpy(bp->b_data, space, fs->fs_bsize); 818 space = (char *)space + fs->fs_bsize; 819 bawrite(bp); 820 } 821 if (error) 822 goto out; 823 error = bread(vp, lblkno(fs, fs->fs_sblockloc), 824 fs->fs_bsize, l->l_cred, B_MODIFY, &bp); 825 if (error) { 826 brelse(bp, 0); 827 goto out; 828 } else { 829 memcpy(bp->b_data, sbbuf, fs->fs_bsize); 830 bawrite(bp); 831 } 832 /* 833 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite() 834 * and ffs_snapblkfree() will always work on indirect blocks. 835 */ 836 for (loc = 0; loc < NDADDR; loc++) { 837 if (db_get(ip, loc) != 0) 838 continue; 839 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc), 840 fs->fs_bsize, l->l_cred, 0, &bp); 841 if (error) 842 break; 843 error = rwfsblk(vp, B_READ, bp->b_data, loc); 844 if (error) { 845 brelse(bp, 0); 846 break; 847 } 848 bawrite(bp); 849 } 850 851 out: 852 UFS_WAPBL_END(mp); 853 return error; 854 } 855 856 /* 857 * Copy all cylinder group maps. 858 */ 859 static int 860 cgaccount(struct vnode *vp, int passno, int *redo) 861 { 862 int cg, error; 863 struct buf *nbp; 864 struct fs *fs = VTOI(vp)->i_fs; 865 866 error = UFS_WAPBL_BEGIN(vp->v_mount); 867 if (error) 868 return error; 869 if (redo != NULL) 870 *redo = 0; 871 if (passno == 1) 872 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY), 873 M_DEVBUF, M_WAITOK | M_ZERO); 874 for (cg = 0; cg < fs->fs_ncg; cg++) { 875 if (passno == 2 && ACTIVECG_ISSET(fs, cg)) 876 continue; 877 if (redo != NULL) 878 *redo += 1; 879 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 880 fs->fs_bsize, curlwp->l_cred, 0, &nbp); 881 if (error) 882 break; 883 error = cgaccount1(cg, vp, nbp->b_data, passno); 884 bawrite(nbp); 885 if (error) 886 break; 887 } 888 UFS_WAPBL_END(vp->v_mount); 889 return error; 890 } 891 892 /* 893 * Copy a cylinder group map. All the unallocated blocks are marked 894 * BLK_NOCOPY so that the snapshot knows that it need not copy them 895 * if they are later written. If passno is one, then this is a first 896 * pass, so only setting needs to be done. If passno is 2, then this 897 * is a revision to a previous pass which must be undone as the 898 * replacement pass is done. 899 */ 900 static int 901 cgaccount1(int cg, struct vnode *vp, void *data, int passno) 902 { 903 struct buf *bp, *ibp; 904 struct inode *ip; 905 struct cg *cgp; 906 struct fs *fs; 907 struct lwp *l = curlwp; 908 daddr_t base, numblks; 909 int error, len, loc, ns, indiroff; 910 911 ip = VTOI(vp); 912 fs = ip->i_fs; 913 ns = UFS_FSNEEDSWAP(fs); 914 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 915 (int)fs->fs_cgsize, l->l_cred, 0, &bp); 916 if (error) { 917 brelse(bp, 0); 918 return (error); 919 } 920 cgp = (struct cg *)bp->b_data; 921 if (!cg_chkmagic(cgp, ns)) { 922 brelse(bp, 0); 923 return (EIO); 924 } 925 ACTIVECG_SET(fs, cg); 926 927 memcpy(data, bp->b_data, fs->fs_cgsize); 928 brelse(bp, 0); 929 if (fs->fs_cgsize < fs->fs_bsize) 930 memset((char *)data + fs->fs_cgsize, 0, 931 fs->fs_bsize - fs->fs_cgsize); 932 numblks = howmany(fs->fs_size, fs->fs_frag); 933 len = howmany(fs->fs_fpg, fs->fs_frag); 934 base = cg * fs->fs_fpg / fs->fs_frag; 935 if (base + len >= numblks) 936 len = numblks - base - 1; 937 loc = 0; 938 if (base < NDADDR) { 939 for ( ; loc < NDADDR; loc++) { 940 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 941 db_assign(ip, loc, BLK_NOCOPY); 942 else if (db_get(ip, loc) == BLK_NOCOPY) { 943 if (passno == 2) 944 db_assign(ip, loc, 0); 945 else if (passno == 1) 946 panic("ffs_snapshot: lost direct block"); 947 } 948 } 949 } 950 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), 951 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 952 return (error); 953 indiroff = (base + loc - NDADDR) % NINDIR(fs); 954 for ( ; loc < len; loc++, indiroff++) { 955 if (indiroff >= NINDIR(fs)) { 956 bawrite(ibp); 957 if ((error = ffs_balloc(vp, 958 lblktosize(fs, (off_t)(base + loc)), 959 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 960 return (error); 961 indiroff = 0; 962 } 963 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 964 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 965 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 966 if (passno == 2) 967 idb_assign(ip, ibp->b_data, indiroff, 0); 968 else if (passno == 1) 969 panic("ffs_snapshot: lost indirect block"); 970 } 971 } 972 bdwrite(ibp); 973 return (0); 974 } 975 976 /* 977 * Before expunging a snapshot inode, note all the 978 * blocks that it claims with BLK_SNAP so that fsck will 979 * be able to account for those blocks properly and so 980 * that this snapshot knows that it need not copy them 981 * if the other snapshot holding them is freed. 982 */ 983 static int 984 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 985 acctfunc_t acctfunc, int expungetype) 986 { 987 int i, error, ns; 988 daddr_t lbn, rlbn; 989 daddr_t len, blkno, numblks, blksperindir; 990 struct ufs1_dinode *dip1; 991 struct ufs2_dinode *dip2; 992 struct lwp *l = curlwp; 993 void *bap; 994 struct buf *bp; 995 996 ns = UFS_FSNEEDSWAP(fs); 997 /* 998 * Prepare to expunge the inode. If its inode block has not 999 * yet been copied, then allocate and fill the copy. 1000 */ 1001 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1002 error = snapblkaddr(snapvp, lbn, &blkno); 1003 if (error) 1004 return error; 1005 if (blkno != 0) { 1006 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred, 1007 B_MODIFY, &bp); 1008 } else { 1009 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 1010 fs->fs_bsize, l->l_cred, 0, &bp); 1011 if (! error) 1012 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 1013 } 1014 if (error) 1015 return error; 1016 /* 1017 * Set a snapshot inode to be a zero length file, regular files 1018 * or unlinked snapshots to be completely unallocated. 1019 */ 1020 if (fs->fs_magic == FS_UFS1_MAGIC) { 1021 dip1 = (struct ufs1_dinode *)bp->b_data + 1022 ino_to_fsbo(fs, cancelip->i_number); 1023 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1024 dip1->di_mode = 0; 1025 dip1->di_size = 0; 1026 dip1->di_blocks = 0; 1027 dip1->di_flags = 1028 ufs_rw32(ufs_rw32(dip1->di_flags, ns) & ~SF_SNAPSHOT, ns); 1029 memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t)); 1030 } else { 1031 dip2 = (struct ufs2_dinode *)bp->b_data + 1032 ino_to_fsbo(fs, cancelip->i_number); 1033 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1034 dip2->di_mode = 0; 1035 dip2->di_size = 0; 1036 dip2->di_blocks = 0; 1037 dip2->di_flags = 1038 ufs_rw32(ufs_rw32(dip2->di_flags, ns) & ~SF_SNAPSHOT, ns); 1039 memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t)); 1040 } 1041 bdwrite(bp); 1042 /* 1043 * Now go through and expunge all the blocks in the file 1044 * using the function requested. 1045 */ 1046 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1047 if (fs->fs_magic == FS_UFS1_MAGIC) 1048 bap = &cancelip->i_ffs1_db[0]; 1049 else 1050 bap = &cancelip->i_ffs2_db[0]; 1051 if ((error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype))) 1052 return (error); 1053 if (fs->fs_magic == FS_UFS1_MAGIC) 1054 bap = &cancelip->i_ffs1_ib[0]; 1055 else 1056 bap = &cancelip->i_ffs2_ib[0]; 1057 if ((error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype))) 1058 return (error); 1059 blksperindir = 1; 1060 lbn = -NDADDR; 1061 len = numblks - NDADDR; 1062 rlbn = NDADDR; 1063 for (i = 0; len > 0 && i < NIADDR; i++) { 1064 error = indiracct(snapvp, ITOV(cancelip), i, 1065 ib_get(cancelip, i), lbn, rlbn, len, 1066 blksperindir, fs, acctfunc, expungetype); 1067 if (error) 1068 return (error); 1069 blksperindir *= NINDIR(fs); 1070 lbn -= blksperindir + 1; 1071 len -= blksperindir; 1072 rlbn += blksperindir; 1073 } 1074 return (0); 1075 } 1076 1077 /* 1078 * Descend an indirect block chain for vnode cancelvp accounting for all 1079 * its indirect blocks in snapvp. 1080 */ 1081 static int 1082 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level, 1083 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks, 1084 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype) 1085 { 1086 int error, num, i; 1087 daddr_t subblksperindir; 1088 struct indir indirs[NIADDR + 2]; 1089 daddr_t last; 1090 void *bap; 1091 struct buf *bp; 1092 1093 if (blkno == 0) { 1094 if (expungetype == BLK_NOCOPY) 1095 return (0); 1096 panic("indiracct: missing indir"); 1097 } 1098 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1099 return (error); 1100 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1101 panic("indiracct: botched params"); 1102 /* 1103 * We have to expand bread here since it will deadlock looking 1104 * up the block number for any blocks that are not in the cache. 1105 */ 1106 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, 1107 false, &bp); 1108 if (error) 1109 return error; 1110 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 1111 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { 1112 brelse(bp, 0); 1113 return (error); 1114 } 1115 /* 1116 * Account for the block pointers in this indirect block. 1117 */ 1118 last = howmany(remblks, blksperindir); 1119 if (last > NINDIR(fs)) 1120 last = NINDIR(fs); 1121 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO); 1122 memcpy((void *)bap, bp->b_data, fs->fs_bsize); 1123 brelse(bp, 0); 1124 error = (*acctfunc)(snapvp, bap, 0, last, 1125 fs, level == 0 ? rlbn : -1, expungetype); 1126 if (error || level == 0) 1127 goto out; 1128 /* 1129 * Account for the block pointers in each of the indirect blocks 1130 * in the levels below us. 1131 */ 1132 subblksperindir = blksperindir / NINDIR(fs); 1133 for (lbn++, level--, i = 0; i < last; i++) { 1134 error = indiracct(snapvp, cancelvp, level, 1135 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks, 1136 subblksperindir, fs, acctfunc, expungetype); 1137 if (error) 1138 goto out; 1139 rlbn += blksperindir; 1140 lbn -= blksperindir; 1141 remblks -= blksperindir; 1142 } 1143 out: 1144 free(bap, M_DEVBUF); 1145 return (error); 1146 } 1147 1148 /* 1149 * Do both snap accounting and map accounting. 1150 */ 1151 static int 1152 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1153 struct fs *fs, daddr_t lblkno, 1154 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1155 { 1156 int error; 1157 1158 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype))) 1159 return (error); 1160 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)); 1161 } 1162 1163 /* 1164 * Identify a set of blocks allocated in a snapshot inode. 1165 */ 1166 static int 1167 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1168 struct fs *fs, daddr_t lblkno, 1169 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1170 { 1171 struct inode *ip = VTOI(vp); 1172 struct lwp *l = curlwp; 1173 daddr_t blkno; 1174 daddr_t lbn; 1175 struct buf *ibp; 1176 int error; 1177 1178 for ( ; oldblkp < lastblkp; oldblkp++) { 1179 blkno = idb_get(ip, bap, oldblkp); 1180 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1181 continue; 1182 lbn = fragstoblks(fs, blkno); 1183 if (lbn < NDADDR) { 1184 blkno = db_get(ip, lbn); 1185 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1186 } else { 1187 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1188 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1189 if (error) 1190 return (error); 1191 blkno = idb_get(ip, ibp->b_data, 1192 (lbn - NDADDR) % NINDIR(fs)); 1193 } 1194 /* 1195 * If we are expunging a snapshot vnode and we 1196 * find a block marked BLK_NOCOPY, then it is 1197 * one that has been allocated to this snapshot after 1198 * we took our current snapshot and can be ignored. 1199 */ 1200 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1201 if (lbn >= NDADDR) 1202 brelse(ibp, 0); 1203 } else { 1204 if (blkno != 0) 1205 panic("snapacct: bad block"); 1206 if (lbn < NDADDR) 1207 db_assign(ip, lbn, expungetype); 1208 else { 1209 idb_assign(ip, ibp->b_data, 1210 (lbn - NDADDR) % NINDIR(fs), expungetype); 1211 bdwrite(ibp); 1212 } 1213 } 1214 } 1215 return (0); 1216 } 1217 1218 /* 1219 * Account for a set of blocks allocated in a snapshot inode. 1220 */ 1221 static int 1222 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1223 struct fs *fs, daddr_t lblkno, int expungetype) 1224 { 1225 daddr_t blkno; 1226 struct inode *ip; 1227 ino_t inum; 1228 int acctit; 1229 1230 ip = VTOI(vp); 1231 inum = ip->i_number; 1232 if (lblkno == -1) 1233 acctit = 0; 1234 else 1235 acctit = 1; 1236 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1237 blkno = idb_get(ip, bap, oldblkp); 1238 if (blkno == 0 || blkno == BLK_NOCOPY) 1239 continue; 1240 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1241 *ip->i_snapblklist++ = lblkno; 1242 if (blkno == BLK_SNAP) 1243 blkno = blkstofrags(fs, lblkno); 1244 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum); 1245 } 1246 return (0); 1247 } 1248 #endif /* defined(FFS_NO_SNAPSHOT) */ 1249 1250 /* 1251 * Decrement extra reference on snapshot when last name is removed. 1252 * It will not be freed until the last open reference goes away. 1253 */ 1254 void 1255 ffs_snapgone(struct inode *ip) 1256 { 1257 struct mount *mp = ip->i_devvp->v_specmountpoint; 1258 struct inode *xp; 1259 struct fs *fs; 1260 struct snap_info *si; 1261 int snaploc; 1262 1263 si = VFSTOUFS(mp)->um_snapinfo; 1264 1265 /* 1266 * Find snapshot in incore list. 1267 */ 1268 mutex_enter(&si->si_lock); 1269 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1270 if (xp == ip) 1271 break; 1272 mutex_exit(&si->si_lock); 1273 #ifdef DEBUG 1274 if (snapdebug && xp == NULL) 1275 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1276 (unsigned long long)ip->i_number); 1277 #endif 1278 /* 1279 * Delete snapshot inode from superblock. Keep list dense. 1280 */ 1281 mutex_enter(&si->si_lock); 1282 fs = ip->i_fs; 1283 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1284 if (fs->fs_snapinum[snaploc] == ip->i_number) 1285 break; 1286 if (snaploc < FSMAXSNAP) { 1287 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1288 if (fs->fs_snapinum[snaploc] == 0) 1289 break; 1290 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1291 } 1292 fs->fs_snapinum[snaploc - 1] = 0; 1293 } 1294 si->si_gen++; 1295 mutex_exit(&si->si_lock); 1296 } 1297 1298 /* 1299 * Prepare a snapshot file for being removed. 1300 */ 1301 void 1302 ffs_snapremove(struct vnode *vp) 1303 { 1304 struct inode *ip = VTOI(vp), *xp; 1305 struct vnode *devvp = ip->i_devvp; 1306 struct fs *fs = ip->i_fs; 1307 struct mount *mp = devvp->v_specmountpoint; 1308 struct buf *ibp; 1309 struct snap_info *si; 1310 struct lwp *l = curlwp; 1311 daddr_t numblks, blkno, dblk; 1312 int error, loc, last; 1313 1314 si = VFSTOUFS(mp)->um_snapinfo; 1315 /* 1316 * If active, delete from incore list (this snapshot may 1317 * already have been in the process of being deleted, so 1318 * would not have been active). 1319 * 1320 * Clear copy-on-write flag if last snapshot. 1321 */ 1322 mutex_enter(&si->si_lock); 1323 if (is_active_snapshot(si, ip)) { 1324 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1325 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1326 /* Roll back the list of preallocated blocks. */ 1327 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1328 si->si_snapblklist = xp->i_snapblklist; 1329 si->si_gen++; 1330 mutex_exit(&si->si_lock); 1331 } else { 1332 si->si_snapblklist = 0; 1333 si->si_gen++; 1334 mutex_exit(&si->si_lock); 1335 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1336 } 1337 if (ip->i_snapblklist != NULL) { 1338 free(ip->i_snapblklist, M_UFSMNT); 1339 ip->i_snapblklist = NULL; 1340 } 1341 vrele(vp); 1342 } else 1343 mutex_exit(&si->si_lock); 1344 /* 1345 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1346 * snapshots that want them (see ffs_snapblkfree below). 1347 */ 1348 for (blkno = 1; blkno < NDADDR; blkno++) { 1349 dblk = db_get(ip, blkno); 1350 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1351 db_assign(ip, blkno, 0); 1352 else if ((dblk == blkstofrags(fs, blkno) && 1353 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1354 ip->i_number))) { 1355 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1356 db_assign(ip, blkno, 0); 1357 } 1358 } 1359 numblks = howmany(ip->i_size, fs->fs_bsize); 1360 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1361 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1362 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1363 if (error) 1364 continue; 1365 if (fs->fs_size - blkno > NINDIR(fs)) 1366 last = NINDIR(fs); 1367 else 1368 last = fs->fs_size - blkno; 1369 for (loc = 0; loc < last; loc++) { 1370 dblk = idb_get(ip, ibp->b_data, loc); 1371 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1372 idb_assign(ip, ibp->b_data, loc, 0); 1373 else if (dblk == blkstofrags(fs, blkno) && 1374 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1375 fs->fs_bsize, ip->i_number)) { 1376 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1377 idb_assign(ip, ibp->b_data, loc, 0); 1378 } 1379 } 1380 bawrite(ibp); 1381 } 1382 /* 1383 * Clear snapshot flag and drop reference. 1384 */ 1385 ip->i_flags &= ~SF_SNAPSHOT; 1386 DIP_ASSIGN(ip, flags, ip->i_flags); 1387 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1388 } 1389 1390 /* 1391 * Notification that a block is being freed. Return zero if the free 1392 * should be allowed to proceed. Return non-zero if the snapshot file 1393 * wants to claim the block. The block will be claimed if it is an 1394 * uncopied part of one of the snapshots. It will be freed if it is 1395 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1396 * If a fragment is being freed, then all snapshots that care about 1397 * it must make a copy since a snapshot file can only claim full sized 1398 * blocks. Note that if more than one snapshot file maps the block, 1399 * we can pick one at random to claim it. Since none of the snapshots 1400 * can change, we are assurred that they will all see the same unmodified 1401 * image. When deleting a snapshot file (see ffs_snapremove above), we 1402 * must push any of these claimed blocks to one of the other snapshots 1403 * that maps it. These claimed blocks are easily identified as they will 1404 * have a block number equal to their logical block number within the 1405 * snapshot. A copied block can never have this property because they 1406 * must always have been allocated from a BLK_NOCOPY location. 1407 */ 1408 int 1409 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, 1410 long size, ino_t inum) 1411 { 1412 struct mount *mp = devvp->v_specmountpoint; 1413 struct buf *ibp; 1414 struct inode *ip; 1415 struct vnode *vp = NULL; 1416 struct snap_info *si; 1417 void *saved_data = NULL; 1418 daddr_t lbn; 1419 daddr_t blkno; 1420 uint32_t gen; 1421 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1422 1423 si = VFSTOUFS(mp)->um_snapinfo; 1424 lbn = fragstoblks(fs, bno); 1425 mutex_enter(&si->si_lock); 1426 retry: 1427 gen = si->si_gen; 1428 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1429 vp = ITOV(ip); 1430 if (snapshot_locked == 0) { 1431 if (!mutex_tryenter(&si->si_snaplock)) { 1432 mutex_exit(&si->si_lock); 1433 mutex_enter(&si->si_snaplock); 1434 mutex_enter(&si->si_lock); 1435 } 1436 snapshot_locked = 1; 1437 if (gen != si->si_gen) 1438 goto retry; 1439 } 1440 /* 1441 * Lookup block being written. 1442 */ 1443 if (lbn < NDADDR) { 1444 blkno = db_get(ip, lbn); 1445 } else { 1446 mutex_exit(&si->si_lock); 1447 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1448 fs->fs_bsize, FSCRED, B_METAONLY, &ibp); 1449 if (error) { 1450 mutex_enter(&si->si_lock); 1451 break; 1452 } 1453 indiroff = (lbn - NDADDR) % NINDIR(fs); 1454 blkno = idb_get(ip, ibp->b_data, indiroff); 1455 mutex_enter(&si->si_lock); 1456 if (gen != si->si_gen) { 1457 brelse(ibp, 0); 1458 goto retry; 1459 } 1460 } 1461 /* 1462 * Check to see if block needs to be copied. 1463 */ 1464 if (blkno == 0) { 1465 /* 1466 * A block that we map is being freed. If it has not 1467 * been claimed yet, we will claim or copy it (below). 1468 */ 1469 claimedblk = 1; 1470 } else if (blkno == BLK_SNAP) { 1471 /* 1472 * No previous snapshot claimed the block, 1473 * so it will be freed and become a BLK_NOCOPY 1474 * (don't care) for us. 1475 */ 1476 if (claimedblk) 1477 panic("snapblkfree: inconsistent block type"); 1478 if (lbn < NDADDR) { 1479 db_assign(ip, lbn, BLK_NOCOPY); 1480 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1481 } else { 1482 idb_assign(ip, ibp->b_data, indiroff, 1483 BLK_NOCOPY); 1484 mutex_exit(&si->si_lock); 1485 if (ip->i_nlink > 0) 1486 bwrite(ibp); 1487 else 1488 bdwrite(ibp); 1489 mutex_enter(&si->si_lock); 1490 if (gen != si->si_gen) 1491 goto retry; 1492 } 1493 continue; 1494 } else /* BLK_NOCOPY or default */ { 1495 /* 1496 * If the snapshot has already copied the block 1497 * (default), or does not care about the block, 1498 * it is not needed. 1499 */ 1500 if (lbn >= NDADDR) 1501 brelse(ibp, 0); 1502 continue; 1503 } 1504 /* 1505 * If this is a full size block, we will just grab it 1506 * and assign it to the snapshot inode. Otherwise we 1507 * will proceed to copy it. See explanation for this 1508 * routine as to why only a single snapshot needs to 1509 * claim this block. 1510 */ 1511 if (size == fs->fs_bsize) { 1512 #ifdef DEBUG 1513 if (snapdebug) 1514 printf("%s %llu lbn %" PRId64 1515 "from inum %llu\n", 1516 "Grabonremove: snapino", 1517 (unsigned long long)ip->i_number, 1518 lbn, (unsigned long long)inum); 1519 #endif 1520 mutex_exit(&si->si_lock); 1521 if (lbn < NDADDR) { 1522 db_assign(ip, lbn, bno); 1523 } else { 1524 idb_assign(ip, ibp->b_data, indiroff, bno); 1525 if (ip->i_nlink > 0) 1526 bwrite(ibp); 1527 else 1528 bdwrite(ibp); 1529 } 1530 DIP_ADD(ip, blocks, btodb(size)); 1531 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1532 if (ip->i_nlink > 0 && mp->mnt_wapbl) 1533 error = syncsnap(vp); 1534 else 1535 error = 0; 1536 mutex_exit(&si->si_snaplock); 1537 return (error == 0); 1538 } 1539 if (lbn >= NDADDR) 1540 brelse(ibp, 0); 1541 #ifdef DEBUG 1542 if (snapdebug) 1543 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1544 "Copyonremove: snapino ", 1545 (unsigned long long)ip->i_number, 1546 lbn, "for inum", (unsigned long long)inum, size); 1547 #endif 1548 /* 1549 * If we have already read the old block contents, then 1550 * simply copy them to the new block. Note that we need 1551 * to synchronously write snapshots that have not been 1552 * unlinked, and hence will be visible after a crash, 1553 * to ensure their integrity. 1554 */ 1555 mutex_exit(&si->si_lock); 1556 if (saved_data == NULL) { 1557 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1558 error = rwfsblk(vp, B_READ, saved_data, lbn); 1559 if (error) { 1560 free(saved_data, M_UFSMNT); 1561 saved_data = NULL; 1562 mutex_enter(&si->si_lock); 1563 break; 1564 } 1565 } 1566 error = wrsnapblk(vp, saved_data, lbn); 1567 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 1568 error = syncsnap(vp); 1569 mutex_enter(&si->si_lock); 1570 if (error) 1571 break; 1572 if (gen != si->si_gen) 1573 goto retry; 1574 } 1575 mutex_exit(&si->si_lock); 1576 if (saved_data) 1577 free(saved_data, M_UFSMNT); 1578 /* 1579 * If we have been unable to allocate a block in which to do 1580 * the copy, then return non-zero so that the fragment will 1581 * not be freed. Although space will be lost, the snapshot 1582 * will stay consistent. 1583 */ 1584 if (snapshot_locked) 1585 mutex_exit(&si->si_snaplock); 1586 return (error); 1587 } 1588 1589 /* 1590 * Associate snapshot files when mounting. 1591 */ 1592 void 1593 ffs_snapshot_mount(struct mount *mp) 1594 { 1595 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1596 struct fs *fs = VFSTOUFS(mp)->um_fs; 1597 struct lwp *l = curlwp; 1598 struct vnode *vp; 1599 struct inode *ip, *xp; 1600 struct snap_info *si; 1601 daddr_t snaplistsize, *snapblklist; 1602 int i, error, ns, snaploc, loc; 1603 1604 /* 1605 * No persistent snapshots on apple ufs file systems. 1606 */ 1607 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1608 return; 1609 1610 si = VFSTOUFS(mp)->um_snapinfo; 1611 ns = UFS_FSNEEDSWAP(fs); 1612 /* 1613 * XXX The following needs to be set before ffs_truncate or 1614 * VOP_READ can be called. 1615 */ 1616 mp->mnt_stat.f_iosize = fs->fs_bsize; 1617 /* 1618 * Process each snapshot listed in the superblock. 1619 */ 1620 vp = NULL; 1621 mutex_enter(&si->si_lock); 1622 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1623 if (fs->fs_snapinum[snaploc] == 0) 1624 break; 1625 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1626 &vp)) != 0) { 1627 printf("ffs_snapshot_mount: vget failed %d\n", error); 1628 continue; 1629 } 1630 ip = VTOI(vp); 1631 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1632 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1633 fs->fs_snapinum[snaploc]); 1634 vput(vp); 1635 vp = NULL; 1636 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1637 if (fs->fs_snapinum[loc] == 0) 1638 break; 1639 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1640 } 1641 fs->fs_snapinum[loc - 1] = 0; 1642 snaploc--; 1643 continue; 1644 } 1645 1646 /* 1647 * Read the block hints list. Use an empty list on 1648 * read errors. 1649 */ 1650 error = vn_rdwr(UIO_READ, vp, 1651 (void *)&snaplistsize, sizeof(snaplistsize), 1652 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1653 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1654 l->l_cred, NULL, NULL); 1655 if (error) { 1656 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1657 snaplistsize = 1; 1658 } else 1659 snaplistsize = ufs_rw64(snaplistsize, ns); 1660 snapblklist = malloc( 1661 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 1662 if (error) 1663 snapblklist[0] = 1; 1664 else { 1665 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1666 snaplistsize * sizeof(daddr_t), 1667 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1668 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1669 l->l_cred, NULL, NULL); 1670 for (i = 0; i < snaplistsize; i++) 1671 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1672 if (error) { 1673 printf("ffs_snapshot_mount: read_2 failed %d\n", 1674 error); 1675 snapblklist[0] = 1; 1676 } 1677 } 1678 ip->i_snapblklist = &snapblklist[0]; 1679 1680 /* 1681 * Link it onto the active snapshot list. 1682 */ 1683 if (is_active_snapshot(si, ip)) 1684 panic("ffs_snapshot_mount: %"PRIu64" already on list", 1685 ip->i_number); 1686 else 1687 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1688 vp->v_vflag |= VV_SYSTEM; 1689 VOP_UNLOCK(vp); 1690 } 1691 /* 1692 * No usable snapshots found. 1693 */ 1694 if (vp == NULL) { 1695 mutex_exit(&si->si_lock); 1696 return; 1697 } 1698 /* 1699 * Attach the block hints list. We always want to 1700 * use the list from the newest snapshot. 1701 */ 1702 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1703 si->si_snapblklist = xp->i_snapblklist; 1704 fscow_establish(mp, ffs_copyonwrite, devvp); 1705 si->si_gen++; 1706 mutex_exit(&si->si_lock); 1707 } 1708 1709 /* 1710 * Disassociate snapshot files when unmounting. 1711 */ 1712 void 1713 ffs_snapshot_unmount(struct mount *mp) 1714 { 1715 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1716 struct inode *ip; 1717 struct snap_info *si; 1718 bool list_empty = true; 1719 1720 si = VFSTOUFS(mp)->um_snapinfo; 1721 mutex_enter(&si->si_lock); 1722 while ((ip = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1723 list_empty = false; 1724 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1725 if (ip->i_snapblklist == si->si_snapblklist) 1726 si->si_snapblklist = NULL; 1727 free(ip->i_snapblklist, M_UFSMNT); 1728 si->si_gen++; 1729 mutex_exit(&si->si_lock); 1730 vrele(ITOV(ip)); 1731 mutex_enter(&si->si_lock); 1732 } 1733 mutex_exit(&si->si_lock); 1734 if (! list_empty) 1735 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1736 } 1737 1738 /* 1739 * Check for need to copy block that is about to be written, 1740 * copying the block if necessary. 1741 */ 1742 static int 1743 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1744 { 1745 struct fs *fs; 1746 struct inode *ip; 1747 struct vnode *devvp = v, *vp = NULL; 1748 struct mount *mp = devvp->v_specmountpoint; 1749 struct snap_info *si; 1750 void *saved_data = NULL; 1751 daddr_t lbn, blkno, *snapblklist; 1752 uint32_t gen; 1753 int lower, upper, mid, snapshot_locked = 0, error = 0; 1754 1755 /* 1756 * Check for valid snapshots. 1757 */ 1758 si = VFSTOUFS(mp)->um_snapinfo; 1759 mutex_enter(&si->si_lock); 1760 ip = TAILQ_FIRST(&si->si_snapshots); 1761 if (ip == NULL) { 1762 mutex_exit(&si->si_lock); 1763 return 0; 1764 } 1765 /* 1766 * First check to see if it is after the file system or 1767 * in the preallocated list. 1768 * By doing this check we avoid several potential deadlocks. 1769 */ 1770 fs = ip->i_fs; 1771 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1772 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) { 1773 mutex_exit(&si->si_lock); 1774 return 0; 1775 } 1776 snapblklist = si->si_snapblklist; 1777 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0); 1778 lower = 1; 1779 while (lower <= upper) { 1780 mid = (lower + upper) / 2; 1781 if (snapblklist[mid] == lbn) 1782 break; 1783 if (snapblklist[mid] < lbn) 1784 lower = mid + 1; 1785 else 1786 upper = mid - 1; 1787 } 1788 if (lower <= upper) { 1789 mutex_exit(&si->si_lock); 1790 return 0; 1791 } 1792 /* 1793 * Not in the precomputed list, so check the snapshots. 1794 */ 1795 if (data_valid && bp->b_bcount == fs->fs_bsize) 1796 saved_data = bp->b_data; 1797 retry: 1798 gen = si->si_gen; 1799 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1800 vp = ITOV(ip); 1801 /* 1802 * We ensure that everything of our own that needs to be 1803 * copied will be done at the time that ffs_snapshot is 1804 * called. Thus we can skip the check here which can 1805 * deadlock in doing the lookup in ffs_balloc. 1806 */ 1807 if (bp->b_vp == vp) 1808 continue; 1809 /* 1810 * Check to see if block needs to be copied. 1811 */ 1812 if (lbn < NDADDR) { 1813 blkno = db_get(ip, lbn); 1814 } else { 1815 mutex_exit(&si->si_lock); 1816 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1817 mutex_enter(&si->si_lock); 1818 break; 1819 } 1820 mutex_enter(&si->si_lock); 1821 if (gen != si->si_gen) 1822 goto retry; 1823 } 1824 #ifdef DIAGNOSTIC 1825 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1826 panic("ffs_copyonwrite: bad copy block"); 1827 #endif 1828 if (blkno != 0) 1829 continue; 1830 1831 if (curlwp == uvm.pagedaemon_lwp) { 1832 error = ENOMEM; 1833 break; 1834 } 1835 1836 if (snapshot_locked == 0) { 1837 if (!mutex_tryenter(&si->si_snaplock)) { 1838 mutex_exit(&si->si_lock); 1839 mutex_enter(&si->si_snaplock); 1840 mutex_enter(&si->si_lock); 1841 } 1842 snapshot_locked = 1; 1843 if (gen != si->si_gen) 1844 goto retry; 1845 1846 /* Check again if block still needs to be copied */ 1847 if (lbn < NDADDR) { 1848 blkno = db_get(ip, lbn); 1849 } else { 1850 mutex_exit(&si->si_lock); 1851 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1852 mutex_enter(&si->si_lock); 1853 break; 1854 } 1855 mutex_enter(&si->si_lock); 1856 if (gen != si->si_gen) 1857 goto retry; 1858 } 1859 1860 if (blkno != 0) 1861 continue; 1862 } 1863 /* 1864 * Allocate the block into which to do the copy. Since 1865 * multiple processes may all try to copy the same block, 1866 * we have to recheck our need to do a copy if we sleep 1867 * waiting for the lock. 1868 * 1869 * Because all snapshots on a filesystem share a single 1870 * lock, we ensure that we will never be in competition 1871 * with another process to allocate a block. 1872 */ 1873 #ifdef DEBUG 1874 if (snapdebug) { 1875 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 1876 (unsigned long long)ip->i_number, lbn); 1877 if (bp->b_vp == devvp) 1878 printf("fs metadata"); 1879 else 1880 printf("inum %llu", (unsigned long long) 1881 VTOI(bp->b_vp)->i_number); 1882 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 1883 } 1884 #endif 1885 /* 1886 * If we have already read the old block contents, then 1887 * simply copy them to the new block. Note that we need 1888 * to synchronously write snapshots that have not been 1889 * unlinked, and hence will be visible after a crash, 1890 * to ensure their integrity. 1891 */ 1892 mutex_exit(&si->si_lock); 1893 if (saved_data == NULL) { 1894 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1895 error = rwfsblk(vp, B_READ, saved_data, lbn); 1896 if (error) { 1897 free(saved_data, M_UFSMNT); 1898 saved_data = NULL; 1899 mutex_enter(&si->si_lock); 1900 break; 1901 } 1902 } 1903 error = wrsnapblk(vp, saved_data, lbn); 1904 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 1905 error = syncsnap(vp); 1906 mutex_enter(&si->si_lock); 1907 if (error) 1908 break; 1909 if (gen != si->si_gen) 1910 goto retry; 1911 } 1912 /* 1913 * Note that we need to synchronously write snapshots that 1914 * have not been unlinked, and hence will be visible after 1915 * a crash, to ensure their integrity. 1916 */ 1917 mutex_exit(&si->si_lock); 1918 if (saved_data && saved_data != bp->b_data) 1919 free(saved_data, M_UFSMNT); 1920 if (snapshot_locked) 1921 mutex_exit(&si->si_snaplock); 1922 return error; 1923 } 1924 1925 /* 1926 * Read from a snapshot. 1927 */ 1928 int 1929 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) 1930 { 1931 struct inode *ip = VTOI(vp); 1932 struct fs *fs = ip->i_fs; 1933 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; 1934 struct buf *bp; 1935 daddr_t lbn, nextlbn; 1936 off_t fsbytes, bytesinfile; 1937 long size, xfersize, blkoffset; 1938 int error; 1939 1940 fstrans_start(vp->v_mount, FSTRANS_SHARED); 1941 mutex_enter(&si->si_snaplock); 1942 1943 if (ioflag & IO_ALTSEMANTICS) 1944 fsbytes = ip->i_size; 1945 else 1946 fsbytes = lfragtosize(fs, fs->fs_size); 1947 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 1948 bytesinfile = fsbytes - uio->uio_offset; 1949 if (bytesinfile <= 0) 1950 break; 1951 lbn = lblkno(fs, uio->uio_offset); 1952 nextlbn = lbn + 1; 1953 size = fs->fs_bsize; 1954 blkoffset = blkoff(fs, uio->uio_offset); 1955 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 1956 bytesinfile); 1957 1958 if (lblktosize(fs, nextlbn + 1) >= fsbytes) { 1959 if (lblktosize(fs, lbn) + size > fsbytes) 1960 size = fragroundup(fs, 1961 fsbytes - lblktosize(fs, lbn)); 1962 error = bread(vp, lbn, size, NOCRED, 0, &bp); 1963 } else { 1964 int nextsize = fs->fs_bsize; 1965 error = breadn(vp, lbn, 1966 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp); 1967 } 1968 if (error) 1969 break; 1970 1971 /* 1972 * We should only get non-zero b_resid when an I/O error 1973 * has occurred, which should cause us to break above. 1974 * However, if the short read did not cause an error, 1975 * then we want to ensure that we do not uiomove bad 1976 * or uninitialized data. 1977 */ 1978 size -= bp->b_resid; 1979 if (size < blkoffset + xfersize) { 1980 xfersize = size - blkoffset; 1981 if (xfersize <= 0) 1982 break; 1983 } 1984 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 1985 if (error) 1986 break; 1987 brelse(bp, BC_AGE); 1988 } 1989 if (bp != NULL) 1990 brelse(bp, BC_AGE); 1991 1992 mutex_exit(&si->si_snaplock); 1993 fstrans_done(vp->v_mount); 1994 return error; 1995 } 1996 1997 /* 1998 * Lookup a snapshots data block address. 1999 * Simpler than UFS_BALLOC() as we know all metadata is already allocated 2000 * and safe even for the pagedaemon where we cannot bread(). 2001 */ 2002 static int 2003 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) 2004 { 2005 struct indir indirs[NIADDR + 2]; 2006 struct inode *ip = VTOI(vp); 2007 struct fs *fs = ip->i_fs; 2008 struct buf *bp; 2009 int error, num; 2010 2011 KASSERT(lbn >= 0); 2012 2013 if (lbn < NDADDR) { 2014 *res = db_get(ip, lbn); 2015 return 0; 2016 } 2017 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 2018 return error; 2019 if (curlwp == uvm.pagedaemon_lwp) { 2020 mutex_enter(&bufcache_lock); 2021 bp = incore(vp, indirs[num-1].in_lbn); 2022 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { 2023 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2024 error = 0; 2025 } else 2026 error = ENOMEM; 2027 mutex_exit(&bufcache_lock); 2028 return error; 2029 } 2030 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp); 2031 if (error == 0) 2032 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2033 brelse(bp, 0); 2034 2035 return error; 2036 } 2037 2038 /* 2039 * Read or write the specified block of the filesystem vp resides on 2040 * from or to the disk bypassing the buffer cache. 2041 */ 2042 static int 2043 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn) 2044 { 2045 int error; 2046 struct inode *ip = VTOI(vp); 2047 struct fs *fs = ip->i_fs; 2048 struct buf *nbp; 2049 2050 nbp = getiobuf(NULL, true); 2051 nbp->b_flags = flags; 2052 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2053 nbp->b_error = 0; 2054 nbp->b_data = data; 2055 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 2056 nbp->b_proc = NULL; 2057 nbp->b_dev = ip->i_devvp->v_rdev; 2058 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ 2059 2060 bdev_strategy(nbp); 2061 2062 error = biowait(nbp); 2063 2064 putiobuf(nbp); 2065 2066 return error; 2067 } 2068 2069 /* 2070 * Write all dirty buffers to disk and invalidate them. 2071 */ 2072 static int 2073 syncsnap(struct vnode *vp) 2074 { 2075 int error; 2076 buf_t *bp; 2077 struct fs *fs = VTOI(vp)->i_fs; 2078 2079 mutex_enter(&bufcache_lock); 2080 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { 2081 error = bbusy(bp, false, 0, NULL); 2082 if (error == EPASSTHROUGH) 2083 continue; 2084 else if (error != 0) { 2085 mutex_exit(&bufcache_lock); 2086 return error; 2087 } 2088 KASSERT(bp->b_bcount == fs->fs_bsize); 2089 mutex_exit(&bufcache_lock); 2090 error = rwfsblk(vp, B_WRITE, bp->b_data, 2091 fragstoblks(fs, dbtofsb(fs, bp->b_blkno))); 2092 brelse(bp, BC_INVAL | BC_VFLUSH); 2093 if (error) 2094 return error; 2095 mutex_enter(&bufcache_lock); 2096 } 2097 mutex_exit(&bufcache_lock); 2098 2099 return 0; 2100 } 2101 2102 /* 2103 * Write the specified block to a snapshot. 2104 */ 2105 static int 2106 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn) 2107 { 2108 struct inode *ip = VTOI(vp); 2109 struct fs *fs = ip->i_fs; 2110 struct buf *bp; 2111 int error; 2112 2113 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, 2114 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp); 2115 if (error) 2116 return error; 2117 memcpy(bp->b_data, data, fs->fs_bsize); 2118 if (ip->i_nlink > 0) 2119 error = bwrite(bp); 2120 else 2121 bawrite(bp); 2122 2123 return error; 2124 } 2125 2126 /* 2127 * Check if this inode is present on the active snapshot list. 2128 * Must be called with snapinfo locked. 2129 */ 2130 static inline bool 2131 is_active_snapshot(struct snap_info *si, struct inode *ip) 2132 { 2133 struct inode *xp; 2134 2135 KASSERT(mutex_owned(&si->si_lock)); 2136 2137 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 2138 if (xp == ip) 2139 return true; 2140 return false; 2141 } 2142 2143 /* 2144 * Get/Put direct block from inode or buffer containing disk addresses. Take 2145 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2146 * into a global include. 2147 */ 2148 static inline daddr_t 2149 db_get(struct inode *ip, int loc) 2150 { 2151 if (ip->i_ump->um_fstype == UFS1) 2152 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2153 else 2154 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2155 } 2156 2157 static inline void 2158 db_assign(struct inode *ip, int loc, daddr_t val) 2159 { 2160 if (ip->i_ump->um_fstype == UFS1) 2161 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2162 else 2163 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2164 } 2165 2166 static inline daddr_t 2167 ib_get(struct inode *ip, int loc) 2168 { 2169 if (ip->i_ump->um_fstype == UFS1) 2170 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip)); 2171 else 2172 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip)); 2173 } 2174 2175 static inline void 2176 ib_assign(struct inode *ip, int loc, daddr_t val) 2177 { 2178 if (ip->i_ump->um_fstype == UFS1) 2179 ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2180 else 2181 ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2182 } 2183 2184 static inline daddr_t 2185 idb_get(struct inode *ip, void *bf, int loc) 2186 { 2187 if (ip->i_ump->um_fstype == UFS1) 2188 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2189 else 2190 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2191 } 2192 2193 static inline void 2194 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val) 2195 { 2196 if (ip->i_ump->um_fstype == UFS1) 2197 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2198 else 2199 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2200 } 2201