1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 24 * All rights reserved. 25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 /* Portions Copyright 2010 Robert Milkowski */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/kernel.h> 36 #include <sys/sysmacros.h> 37 #include <sys/kmem.h> 38 #include <sys/acl.h> 39 #include <sys/vnode.h> 40 #include <sys/vfs.h> 41 #include <sys/mntent.h> 42 #include <sys/mount.h> 43 #include <sys/cmn_err.h> 44 #include <sys/zfs_znode.h> 45 #include <sys/zfs_vnops.h> 46 #include <sys/zfs_dir.h> 47 #include <sys/zil.h> 48 #include <sys/fs/zfs.h> 49 #include <sys/dmu.h> 50 #include <sys/dsl_prop.h> 51 #include <sys/dsl_dataset.h> 52 #include <sys/dsl_deleg.h> 53 #include <sys/spa.h> 54 #include <sys/zap.h> 55 #include <sys/sa.h> 56 #include <sys/sa_impl.h> 57 #include <sys/policy.h> 58 #include <sys/atomic.h> 59 #include <sys/zfs_ioctl.h> 60 #include <sys/zfs_ctldir.h> 61 #include <sys/zfs_fuid.h> 62 #include <sys/sunddi.h> 63 #include <sys/dmu_objset.h> 64 #include <sys/dsl_dir.h> 65 #include <sys/jail.h> 66 #include <sys/osd.h> 67 #include <ufs/ufs/quota.h> 68 #include <sys/zfs_quota.h> 69 70 #include "zfs_comutil.h" 71 72 #ifndef MNTK_VMSETSIZE_BUG 73 #define MNTK_VMSETSIZE_BUG 0 74 #endif 75 #ifndef MNTK_NOMSYNC 76 #define MNTK_NOMSYNC 8 77 #endif 78 79 struct mtx zfs_debug_mtx; 80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); 81 82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); 83 84 int zfs_super_owner; 85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, 86 "File system owners can perform privileged operation on file systems"); 87 88 int zfs_debug_level; 89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, 90 "Debug level"); 91 92 struct zfs_jailparam { 93 int mount_snapshot; 94 }; 95 96 static struct zfs_jailparam zfs_jailparam0 = { 97 .mount_snapshot = 0, 98 }; 99 100 static int zfs_jailparam_slot; 101 102 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters"); 103 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I", 104 "Allow mounting snapshots in the .zfs directory for unjailed datasets"); 105 106 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); 107 static int zfs_version_acl = ZFS_ACL_VERSION; 108 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, 109 "ZFS_ACL_VERSION"); 110 static int zfs_version_spa = SPA_VERSION; 111 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, 112 "SPA_VERSION"); 113 static int zfs_version_zpl = ZPL_VERSION; 114 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, 115 "ZPL_VERSION"); 116 117 #if __FreeBSD_version >= 1400018 118 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, 119 bool *mp_busy); 120 #else 121 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); 122 #endif 123 static int zfs_mount(vfs_t *vfsp); 124 static int zfs_umount(vfs_t *vfsp, int fflag); 125 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 126 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); 127 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); 128 static int zfs_sync(vfs_t *vfsp, int waitfor); 129 #if __FreeBSD_version >= 1300098 130 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 131 struct ucred **credanonp, int *numsecflavors, int *secflavors); 132 #else 133 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 134 struct ucred **credanonp, int *numsecflavors, int **secflavors); 135 #endif 136 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); 137 static void zfs_freevfs(vfs_t *vfsp); 138 139 struct vfsops zfs_vfsops = { 140 .vfs_mount = zfs_mount, 141 .vfs_unmount = zfs_umount, 142 #if __FreeBSD_version >= 1300049 143 .vfs_root = vfs_cache_root, 144 .vfs_cachedroot = zfs_root, 145 #else 146 .vfs_root = zfs_root, 147 #endif 148 .vfs_statfs = zfs_statfs, 149 .vfs_vget = zfs_vget, 150 .vfs_sync = zfs_sync, 151 .vfs_checkexp = zfs_checkexp, 152 .vfs_fhtovp = zfs_fhtovp, 153 .vfs_quotactl = zfs_quotactl, 154 }; 155 156 #ifdef VFCF_CROSS_COPY_FILE_RANGE 157 VFS_SET(zfs_vfsops, zfs, 158 VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE); 159 #else 160 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL); 161 #endif 162 163 /* 164 * We need to keep a count of active fs's. 165 * This is necessary to prevent our module 166 * from being unloaded after a umount -f 167 */ 168 static uint32_t zfs_active_fs_count = 0; 169 170 int 171 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, 172 char *setpoint) 173 { 174 int error; 175 zfsvfs_t *zfvp; 176 vfs_t *vfsp; 177 objset_t *os; 178 uint64_t tmp = *val; 179 180 error = dmu_objset_from_ds(ds, &os); 181 if (error != 0) 182 return (error); 183 184 error = getzfsvfs_impl(os, &zfvp); 185 if (error != 0) 186 return (error); 187 if (zfvp == NULL) 188 return (ENOENT); 189 vfsp = zfvp->z_vfs; 190 switch (zfs_prop) { 191 case ZFS_PROP_ATIME: 192 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) 193 tmp = 0; 194 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) 195 tmp = 1; 196 break; 197 case ZFS_PROP_DEVICES: 198 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 199 tmp = 0; 200 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) 201 tmp = 1; 202 break; 203 case ZFS_PROP_EXEC: 204 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 205 tmp = 0; 206 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) 207 tmp = 1; 208 break; 209 case ZFS_PROP_SETUID: 210 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 211 tmp = 0; 212 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) 213 tmp = 1; 214 break; 215 case ZFS_PROP_READONLY: 216 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) 217 tmp = 0; 218 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 219 tmp = 1; 220 break; 221 case ZFS_PROP_XATTR: 222 if (zfvp->z_flags & ZSB_XATTR) 223 tmp = zfvp->z_xattr; 224 break; 225 case ZFS_PROP_NBMAND: 226 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 227 tmp = 0; 228 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 229 tmp = 1; 230 break; 231 default: 232 vfs_unbusy(vfsp); 233 return (ENOENT); 234 } 235 236 vfs_unbusy(vfsp); 237 if (tmp != *val) { 238 if (setpoint) 239 (void) strcpy(setpoint, "temporary"); 240 *val = tmp; 241 } 242 return (0); 243 } 244 245 static int 246 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) 247 { 248 int error = 0; 249 char buf[32]; 250 uint64_t usedobj, quotaobj; 251 uint64_t quota, used = 0; 252 timespec_t now; 253 254 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 255 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 256 257 if (quotaobj == 0 || zfsvfs->z_replay) { 258 error = ENOENT; 259 goto done; 260 } 261 (void) sprintf(buf, "%llx", (longlong_t)id); 262 if ((error = zap_lookup(zfsvfs->z_os, quotaobj, 263 buf, sizeof (quota), 1, "a)) != 0) { 264 dprintf("%s(%d): quotaobj lookup failed\n", 265 __FUNCTION__, __LINE__); 266 goto done; 267 } 268 /* 269 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". 270 * So we set them to be the same. 271 */ 272 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); 273 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); 274 if (error && error != ENOENT) { 275 dprintf("%s(%d): usedobj failed; %d\n", 276 __FUNCTION__, __LINE__, error); 277 goto done; 278 } 279 dqp->dqb_curblocks = btodb(used); 280 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; 281 vfs_timestamp(&now); 282 /* 283 * Setting this to 0 causes FreeBSD quota(8) to print 284 * the number of days since the epoch, which isn't 285 * particularly useful. 286 */ 287 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; 288 done: 289 return (error); 290 } 291 292 static int 293 #if __FreeBSD_version >= 1400018 294 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy) 295 #else 296 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) 297 #endif 298 { 299 zfsvfs_t *zfsvfs = vfsp->vfs_data; 300 struct thread *td; 301 int cmd, type, error = 0; 302 int bitsize; 303 zfs_userquota_prop_t quota_type; 304 struct dqblk64 dqblk = { 0 }; 305 306 td = curthread; 307 cmd = cmds >> SUBCMDSHIFT; 308 type = cmds & SUBCMDMASK; 309 310 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 311 return (error); 312 if (id == -1) { 313 switch (type) { 314 case USRQUOTA: 315 id = td->td_ucred->cr_ruid; 316 break; 317 case GRPQUOTA: 318 id = td->td_ucred->cr_rgid; 319 break; 320 default: 321 error = EINVAL; 322 #if __FreeBSD_version < 1400018 323 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) 324 vfs_unbusy(vfsp); 325 #endif 326 goto done; 327 } 328 } 329 /* 330 * Map BSD type to: 331 * ZFS_PROP_USERUSED, 332 * ZFS_PROP_USERQUOTA, 333 * ZFS_PROP_GROUPUSED, 334 * ZFS_PROP_GROUPQUOTA 335 */ 336 switch (cmd) { 337 case Q_SETQUOTA: 338 case Q_SETQUOTA32: 339 if (type == USRQUOTA) 340 quota_type = ZFS_PROP_USERQUOTA; 341 else if (type == GRPQUOTA) 342 quota_type = ZFS_PROP_GROUPQUOTA; 343 else 344 error = EINVAL; 345 break; 346 case Q_GETQUOTA: 347 case Q_GETQUOTA32: 348 if (type == USRQUOTA) 349 quota_type = ZFS_PROP_USERUSED; 350 else if (type == GRPQUOTA) 351 quota_type = ZFS_PROP_GROUPUSED; 352 else 353 error = EINVAL; 354 break; 355 } 356 357 /* 358 * Depending on the cmd, we may need to get 359 * the ruid and domain (see fuidstr_to_sid?), 360 * the fuid (how?), or other information. 361 * Create fuid using zfs_fuid_create(zfsvfs, id, 362 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? 363 * I think I can use just the id? 364 * 365 * Look at zfs_id_overquota() to look up a quota. 366 * zap_lookup(something, quotaobj, fuidstring, 367 * sizeof (long long), 1, "a) 368 * 369 * See zfs_set_userquota() to set a quota. 370 */ 371 if ((uint32_t)type >= MAXQUOTAS) { 372 error = EINVAL; 373 goto done; 374 } 375 376 switch (cmd) { 377 case Q_GETQUOTASIZE: 378 bitsize = 64; 379 error = copyout(&bitsize, arg, sizeof (int)); 380 break; 381 case Q_QUOTAON: 382 // As far as I can tell, you can't turn quotas on or off on zfs 383 error = 0; 384 #if __FreeBSD_version < 1400018 385 vfs_unbusy(vfsp); 386 #endif 387 break; 388 case Q_QUOTAOFF: 389 error = ENOTSUP; 390 #if __FreeBSD_version < 1400018 391 vfs_unbusy(vfsp); 392 #endif 393 break; 394 case Q_SETQUOTA: 395 error = copyin(arg, &dqblk, sizeof (dqblk)); 396 if (error == 0) 397 error = zfs_set_userquota(zfsvfs, quota_type, 398 "", id, dbtob(dqblk.dqb_bhardlimit)); 399 break; 400 case Q_GETQUOTA: 401 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); 402 if (error == 0) 403 error = copyout(&dqblk, arg, sizeof (dqblk)); 404 break; 405 default: 406 error = EINVAL; 407 break; 408 } 409 done: 410 zfs_exit(zfsvfs, FTAG); 411 return (error); 412 } 413 414 415 boolean_t 416 zfs_is_readonly(zfsvfs_t *zfsvfs) 417 { 418 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); 419 } 420 421 static int 422 zfs_sync(vfs_t *vfsp, int waitfor) 423 { 424 425 /* 426 * Data integrity is job one. We don't want a compromised kernel 427 * writing to the storage pool, so we never sync during panic. 428 */ 429 if (panicstr) 430 return (0); 431 432 /* 433 * Ignore the system syncher. ZFS already commits async data 434 * at zfs_txg_timeout intervals. 435 */ 436 if (waitfor == MNT_LAZY) 437 return (0); 438 439 if (vfsp != NULL) { 440 /* 441 * Sync a specific filesystem. 442 */ 443 zfsvfs_t *zfsvfs = vfsp->vfs_data; 444 dsl_pool_t *dp; 445 int error; 446 447 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 448 return (error); 449 dp = dmu_objset_pool(zfsvfs->z_os); 450 451 /* 452 * If the system is shutting down, then skip any 453 * filesystems which may exist on a suspended pool. 454 */ 455 if (rebooting && spa_suspended(dp->dp_spa)) { 456 zfs_exit(zfsvfs, FTAG); 457 return (0); 458 } 459 460 if (zfsvfs->z_log != NULL) 461 zil_commit(zfsvfs->z_log, 0); 462 463 zfs_exit(zfsvfs, FTAG); 464 } else { 465 /* 466 * Sync all ZFS filesystems. This is what happens when you 467 * run sync(8). Unlike other filesystems, ZFS honors the 468 * request by waiting for all pools to commit all dirty data. 469 */ 470 spa_sync_allpools(); 471 } 472 473 return (0); 474 } 475 476 static void 477 atime_changed_cb(void *arg, uint64_t newval) 478 { 479 zfsvfs_t *zfsvfs = arg; 480 481 if (newval == TRUE) { 482 zfsvfs->z_atime = TRUE; 483 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; 484 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 485 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 486 } else { 487 zfsvfs->z_atime = FALSE; 488 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; 489 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 490 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 491 } 492 } 493 494 static void 495 xattr_changed_cb(void *arg, uint64_t newval) 496 { 497 zfsvfs_t *zfsvfs = arg; 498 499 if (newval == ZFS_XATTR_OFF) { 500 zfsvfs->z_flags &= ~ZSB_XATTR; 501 } else { 502 zfsvfs->z_flags |= ZSB_XATTR; 503 504 if (newval == ZFS_XATTR_SA) 505 zfsvfs->z_xattr_sa = B_TRUE; 506 else 507 zfsvfs->z_xattr_sa = B_FALSE; 508 } 509 } 510 511 static void 512 blksz_changed_cb(void *arg, uint64_t newval) 513 { 514 zfsvfs_t *zfsvfs = arg; 515 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 516 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 517 ASSERT(ISP2(newval)); 518 519 zfsvfs->z_max_blksz = newval; 520 zfsvfs->z_vfs->mnt_stat.f_iosize = newval; 521 } 522 523 static void 524 readonly_changed_cb(void *arg, uint64_t newval) 525 { 526 zfsvfs_t *zfsvfs = arg; 527 528 if (newval) { 529 /* XXX locking on vfs_flag? */ 530 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 531 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 532 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 533 } else { 534 /* XXX locking on vfs_flag? */ 535 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 536 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 537 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 538 } 539 } 540 541 static void 542 setuid_changed_cb(void *arg, uint64_t newval) 543 { 544 zfsvfs_t *zfsvfs = arg; 545 546 if (newval == FALSE) { 547 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 548 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 549 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 550 } else { 551 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 552 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 553 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 554 } 555 } 556 557 static void 558 exec_changed_cb(void *arg, uint64_t newval) 559 { 560 zfsvfs_t *zfsvfs = arg; 561 562 if (newval == FALSE) { 563 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 564 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 565 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 566 } else { 567 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 568 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 569 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 570 } 571 } 572 573 /* 574 * The nbmand mount option can be changed at mount time. 575 * We can't allow it to be toggled on live file systems or incorrect 576 * behavior may be seen from cifs clients 577 * 578 * This property isn't registered via dsl_prop_register(), but this callback 579 * will be called when a file system is first mounted 580 */ 581 static void 582 nbmand_changed_cb(void *arg, uint64_t newval) 583 { 584 zfsvfs_t *zfsvfs = arg; 585 if (newval == FALSE) { 586 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 587 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 588 } else { 589 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 590 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 591 } 592 } 593 594 static void 595 snapdir_changed_cb(void *arg, uint64_t newval) 596 { 597 zfsvfs_t *zfsvfs = arg; 598 599 zfsvfs->z_show_ctldir = newval; 600 } 601 602 static void 603 acl_mode_changed_cb(void *arg, uint64_t newval) 604 { 605 zfsvfs_t *zfsvfs = arg; 606 607 zfsvfs->z_acl_mode = newval; 608 } 609 610 static void 611 acl_inherit_changed_cb(void *arg, uint64_t newval) 612 { 613 zfsvfs_t *zfsvfs = arg; 614 615 zfsvfs->z_acl_inherit = newval; 616 } 617 618 static void 619 acl_type_changed_cb(void *arg, uint64_t newval) 620 { 621 zfsvfs_t *zfsvfs = arg; 622 623 zfsvfs->z_acl_type = newval; 624 } 625 626 static int 627 zfs_register_callbacks(vfs_t *vfsp) 628 { 629 struct dsl_dataset *ds = NULL; 630 objset_t *os = NULL; 631 zfsvfs_t *zfsvfs = NULL; 632 uint64_t nbmand; 633 boolean_t readonly = B_FALSE; 634 boolean_t do_readonly = B_FALSE; 635 boolean_t setuid = B_FALSE; 636 boolean_t do_setuid = B_FALSE; 637 boolean_t exec = B_FALSE; 638 boolean_t do_exec = B_FALSE; 639 boolean_t xattr = B_FALSE; 640 boolean_t atime = B_FALSE; 641 boolean_t do_atime = B_FALSE; 642 boolean_t do_xattr = B_FALSE; 643 int error = 0; 644 645 ASSERT3P(vfsp, !=, NULL); 646 zfsvfs = vfsp->vfs_data; 647 ASSERT3P(zfsvfs, !=, NULL); 648 os = zfsvfs->z_os; 649 650 /* 651 * This function can be called for a snapshot when we update snapshot's 652 * mount point, which isn't really supported. 653 */ 654 if (dmu_objset_is_snapshot(os)) 655 return (EOPNOTSUPP); 656 657 /* 658 * The act of registering our callbacks will destroy any mount 659 * options we may have. In order to enable temporary overrides 660 * of mount options, we stash away the current values and 661 * restore them after we register the callbacks. 662 */ 663 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 664 !spa_writeable(dmu_objset_spa(os))) { 665 readonly = B_TRUE; 666 do_readonly = B_TRUE; 667 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 668 readonly = B_FALSE; 669 do_readonly = B_TRUE; 670 } 671 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 672 setuid = B_FALSE; 673 do_setuid = B_TRUE; 674 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 675 setuid = B_TRUE; 676 do_setuid = B_TRUE; 677 } 678 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 679 exec = B_FALSE; 680 do_exec = B_TRUE; 681 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 682 exec = B_TRUE; 683 do_exec = B_TRUE; 684 } 685 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 686 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; 687 do_xattr = B_TRUE; 688 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 689 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 690 do_xattr = B_TRUE; 691 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { 692 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 693 do_xattr = B_TRUE; 694 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { 695 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; 696 do_xattr = B_TRUE; 697 } 698 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 699 atime = B_FALSE; 700 do_atime = B_TRUE; 701 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 702 atime = B_TRUE; 703 do_atime = B_TRUE; 704 } 705 706 /* 707 * We need to enter pool configuration here, so that we can use 708 * dsl_prop_get_int_ds() to handle the special nbmand property below. 709 * dsl_prop_get_integer() can not be used, because it has to acquire 710 * spa_namespace_lock and we can not do that because we already hold 711 * z_teardown_lock. The problem is that spa_write_cachefile() is called 712 * with spa_namespace_lock held and the function calls ZFS vnode 713 * operations to write the cache file and thus z_teardown_lock is 714 * acquired after spa_namespace_lock. 715 */ 716 ds = dmu_objset_ds(os); 717 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 718 719 /* 720 * nbmand is a special property. It can only be changed at 721 * mount time. 722 * 723 * This is weird, but it is documented to only be changeable 724 * at mount time. 725 */ 726 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 727 nbmand = B_FALSE; 728 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 729 nbmand = B_TRUE; 730 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) { 731 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 732 return (error); 733 } 734 735 /* 736 * Register property callbacks. 737 * 738 * It would probably be fine to just check for i/o error from 739 * the first prop_register(), but I guess I like to go 740 * overboard... 741 */ 742 error = dsl_prop_register(ds, 743 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 744 error = error ? error : dsl_prop_register(ds, 745 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 746 error = error ? error : dsl_prop_register(ds, 747 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 748 error = error ? error : dsl_prop_register(ds, 749 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 750 error = error ? error : dsl_prop_register(ds, 751 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 752 error = error ? error : dsl_prop_register(ds, 753 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 754 error = error ? error : dsl_prop_register(ds, 755 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 756 error = error ? error : dsl_prop_register(ds, 757 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); 758 error = error ? error : dsl_prop_register(ds, 759 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 760 error = error ? error : dsl_prop_register(ds, 761 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 762 zfsvfs); 763 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 764 if (error) 765 goto unregister; 766 767 /* 768 * Invoke our callbacks to restore temporary mount options. 769 */ 770 if (do_readonly) 771 readonly_changed_cb(zfsvfs, readonly); 772 if (do_setuid) 773 setuid_changed_cb(zfsvfs, setuid); 774 if (do_exec) 775 exec_changed_cb(zfsvfs, exec); 776 if (do_xattr) 777 xattr_changed_cb(zfsvfs, xattr); 778 if (do_atime) 779 atime_changed_cb(zfsvfs, atime); 780 781 nbmand_changed_cb(zfsvfs, nbmand); 782 783 return (0); 784 785 unregister: 786 dsl_prop_unregister_all(ds, zfsvfs); 787 return (error); 788 } 789 790 /* 791 * Associate this zfsvfs with the given objset, which must be owned. 792 * This will cache a bunch of on-disk state from the objset in the 793 * zfsvfs. 794 */ 795 static int 796 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 797 { 798 int error; 799 uint64_t val; 800 801 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 802 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 803 zfsvfs->z_os = os; 804 805 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 806 if (error != 0) 807 return (error); 808 if (zfsvfs->z_version > 809 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 810 (void) printf("Can't mount a version %lld file system " 811 "on a version %lld pool\n. Pool must be upgraded to mount " 812 "this file system.", (u_longlong_t)zfsvfs->z_version, 813 (u_longlong_t)spa_version(dmu_objset_spa(os))); 814 return (SET_ERROR(ENOTSUP)); 815 } 816 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 817 if (error != 0) 818 return (error); 819 zfsvfs->z_norm = (int)val; 820 821 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 822 if (error != 0) 823 return (error); 824 zfsvfs->z_utf8 = (val != 0); 825 826 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 827 if (error != 0) 828 return (error); 829 zfsvfs->z_case = (uint_t)val; 830 831 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); 832 if (error != 0) 833 return (error); 834 zfsvfs->z_acl_type = (uint_t)val; 835 836 /* 837 * Fold case on file systems that are always or sometimes case 838 * insensitive. 839 */ 840 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 841 zfsvfs->z_case == ZFS_CASE_MIXED) 842 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 843 844 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 845 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 846 847 uint64_t sa_obj = 0; 848 if (zfsvfs->z_use_sa) { 849 /* should either have both of these objects or none */ 850 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 851 &sa_obj); 852 if (error != 0) 853 return (error); 854 855 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); 856 if (error == 0 && val == ZFS_XATTR_SA) 857 zfsvfs->z_xattr_sa = B_TRUE; 858 } 859 860 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 861 &zfsvfs->z_attr_table); 862 if (error != 0) 863 return (error); 864 865 if (zfsvfs->z_version >= ZPL_VERSION_SA) 866 sa_register_update_callback(os, zfs_sa_upgrade); 867 868 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 869 &zfsvfs->z_root); 870 if (error != 0) 871 return (error); 872 ASSERT3U(zfsvfs->z_root, !=, 0); 873 874 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 875 &zfsvfs->z_unlinkedobj); 876 if (error != 0) 877 return (error); 878 879 error = zap_lookup(os, MASTER_NODE_OBJ, 880 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 881 8, 1, &zfsvfs->z_userquota_obj); 882 if (error == ENOENT) 883 zfsvfs->z_userquota_obj = 0; 884 else if (error != 0) 885 return (error); 886 887 error = zap_lookup(os, MASTER_NODE_OBJ, 888 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 889 8, 1, &zfsvfs->z_groupquota_obj); 890 if (error == ENOENT) 891 zfsvfs->z_groupquota_obj = 0; 892 else if (error != 0) 893 return (error); 894 895 error = zap_lookup(os, MASTER_NODE_OBJ, 896 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 897 8, 1, &zfsvfs->z_projectquota_obj); 898 if (error == ENOENT) 899 zfsvfs->z_projectquota_obj = 0; 900 else if (error != 0) 901 return (error); 902 903 error = zap_lookup(os, MASTER_NODE_OBJ, 904 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 905 8, 1, &zfsvfs->z_userobjquota_obj); 906 if (error == ENOENT) 907 zfsvfs->z_userobjquota_obj = 0; 908 else if (error != 0) 909 return (error); 910 911 error = zap_lookup(os, MASTER_NODE_OBJ, 912 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 913 8, 1, &zfsvfs->z_groupobjquota_obj); 914 if (error == ENOENT) 915 zfsvfs->z_groupobjquota_obj = 0; 916 else if (error != 0) 917 return (error); 918 919 error = zap_lookup(os, MASTER_NODE_OBJ, 920 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 921 8, 1, &zfsvfs->z_projectobjquota_obj); 922 if (error == ENOENT) 923 zfsvfs->z_projectobjquota_obj = 0; 924 else if (error != 0) 925 return (error); 926 927 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 928 &zfsvfs->z_fuid_obj); 929 if (error == ENOENT) 930 zfsvfs->z_fuid_obj = 0; 931 else if (error != 0) 932 return (error); 933 934 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 935 &zfsvfs->z_shares_dir); 936 if (error == ENOENT) 937 zfsvfs->z_shares_dir = 0; 938 else if (error != 0) 939 return (error); 940 941 /* 942 * Only use the name cache if we are looking for a 943 * name on a file system that does not require normalization 944 * or case folding. We can also look there if we happen to be 945 * on a non-normalizing, mixed sensitivity file system IF we 946 * are looking for the exact name (which is always the case on 947 * FreeBSD). 948 */ 949 zfsvfs->z_use_namecache = !zfsvfs->z_norm || 950 ((zfsvfs->z_case == ZFS_CASE_MIXED) && 951 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); 952 953 return (0); 954 } 955 956 taskq_t *zfsvfs_taskq; 957 958 static void 959 zfsvfs_task_unlinked_drain(void *context, int pending __unused) 960 { 961 962 zfs_unlinked_drain((zfsvfs_t *)context); 963 } 964 965 int 966 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) 967 { 968 objset_t *os; 969 zfsvfs_t *zfsvfs; 970 int error; 971 boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); 972 973 /* 974 * XXX: Fix struct statfs so this isn't necessary! 975 * 976 * The 'osname' is used as the filesystem's special node, which means 977 * it must fit in statfs.f_mntfromname, or else it can't be 978 * enumerated, so libzfs_mnttab_find() returns NULL, which causes 979 * 'zfs unmount' to think it's not mounted when it is. 980 */ 981 if (strlen(osname) >= MNAMELEN) 982 return (SET_ERROR(ENAMETOOLONG)); 983 984 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 985 986 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, 987 &os); 988 if (error != 0) { 989 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 990 return (error); 991 } 992 993 error = zfsvfs_create_impl(zfvp, zfsvfs, os); 994 995 return (error); 996 } 997 998 999 int 1000 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) 1001 { 1002 int error; 1003 1004 zfsvfs->z_vfs = NULL; 1005 zfsvfs->z_parent = zfsvfs; 1006 1007 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1008 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1009 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1010 offsetof(znode_t, z_link_node)); 1011 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, 1012 zfsvfs_task_unlinked_drain, zfsvfs); 1013 ZFS_TEARDOWN_INIT(zfsvfs); 1014 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); 1015 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1016 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1017 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1018 1019 error = zfsvfs_init(zfsvfs, os); 1020 if (error != 0) { 1021 dmu_objset_disown(os, B_TRUE, zfsvfs); 1022 *zfvp = NULL; 1023 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1024 return (error); 1025 } 1026 1027 *zfvp = zfsvfs; 1028 return (0); 1029 } 1030 1031 static int 1032 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1033 { 1034 int error; 1035 1036 /* 1037 * Check for a bad on-disk format version now since we 1038 * lied about owning the dataset readonly before. 1039 */ 1040 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && 1041 dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) 1042 return (SET_ERROR(EROFS)); 1043 1044 error = zfs_register_callbacks(zfsvfs->z_vfs); 1045 if (error) 1046 return (error); 1047 1048 /* 1049 * If we are not mounting (ie: online recv), then we don't 1050 * have to worry about replaying the log as we blocked all 1051 * operations out since we closed the ZIL. 1052 */ 1053 if (mounting) { 1054 boolean_t readonly; 1055 1056 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); 1057 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); 1058 if (error) 1059 return (error); 1060 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, 1061 &zfsvfs->z_kstat.dk_zil_sums); 1062 1063 /* 1064 * During replay we remove the read only flag to 1065 * allow replays to succeed. 1066 */ 1067 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1068 if (readonly != 0) { 1069 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1070 } else { 1071 dsl_dir_t *dd; 1072 zap_stats_t zs; 1073 1074 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, 1075 &zs) == 0) { 1076 dataset_kstats_update_nunlinks_kstat( 1077 &zfsvfs->z_kstat, zs.zs_num_entries); 1078 dprintf_ds(zfsvfs->z_os->os_dsl_dataset, 1079 "num_entries in unlinked set: %llu", 1080 (u_longlong_t)zs.zs_num_entries); 1081 } 1082 1083 zfs_unlinked_drain(zfsvfs); 1084 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1085 dd->dd_activity_cancelled = B_FALSE; 1086 } 1087 1088 /* 1089 * Parse and replay the intent log. 1090 * 1091 * Because of ziltest, this must be done after 1092 * zfs_unlinked_drain(). (Further note: ziltest 1093 * doesn't use readonly mounts, where 1094 * zfs_unlinked_drain() isn't called.) This is because 1095 * ziltest causes spa_sync() to think it's committed, 1096 * but actually it is not, so the intent log contains 1097 * many txg's worth of changes. 1098 * 1099 * In particular, if object N is in the unlinked set in 1100 * the last txg to actually sync, then it could be 1101 * actually freed in a later txg and then reallocated 1102 * in a yet later txg. This would write a "create 1103 * object N" record to the intent log. Normally, this 1104 * would be fine because the spa_sync() would have 1105 * written out the fact that object N is free, before 1106 * we could write the "create object N" intent log 1107 * record. 1108 * 1109 * But when we are in ziltest mode, we advance the "open 1110 * txg" without actually spa_sync()-ing the changes to 1111 * disk. So we would see that object N is still 1112 * allocated and in the unlinked set, and there is an 1113 * intent log record saying to allocate it. 1114 */ 1115 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1116 if (zil_replay_disable) { 1117 zil_destroy(zfsvfs->z_log, B_FALSE); 1118 } else { 1119 boolean_t use_nc = zfsvfs->z_use_namecache; 1120 zfsvfs->z_use_namecache = B_FALSE; 1121 zfsvfs->z_replay = B_TRUE; 1122 zil_replay(zfsvfs->z_os, zfsvfs, 1123 zfs_replay_vector); 1124 zfsvfs->z_replay = B_FALSE; 1125 zfsvfs->z_use_namecache = use_nc; 1126 } 1127 } 1128 1129 /* restore readonly bit */ 1130 if (readonly != 0) 1131 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 1132 } else { 1133 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); 1134 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, 1135 &zfsvfs->z_kstat.dk_zil_sums); 1136 } 1137 1138 /* 1139 * Set the objset user_ptr to track its zfsvfs. 1140 */ 1141 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1142 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1143 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1144 1145 return (0); 1146 } 1147 1148 void 1149 zfsvfs_free(zfsvfs_t *zfsvfs) 1150 { 1151 int i; 1152 1153 zfs_fuid_destroy(zfsvfs); 1154 1155 mutex_destroy(&zfsvfs->z_znodes_lock); 1156 mutex_destroy(&zfsvfs->z_lock); 1157 ASSERT3U(zfsvfs->z_nr_znodes, ==, 0); 1158 list_destroy(&zfsvfs->z_all_znodes); 1159 ZFS_TEARDOWN_DESTROY(zfsvfs); 1160 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); 1161 rw_destroy(&zfsvfs->z_fuid_lock); 1162 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1163 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1164 dataset_kstats_destroy(&zfsvfs->z_kstat); 1165 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1166 } 1167 1168 static void 1169 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1170 { 1171 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1172 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1173 } 1174 1175 static int 1176 zfs_domount(vfs_t *vfsp, char *osname) 1177 { 1178 uint64_t recordsize, fsid_guid; 1179 int error = 0; 1180 zfsvfs_t *zfsvfs; 1181 1182 ASSERT3P(vfsp, !=, NULL); 1183 ASSERT3P(osname, !=, NULL); 1184 1185 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); 1186 if (error) 1187 return (error); 1188 zfsvfs->z_vfs = vfsp; 1189 1190 if ((error = dsl_prop_get_integer(osname, 1191 "recordsize", &recordsize, NULL))) 1192 goto out; 1193 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; 1194 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; 1195 1196 vfsp->vfs_data = zfsvfs; 1197 vfsp->mnt_flag |= MNT_LOCAL; 1198 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; 1199 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; 1200 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; 1201 /* 1202 * This can cause a loss of coherence between ARC and page cache 1203 * on ZoF - unclear if the problem is in FreeBSD or ZoF 1204 */ 1205 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ 1206 vfsp->mnt_kern_flag |= MNTK_NOMSYNC; 1207 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; 1208 1209 #if defined(_KERNEL) && !defined(KMEM_DEBUG) 1210 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; 1211 #endif 1212 /* 1213 * The fsid is 64 bits, composed of an 8-bit fs type, which 1214 * separates our fsid from any other filesystem types, and a 1215 * 56-bit objset unique ID. The objset unique ID is unique to 1216 * all objsets open on this system, provided by unique_create(). 1217 * The 8-bit fs type must be put in the low bits of fsid[1] 1218 * because that's where other Solaris filesystems put it. 1219 */ 1220 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1221 ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0); 1222 vfsp->vfs_fsid.val[0] = fsid_guid; 1223 vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) | 1224 (vfsp->mnt_vfc->vfc_typenum & 0xFF); 1225 1226 /* 1227 * Set features for file system. 1228 */ 1229 zfs_set_fuid_feature(zfsvfs); 1230 1231 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1232 uint64_t pval; 1233 1234 atime_changed_cb(zfsvfs, B_FALSE); 1235 readonly_changed_cb(zfsvfs, B_TRUE); 1236 if ((error = dsl_prop_get_integer(osname, 1237 "xattr", &pval, NULL))) 1238 goto out; 1239 xattr_changed_cb(zfsvfs, pval); 1240 if ((error = dsl_prop_get_integer(osname, 1241 "acltype", &pval, NULL))) 1242 goto out; 1243 acl_type_changed_cb(zfsvfs, pval); 1244 zfsvfs->z_issnap = B_TRUE; 1245 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1246 1247 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1248 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1249 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1250 } else { 1251 if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) 1252 goto out; 1253 } 1254 1255 vfs_mountedfrom(vfsp, osname); 1256 1257 if (!zfsvfs->z_issnap) 1258 zfsctl_create(zfsvfs); 1259 out: 1260 if (error) { 1261 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); 1262 zfsvfs_free(zfsvfs); 1263 } else { 1264 atomic_inc_32(&zfs_active_fs_count); 1265 } 1266 1267 return (error); 1268 } 1269 1270 static void 1271 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1272 { 1273 objset_t *os = zfsvfs->z_os; 1274 1275 if (!dmu_objset_is_snapshot(os)) 1276 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1277 } 1278 1279 static int 1280 getpoolname(const char *osname, char *poolname) 1281 { 1282 char *p; 1283 1284 p = strchr(osname, '/'); 1285 if (p == NULL) { 1286 if (strlen(osname) >= MAXNAMELEN) 1287 return (ENAMETOOLONG); 1288 (void) strcpy(poolname, osname); 1289 } else { 1290 if (p - osname >= MAXNAMELEN) 1291 return (ENAMETOOLONG); 1292 (void) strlcpy(poolname, osname, p - osname + 1); 1293 } 1294 return (0); 1295 } 1296 1297 static void 1298 fetch_osname_options(char *name, bool *checkpointrewind) 1299 { 1300 1301 if (name[0] == '!') { 1302 *checkpointrewind = true; 1303 memmove(name, name + 1, strlen(name)); 1304 } else { 1305 *checkpointrewind = false; 1306 } 1307 } 1308 1309 static int 1310 zfs_mount(vfs_t *vfsp) 1311 { 1312 kthread_t *td = curthread; 1313 vnode_t *mvp = vfsp->mnt_vnodecovered; 1314 cred_t *cr = td->td_ucred; 1315 char *osname; 1316 int error = 0; 1317 int canwrite; 1318 bool checkpointrewind, isctlsnap = false; 1319 1320 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) 1321 return (SET_ERROR(EINVAL)); 1322 1323 /* 1324 * If full-owner-access is enabled and delegated administration is 1325 * turned on, we must set nosuid. 1326 */ 1327 if (zfs_super_owner && 1328 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { 1329 secpolicy_fs_mount_clearopts(cr, vfsp); 1330 } 1331 1332 fetch_osname_options(osname, &checkpointrewind); 1333 isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) && 1334 strchr(osname, '@') != NULL); 1335 1336 /* 1337 * Check for mount privilege? 1338 * 1339 * If we don't have privilege then see if 1340 * we have local permission to allow it 1341 */ 1342 error = secpolicy_fs_mount(cr, mvp, vfsp); 1343 if (error && isctlsnap) { 1344 secpolicy_fs_mount_clearopts(cr, vfsp); 1345 } else if (error) { 1346 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) 1347 goto out; 1348 1349 if (!(vfsp->vfs_flag & MS_REMOUNT)) { 1350 vattr_t vattr; 1351 1352 /* 1353 * Make sure user is the owner of the mount point 1354 * or has sufficient privileges. 1355 */ 1356 1357 vattr.va_mask = AT_UID; 1358 1359 vn_lock(mvp, LK_SHARED | LK_RETRY); 1360 if (VOP_GETATTR(mvp, &vattr, cr)) { 1361 VOP_UNLOCK1(mvp); 1362 goto out; 1363 } 1364 1365 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 1366 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { 1367 VOP_UNLOCK1(mvp); 1368 goto out; 1369 } 1370 VOP_UNLOCK1(mvp); 1371 } 1372 1373 secpolicy_fs_mount_clearopts(cr, vfsp); 1374 } 1375 1376 /* 1377 * Refuse to mount a filesystem if we are in a local zone and the 1378 * dataset is not visible. 1379 */ 1380 if (!INGLOBALZONE(curproc) && 1381 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1382 boolean_t mount_snapshot = B_FALSE; 1383 1384 /* 1385 * Snapshots may be mounted in .zfs for unjailed datasets 1386 * if allowed by the jail param zfs.mount_snapshot. 1387 */ 1388 if (isctlsnap) { 1389 struct prison *pr; 1390 struct zfs_jailparam *zjp; 1391 1392 pr = curthread->td_ucred->cr_prison; 1393 mtx_lock(&pr->pr_mtx); 1394 zjp = osd_jail_get(pr, zfs_jailparam_slot); 1395 mtx_unlock(&pr->pr_mtx); 1396 if (zjp && zjp->mount_snapshot) 1397 mount_snapshot = B_TRUE; 1398 } 1399 if (!mount_snapshot) { 1400 error = SET_ERROR(EPERM); 1401 goto out; 1402 } 1403 } 1404 1405 vfsp->vfs_flag |= MNT_NFS4ACLS; 1406 1407 /* 1408 * When doing a remount, we simply refresh our temporary properties 1409 * according to those options set in the current VFS options. 1410 */ 1411 if (vfsp->vfs_flag & MS_REMOUNT) { 1412 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1413 1414 /* 1415 * Refresh mount options with z_teardown_lock blocking I/O while 1416 * the filesystem is in an inconsistent state. 1417 * The lock also serializes this code with filesystem 1418 * manipulations between entry to zfs_suspend_fs() and return 1419 * from zfs_resume_fs(). 1420 */ 1421 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1422 zfs_unregister_callbacks(zfsvfs); 1423 error = zfs_register_callbacks(vfsp); 1424 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1425 goto out; 1426 } 1427 1428 /* Initial root mount: try hard to import the requested root pool. */ 1429 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && 1430 (vfsp->vfs_flag & MNT_UPDATE) == 0) { 1431 char pname[MAXNAMELEN]; 1432 1433 error = getpoolname(osname, pname); 1434 if (error == 0) 1435 error = spa_import_rootpool(pname, checkpointrewind); 1436 if (error) 1437 goto out; 1438 } 1439 DROP_GIANT(); 1440 error = zfs_domount(vfsp, osname); 1441 PICKUP_GIANT(); 1442 1443 out: 1444 return (error); 1445 } 1446 1447 static int 1448 zfs_statfs(vfs_t *vfsp, struct statfs *statp) 1449 { 1450 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1451 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1452 int error; 1453 1454 statp->f_version = STATFS_VERSION; 1455 1456 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1457 return (error); 1458 1459 dmu_objset_space(zfsvfs->z_os, 1460 &refdbytes, &availbytes, &usedobjs, &availobjs); 1461 1462 /* 1463 * The underlying storage pool actually uses multiple block sizes. 1464 * We report the fragsize as the smallest block size we support, 1465 * and we report our blocksize as the filesystem's maximum blocksize. 1466 */ 1467 statp->f_bsize = SPA_MINBLOCKSIZE; 1468 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; 1469 1470 /* 1471 * The following report "total" blocks of various kinds in the 1472 * file system, but reported in terms of f_frsize - the 1473 * "fragment" size. 1474 */ 1475 1476 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1477 statp->f_bfree = availbytes / statp->f_bsize; 1478 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1479 1480 /* 1481 * statvfs() should really be called statufs(), because it assumes 1482 * static metadata. ZFS doesn't preallocate files, so the best 1483 * we can do is report the max that could possibly fit in f_files, 1484 * and that minus the number actually used in f_ffree. 1485 * For f_ffree, report the smaller of the number of object available 1486 * and the number of blocks (each object will take at least a block). 1487 */ 1488 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1489 statp->f_files = statp->f_ffree + usedobjs; 1490 1491 /* 1492 * We're a zfs filesystem. 1493 */ 1494 strlcpy(statp->f_fstypename, "zfs", 1495 sizeof (statp->f_fstypename)); 1496 1497 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 1498 sizeof (statp->f_mntfromname)); 1499 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 1500 sizeof (statp->f_mntonname)); 1501 1502 statp->f_namemax = MAXNAMELEN - 1; 1503 1504 zfs_exit(zfsvfs, FTAG); 1505 return (0); 1506 } 1507 1508 static int 1509 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) 1510 { 1511 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1512 znode_t *rootzp; 1513 int error; 1514 1515 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1516 return (error); 1517 1518 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1519 if (error == 0) 1520 *vpp = ZTOV(rootzp); 1521 1522 zfs_exit(zfsvfs, FTAG); 1523 1524 if (error == 0) { 1525 error = vn_lock(*vpp, flags); 1526 if (error != 0) { 1527 VN_RELE(*vpp); 1528 *vpp = NULL; 1529 } 1530 } 1531 return (error); 1532 } 1533 1534 /* 1535 * Teardown the zfsvfs::z_os. 1536 * 1537 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' 1538 * and 'z_teardown_inactive_lock' held. 1539 */ 1540 static int 1541 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1542 { 1543 znode_t *zp; 1544 dsl_dir_t *dd; 1545 1546 /* 1547 * If someone has not already unmounted this file system, 1548 * drain the zrele_taskq to ensure all active references to the 1549 * zfsvfs_t have been handled only then can it be safely destroyed. 1550 */ 1551 if (zfsvfs->z_os) { 1552 /* 1553 * If we're unmounting we have to wait for the list to 1554 * drain completely. 1555 * 1556 * If we're not unmounting there's no guarantee the list 1557 * will drain completely, but zreles run from the taskq 1558 * may add the parents of dir-based xattrs to the taskq 1559 * so we want to wait for these. 1560 * 1561 * We can safely read z_nr_znodes without locking because the 1562 * VFS has already blocked operations which add to the 1563 * z_all_znodes list and thus increment z_nr_znodes. 1564 */ 1565 int round = 0; 1566 while (zfsvfs->z_nr_znodes > 0) { 1567 taskq_wait_outstanding(dsl_pool_zrele_taskq( 1568 dmu_objset_pool(zfsvfs->z_os)), 0); 1569 if (++round > 1 && !unmounting) 1570 break; 1571 } 1572 } 1573 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1574 1575 if (!unmounting) { 1576 /* 1577 * We purge the parent filesystem's vfsp as the parent 1578 * filesystem and all of its snapshots have their vnode's 1579 * v_vfsp set to the parent's filesystem's vfsp. Note, 1580 * 'z_parent' is self referential for non-snapshots. 1581 */ 1582 #ifdef FREEBSD_NAMECACHE 1583 #if __FreeBSD_version >= 1300117 1584 cache_purgevfs(zfsvfs->z_parent->z_vfs); 1585 #else 1586 cache_purgevfs(zfsvfs->z_parent->z_vfs, true); 1587 #endif 1588 #endif 1589 } 1590 1591 /* 1592 * Close the zil. NB: Can't close the zil while zfs_inactive 1593 * threads are blocked as zil_close can call zfs_inactive. 1594 */ 1595 if (zfsvfs->z_log) { 1596 zil_close(zfsvfs->z_log); 1597 zfsvfs->z_log = NULL; 1598 } 1599 1600 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); 1601 1602 /* 1603 * If we are not unmounting (ie: online recv) and someone already 1604 * unmounted this file system while we were doing the switcheroo, 1605 * or a reopen of z_os failed then just bail out now. 1606 */ 1607 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1608 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1609 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1610 return (SET_ERROR(EIO)); 1611 } 1612 1613 /* 1614 * At this point there are no vops active, and any new vops will 1615 * fail with EIO since we have z_teardown_lock for writer (only 1616 * relevant for forced unmount). 1617 * 1618 * Release all holds on dbufs. 1619 */ 1620 mutex_enter(&zfsvfs->z_znodes_lock); 1621 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1622 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 1623 if (zp->z_sa_hdl != NULL) { 1624 zfs_znode_dmu_fini(zp); 1625 } 1626 } 1627 mutex_exit(&zfsvfs->z_znodes_lock); 1628 1629 /* 1630 * If we are unmounting, set the unmounted flag and let new vops 1631 * unblock. zfs_inactive will have the unmounted behavior, and all 1632 * other vops will fail with EIO. 1633 */ 1634 if (unmounting) { 1635 zfsvfs->z_unmounted = B_TRUE; 1636 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1637 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1638 } 1639 1640 /* 1641 * z_os will be NULL if there was an error in attempting to reopen 1642 * zfsvfs, so just return as the properties had already been 1643 * unregistered and cached data had been evicted before. 1644 */ 1645 if (zfsvfs->z_os == NULL) 1646 return (0); 1647 1648 /* 1649 * Unregister properties. 1650 */ 1651 zfs_unregister_callbacks(zfsvfs); 1652 1653 /* 1654 * Evict cached data 1655 */ 1656 if (!zfs_is_readonly(zfsvfs)) 1657 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1658 dmu_objset_evict_dbufs(zfsvfs->z_os); 1659 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1660 dsl_dir_cancel_waiters(dd); 1661 1662 return (0); 1663 } 1664 1665 static int 1666 zfs_umount(vfs_t *vfsp, int fflag) 1667 { 1668 kthread_t *td = curthread; 1669 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1670 objset_t *os; 1671 cred_t *cr = td->td_ucred; 1672 int ret; 1673 1674 ret = secpolicy_fs_unmount(cr, vfsp); 1675 if (ret) { 1676 if (dsl_deleg_access((char *)vfsp->vfs_resource, 1677 ZFS_DELEG_PERM_MOUNT, cr)) 1678 return (ret); 1679 } 1680 1681 /* 1682 * Unmount any snapshots mounted under .zfs before unmounting the 1683 * dataset itself. 1684 */ 1685 if (zfsvfs->z_ctldir != NULL) { 1686 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 1687 return (ret); 1688 } 1689 1690 if (fflag & MS_FORCE) { 1691 /* 1692 * Mark file system as unmounted before calling 1693 * vflush(FORCECLOSE). This way we ensure no future vnops 1694 * will be called and risk operating on DOOMED vnodes. 1695 */ 1696 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1697 zfsvfs->z_unmounted = B_TRUE; 1698 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1699 } 1700 1701 /* 1702 * Flush all the files. 1703 */ 1704 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); 1705 if (ret != 0) 1706 return (ret); 1707 while (taskqueue_cancel(zfsvfs_taskq->tq_queue, 1708 &zfsvfs->z_unlinked_drain_task, NULL) != 0) 1709 taskqueue_drain(zfsvfs_taskq->tq_queue, 1710 &zfsvfs->z_unlinked_drain_task); 1711 1712 VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); 1713 os = zfsvfs->z_os; 1714 1715 /* 1716 * z_os will be NULL if there was an error in 1717 * attempting to reopen zfsvfs. 1718 */ 1719 if (os != NULL) { 1720 /* 1721 * Unset the objset user_ptr. 1722 */ 1723 mutex_enter(&os->os_user_ptr_lock); 1724 dmu_objset_set_user(os, NULL); 1725 mutex_exit(&os->os_user_ptr_lock); 1726 1727 /* 1728 * Finally release the objset 1729 */ 1730 dmu_objset_disown(os, B_TRUE, zfsvfs); 1731 } 1732 1733 /* 1734 * We can now safely destroy the '.zfs' directory node. 1735 */ 1736 if (zfsvfs->z_ctldir != NULL) 1737 zfsctl_destroy(zfsvfs); 1738 zfs_freevfs(vfsp); 1739 1740 return (0); 1741 } 1742 1743 static int 1744 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) 1745 { 1746 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1747 znode_t *zp; 1748 int err; 1749 1750 /* 1751 * zfs_zget() can't operate on virtual entries like .zfs/ or 1752 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. 1753 * This will make NFS to switch to LOOKUP instead of using VGET. 1754 */ 1755 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || 1756 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) 1757 return (EOPNOTSUPP); 1758 1759 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1760 return (err); 1761 err = zfs_zget(zfsvfs, ino, &zp); 1762 if (err == 0 && zp->z_unlinked) { 1763 vrele(ZTOV(zp)); 1764 err = EINVAL; 1765 } 1766 if (err == 0) 1767 *vpp = ZTOV(zp); 1768 zfs_exit(zfsvfs, FTAG); 1769 if (err == 0) { 1770 err = vn_lock(*vpp, flags); 1771 if (err != 0) 1772 vrele(*vpp); 1773 } 1774 if (err != 0) 1775 *vpp = NULL; 1776 return (err); 1777 } 1778 1779 static int 1780 #if __FreeBSD_version >= 1300098 1781 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 1782 struct ucred **credanonp, int *numsecflavors, int *secflavors) 1783 #else 1784 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, 1785 struct ucred **credanonp, int *numsecflavors, int **secflavors) 1786 #endif 1787 { 1788 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1789 1790 /* 1791 * If this is regular file system vfsp is the same as 1792 * zfsvfs->z_parent->z_vfs, but if it is snapshot, 1793 * zfsvfs->z_parent->z_vfs represents parent file system 1794 * which we have to use here, because only this file system 1795 * has mnt_export configured. 1796 */ 1797 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, 1798 credanonp, numsecflavors, secflavors)); 1799 } 1800 1801 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN, 1802 "struct fid bigger than SHORT_FID_LEN"); 1803 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN, 1804 "struct fid bigger than LONG_FID_LEN"); 1805 1806 static int 1807 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) 1808 { 1809 struct componentname cn; 1810 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1811 znode_t *zp; 1812 vnode_t *dvp; 1813 uint64_t object = 0; 1814 uint64_t fid_gen = 0; 1815 uint64_t setgen = 0; 1816 uint64_t gen_mask; 1817 uint64_t zp_gen; 1818 int i, err; 1819 1820 *vpp = NULL; 1821 1822 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1823 return (err); 1824 1825 /* 1826 * On FreeBSD we can get snapshot's mount point or its parent file 1827 * system mount point depending if snapshot is already mounted or not. 1828 */ 1829 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 1830 zfid_long_t *zlfid = (zfid_long_t *)fidp; 1831 uint64_t objsetid = 0; 1832 1833 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 1834 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 1835 1836 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 1837 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 1838 1839 zfs_exit(zfsvfs, FTAG); 1840 1841 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 1842 if (err) 1843 return (SET_ERROR(EINVAL)); 1844 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1845 return (err); 1846 } 1847 1848 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 1849 zfid_short_t *zfid = (zfid_short_t *)fidp; 1850 1851 for (i = 0; i < sizeof (zfid->zf_object); i++) 1852 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1853 1854 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1855 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1856 } else { 1857 zfs_exit(zfsvfs, FTAG); 1858 return (SET_ERROR(EINVAL)); 1859 } 1860 1861 if (fidp->fid_len == LONG_FID_LEN && setgen != 0) { 1862 zfs_exit(zfsvfs, FTAG); 1863 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n", 1864 (u_longlong_t)fid_gen, (u_longlong_t)setgen); 1865 return (SET_ERROR(EINVAL)); 1866 } 1867 1868 /* 1869 * A zero fid_gen means we are in .zfs or the .zfs/snapshot 1870 * directory tree. If the object == zfsvfs->z_shares_dir, then 1871 * we are in the .zfs/shares directory tree. 1872 */ 1873 if ((fid_gen == 0 && 1874 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || 1875 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { 1876 zfs_exit(zfsvfs, FTAG); 1877 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); 1878 if (object == ZFSCTL_INO_SNAPDIR) { 1879 cn.cn_nameptr = "snapshot"; 1880 cn.cn_namelen = strlen(cn.cn_nameptr); 1881 cn.cn_nameiop = LOOKUP; 1882 cn.cn_flags = ISLASTCN | LOCKLEAF; 1883 cn.cn_lkflags = flags; 1884 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1885 vput(dvp); 1886 } else if (object == zfsvfs->z_shares_dir) { 1887 /* 1888 * XXX This branch must not be taken, 1889 * if it is, then the lookup below will 1890 * explode. 1891 */ 1892 cn.cn_nameptr = "shares"; 1893 cn.cn_namelen = strlen(cn.cn_nameptr); 1894 cn.cn_nameiop = LOOKUP; 1895 cn.cn_flags = ISLASTCN; 1896 cn.cn_lkflags = flags; 1897 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1898 vput(dvp); 1899 } else { 1900 *vpp = dvp; 1901 } 1902 return (err); 1903 } 1904 1905 gen_mask = -1ULL >> (64 - 8 * i); 1906 1907 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object, 1908 (u_longlong_t)fid_gen, 1909 (u_longlong_t)gen_mask); 1910 if ((err = zfs_zget(zfsvfs, object, &zp))) { 1911 zfs_exit(zfsvfs, FTAG); 1912 return (err); 1913 } 1914 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1915 sizeof (uint64_t)); 1916 zp_gen = zp_gen & gen_mask; 1917 if (zp_gen == 0) 1918 zp_gen = 1; 1919 if (zp->z_unlinked || zp_gen != fid_gen) { 1920 dprintf("znode gen (%llu) != fid gen (%llu)\n", 1921 (u_longlong_t)zp_gen, (u_longlong_t)fid_gen); 1922 vrele(ZTOV(zp)); 1923 zfs_exit(zfsvfs, FTAG); 1924 return (SET_ERROR(EINVAL)); 1925 } 1926 1927 *vpp = ZTOV(zp); 1928 zfs_exit(zfsvfs, FTAG); 1929 err = vn_lock(*vpp, flags); 1930 if (err == 0) 1931 vnode_create_vobject(*vpp, zp->z_size, curthread); 1932 else 1933 *vpp = NULL; 1934 return (err); 1935 } 1936 1937 /* 1938 * Block out VOPs and close zfsvfs_t::z_os 1939 * 1940 * Note, if successful, then we return with the 'z_teardown_lock' and 1941 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 1942 * dataset and objset intact so that they can be atomically handed off during 1943 * a subsequent rollback or recv operation and the resume thereafter. 1944 */ 1945 int 1946 zfs_suspend_fs(zfsvfs_t *zfsvfs) 1947 { 1948 int error; 1949 1950 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 1951 return (error); 1952 1953 return (0); 1954 } 1955 1956 /* 1957 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 1958 * is an invariant across any of the operations that can be performed while the 1959 * filesystem was suspended. Whether it succeeded or failed, the preconditions 1960 * are the same: the relevant objset and associated dataset are owned by 1961 * zfsvfs, held, and long held on entry. 1962 */ 1963 int 1964 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 1965 { 1966 int err; 1967 znode_t *zp; 1968 1969 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 1970 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 1971 1972 /* 1973 * We already own this, so just update the objset_t, as the one we 1974 * had before may have been evicted. 1975 */ 1976 objset_t *os; 1977 VERIFY3P(ds->ds_owner, ==, zfsvfs); 1978 VERIFY(dsl_dataset_long_held(ds)); 1979 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 1980 dsl_pool_config_enter(dp, FTAG); 1981 VERIFY0(dmu_objset_from_ds(ds, &os)); 1982 dsl_pool_config_exit(dp, FTAG); 1983 1984 err = zfsvfs_init(zfsvfs, os); 1985 if (err != 0) 1986 goto bail; 1987 1988 ds->ds_dir->dd_activity_cancelled = B_FALSE; 1989 VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); 1990 1991 zfs_set_fuid_feature(zfsvfs); 1992 1993 /* 1994 * Attempt to re-establish all the active znodes with 1995 * their dbufs. If a zfs_rezget() fails, then we'll let 1996 * any potential callers discover that via zfs_enter_verify_zp 1997 * when they try to use their znode. 1998 */ 1999 mutex_enter(&zfsvfs->z_znodes_lock); 2000 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2001 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2002 (void) zfs_rezget(zp); 2003 } 2004 mutex_exit(&zfsvfs->z_znodes_lock); 2005 2006 bail: 2007 /* release the VOPs */ 2008 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2009 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2010 2011 if (err) { 2012 /* 2013 * Since we couldn't setup the sa framework, try to force 2014 * unmount this file system. 2015 */ 2016 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { 2017 vfs_ref(zfsvfs->z_vfs); 2018 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); 2019 } 2020 } 2021 return (err); 2022 } 2023 2024 static void 2025 zfs_freevfs(vfs_t *vfsp) 2026 { 2027 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2028 2029 zfsvfs_free(zfsvfs); 2030 2031 atomic_dec_32(&zfs_active_fs_count); 2032 } 2033 2034 #ifdef __i386__ 2035 static int desiredvnodes_backup; 2036 #include <sys/vmmeter.h> 2037 2038 2039 #include <vm/vm_page.h> 2040 #include <vm/vm_object.h> 2041 #include <vm/vm_kern.h> 2042 #include <vm/vm_map.h> 2043 #endif 2044 2045 static void 2046 zfs_vnodes_adjust(void) 2047 { 2048 #ifdef __i386__ 2049 int newdesiredvnodes; 2050 2051 desiredvnodes_backup = desiredvnodes; 2052 2053 /* 2054 * We calculate newdesiredvnodes the same way it is done in 2055 * vntblinit(). If it is equal to desiredvnodes, it means that 2056 * it wasn't tuned by the administrator and we can tune it down. 2057 */ 2058 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * 2059 vm_kmem_size / (5 * (sizeof (struct vm_object) + 2060 sizeof (struct vnode)))); 2061 if (newdesiredvnodes == desiredvnodes) 2062 desiredvnodes = (3 * newdesiredvnodes) / 4; 2063 #endif 2064 } 2065 2066 static void 2067 zfs_vnodes_adjust_back(void) 2068 { 2069 2070 #ifdef __i386__ 2071 desiredvnodes = desiredvnodes_backup; 2072 #endif 2073 } 2074 2075 void 2076 zfs_init(void) 2077 { 2078 2079 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); 2080 2081 /* 2082 * Initialize .zfs directory structures 2083 */ 2084 zfsctl_init(); 2085 2086 /* 2087 * Initialize znode cache, vnode ops, etc... 2088 */ 2089 zfs_znode_init(); 2090 2091 /* 2092 * Reduce number of vnodes. Originally number of vnodes is calculated 2093 * with UFS inode in mind. We reduce it here, because it's too big for 2094 * ZFS/i386. 2095 */ 2096 zfs_vnodes_adjust(); 2097 2098 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); 2099 2100 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); 2101 } 2102 2103 void 2104 zfs_fini(void) 2105 { 2106 taskq_destroy(zfsvfs_taskq); 2107 zfsctl_fini(); 2108 zfs_znode_fini(); 2109 zfs_vnodes_adjust_back(); 2110 } 2111 2112 int 2113 zfs_busy(void) 2114 { 2115 return (zfs_active_fs_count != 0); 2116 } 2117 2118 /* 2119 * Release VOPs and unmount a suspended filesystem. 2120 */ 2121 int 2122 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2123 { 2124 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 2125 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 2126 2127 /* 2128 * We already own this, so just hold and rele it to update the 2129 * objset_t, as the one we had before may have been evicted. 2130 */ 2131 objset_t *os; 2132 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2133 VERIFY(dsl_dataset_long_held(ds)); 2134 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 2135 dsl_pool_config_enter(dp, FTAG); 2136 VERIFY0(dmu_objset_from_ds(ds, &os)); 2137 dsl_pool_config_exit(dp, FTAG); 2138 zfsvfs->z_os = os; 2139 2140 /* release the VOPs */ 2141 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2142 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2143 2144 /* 2145 * Try to force unmount this file system. 2146 */ 2147 (void) zfs_umount(zfsvfs->z_vfs, 0); 2148 zfsvfs->z_unmounted = B_TRUE; 2149 return (0); 2150 } 2151 2152 int 2153 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2154 { 2155 int error; 2156 objset_t *os = zfsvfs->z_os; 2157 dmu_tx_t *tx; 2158 2159 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2160 return (SET_ERROR(EINVAL)); 2161 2162 if (newvers < zfsvfs->z_version) 2163 return (SET_ERROR(EINVAL)); 2164 2165 if (zfs_spa_version_map(newvers) > 2166 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2167 return (SET_ERROR(ENOTSUP)); 2168 2169 tx = dmu_tx_create(os); 2170 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2171 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2172 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2173 ZFS_SA_ATTRS); 2174 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2175 } 2176 error = dmu_tx_assign(tx, TXG_WAIT); 2177 if (error) { 2178 dmu_tx_abort(tx); 2179 return (error); 2180 } 2181 2182 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2183 8, 1, &newvers, tx); 2184 2185 if (error) { 2186 dmu_tx_commit(tx); 2187 return (error); 2188 } 2189 2190 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2191 uint64_t sa_obj; 2192 2193 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2194 SPA_VERSION_SA); 2195 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2196 DMU_OT_NONE, 0, tx); 2197 2198 error = zap_add(os, MASTER_NODE_OBJ, 2199 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2200 ASSERT0(error); 2201 2202 VERIFY0(sa_set_sa_object(os, sa_obj)); 2203 sa_register_update_callback(os, zfs_sa_upgrade); 2204 } 2205 2206 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2207 "from %ju to %ju", (uintmax_t)zfsvfs->z_version, 2208 (uintmax_t)newvers); 2209 dmu_tx_commit(tx); 2210 2211 zfsvfs->z_version = newvers; 2212 os->os_version = newvers; 2213 2214 zfs_set_fuid_feature(zfsvfs); 2215 2216 return (0); 2217 } 2218 2219 /* 2220 * Read a property stored within the master node. 2221 */ 2222 int 2223 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2224 { 2225 uint64_t *cached_copy = NULL; 2226 2227 /* 2228 * Figure out where in the objset_t the cached copy would live, if it 2229 * is available for the requested property. 2230 */ 2231 if (os != NULL) { 2232 switch (prop) { 2233 case ZFS_PROP_VERSION: 2234 cached_copy = &os->os_version; 2235 break; 2236 case ZFS_PROP_NORMALIZE: 2237 cached_copy = &os->os_normalization; 2238 break; 2239 case ZFS_PROP_UTF8ONLY: 2240 cached_copy = &os->os_utf8only; 2241 break; 2242 case ZFS_PROP_CASE: 2243 cached_copy = &os->os_casesensitivity; 2244 break; 2245 default: 2246 break; 2247 } 2248 } 2249 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { 2250 *value = *cached_copy; 2251 return (0); 2252 } 2253 2254 /* 2255 * If the property wasn't cached, look up the file system's value for 2256 * the property. For the version property, we look up a slightly 2257 * different string. 2258 */ 2259 const char *pname; 2260 int error = ENOENT; 2261 if (prop == ZFS_PROP_VERSION) { 2262 pname = ZPL_VERSION_STR; 2263 } else { 2264 pname = zfs_prop_to_name(prop); 2265 } 2266 2267 if (os != NULL) { 2268 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); 2269 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2270 } 2271 2272 if (error == ENOENT) { 2273 /* No value set, use the default value */ 2274 switch (prop) { 2275 case ZFS_PROP_VERSION: 2276 *value = ZPL_VERSION; 2277 break; 2278 case ZFS_PROP_NORMALIZE: 2279 case ZFS_PROP_UTF8ONLY: 2280 *value = 0; 2281 break; 2282 case ZFS_PROP_CASE: 2283 *value = ZFS_CASE_SENSITIVE; 2284 break; 2285 case ZFS_PROP_ACLTYPE: 2286 *value = ZFS_ACLTYPE_NFSV4; 2287 break; 2288 default: 2289 return (error); 2290 } 2291 error = 0; 2292 } 2293 2294 /* 2295 * If one of the methods for getting the property value above worked, 2296 * copy it into the objset_t's cache. 2297 */ 2298 if (error == 0 && cached_copy != NULL) { 2299 *cached_copy = *value; 2300 } 2301 2302 return (error); 2303 } 2304 2305 /* 2306 * Return true if the corresponding vfs's unmounted flag is set. 2307 * Otherwise return false. 2308 * If this function returns true we know VFS unmount has been initiated. 2309 */ 2310 boolean_t 2311 zfs_get_vfs_flag_unmounted(objset_t *os) 2312 { 2313 zfsvfs_t *zfvp; 2314 boolean_t unmounted = B_FALSE; 2315 2316 ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS); 2317 2318 mutex_enter(&os->os_user_ptr_lock); 2319 zfvp = dmu_objset_get_user(os); 2320 if (zfvp != NULL && zfvp->z_vfs != NULL && 2321 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) 2322 unmounted = B_TRUE; 2323 mutex_exit(&os->os_user_ptr_lock); 2324 2325 return (unmounted); 2326 } 2327 2328 #ifdef _KERNEL 2329 void 2330 zfsvfs_update_fromname(const char *oldname, const char *newname) 2331 { 2332 char tmpbuf[MAXPATHLEN]; 2333 struct mount *mp; 2334 char *fromname; 2335 size_t oldlen; 2336 2337 oldlen = strlen(oldname); 2338 2339 mtx_lock(&mountlist_mtx); 2340 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2341 fromname = mp->mnt_stat.f_mntfromname; 2342 if (strcmp(fromname, oldname) == 0) { 2343 (void) strlcpy(fromname, newname, 2344 sizeof (mp->mnt_stat.f_mntfromname)); 2345 continue; 2346 } 2347 if (strncmp(fromname, oldname, oldlen) == 0 && 2348 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { 2349 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", 2350 newname, fromname + oldlen); 2351 (void) strlcpy(fromname, tmpbuf, 2352 sizeof (mp->mnt_stat.f_mntfromname)); 2353 continue; 2354 } 2355 } 2356 mtx_unlock(&mountlist_mtx); 2357 } 2358 #endif 2359 2360 /* 2361 * Find a prison with ZFS info. 2362 * Return the ZFS info and the (locked) prison. 2363 */ 2364 static struct zfs_jailparam * 2365 zfs_jailparam_find(struct prison *spr, struct prison **prp) 2366 { 2367 struct prison *pr; 2368 struct zfs_jailparam *zjp; 2369 2370 for (pr = spr; ; pr = pr->pr_parent) { 2371 mtx_lock(&pr->pr_mtx); 2372 if (pr == &prison0) { 2373 zjp = &zfs_jailparam0; 2374 break; 2375 } 2376 zjp = osd_jail_get(pr, zfs_jailparam_slot); 2377 if (zjp != NULL) 2378 break; 2379 mtx_unlock(&pr->pr_mtx); 2380 } 2381 *prp = pr; 2382 2383 return (zjp); 2384 } 2385 2386 /* 2387 * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the 2388 * ZFS info and lock the prison. 2389 */ 2390 static void 2391 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp) 2392 { 2393 struct prison *ppr; 2394 struct zfs_jailparam *zjp, *nzjp; 2395 void **rsv; 2396 2397 /* If this prison already has ZFS info, return that. */ 2398 zjp = zfs_jailparam_find(pr, &ppr); 2399 if (ppr == pr) 2400 goto done; 2401 2402 /* 2403 * Allocate a new info record. Then check again, in case something 2404 * changed during the allocation. 2405 */ 2406 mtx_unlock(&ppr->pr_mtx); 2407 nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK); 2408 rsv = osd_reserve(zfs_jailparam_slot); 2409 zjp = zfs_jailparam_find(pr, &ppr); 2410 if (ppr == pr) { 2411 free(nzjp, M_PRISON); 2412 osd_free_reserved(rsv); 2413 goto done; 2414 } 2415 /* Inherit the initial values from the ancestor. */ 2416 mtx_lock(&pr->pr_mtx); 2417 (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp); 2418 (void) memcpy(nzjp, zjp, sizeof (*zjp)); 2419 zjp = nzjp; 2420 mtx_unlock(&ppr->pr_mtx); 2421 done: 2422 if (zjpp != NULL) 2423 *zjpp = zjp; 2424 else 2425 mtx_unlock(&pr->pr_mtx); 2426 } 2427 2428 /* 2429 * Jail OSD methods for ZFS VFS info. 2430 */ 2431 static int 2432 zfs_jailparam_create(void *obj, void *data) 2433 { 2434 struct prison *pr = obj; 2435 struct vfsoptlist *opts = data; 2436 int jsys; 2437 2438 if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 && 2439 jsys == JAIL_SYS_INHERIT) 2440 return (0); 2441 /* 2442 * Inherit a prison's initial values from its parent 2443 * (different from JAIL_SYS_INHERIT which also inherits changes). 2444 */ 2445 zfs_jailparam_alloc(pr, NULL); 2446 return (0); 2447 } 2448 2449 static int 2450 zfs_jailparam_get(void *obj, void *data) 2451 { 2452 struct prison *ppr, *pr = obj; 2453 struct vfsoptlist *opts = data; 2454 struct zfs_jailparam *zjp; 2455 int jsys, error; 2456 2457 zjp = zfs_jailparam_find(pr, &ppr); 2458 jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; 2459 error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys)); 2460 if (error != 0 && error != ENOENT) 2461 goto done; 2462 if (jsys == JAIL_SYS_NEW) { 2463 error = vfs_setopt(opts, "zfs.mount_snapshot", 2464 &zjp->mount_snapshot, sizeof (zjp->mount_snapshot)); 2465 if (error != 0 && error != ENOENT) 2466 goto done; 2467 } else { 2468 /* 2469 * If this prison is inheriting its ZFS info, report 2470 * empty/zero parameters. 2471 */ 2472 static int mount_snapshot = 0; 2473 2474 error = vfs_setopt(opts, "zfs.mount_snapshot", 2475 &mount_snapshot, sizeof (mount_snapshot)); 2476 if (error != 0 && error != ENOENT) 2477 goto done; 2478 } 2479 error = 0; 2480 done: 2481 mtx_unlock(&ppr->pr_mtx); 2482 return (error); 2483 } 2484 2485 static int 2486 zfs_jailparam_set(void *obj, void *data) 2487 { 2488 struct prison *pr = obj; 2489 struct prison *ppr; 2490 struct vfsoptlist *opts = data; 2491 int error, jsys, mount_snapshot; 2492 2493 /* Set the parameters, which should be correct. */ 2494 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); 2495 if (error == ENOENT) 2496 jsys = -1; 2497 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, 2498 sizeof (mount_snapshot)); 2499 if (error == ENOENT) 2500 mount_snapshot = -1; 2501 else 2502 jsys = JAIL_SYS_NEW; 2503 switch (jsys) { 2504 case JAIL_SYS_NEW: 2505 { 2506 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */ 2507 struct zfs_jailparam *zjp; 2508 2509 /* 2510 * A child jail cannot have more permissions than its parent 2511 */ 2512 if (pr->pr_parent != &prison0) { 2513 zjp = zfs_jailparam_find(pr->pr_parent, &ppr); 2514 mtx_unlock(&ppr->pr_mtx); 2515 if (zjp->mount_snapshot < mount_snapshot) { 2516 return (EPERM); 2517 } 2518 } 2519 zfs_jailparam_alloc(pr, &zjp); 2520 if (mount_snapshot != -1) 2521 zjp->mount_snapshot = mount_snapshot; 2522 mtx_unlock(&pr->pr_mtx); 2523 break; 2524 } 2525 case JAIL_SYS_INHERIT: 2526 /* "zfs=inherit": inherit the parent's ZFS info. */ 2527 mtx_lock(&pr->pr_mtx); 2528 osd_jail_del(pr, zfs_jailparam_slot); 2529 mtx_unlock(&pr->pr_mtx); 2530 break; 2531 case -1: 2532 /* 2533 * If the setting being changed is not ZFS related 2534 * then do nothing. 2535 */ 2536 break; 2537 } 2538 2539 return (0); 2540 } 2541 2542 static int 2543 zfs_jailparam_check(void *obj __unused, void *data) 2544 { 2545 struct vfsoptlist *opts = data; 2546 int error, jsys, mount_snapshot; 2547 2548 /* Check that the parameters are correct. */ 2549 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); 2550 if (error != ENOENT) { 2551 if (error != 0) 2552 return (error); 2553 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT) 2554 return (EINVAL); 2555 } 2556 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, 2557 sizeof (mount_snapshot)); 2558 if (error != ENOENT) { 2559 if (error != 0) 2560 return (error); 2561 if (mount_snapshot != 0 && mount_snapshot != 1) 2562 return (EINVAL); 2563 } 2564 return (0); 2565 } 2566 2567 static void 2568 zfs_jailparam_destroy(void *data) 2569 { 2570 2571 free(data, M_PRISON); 2572 } 2573 2574 static void 2575 zfs_jailparam_sysinit(void *arg __unused) 2576 { 2577 struct prison *pr; 2578 osd_method_t methods[PR_MAXMETHOD] = { 2579 [PR_METHOD_CREATE] = zfs_jailparam_create, 2580 [PR_METHOD_GET] = zfs_jailparam_get, 2581 [PR_METHOD_SET] = zfs_jailparam_set, 2582 [PR_METHOD_CHECK] = zfs_jailparam_check, 2583 }; 2584 2585 zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods); 2586 /* Copy the defaults to any existing prisons. */ 2587 sx_slock(&allprison_lock); 2588 TAILQ_FOREACH(pr, &allprison, pr_list) 2589 zfs_jailparam_alloc(pr, NULL); 2590 sx_sunlock(&allprison_lock); 2591 } 2592 2593 static void 2594 zfs_jailparam_sysuninit(void *arg __unused) 2595 { 2596 2597 osd_jail_deregister(zfs_jailparam_slot); 2598 } 2599 2600 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, 2601 zfs_jailparam_sysinit, NULL); 2602 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, 2603 zfs_jailparam_sysuninit, NULL); 2604