1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24  * All rights reserved.
25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28  */
29 
30 /* Portions Copyright 2010 Robert Milkowski */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/acl.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/mntent.h>
42 #include <sys/mount.h>
43 #include <sys/cmn_err.h>
44 #include <sys/zfs_znode.h>
45 #include <sys/zfs_vnops.h>
46 #include <sys/zfs_dir.h>
47 #include <sys/zil.h>
48 #include <sys/fs/zfs.h>
49 #include <sys/dmu.h>
50 #include <sys/dsl_prop.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/dsl_deleg.h>
53 #include <sys/spa.h>
54 #include <sys/zap.h>
55 #include <sys/sa.h>
56 #include <sys/sa_impl.h>
57 #include <sys/policy.h>
58 #include <sys/atomic.h>
59 #include <sys/zfs_ioctl.h>
60 #include <sys/zfs_ctldir.h>
61 #include <sys/zfs_fuid.h>
62 #include <sys/sunddi.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/dsl_dir.h>
65 #include <sys/jail.h>
66 #include <sys/osd.h>
67 #include <ufs/ufs/quota.h>
68 #include <sys/zfs_quota.h>
69 
70 #include "zfs_comutil.h"
71 
72 #ifndef	MNTK_VMSETSIZE_BUG
73 #define	MNTK_VMSETSIZE_BUG	0
74 #endif
75 #ifndef	MNTK_NOMSYNC
76 #define	MNTK_NOMSYNC	8
77 #endif
78 
79 struct mtx zfs_debug_mtx;
80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
81 
82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
83 
84 int zfs_super_owner;
85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
86 	"File system owners can perform privileged operation on file systems");
87 
88 int zfs_debug_level;
89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
90 	"Debug level");
91 
92 struct zfs_jailparam {
93 	int mount_snapshot;
94 };
95 
96 static struct zfs_jailparam zfs_jailparam0 = {
97 	.mount_snapshot = 0,
98 };
99 
100 static int zfs_jailparam_slot;
101 
102 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
103 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
104 	"Allow mounting snapshots in the .zfs directory for unjailed datasets");
105 
106 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
107 static int zfs_version_acl = ZFS_ACL_VERSION;
108 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
109 	"ZFS_ACL_VERSION");
110 static int zfs_version_spa = SPA_VERSION;
111 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
112 	"SPA_VERSION");
113 static int zfs_version_zpl = ZPL_VERSION;
114 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
115 	"ZPL_VERSION");
116 
117 #if __FreeBSD_version >= 1400018
118 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
119     bool *mp_busy);
120 #else
121 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
122 #endif
123 static int zfs_mount(vfs_t *vfsp);
124 static int zfs_umount(vfs_t *vfsp, int fflag);
125 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
126 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
127 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
128 static int zfs_sync(vfs_t *vfsp, int waitfor);
129 #if __FreeBSD_version >= 1300098
130 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
131     struct ucred **credanonp, int *numsecflavors, int *secflavors);
132 #else
133 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
134     struct ucred **credanonp, int *numsecflavors, int **secflavors);
135 #endif
136 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
137 static void zfs_freevfs(vfs_t *vfsp);
138 
139 struct vfsops zfs_vfsops = {
140 	.vfs_mount =		zfs_mount,
141 	.vfs_unmount =		zfs_umount,
142 #if __FreeBSD_version >= 1300049
143 	.vfs_root =		vfs_cache_root,
144 	.vfs_cachedroot = zfs_root,
145 #else
146 	.vfs_root =		zfs_root,
147 #endif
148 	.vfs_statfs =		zfs_statfs,
149 	.vfs_vget =		zfs_vget,
150 	.vfs_sync =		zfs_sync,
151 	.vfs_checkexp =		zfs_checkexp,
152 	.vfs_fhtovp =		zfs_fhtovp,
153 	.vfs_quotactl =		zfs_quotactl,
154 };
155 
156 #ifdef VFCF_CROSS_COPY_FILE_RANGE
157 VFS_SET(zfs_vfsops, zfs,
158     VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE);
159 #else
160 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL);
161 #endif
162 
163 /*
164  * We need to keep a count of active fs's.
165  * This is necessary to prevent our module
166  * from being unloaded after a umount -f
167  */
168 static uint32_t	zfs_active_fs_count = 0;
169 
170 int
171 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
172     char *setpoint)
173 {
174 	int error;
175 	zfsvfs_t *zfvp;
176 	vfs_t *vfsp;
177 	objset_t *os;
178 	uint64_t tmp = *val;
179 
180 	error = dmu_objset_from_ds(ds, &os);
181 	if (error != 0)
182 		return (error);
183 
184 	error = getzfsvfs_impl(os, &zfvp);
185 	if (error != 0)
186 		return (error);
187 	if (zfvp == NULL)
188 		return (ENOENT);
189 	vfsp = zfvp->z_vfs;
190 	switch (zfs_prop) {
191 	case ZFS_PROP_ATIME:
192 		if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
193 			tmp = 0;
194 		if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
195 			tmp = 1;
196 		break;
197 	case ZFS_PROP_DEVICES:
198 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
199 			tmp = 0;
200 		if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
201 			tmp = 1;
202 		break;
203 	case ZFS_PROP_EXEC:
204 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
205 			tmp = 0;
206 		if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
207 			tmp = 1;
208 		break;
209 	case ZFS_PROP_SETUID:
210 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
211 			tmp = 0;
212 		if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
213 			tmp = 1;
214 		break;
215 	case ZFS_PROP_READONLY:
216 		if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
217 			tmp = 0;
218 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
219 			tmp = 1;
220 		break;
221 	case ZFS_PROP_XATTR:
222 		if (zfvp->z_flags & ZSB_XATTR)
223 			tmp = zfvp->z_xattr;
224 		break;
225 	case ZFS_PROP_NBMAND:
226 		if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
227 			tmp = 0;
228 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
229 			tmp = 1;
230 		break;
231 	default:
232 		vfs_unbusy(vfsp);
233 		return (ENOENT);
234 	}
235 
236 	vfs_unbusy(vfsp);
237 	if (tmp != *val) {
238 		if (setpoint)
239 			(void) strcpy(setpoint, "temporary");
240 		*val = tmp;
241 	}
242 	return (0);
243 }
244 
245 static int
246 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
247 {
248 	int error = 0;
249 	char buf[32];
250 	uint64_t usedobj, quotaobj;
251 	uint64_t quota, used = 0;
252 	timespec_t now;
253 
254 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
255 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
256 
257 	if (quotaobj == 0 || zfsvfs->z_replay) {
258 		error = ENOENT;
259 		goto done;
260 	}
261 	(void) sprintf(buf, "%llx", (longlong_t)id);
262 	if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
263 	    buf, sizeof (quota), 1, &quota)) != 0) {
264 		dprintf("%s(%d): quotaobj lookup failed\n",
265 		    __FUNCTION__, __LINE__);
266 		goto done;
267 	}
268 	/*
269 	 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
270 	 * So we set them to be the same.
271 	 */
272 	dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
273 	error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
274 	if (error && error != ENOENT) {
275 		dprintf("%s(%d):  usedobj failed; %d\n",
276 		    __FUNCTION__, __LINE__, error);
277 		goto done;
278 	}
279 	dqp->dqb_curblocks = btodb(used);
280 	dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
281 	vfs_timestamp(&now);
282 	/*
283 	 * Setting this to 0 causes FreeBSD quota(8) to print
284 	 * the number of days since the epoch, which isn't
285 	 * particularly useful.
286 	 */
287 	dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
288 done:
289 	return (error);
290 }
291 
292 static int
293 #if __FreeBSD_version >= 1400018
294 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
295 #else
296 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
297 #endif
298 {
299 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
300 	struct thread *td;
301 	int cmd, type, error = 0;
302 	int bitsize;
303 	zfs_userquota_prop_t quota_type;
304 	struct dqblk64 dqblk = { 0 };
305 
306 	td = curthread;
307 	cmd = cmds >> SUBCMDSHIFT;
308 	type = cmds & SUBCMDMASK;
309 
310 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
311 		return (error);
312 	if (id == -1) {
313 		switch (type) {
314 		case USRQUOTA:
315 			id = td->td_ucred->cr_ruid;
316 			break;
317 		case GRPQUOTA:
318 			id = td->td_ucred->cr_rgid;
319 			break;
320 		default:
321 			error = EINVAL;
322 #if __FreeBSD_version < 1400018
323 			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
324 				vfs_unbusy(vfsp);
325 #endif
326 			goto done;
327 		}
328 	}
329 	/*
330 	 * Map BSD type to:
331 	 * ZFS_PROP_USERUSED,
332 	 * ZFS_PROP_USERQUOTA,
333 	 * ZFS_PROP_GROUPUSED,
334 	 * ZFS_PROP_GROUPQUOTA
335 	 */
336 	switch (cmd) {
337 	case Q_SETQUOTA:
338 	case Q_SETQUOTA32:
339 		if (type == USRQUOTA)
340 			quota_type = ZFS_PROP_USERQUOTA;
341 		else if (type == GRPQUOTA)
342 			quota_type = ZFS_PROP_GROUPQUOTA;
343 		else
344 			error = EINVAL;
345 		break;
346 	case Q_GETQUOTA:
347 	case Q_GETQUOTA32:
348 		if (type == USRQUOTA)
349 			quota_type = ZFS_PROP_USERUSED;
350 		else if (type == GRPQUOTA)
351 			quota_type = ZFS_PROP_GROUPUSED;
352 		else
353 			error = EINVAL;
354 		break;
355 	}
356 
357 	/*
358 	 * Depending on the cmd, we may need to get
359 	 * the ruid and domain (see fuidstr_to_sid?),
360 	 * the fuid (how?), or other information.
361 	 * Create fuid using zfs_fuid_create(zfsvfs, id,
362 	 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
363 	 * I think I can use just the id?
364 	 *
365 	 * Look at zfs_id_overquota() to look up a quota.
366 	 * zap_lookup(something, quotaobj, fuidstring,
367 	 *     sizeof (long long), 1, &quota)
368 	 *
369 	 * See zfs_set_userquota() to set a quota.
370 	 */
371 	if ((uint32_t)type >= MAXQUOTAS) {
372 		error = EINVAL;
373 		goto done;
374 	}
375 
376 	switch (cmd) {
377 	case Q_GETQUOTASIZE:
378 		bitsize = 64;
379 		error = copyout(&bitsize, arg, sizeof (int));
380 		break;
381 	case Q_QUOTAON:
382 		// As far as I can tell, you can't turn quotas on or off on zfs
383 		error = 0;
384 #if __FreeBSD_version < 1400018
385 		vfs_unbusy(vfsp);
386 #endif
387 		break;
388 	case Q_QUOTAOFF:
389 		error = ENOTSUP;
390 #if __FreeBSD_version < 1400018
391 		vfs_unbusy(vfsp);
392 #endif
393 		break;
394 	case Q_SETQUOTA:
395 		error = copyin(arg, &dqblk, sizeof (dqblk));
396 		if (error == 0)
397 			error = zfs_set_userquota(zfsvfs, quota_type,
398 			    "", id, dbtob(dqblk.dqb_bhardlimit));
399 		break;
400 	case Q_GETQUOTA:
401 		error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
402 		if (error == 0)
403 			error = copyout(&dqblk, arg, sizeof (dqblk));
404 		break;
405 	default:
406 		error = EINVAL;
407 		break;
408 	}
409 done:
410 	zfs_exit(zfsvfs, FTAG);
411 	return (error);
412 }
413 
414 
415 boolean_t
416 zfs_is_readonly(zfsvfs_t *zfsvfs)
417 {
418 	return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
419 }
420 
421 static int
422 zfs_sync(vfs_t *vfsp, int waitfor)
423 {
424 
425 	/*
426 	 * Data integrity is job one.  We don't want a compromised kernel
427 	 * writing to the storage pool, so we never sync during panic.
428 	 */
429 	if (panicstr)
430 		return (0);
431 
432 	/*
433 	 * Ignore the system syncher.  ZFS already commits async data
434 	 * at zfs_txg_timeout intervals.
435 	 */
436 	if (waitfor == MNT_LAZY)
437 		return (0);
438 
439 	if (vfsp != NULL) {
440 		/*
441 		 * Sync a specific filesystem.
442 		 */
443 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
444 		dsl_pool_t *dp;
445 		int error;
446 
447 		if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
448 			return (error);
449 		dp = dmu_objset_pool(zfsvfs->z_os);
450 
451 		/*
452 		 * If the system is shutting down, then skip any
453 		 * filesystems which may exist on a suspended pool.
454 		 */
455 		if (rebooting && spa_suspended(dp->dp_spa)) {
456 			zfs_exit(zfsvfs, FTAG);
457 			return (0);
458 		}
459 
460 		if (zfsvfs->z_log != NULL)
461 			zil_commit(zfsvfs->z_log, 0);
462 
463 		zfs_exit(zfsvfs, FTAG);
464 	} else {
465 		/*
466 		 * Sync all ZFS filesystems.  This is what happens when you
467 		 * run sync(8).  Unlike other filesystems, ZFS honors the
468 		 * request by waiting for all pools to commit all dirty data.
469 		 */
470 		spa_sync_allpools();
471 	}
472 
473 	return (0);
474 }
475 
476 static void
477 atime_changed_cb(void *arg, uint64_t newval)
478 {
479 	zfsvfs_t *zfsvfs = arg;
480 
481 	if (newval == TRUE) {
482 		zfsvfs->z_atime = TRUE;
483 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
484 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
485 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
486 	} else {
487 		zfsvfs->z_atime = FALSE;
488 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
489 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
490 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
491 	}
492 }
493 
494 static void
495 xattr_changed_cb(void *arg, uint64_t newval)
496 {
497 	zfsvfs_t *zfsvfs = arg;
498 
499 	if (newval == ZFS_XATTR_OFF) {
500 		zfsvfs->z_flags &= ~ZSB_XATTR;
501 	} else {
502 		zfsvfs->z_flags |= ZSB_XATTR;
503 
504 		if (newval == ZFS_XATTR_SA)
505 			zfsvfs->z_xattr_sa = B_TRUE;
506 		else
507 			zfsvfs->z_xattr_sa = B_FALSE;
508 	}
509 }
510 
511 static void
512 blksz_changed_cb(void *arg, uint64_t newval)
513 {
514 	zfsvfs_t *zfsvfs = arg;
515 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
516 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
517 	ASSERT(ISP2(newval));
518 
519 	zfsvfs->z_max_blksz = newval;
520 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
521 }
522 
523 static void
524 readonly_changed_cb(void *arg, uint64_t newval)
525 {
526 	zfsvfs_t *zfsvfs = arg;
527 
528 	if (newval) {
529 		/* XXX locking on vfs_flag? */
530 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
531 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
532 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
533 	} else {
534 		/* XXX locking on vfs_flag? */
535 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
536 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
537 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
538 	}
539 }
540 
541 static void
542 setuid_changed_cb(void *arg, uint64_t newval)
543 {
544 	zfsvfs_t *zfsvfs = arg;
545 
546 	if (newval == FALSE) {
547 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
548 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
549 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
550 	} else {
551 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
552 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
553 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
554 	}
555 }
556 
557 static void
558 exec_changed_cb(void *arg, uint64_t newval)
559 {
560 	zfsvfs_t *zfsvfs = arg;
561 
562 	if (newval == FALSE) {
563 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
564 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
565 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
566 	} else {
567 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
568 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
569 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
570 	}
571 }
572 
573 /*
574  * The nbmand mount option can be changed at mount time.
575  * We can't allow it to be toggled on live file systems or incorrect
576  * behavior may be seen from cifs clients
577  *
578  * This property isn't registered via dsl_prop_register(), but this callback
579  * will be called when a file system is first mounted
580  */
581 static void
582 nbmand_changed_cb(void *arg, uint64_t newval)
583 {
584 	zfsvfs_t *zfsvfs = arg;
585 	if (newval == FALSE) {
586 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
587 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
588 	} else {
589 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
590 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
591 	}
592 }
593 
594 static void
595 snapdir_changed_cb(void *arg, uint64_t newval)
596 {
597 	zfsvfs_t *zfsvfs = arg;
598 
599 	zfsvfs->z_show_ctldir = newval;
600 }
601 
602 static void
603 acl_mode_changed_cb(void *arg, uint64_t newval)
604 {
605 	zfsvfs_t *zfsvfs = arg;
606 
607 	zfsvfs->z_acl_mode = newval;
608 }
609 
610 static void
611 acl_inherit_changed_cb(void *arg, uint64_t newval)
612 {
613 	zfsvfs_t *zfsvfs = arg;
614 
615 	zfsvfs->z_acl_inherit = newval;
616 }
617 
618 static void
619 acl_type_changed_cb(void *arg, uint64_t newval)
620 {
621 	zfsvfs_t *zfsvfs = arg;
622 
623 	zfsvfs->z_acl_type = newval;
624 }
625 
626 static int
627 zfs_register_callbacks(vfs_t *vfsp)
628 {
629 	struct dsl_dataset *ds = NULL;
630 	objset_t *os = NULL;
631 	zfsvfs_t *zfsvfs = NULL;
632 	uint64_t nbmand;
633 	boolean_t readonly = B_FALSE;
634 	boolean_t do_readonly = B_FALSE;
635 	boolean_t setuid = B_FALSE;
636 	boolean_t do_setuid = B_FALSE;
637 	boolean_t exec = B_FALSE;
638 	boolean_t do_exec = B_FALSE;
639 	boolean_t xattr = B_FALSE;
640 	boolean_t atime = B_FALSE;
641 	boolean_t do_atime = B_FALSE;
642 	boolean_t do_xattr = B_FALSE;
643 	int error = 0;
644 
645 	ASSERT3P(vfsp, !=, NULL);
646 	zfsvfs = vfsp->vfs_data;
647 	ASSERT3P(zfsvfs, !=, NULL);
648 	os = zfsvfs->z_os;
649 
650 	/*
651 	 * This function can be called for a snapshot when we update snapshot's
652 	 * mount point, which isn't really supported.
653 	 */
654 	if (dmu_objset_is_snapshot(os))
655 		return (EOPNOTSUPP);
656 
657 	/*
658 	 * The act of registering our callbacks will destroy any mount
659 	 * options we may have.  In order to enable temporary overrides
660 	 * of mount options, we stash away the current values and
661 	 * restore them after we register the callbacks.
662 	 */
663 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
664 	    !spa_writeable(dmu_objset_spa(os))) {
665 		readonly = B_TRUE;
666 		do_readonly = B_TRUE;
667 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
668 		readonly = B_FALSE;
669 		do_readonly = B_TRUE;
670 	}
671 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
672 		setuid = B_FALSE;
673 		do_setuid = B_TRUE;
674 	} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
675 		setuid = B_TRUE;
676 		do_setuid = B_TRUE;
677 	}
678 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
679 		exec = B_FALSE;
680 		do_exec = B_TRUE;
681 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
682 		exec = B_TRUE;
683 		do_exec = B_TRUE;
684 	}
685 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
686 		zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
687 		do_xattr = B_TRUE;
688 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
689 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
690 		do_xattr = B_TRUE;
691 	} else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
692 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
693 		do_xattr = B_TRUE;
694 	} else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
695 		zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
696 		do_xattr = B_TRUE;
697 	}
698 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
699 		atime = B_FALSE;
700 		do_atime = B_TRUE;
701 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
702 		atime = B_TRUE;
703 		do_atime = B_TRUE;
704 	}
705 
706 	/*
707 	 * We need to enter pool configuration here, so that we can use
708 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
709 	 * dsl_prop_get_integer() can not be used, because it has to acquire
710 	 * spa_namespace_lock and we can not do that because we already hold
711 	 * z_teardown_lock.  The problem is that spa_write_cachefile() is called
712 	 * with spa_namespace_lock held and the function calls ZFS vnode
713 	 * operations to write the cache file and thus z_teardown_lock is
714 	 * acquired after spa_namespace_lock.
715 	 */
716 	ds = dmu_objset_ds(os);
717 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
718 
719 	/*
720 	 * nbmand is a special property.  It can only be changed at
721 	 * mount time.
722 	 *
723 	 * This is weird, but it is documented to only be changeable
724 	 * at mount time.
725 	 */
726 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
727 		nbmand = B_FALSE;
728 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
729 		nbmand = B_TRUE;
730 	} else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
731 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
732 		return (error);
733 	}
734 
735 	/*
736 	 * Register property callbacks.
737 	 *
738 	 * It would probably be fine to just check for i/o error from
739 	 * the first prop_register(), but I guess I like to go
740 	 * overboard...
741 	 */
742 	error = dsl_prop_register(ds,
743 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
744 	error = error ? error : dsl_prop_register(ds,
745 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
746 	error = error ? error : dsl_prop_register(ds,
747 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
748 	error = error ? error : dsl_prop_register(ds,
749 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
750 	error = error ? error : dsl_prop_register(ds,
751 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
752 	error = error ? error : dsl_prop_register(ds,
753 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
754 	error = error ? error : dsl_prop_register(ds,
755 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
756 	error = error ? error : dsl_prop_register(ds,
757 	    zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
758 	error = error ? error : dsl_prop_register(ds,
759 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
760 	error = error ? error : dsl_prop_register(ds,
761 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
762 	    zfsvfs);
763 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
764 	if (error)
765 		goto unregister;
766 
767 	/*
768 	 * Invoke our callbacks to restore temporary mount options.
769 	 */
770 	if (do_readonly)
771 		readonly_changed_cb(zfsvfs, readonly);
772 	if (do_setuid)
773 		setuid_changed_cb(zfsvfs, setuid);
774 	if (do_exec)
775 		exec_changed_cb(zfsvfs, exec);
776 	if (do_xattr)
777 		xattr_changed_cb(zfsvfs, xattr);
778 	if (do_atime)
779 		atime_changed_cb(zfsvfs, atime);
780 
781 	nbmand_changed_cb(zfsvfs, nbmand);
782 
783 	return (0);
784 
785 unregister:
786 	dsl_prop_unregister_all(ds, zfsvfs);
787 	return (error);
788 }
789 
790 /*
791  * Associate this zfsvfs with the given objset, which must be owned.
792  * This will cache a bunch of on-disk state from the objset in the
793  * zfsvfs.
794  */
795 static int
796 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
797 {
798 	int error;
799 	uint64_t val;
800 
801 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
802 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
803 	zfsvfs->z_os = os;
804 
805 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
806 	if (error != 0)
807 		return (error);
808 	if (zfsvfs->z_version >
809 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
810 		(void) printf("Can't mount a version %lld file system "
811 		    "on a version %lld pool\n. Pool must be upgraded to mount "
812 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
813 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
814 		return (SET_ERROR(ENOTSUP));
815 	}
816 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
817 	if (error != 0)
818 		return (error);
819 	zfsvfs->z_norm = (int)val;
820 
821 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
822 	if (error != 0)
823 		return (error);
824 	zfsvfs->z_utf8 = (val != 0);
825 
826 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
827 	if (error != 0)
828 		return (error);
829 	zfsvfs->z_case = (uint_t)val;
830 
831 	error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
832 	if (error != 0)
833 		return (error);
834 	zfsvfs->z_acl_type = (uint_t)val;
835 
836 	/*
837 	 * Fold case on file systems that are always or sometimes case
838 	 * insensitive.
839 	 */
840 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
841 	    zfsvfs->z_case == ZFS_CASE_MIXED)
842 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
843 
844 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
845 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
846 
847 	uint64_t sa_obj = 0;
848 	if (zfsvfs->z_use_sa) {
849 		/* should either have both of these objects or none */
850 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
851 		    &sa_obj);
852 		if (error != 0)
853 			return (error);
854 
855 		error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
856 		if (error == 0 && val == ZFS_XATTR_SA)
857 			zfsvfs->z_xattr_sa = B_TRUE;
858 	}
859 
860 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
861 	    &zfsvfs->z_attr_table);
862 	if (error != 0)
863 		return (error);
864 
865 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
866 		sa_register_update_callback(os, zfs_sa_upgrade);
867 
868 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
869 	    &zfsvfs->z_root);
870 	if (error != 0)
871 		return (error);
872 	ASSERT3U(zfsvfs->z_root, !=, 0);
873 
874 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
875 	    &zfsvfs->z_unlinkedobj);
876 	if (error != 0)
877 		return (error);
878 
879 	error = zap_lookup(os, MASTER_NODE_OBJ,
880 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
881 	    8, 1, &zfsvfs->z_userquota_obj);
882 	if (error == ENOENT)
883 		zfsvfs->z_userquota_obj = 0;
884 	else if (error != 0)
885 		return (error);
886 
887 	error = zap_lookup(os, MASTER_NODE_OBJ,
888 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
889 	    8, 1, &zfsvfs->z_groupquota_obj);
890 	if (error == ENOENT)
891 		zfsvfs->z_groupquota_obj = 0;
892 	else if (error != 0)
893 		return (error);
894 
895 	error = zap_lookup(os, MASTER_NODE_OBJ,
896 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
897 	    8, 1, &zfsvfs->z_projectquota_obj);
898 	if (error == ENOENT)
899 		zfsvfs->z_projectquota_obj = 0;
900 	else if (error != 0)
901 		return (error);
902 
903 	error = zap_lookup(os, MASTER_NODE_OBJ,
904 	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
905 	    8, 1, &zfsvfs->z_userobjquota_obj);
906 	if (error == ENOENT)
907 		zfsvfs->z_userobjquota_obj = 0;
908 	else if (error != 0)
909 		return (error);
910 
911 	error = zap_lookup(os, MASTER_NODE_OBJ,
912 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
913 	    8, 1, &zfsvfs->z_groupobjquota_obj);
914 	if (error == ENOENT)
915 		zfsvfs->z_groupobjquota_obj = 0;
916 	else if (error != 0)
917 		return (error);
918 
919 	error = zap_lookup(os, MASTER_NODE_OBJ,
920 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
921 	    8, 1, &zfsvfs->z_projectobjquota_obj);
922 	if (error == ENOENT)
923 		zfsvfs->z_projectobjquota_obj = 0;
924 	else if (error != 0)
925 		return (error);
926 
927 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
928 	    &zfsvfs->z_fuid_obj);
929 	if (error == ENOENT)
930 		zfsvfs->z_fuid_obj = 0;
931 	else if (error != 0)
932 		return (error);
933 
934 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
935 	    &zfsvfs->z_shares_dir);
936 	if (error == ENOENT)
937 		zfsvfs->z_shares_dir = 0;
938 	else if (error != 0)
939 		return (error);
940 
941 	/*
942 	 * Only use the name cache if we are looking for a
943 	 * name on a file system that does not require normalization
944 	 * or case folding.  We can also look there if we happen to be
945 	 * on a non-normalizing, mixed sensitivity file system IF we
946 	 * are looking for the exact name (which is always the case on
947 	 * FreeBSD).
948 	 */
949 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
950 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
951 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
952 
953 	return (0);
954 }
955 
956 taskq_t *zfsvfs_taskq;
957 
958 static void
959 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
960 {
961 
962 	zfs_unlinked_drain((zfsvfs_t *)context);
963 }
964 
965 int
966 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
967 {
968 	objset_t *os;
969 	zfsvfs_t *zfsvfs;
970 	int error;
971 	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
972 
973 	/*
974 	 * XXX: Fix struct statfs so this isn't necessary!
975 	 *
976 	 * The 'osname' is used as the filesystem's special node, which means
977 	 * it must fit in statfs.f_mntfromname, or else it can't be
978 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
979 	 * 'zfs unmount' to think it's not mounted when it is.
980 	 */
981 	if (strlen(osname) >= MNAMELEN)
982 		return (SET_ERROR(ENAMETOOLONG));
983 
984 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
985 
986 	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
987 	    &os);
988 	if (error != 0) {
989 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
990 		return (error);
991 	}
992 
993 	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
994 
995 	return (error);
996 }
997 
998 
999 int
1000 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1001 {
1002 	int error;
1003 
1004 	zfsvfs->z_vfs = NULL;
1005 	zfsvfs->z_parent = zfsvfs;
1006 
1007 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1008 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1009 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1010 	    offsetof(znode_t, z_link_node));
1011 	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1012 	    zfsvfs_task_unlinked_drain, zfsvfs);
1013 	ZFS_TEARDOWN_INIT(zfsvfs);
1014 	ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
1015 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1016 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1017 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1018 
1019 	error = zfsvfs_init(zfsvfs, os);
1020 	if (error != 0) {
1021 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1022 		*zfvp = NULL;
1023 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1024 		return (error);
1025 	}
1026 
1027 	*zfvp = zfsvfs;
1028 	return (0);
1029 }
1030 
1031 static int
1032 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1033 {
1034 	int error;
1035 
1036 	/*
1037 	 * Check for a bad on-disk format version now since we
1038 	 * lied about owning the dataset readonly before.
1039 	 */
1040 	if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1041 	    dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1042 		return (SET_ERROR(EROFS));
1043 
1044 	error = zfs_register_callbacks(zfsvfs->z_vfs);
1045 	if (error)
1046 		return (error);
1047 
1048 	/*
1049 	 * If we are not mounting (ie: online recv), then we don't
1050 	 * have to worry about replaying the log as we blocked all
1051 	 * operations out since we closed the ZIL.
1052 	 */
1053 	if (mounting) {
1054 		boolean_t readonly;
1055 
1056 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1057 		error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1058 		if (error)
1059 			return (error);
1060 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1061 		    &zfsvfs->z_kstat.dk_zil_sums);
1062 
1063 		/*
1064 		 * During replay we remove the read only flag to
1065 		 * allow replays to succeed.
1066 		 */
1067 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1068 		if (readonly != 0) {
1069 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1070 		} else {
1071 			dsl_dir_t *dd;
1072 			zap_stats_t zs;
1073 
1074 			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1075 			    &zs) == 0) {
1076 				dataset_kstats_update_nunlinks_kstat(
1077 				    &zfsvfs->z_kstat, zs.zs_num_entries);
1078 				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1079 				    "num_entries in unlinked set: %llu",
1080 				    (u_longlong_t)zs.zs_num_entries);
1081 			}
1082 
1083 			zfs_unlinked_drain(zfsvfs);
1084 			dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1085 			dd->dd_activity_cancelled = B_FALSE;
1086 		}
1087 
1088 		/*
1089 		 * Parse and replay the intent log.
1090 		 *
1091 		 * Because of ziltest, this must be done after
1092 		 * zfs_unlinked_drain().  (Further note: ziltest
1093 		 * doesn't use readonly mounts, where
1094 		 * zfs_unlinked_drain() isn't called.)  This is because
1095 		 * ziltest causes spa_sync() to think it's committed,
1096 		 * but actually it is not, so the intent log contains
1097 		 * many txg's worth of changes.
1098 		 *
1099 		 * In particular, if object N is in the unlinked set in
1100 		 * the last txg to actually sync, then it could be
1101 		 * actually freed in a later txg and then reallocated
1102 		 * in a yet later txg.  This would write a "create
1103 		 * object N" record to the intent log.  Normally, this
1104 		 * would be fine because the spa_sync() would have
1105 		 * written out the fact that object N is free, before
1106 		 * we could write the "create object N" intent log
1107 		 * record.
1108 		 *
1109 		 * But when we are in ziltest mode, we advance the "open
1110 		 * txg" without actually spa_sync()-ing the changes to
1111 		 * disk.  So we would see that object N is still
1112 		 * allocated and in the unlinked set, and there is an
1113 		 * intent log record saying to allocate it.
1114 		 */
1115 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1116 			if (zil_replay_disable) {
1117 				zil_destroy(zfsvfs->z_log, B_FALSE);
1118 			} else {
1119 				boolean_t use_nc = zfsvfs->z_use_namecache;
1120 				zfsvfs->z_use_namecache = B_FALSE;
1121 				zfsvfs->z_replay = B_TRUE;
1122 				zil_replay(zfsvfs->z_os, zfsvfs,
1123 				    zfs_replay_vector);
1124 				zfsvfs->z_replay = B_FALSE;
1125 				zfsvfs->z_use_namecache = use_nc;
1126 			}
1127 		}
1128 
1129 		/* restore readonly bit */
1130 		if (readonly != 0)
1131 			zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1132 	} else {
1133 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1134 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1135 		    &zfsvfs->z_kstat.dk_zil_sums);
1136 	}
1137 
1138 	/*
1139 	 * Set the objset user_ptr to track its zfsvfs.
1140 	 */
1141 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1142 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1143 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1144 
1145 	return (0);
1146 }
1147 
1148 void
1149 zfsvfs_free(zfsvfs_t *zfsvfs)
1150 {
1151 	int i;
1152 
1153 	zfs_fuid_destroy(zfsvfs);
1154 
1155 	mutex_destroy(&zfsvfs->z_znodes_lock);
1156 	mutex_destroy(&zfsvfs->z_lock);
1157 	list_destroy(&zfsvfs->z_all_znodes);
1158 	ZFS_TEARDOWN_DESTROY(zfsvfs);
1159 	ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1160 	rw_destroy(&zfsvfs->z_fuid_lock);
1161 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1162 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1163 	dataset_kstats_destroy(&zfsvfs->z_kstat);
1164 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1165 }
1166 
1167 static void
1168 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1169 {
1170 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1171 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1172 }
1173 
1174 static int
1175 zfs_domount(vfs_t *vfsp, char *osname)
1176 {
1177 	uint64_t recordsize, fsid_guid;
1178 	int error = 0;
1179 	zfsvfs_t *zfsvfs;
1180 
1181 	ASSERT3P(vfsp, !=, NULL);
1182 	ASSERT3P(osname, !=, NULL);
1183 
1184 	error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1185 	if (error)
1186 		return (error);
1187 	zfsvfs->z_vfs = vfsp;
1188 
1189 	if ((error = dsl_prop_get_integer(osname,
1190 	    "recordsize", &recordsize, NULL)))
1191 		goto out;
1192 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1193 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1194 
1195 	vfsp->vfs_data = zfsvfs;
1196 	vfsp->mnt_flag |= MNT_LOCAL;
1197 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1198 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1199 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1200 	/*
1201 	 * This can cause a loss of coherence between ARC and page cache
1202 	 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1203 	 */
1204 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
1205 	vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1206 	vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1207 
1208 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1209 	vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1210 #endif
1211 	/*
1212 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1213 	 * separates our fsid from any other filesystem types, and a
1214 	 * 56-bit objset unique ID.  The objset unique ID is unique to
1215 	 * all objsets open on this system, provided by unique_create().
1216 	 * The 8-bit fs type must be put in the low bits of fsid[1]
1217 	 * because that's where other Solaris filesystems put it.
1218 	 */
1219 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1220 	ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1221 	vfsp->vfs_fsid.val[0] = fsid_guid;
1222 	vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1223 	    (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1224 
1225 	/*
1226 	 * Set features for file system.
1227 	 */
1228 	zfs_set_fuid_feature(zfsvfs);
1229 
1230 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1231 		uint64_t pval;
1232 
1233 		atime_changed_cb(zfsvfs, B_FALSE);
1234 		readonly_changed_cb(zfsvfs, B_TRUE);
1235 		if ((error = dsl_prop_get_integer(osname,
1236 		    "xattr", &pval, NULL)))
1237 			goto out;
1238 		xattr_changed_cb(zfsvfs, pval);
1239 		if ((error = dsl_prop_get_integer(osname,
1240 		    "acltype", &pval, NULL)))
1241 			goto out;
1242 		acl_type_changed_cb(zfsvfs, pval);
1243 		zfsvfs->z_issnap = B_TRUE;
1244 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1245 
1246 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1247 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1248 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1249 	} else {
1250 		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1251 			goto out;
1252 	}
1253 
1254 	vfs_mountedfrom(vfsp, osname);
1255 
1256 	if (!zfsvfs->z_issnap)
1257 		zfsctl_create(zfsvfs);
1258 out:
1259 	if (error) {
1260 		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1261 		zfsvfs_free(zfsvfs);
1262 	} else {
1263 		atomic_inc_32(&zfs_active_fs_count);
1264 	}
1265 
1266 	return (error);
1267 }
1268 
1269 static void
1270 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1271 {
1272 	objset_t *os = zfsvfs->z_os;
1273 
1274 	if (!dmu_objset_is_snapshot(os))
1275 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1276 }
1277 
1278 static int
1279 getpoolname(const char *osname, char *poolname)
1280 {
1281 	char *p;
1282 
1283 	p = strchr(osname, '/');
1284 	if (p == NULL) {
1285 		if (strlen(osname) >= MAXNAMELEN)
1286 			return (ENAMETOOLONG);
1287 		(void) strcpy(poolname, osname);
1288 	} else {
1289 		if (p - osname >= MAXNAMELEN)
1290 			return (ENAMETOOLONG);
1291 		(void) strlcpy(poolname, osname, p - osname + 1);
1292 	}
1293 	return (0);
1294 }
1295 
1296 static void
1297 fetch_osname_options(char *name, bool *checkpointrewind)
1298 {
1299 
1300 	if (name[0] == '!') {
1301 		*checkpointrewind = true;
1302 		memmove(name, name + 1, strlen(name));
1303 	} else {
1304 		*checkpointrewind = false;
1305 	}
1306 }
1307 
1308 static int
1309 zfs_mount(vfs_t *vfsp)
1310 {
1311 	kthread_t	*td = curthread;
1312 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
1313 	cred_t		*cr = td->td_ucred;
1314 	char		*osname;
1315 	int		error = 0;
1316 	int		canwrite;
1317 	bool		checkpointrewind, isctlsnap = false;
1318 
1319 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1320 		return (SET_ERROR(EINVAL));
1321 
1322 	/*
1323 	 * If full-owner-access is enabled and delegated administration is
1324 	 * turned on, we must set nosuid.
1325 	 */
1326 	if (zfs_super_owner &&
1327 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1328 		secpolicy_fs_mount_clearopts(cr, vfsp);
1329 	}
1330 
1331 	fetch_osname_options(osname, &checkpointrewind);
1332 	isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1333 	    strchr(osname, '@') != NULL);
1334 
1335 	/*
1336 	 * Check for mount privilege?
1337 	 *
1338 	 * If we don't have privilege then see if
1339 	 * we have local permission to allow it
1340 	 */
1341 	error = secpolicy_fs_mount(cr, mvp, vfsp);
1342 	if (error && isctlsnap) {
1343 		secpolicy_fs_mount_clearopts(cr, vfsp);
1344 	} else if (error) {
1345 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1346 			goto out;
1347 
1348 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1349 			vattr_t		vattr;
1350 
1351 			/*
1352 			 * Make sure user is the owner of the mount point
1353 			 * or has sufficient privileges.
1354 			 */
1355 
1356 			vattr.va_mask = AT_UID;
1357 
1358 			vn_lock(mvp, LK_SHARED | LK_RETRY);
1359 			if (VOP_GETATTR(mvp, &vattr, cr)) {
1360 				VOP_UNLOCK1(mvp);
1361 				goto out;
1362 			}
1363 
1364 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1365 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1366 				VOP_UNLOCK1(mvp);
1367 				goto out;
1368 			}
1369 			VOP_UNLOCK1(mvp);
1370 		}
1371 
1372 		secpolicy_fs_mount_clearopts(cr, vfsp);
1373 	}
1374 
1375 	/*
1376 	 * Refuse to mount a filesystem if we are in a local zone and the
1377 	 * dataset is not visible.
1378 	 */
1379 	if (!INGLOBALZONE(curproc) &&
1380 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1381 		boolean_t mount_snapshot = B_FALSE;
1382 
1383 		/*
1384 		 * Snapshots may be mounted in .zfs for unjailed datasets
1385 		 * if allowed by the jail param zfs.mount_snapshot.
1386 		 */
1387 		if (isctlsnap) {
1388 			struct prison *pr;
1389 			struct zfs_jailparam *zjp;
1390 
1391 			pr = curthread->td_ucred->cr_prison;
1392 			mtx_lock(&pr->pr_mtx);
1393 			zjp = osd_jail_get(pr, zfs_jailparam_slot);
1394 			mtx_unlock(&pr->pr_mtx);
1395 			if (zjp && zjp->mount_snapshot)
1396 				mount_snapshot = B_TRUE;
1397 		}
1398 		if (!mount_snapshot) {
1399 			error = SET_ERROR(EPERM);
1400 			goto out;
1401 		}
1402 	}
1403 
1404 	vfsp->vfs_flag |= MNT_NFS4ACLS;
1405 
1406 	/*
1407 	 * When doing a remount, we simply refresh our temporary properties
1408 	 * according to those options set in the current VFS options.
1409 	 */
1410 	if (vfsp->vfs_flag & MS_REMOUNT) {
1411 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
1412 
1413 		/*
1414 		 * Refresh mount options with z_teardown_lock blocking I/O while
1415 		 * the filesystem is in an inconsistent state.
1416 		 * The lock also serializes this code with filesystem
1417 		 * manipulations between entry to zfs_suspend_fs() and return
1418 		 * from zfs_resume_fs().
1419 		 */
1420 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1421 		zfs_unregister_callbacks(zfsvfs);
1422 		error = zfs_register_callbacks(vfsp);
1423 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1424 		goto out;
1425 	}
1426 
1427 	/* Initial root mount: try hard to import the requested root pool. */
1428 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1429 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1430 		char pname[MAXNAMELEN];
1431 
1432 		error = getpoolname(osname, pname);
1433 		if (error == 0)
1434 			error = spa_import_rootpool(pname, checkpointrewind);
1435 		if (error)
1436 			goto out;
1437 	}
1438 	DROP_GIANT();
1439 	error = zfs_domount(vfsp, osname);
1440 	PICKUP_GIANT();
1441 
1442 out:
1443 	return (error);
1444 }
1445 
1446 static int
1447 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1448 {
1449 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1450 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1451 	int error;
1452 
1453 	statp->f_version = STATFS_VERSION;
1454 
1455 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1456 		return (error);
1457 
1458 	dmu_objset_space(zfsvfs->z_os,
1459 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1460 
1461 	/*
1462 	 * The underlying storage pool actually uses multiple block sizes.
1463 	 * We report the fragsize as the smallest block size we support,
1464 	 * and we report our blocksize as the filesystem's maximum blocksize.
1465 	 */
1466 	statp->f_bsize = SPA_MINBLOCKSIZE;
1467 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1468 
1469 	/*
1470 	 * The following report "total" blocks of various kinds in the
1471 	 * file system, but reported in terms of f_frsize - the
1472 	 * "fragment" size.
1473 	 */
1474 
1475 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1476 	statp->f_bfree = availbytes / statp->f_bsize;
1477 	statp->f_bavail = statp->f_bfree; /* no root reservation */
1478 
1479 	/*
1480 	 * statvfs() should really be called statufs(), because it assumes
1481 	 * static metadata.  ZFS doesn't preallocate files, so the best
1482 	 * we can do is report the max that could possibly fit in f_files,
1483 	 * and that minus the number actually used in f_ffree.
1484 	 * For f_ffree, report the smaller of the number of object available
1485 	 * and the number of blocks (each object will take at least a block).
1486 	 */
1487 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1488 	statp->f_files = statp->f_ffree + usedobjs;
1489 
1490 	/*
1491 	 * We're a zfs filesystem.
1492 	 */
1493 	strlcpy(statp->f_fstypename, "zfs",
1494 	    sizeof (statp->f_fstypename));
1495 
1496 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1497 	    sizeof (statp->f_mntfromname));
1498 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1499 	    sizeof (statp->f_mntonname));
1500 
1501 	statp->f_namemax = MAXNAMELEN - 1;
1502 
1503 	zfs_exit(zfsvfs, FTAG);
1504 	return (0);
1505 }
1506 
1507 static int
1508 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1509 {
1510 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1511 	znode_t *rootzp;
1512 	int error;
1513 
1514 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1515 		return (error);
1516 
1517 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1518 	if (error == 0)
1519 		*vpp = ZTOV(rootzp);
1520 
1521 	zfs_exit(zfsvfs, FTAG);
1522 
1523 	if (error == 0) {
1524 		error = vn_lock(*vpp, flags);
1525 		if (error != 0) {
1526 			VN_RELE(*vpp);
1527 			*vpp = NULL;
1528 		}
1529 	}
1530 	return (error);
1531 }
1532 
1533 /*
1534  * Teardown the zfsvfs::z_os.
1535  *
1536  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1537  * and 'z_teardown_inactive_lock' held.
1538  */
1539 static int
1540 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1541 {
1542 	znode_t	*zp;
1543 	dsl_dir_t *dd;
1544 
1545 	/*
1546 	 * If someone has not already unmounted this file system,
1547 	 * drain the zrele_taskq to ensure all active references to the
1548 	 * zfsvfs_t have been handled only then can it be safely destroyed.
1549 	 */
1550 	if (zfsvfs->z_os) {
1551 		/*
1552 		 * If we're unmounting we have to wait for the list to
1553 		 * drain completely.
1554 		 *
1555 		 * If we're not unmounting there's no guarantee the list
1556 		 * will drain completely, but zreles run from the taskq
1557 		 * may add the parents of dir-based xattrs to the taskq
1558 		 * so we want to wait for these.
1559 		 *
1560 		 * We can safely check z_all_znodes for being empty because the
1561 		 * VFS has already blocked operations which add to it.
1562 		 */
1563 		int round = 0;
1564 		while (!list_is_empty(&zfsvfs->z_all_znodes)) {
1565 			taskq_wait_outstanding(dsl_pool_zrele_taskq(
1566 			    dmu_objset_pool(zfsvfs->z_os)), 0);
1567 			if (++round > 1 && !unmounting)
1568 				break;
1569 		}
1570 	}
1571 	ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1572 
1573 	if (!unmounting) {
1574 		/*
1575 		 * We purge the parent filesystem's vfsp as the parent
1576 		 * filesystem and all of its snapshots have their vnode's
1577 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1578 		 * 'z_parent' is self referential for non-snapshots.
1579 		 */
1580 #ifdef FREEBSD_NAMECACHE
1581 #if __FreeBSD_version >= 1300117
1582 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
1583 #else
1584 		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
1585 #endif
1586 #endif
1587 	}
1588 
1589 	/*
1590 	 * Close the zil. NB: Can't close the zil while zfs_inactive
1591 	 * threads are blocked as zil_close can call zfs_inactive.
1592 	 */
1593 	if (zfsvfs->z_log) {
1594 		zil_close(zfsvfs->z_log);
1595 		zfsvfs->z_log = NULL;
1596 	}
1597 
1598 	ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1599 
1600 	/*
1601 	 * If we are not unmounting (ie: online recv) and someone already
1602 	 * unmounted this file system while we were doing the switcheroo,
1603 	 * or a reopen of z_os failed then just bail out now.
1604 	 */
1605 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1606 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1607 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1608 		return (SET_ERROR(EIO));
1609 	}
1610 
1611 	/*
1612 	 * At this point there are no vops active, and any new vops will
1613 	 * fail with EIO since we have z_teardown_lock for writer (only
1614 	 * relevant for forced unmount).
1615 	 *
1616 	 * Release all holds on dbufs.
1617 	 */
1618 	mutex_enter(&zfsvfs->z_znodes_lock);
1619 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1620 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1621 		if (zp->z_sa_hdl != NULL) {
1622 			zfs_znode_dmu_fini(zp);
1623 		}
1624 	}
1625 	mutex_exit(&zfsvfs->z_znodes_lock);
1626 
1627 	/*
1628 	 * If we are unmounting, set the unmounted flag and let new vops
1629 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1630 	 * other vops will fail with EIO.
1631 	 */
1632 	if (unmounting) {
1633 		zfsvfs->z_unmounted = B_TRUE;
1634 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1635 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1636 	}
1637 
1638 	/*
1639 	 * z_os will be NULL if there was an error in attempting to reopen
1640 	 * zfsvfs, so just return as the properties had already been
1641 	 * unregistered and cached data had been evicted before.
1642 	 */
1643 	if (zfsvfs->z_os == NULL)
1644 		return (0);
1645 
1646 	/*
1647 	 * Unregister properties.
1648 	 */
1649 	zfs_unregister_callbacks(zfsvfs);
1650 
1651 	/*
1652 	 * Evict cached data
1653 	 */
1654 	if (!zfs_is_readonly(zfsvfs))
1655 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1656 	dmu_objset_evict_dbufs(zfsvfs->z_os);
1657 	dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1658 	dsl_dir_cancel_waiters(dd);
1659 
1660 	return (0);
1661 }
1662 
1663 static int
1664 zfs_umount(vfs_t *vfsp, int fflag)
1665 {
1666 	kthread_t *td = curthread;
1667 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1668 	objset_t *os;
1669 	cred_t *cr = td->td_ucred;
1670 	int ret;
1671 
1672 	ret = secpolicy_fs_unmount(cr, vfsp);
1673 	if (ret) {
1674 		if (dsl_deleg_access((char *)vfsp->vfs_resource,
1675 		    ZFS_DELEG_PERM_MOUNT, cr))
1676 			return (ret);
1677 	}
1678 
1679 	/*
1680 	 * Unmount any snapshots mounted under .zfs before unmounting the
1681 	 * dataset itself.
1682 	 */
1683 	if (zfsvfs->z_ctldir != NULL) {
1684 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1685 			return (ret);
1686 	}
1687 
1688 	if (fflag & MS_FORCE) {
1689 		/*
1690 		 * Mark file system as unmounted before calling
1691 		 * vflush(FORCECLOSE). This way we ensure no future vnops
1692 		 * will be called and risk operating on DOOMED vnodes.
1693 		 */
1694 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1695 		zfsvfs->z_unmounted = B_TRUE;
1696 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1697 	}
1698 
1699 	/*
1700 	 * Flush all the files.
1701 	 */
1702 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1703 	if (ret != 0)
1704 		return (ret);
1705 	while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1706 	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1707 		taskqueue_drain(zfsvfs_taskq->tq_queue,
1708 		    &zfsvfs->z_unlinked_drain_task);
1709 
1710 	VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1711 	os = zfsvfs->z_os;
1712 
1713 	/*
1714 	 * z_os will be NULL if there was an error in
1715 	 * attempting to reopen zfsvfs.
1716 	 */
1717 	if (os != NULL) {
1718 		/*
1719 		 * Unset the objset user_ptr.
1720 		 */
1721 		mutex_enter(&os->os_user_ptr_lock);
1722 		dmu_objset_set_user(os, NULL);
1723 		mutex_exit(&os->os_user_ptr_lock);
1724 
1725 		/*
1726 		 * Finally release the objset
1727 		 */
1728 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1729 	}
1730 
1731 	/*
1732 	 * We can now safely destroy the '.zfs' directory node.
1733 	 */
1734 	if (zfsvfs->z_ctldir != NULL)
1735 		zfsctl_destroy(zfsvfs);
1736 	zfs_freevfs(vfsp);
1737 
1738 	return (0);
1739 }
1740 
1741 static int
1742 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1743 {
1744 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1745 	znode_t		*zp;
1746 	int 		err;
1747 
1748 	/*
1749 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
1750 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1751 	 * This will make NFS to switch to LOOKUP instead of using VGET.
1752 	 */
1753 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1754 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1755 		return (EOPNOTSUPP);
1756 
1757 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1758 		return (err);
1759 	err = zfs_zget(zfsvfs, ino, &zp);
1760 	if (err == 0 && zp->z_unlinked) {
1761 		vrele(ZTOV(zp));
1762 		err = EINVAL;
1763 	}
1764 	if (err == 0)
1765 		*vpp = ZTOV(zp);
1766 	zfs_exit(zfsvfs, FTAG);
1767 	if (err == 0) {
1768 		err = vn_lock(*vpp, flags);
1769 		if (err != 0)
1770 			vrele(*vpp);
1771 	}
1772 	if (err != 0)
1773 		*vpp = NULL;
1774 	return (err);
1775 }
1776 
1777 static int
1778 #if __FreeBSD_version >= 1300098
1779 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1780     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1781 #else
1782 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1783     struct ucred **credanonp, int *numsecflavors, int **secflavors)
1784 #endif
1785 {
1786 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1787 
1788 	/*
1789 	 * If this is regular file system vfsp is the same as
1790 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1791 	 * zfsvfs->z_parent->z_vfs represents parent file system
1792 	 * which we have to use here, because only this file system
1793 	 * has mnt_export configured.
1794 	 */
1795 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1796 	    credanonp, numsecflavors, secflavors));
1797 }
1798 
1799 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1800 	"struct fid bigger than SHORT_FID_LEN");
1801 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1802 	"struct fid bigger than LONG_FID_LEN");
1803 
1804 static int
1805 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1806 {
1807 	struct componentname cn;
1808 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1809 	znode_t		*zp;
1810 	vnode_t		*dvp;
1811 	uint64_t	object = 0;
1812 	uint64_t	fid_gen = 0;
1813 	uint64_t	setgen = 0;
1814 	uint64_t	gen_mask;
1815 	uint64_t	zp_gen;
1816 	int 		i, err;
1817 
1818 	*vpp = NULL;
1819 
1820 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1821 		return (err);
1822 
1823 	/*
1824 	 * On FreeBSD we can get snapshot's mount point or its parent file
1825 	 * system mount point depending if snapshot is already mounted or not.
1826 	 */
1827 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1828 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1829 		uint64_t	objsetid = 0;
1830 
1831 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1832 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1833 
1834 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1835 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1836 
1837 		zfs_exit(zfsvfs, FTAG);
1838 
1839 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1840 		if (err)
1841 			return (SET_ERROR(EINVAL));
1842 		if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1843 			return (err);
1844 	}
1845 
1846 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1847 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1848 
1849 		for (i = 0; i < sizeof (zfid->zf_object); i++)
1850 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1851 
1852 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1853 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1854 	} else {
1855 		zfs_exit(zfsvfs, FTAG);
1856 		return (SET_ERROR(EINVAL));
1857 	}
1858 
1859 	if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1860 		zfs_exit(zfsvfs, FTAG);
1861 		dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1862 		    (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1863 		return (SET_ERROR(EINVAL));
1864 	}
1865 
1866 	/*
1867 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1868 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
1869 	 * we are in the .zfs/shares directory tree.
1870 	 */
1871 	if ((fid_gen == 0 &&
1872 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1873 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1874 		zfs_exit(zfsvfs, FTAG);
1875 		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1876 		if (object == ZFSCTL_INO_SNAPDIR) {
1877 			cn.cn_nameptr = "snapshot";
1878 			cn.cn_namelen = strlen(cn.cn_nameptr);
1879 			cn.cn_nameiop = LOOKUP;
1880 			cn.cn_flags = ISLASTCN | LOCKLEAF;
1881 			cn.cn_lkflags = flags;
1882 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1883 			vput(dvp);
1884 		} else if (object == zfsvfs->z_shares_dir) {
1885 			/*
1886 			 * XXX This branch must not be taken,
1887 			 * if it is, then the lookup below will
1888 			 * explode.
1889 			 */
1890 			cn.cn_nameptr = "shares";
1891 			cn.cn_namelen = strlen(cn.cn_nameptr);
1892 			cn.cn_nameiop = LOOKUP;
1893 			cn.cn_flags = ISLASTCN;
1894 			cn.cn_lkflags = flags;
1895 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1896 			vput(dvp);
1897 		} else {
1898 			*vpp = dvp;
1899 		}
1900 		return (err);
1901 	}
1902 
1903 	gen_mask = -1ULL >> (64 - 8 * i);
1904 
1905 	dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1906 	    (u_longlong_t)fid_gen,
1907 	    (u_longlong_t)gen_mask);
1908 	if ((err = zfs_zget(zfsvfs, object, &zp))) {
1909 		zfs_exit(zfsvfs, FTAG);
1910 		return (err);
1911 	}
1912 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1913 	    sizeof (uint64_t));
1914 	zp_gen = zp_gen & gen_mask;
1915 	if (zp_gen == 0)
1916 		zp_gen = 1;
1917 	if (zp->z_unlinked || zp_gen != fid_gen) {
1918 		dprintf("znode gen (%llu) != fid gen (%llu)\n",
1919 		    (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1920 		vrele(ZTOV(zp));
1921 		zfs_exit(zfsvfs, FTAG);
1922 		return (SET_ERROR(EINVAL));
1923 	}
1924 
1925 	*vpp = ZTOV(zp);
1926 	zfs_exit(zfsvfs, FTAG);
1927 	err = vn_lock(*vpp, flags);
1928 	if (err == 0)
1929 		vnode_create_vobject(*vpp, zp->z_size, curthread);
1930 	else
1931 		*vpp = NULL;
1932 	return (err);
1933 }
1934 
1935 /*
1936  * Block out VOPs and close zfsvfs_t::z_os
1937  *
1938  * Note, if successful, then we return with the 'z_teardown_lock' and
1939  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1940  * dataset and objset intact so that they can be atomically handed off during
1941  * a subsequent rollback or recv operation and the resume thereafter.
1942  */
1943 int
1944 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1945 {
1946 	int error;
1947 
1948 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1949 		return (error);
1950 
1951 	return (0);
1952 }
1953 
1954 /*
1955  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1956  * is an invariant across any of the operations that can be performed while the
1957  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1958  * are the same: the relevant objset and associated dataset are owned by
1959  * zfsvfs, held, and long held on entry.
1960  */
1961 int
1962 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1963 {
1964 	int err;
1965 	znode_t *zp;
1966 
1967 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1968 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1969 
1970 	/*
1971 	 * We already own this, so just update the objset_t, as the one we
1972 	 * had before may have been evicted.
1973 	 */
1974 	objset_t *os;
1975 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
1976 	VERIFY(dsl_dataset_long_held(ds));
1977 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1978 	dsl_pool_config_enter(dp, FTAG);
1979 	VERIFY0(dmu_objset_from_ds(ds, &os));
1980 	dsl_pool_config_exit(dp, FTAG);
1981 
1982 	err = zfsvfs_init(zfsvfs, os);
1983 	if (err != 0)
1984 		goto bail;
1985 
1986 	ds->ds_dir->dd_activity_cancelled = B_FALSE;
1987 	VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
1988 
1989 	zfs_set_fuid_feature(zfsvfs);
1990 
1991 	/*
1992 	 * Attempt to re-establish all the active znodes with
1993 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
1994 	 * any potential callers discover that via zfs_enter_verify_zp
1995 	 * when they try to use their znode.
1996 	 */
1997 	mutex_enter(&zfsvfs->z_znodes_lock);
1998 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1999 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2000 		(void) zfs_rezget(zp);
2001 	}
2002 	mutex_exit(&zfsvfs->z_znodes_lock);
2003 
2004 bail:
2005 	/* release the VOPs */
2006 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2007 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2008 
2009 	if (err) {
2010 		/*
2011 		 * Since we couldn't setup the sa framework, try to force
2012 		 * unmount this file system.
2013 		 */
2014 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2015 			vfs_ref(zfsvfs->z_vfs);
2016 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2017 		}
2018 	}
2019 	return (err);
2020 }
2021 
2022 static void
2023 zfs_freevfs(vfs_t *vfsp)
2024 {
2025 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2026 
2027 	zfsvfs_free(zfsvfs);
2028 
2029 	atomic_dec_32(&zfs_active_fs_count);
2030 }
2031 
2032 #ifdef __i386__
2033 static int desiredvnodes_backup;
2034 #include <sys/vmmeter.h>
2035 
2036 
2037 #include <vm/vm_page.h>
2038 #include <vm/vm_object.h>
2039 #include <vm/vm_kern.h>
2040 #include <vm/vm_map.h>
2041 #endif
2042 
2043 static void
2044 zfs_vnodes_adjust(void)
2045 {
2046 #ifdef __i386__
2047 	int newdesiredvnodes;
2048 
2049 	desiredvnodes_backup = desiredvnodes;
2050 
2051 	/*
2052 	 * We calculate newdesiredvnodes the same way it is done in
2053 	 * vntblinit(). If it is equal to desiredvnodes, it means that
2054 	 * it wasn't tuned by the administrator and we can tune it down.
2055 	 */
2056 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2057 	    vm_kmem_size / (5 * (sizeof (struct vm_object) +
2058 	    sizeof (struct vnode))));
2059 	if (newdesiredvnodes == desiredvnodes)
2060 		desiredvnodes = (3 * newdesiredvnodes) / 4;
2061 #endif
2062 }
2063 
2064 static void
2065 zfs_vnodes_adjust_back(void)
2066 {
2067 
2068 #ifdef __i386__
2069 	desiredvnodes = desiredvnodes_backup;
2070 #endif
2071 }
2072 
2073 #if __FreeBSD_version >= 1300139
2074 static struct sx zfs_vnlru_lock;
2075 static struct vnode *zfs_vnlru_marker;
2076 #endif
2077 static arc_prune_t *zfs_prune;
2078 
2079 static void
2080 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
2081 {
2082 	if (nr_to_scan > INT_MAX)
2083 		nr_to_scan = INT_MAX;
2084 #if __FreeBSD_version >= 1300139
2085 	sx_xlock(&zfs_vnlru_lock);
2086 	vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
2087 	sx_xunlock(&zfs_vnlru_lock);
2088 #else
2089 	vnlru_free(nr_to_scan, &zfs_vfsops);
2090 #endif
2091 }
2092 
2093 void
2094 zfs_init(void)
2095 {
2096 
2097 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2098 
2099 	/*
2100 	 * Initialize .zfs directory structures
2101 	 */
2102 	zfsctl_init();
2103 
2104 	/*
2105 	 * Initialize znode cache, vnode ops, etc...
2106 	 */
2107 	zfs_znode_init();
2108 
2109 	/*
2110 	 * Reduce number of vnodes. Originally number of vnodes is calculated
2111 	 * with UFS inode in mind. We reduce it here, because it's too big for
2112 	 * ZFS/i386.
2113 	 */
2114 	zfs_vnodes_adjust();
2115 
2116 	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2117 
2118 	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2119 
2120 #if __FreeBSD_version >= 1300139
2121 	zfs_vnlru_marker = vnlru_alloc_marker();
2122 	sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
2123 #endif
2124 	zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
2125 }
2126 
2127 void
2128 zfs_fini(void)
2129 {
2130 	arc_remove_prune_callback(zfs_prune);
2131 #if __FreeBSD_version >= 1300139
2132 	vnlru_free_marker(zfs_vnlru_marker);
2133 	sx_destroy(&zfs_vnlru_lock);
2134 #endif
2135 
2136 	taskq_destroy(zfsvfs_taskq);
2137 	zfsctl_fini();
2138 	zfs_znode_fini();
2139 	zfs_vnodes_adjust_back();
2140 }
2141 
2142 int
2143 zfs_busy(void)
2144 {
2145 	return (zfs_active_fs_count != 0);
2146 }
2147 
2148 /*
2149  * Release VOPs and unmount a suspended filesystem.
2150  */
2151 int
2152 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2153 {
2154 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2155 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2156 
2157 	/*
2158 	 * We already own this, so just hold and rele it to update the
2159 	 * objset_t, as the one we had before may have been evicted.
2160 	 */
2161 	objset_t *os;
2162 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2163 	VERIFY(dsl_dataset_long_held(ds));
2164 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2165 	dsl_pool_config_enter(dp, FTAG);
2166 	VERIFY0(dmu_objset_from_ds(ds, &os));
2167 	dsl_pool_config_exit(dp, FTAG);
2168 	zfsvfs->z_os = os;
2169 
2170 	/* release the VOPs */
2171 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2172 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2173 
2174 	/*
2175 	 * Try to force unmount this file system.
2176 	 */
2177 	(void) zfs_umount(zfsvfs->z_vfs, 0);
2178 	zfsvfs->z_unmounted = B_TRUE;
2179 	return (0);
2180 }
2181 
2182 int
2183 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2184 {
2185 	int error;
2186 	objset_t *os = zfsvfs->z_os;
2187 	dmu_tx_t *tx;
2188 
2189 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2190 		return (SET_ERROR(EINVAL));
2191 
2192 	if (newvers < zfsvfs->z_version)
2193 		return (SET_ERROR(EINVAL));
2194 
2195 	if (zfs_spa_version_map(newvers) >
2196 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2197 		return (SET_ERROR(ENOTSUP));
2198 
2199 	tx = dmu_tx_create(os);
2200 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2201 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2202 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2203 		    ZFS_SA_ATTRS);
2204 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2205 	}
2206 	error = dmu_tx_assign(tx, TXG_WAIT);
2207 	if (error) {
2208 		dmu_tx_abort(tx);
2209 		return (error);
2210 	}
2211 
2212 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2213 	    8, 1, &newvers, tx);
2214 
2215 	if (error) {
2216 		dmu_tx_commit(tx);
2217 		return (error);
2218 	}
2219 
2220 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2221 		uint64_t sa_obj;
2222 
2223 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2224 		    SPA_VERSION_SA);
2225 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2226 		    DMU_OT_NONE, 0, tx);
2227 
2228 		error = zap_add(os, MASTER_NODE_OBJ,
2229 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2230 		ASSERT0(error);
2231 
2232 		VERIFY0(sa_set_sa_object(os, sa_obj));
2233 		sa_register_update_callback(os, zfs_sa_upgrade);
2234 	}
2235 
2236 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2237 	    "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2238 	    (uintmax_t)newvers);
2239 	dmu_tx_commit(tx);
2240 
2241 	zfsvfs->z_version = newvers;
2242 	os->os_version = newvers;
2243 
2244 	zfs_set_fuid_feature(zfsvfs);
2245 
2246 	return (0);
2247 }
2248 
2249 /*
2250  * Return true if the corresponding vfs's unmounted flag is set.
2251  * Otherwise return false.
2252  * If this function returns true we know VFS unmount has been initiated.
2253  */
2254 boolean_t
2255 zfs_get_vfs_flag_unmounted(objset_t *os)
2256 {
2257 	zfsvfs_t *zfvp;
2258 	boolean_t unmounted = B_FALSE;
2259 
2260 	ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2261 
2262 	mutex_enter(&os->os_user_ptr_lock);
2263 	zfvp = dmu_objset_get_user(os);
2264 	if (zfvp != NULL && zfvp->z_vfs != NULL &&
2265 	    (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2266 		unmounted = B_TRUE;
2267 	mutex_exit(&os->os_user_ptr_lock);
2268 
2269 	return (unmounted);
2270 }
2271 
2272 #ifdef _KERNEL
2273 void
2274 zfsvfs_update_fromname(const char *oldname, const char *newname)
2275 {
2276 	char tmpbuf[MAXPATHLEN];
2277 	struct mount *mp;
2278 	char *fromname;
2279 	size_t oldlen;
2280 
2281 	oldlen = strlen(oldname);
2282 
2283 	mtx_lock(&mountlist_mtx);
2284 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2285 		fromname = mp->mnt_stat.f_mntfromname;
2286 		if (strcmp(fromname, oldname) == 0) {
2287 			(void) strlcpy(fromname, newname,
2288 			    sizeof (mp->mnt_stat.f_mntfromname));
2289 			continue;
2290 		}
2291 		if (strncmp(fromname, oldname, oldlen) == 0 &&
2292 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2293 			(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2294 			    newname, fromname + oldlen);
2295 			(void) strlcpy(fromname, tmpbuf,
2296 			    sizeof (mp->mnt_stat.f_mntfromname));
2297 			continue;
2298 		}
2299 	}
2300 	mtx_unlock(&mountlist_mtx);
2301 }
2302 #endif
2303 
2304 /*
2305  * Find a prison with ZFS info.
2306  * Return the ZFS info and the (locked) prison.
2307  */
2308 static struct zfs_jailparam *
2309 zfs_jailparam_find(struct prison *spr, struct prison **prp)
2310 {
2311 	struct prison *pr;
2312 	struct zfs_jailparam *zjp;
2313 
2314 	for (pr = spr; ; pr = pr->pr_parent) {
2315 		mtx_lock(&pr->pr_mtx);
2316 		if (pr == &prison0) {
2317 			zjp = &zfs_jailparam0;
2318 			break;
2319 		}
2320 		zjp = osd_jail_get(pr, zfs_jailparam_slot);
2321 		if (zjp != NULL)
2322 			break;
2323 		mtx_unlock(&pr->pr_mtx);
2324 	}
2325 	*prp = pr;
2326 
2327 	return (zjp);
2328 }
2329 
2330 /*
2331  * Ensure a prison has its own ZFS info.  If zjpp is non-null, point it to the
2332  * ZFS info and lock the prison.
2333  */
2334 static void
2335 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2336 {
2337 	struct prison *ppr;
2338 	struct zfs_jailparam *zjp, *nzjp;
2339 	void **rsv;
2340 
2341 	/* If this prison already has ZFS info, return that. */
2342 	zjp = zfs_jailparam_find(pr, &ppr);
2343 	if (ppr == pr)
2344 		goto done;
2345 
2346 	/*
2347 	 * Allocate a new info record.  Then check again, in case something
2348 	 * changed during the allocation.
2349 	 */
2350 	mtx_unlock(&ppr->pr_mtx);
2351 	nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2352 	rsv = osd_reserve(zfs_jailparam_slot);
2353 	zjp = zfs_jailparam_find(pr, &ppr);
2354 	if (ppr == pr) {
2355 		free(nzjp, M_PRISON);
2356 		osd_free_reserved(rsv);
2357 		goto done;
2358 	}
2359 	/* Inherit the initial values from the ancestor. */
2360 	mtx_lock(&pr->pr_mtx);
2361 	(void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2362 	(void) memcpy(nzjp, zjp, sizeof (*zjp));
2363 	zjp = nzjp;
2364 	mtx_unlock(&ppr->pr_mtx);
2365 done:
2366 	if (zjpp != NULL)
2367 		*zjpp = zjp;
2368 	else
2369 		mtx_unlock(&pr->pr_mtx);
2370 }
2371 
2372 /*
2373  * Jail OSD methods for ZFS VFS info.
2374  */
2375 static int
2376 zfs_jailparam_create(void *obj, void *data)
2377 {
2378 	struct prison *pr = obj;
2379 	struct vfsoptlist *opts = data;
2380 	int jsys;
2381 
2382 	if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2383 	    jsys == JAIL_SYS_INHERIT)
2384 		return (0);
2385 	/*
2386 	 * Inherit a prison's initial values from its parent
2387 	 * (different from JAIL_SYS_INHERIT which also inherits changes).
2388 	 */
2389 	zfs_jailparam_alloc(pr, NULL);
2390 	return (0);
2391 }
2392 
2393 static int
2394 zfs_jailparam_get(void *obj, void *data)
2395 {
2396 	struct prison *ppr, *pr = obj;
2397 	struct vfsoptlist *opts = data;
2398 	struct zfs_jailparam *zjp;
2399 	int jsys, error;
2400 
2401 	zjp = zfs_jailparam_find(pr, &ppr);
2402 	jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2403 	error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2404 	if (error != 0 && error != ENOENT)
2405 		goto done;
2406 	if (jsys == JAIL_SYS_NEW) {
2407 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2408 		    &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2409 		if (error != 0 && error != ENOENT)
2410 			goto done;
2411 	} else {
2412 		/*
2413 		 * If this prison is inheriting its ZFS info, report
2414 		 * empty/zero parameters.
2415 		 */
2416 		static int mount_snapshot = 0;
2417 
2418 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2419 		    &mount_snapshot, sizeof (mount_snapshot));
2420 		if (error != 0 && error != ENOENT)
2421 			goto done;
2422 	}
2423 	error = 0;
2424 done:
2425 	mtx_unlock(&ppr->pr_mtx);
2426 	return (error);
2427 }
2428 
2429 static int
2430 zfs_jailparam_set(void *obj, void *data)
2431 {
2432 	struct prison *pr = obj;
2433 	struct prison *ppr;
2434 	struct vfsoptlist *opts = data;
2435 	int error, jsys, mount_snapshot;
2436 
2437 	/* Set the parameters, which should be correct. */
2438 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2439 	if (error == ENOENT)
2440 		jsys = -1;
2441 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2442 	    sizeof (mount_snapshot));
2443 	if (error == ENOENT)
2444 		mount_snapshot = -1;
2445 	else
2446 		jsys = JAIL_SYS_NEW;
2447 	switch (jsys) {
2448 	case JAIL_SYS_NEW:
2449 	{
2450 		/* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2451 		struct zfs_jailparam *zjp;
2452 
2453 		/*
2454 		 * A child jail cannot have more permissions than its parent
2455 		 */
2456 		if (pr->pr_parent != &prison0) {
2457 			zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2458 			mtx_unlock(&ppr->pr_mtx);
2459 			if (zjp->mount_snapshot < mount_snapshot) {
2460 				return (EPERM);
2461 			}
2462 		}
2463 		zfs_jailparam_alloc(pr, &zjp);
2464 		if (mount_snapshot != -1)
2465 			zjp->mount_snapshot = mount_snapshot;
2466 		mtx_unlock(&pr->pr_mtx);
2467 		break;
2468 	}
2469 	case JAIL_SYS_INHERIT:
2470 		/* "zfs=inherit": inherit the parent's ZFS info. */
2471 		mtx_lock(&pr->pr_mtx);
2472 		osd_jail_del(pr, zfs_jailparam_slot);
2473 		mtx_unlock(&pr->pr_mtx);
2474 		break;
2475 	case -1:
2476 		/*
2477 		 * If the setting being changed is not ZFS related
2478 		 * then do nothing.
2479 		 */
2480 		break;
2481 	}
2482 
2483 	return (0);
2484 }
2485 
2486 static int
2487 zfs_jailparam_check(void *obj __unused, void *data)
2488 {
2489 	struct vfsoptlist *opts = data;
2490 	int error, jsys, mount_snapshot;
2491 
2492 	/* Check that the parameters are correct. */
2493 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2494 	if (error != ENOENT) {
2495 		if (error != 0)
2496 			return (error);
2497 		if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2498 			return (EINVAL);
2499 	}
2500 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2501 	    sizeof (mount_snapshot));
2502 	if (error != ENOENT) {
2503 		if (error != 0)
2504 			return (error);
2505 		if (mount_snapshot != 0 && mount_snapshot != 1)
2506 			return (EINVAL);
2507 	}
2508 	return (0);
2509 }
2510 
2511 static void
2512 zfs_jailparam_destroy(void *data)
2513 {
2514 
2515 	free(data, M_PRISON);
2516 }
2517 
2518 static void
2519 zfs_jailparam_sysinit(void *arg __unused)
2520 {
2521 	struct prison *pr;
2522 	osd_method_t  methods[PR_MAXMETHOD] = {
2523 		[PR_METHOD_CREATE] = zfs_jailparam_create,
2524 		[PR_METHOD_GET] = zfs_jailparam_get,
2525 		[PR_METHOD_SET] = zfs_jailparam_set,
2526 		[PR_METHOD_CHECK] = zfs_jailparam_check,
2527 	};
2528 
2529 	zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2530 	/* Copy the defaults to any existing prisons. */
2531 	sx_slock(&allprison_lock);
2532 	TAILQ_FOREACH(pr, &allprison, pr_list)
2533 		zfs_jailparam_alloc(pr, NULL);
2534 	sx_sunlock(&allprison_lock);
2535 }
2536 
2537 static void
2538 zfs_jailparam_sysuninit(void *arg __unused)
2539 {
2540 
2541 	osd_jail_deregister(zfs_jailparam_slot);
2542 }
2543 
2544 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2545 	zfs_jailparam_sysinit, NULL);
2546 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2547 	zfs_jailparam_sysuninit, NULL);
2548