1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24  * All rights reserved.
25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28  */
29 
30 /* Portions Copyright 2010 Robert Milkowski */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/acl.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/mntent.h>
42 #include <sys/mount.h>
43 #include <sys/cmn_err.h>
44 #include <sys/zfs_znode.h>
45 #include <sys/zfs_vnops.h>
46 #include <sys/zfs_dir.h>
47 #include <sys/zil.h>
48 #include <sys/fs/zfs.h>
49 #include <sys/dmu.h>
50 #include <sys/dsl_prop.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/dsl_deleg.h>
53 #include <sys/spa.h>
54 #include <sys/zap.h>
55 #include <sys/sa.h>
56 #include <sys/sa_impl.h>
57 #include <sys/policy.h>
58 #include <sys/atomic.h>
59 #include <sys/zfs_ioctl.h>
60 #include <sys/zfs_ctldir.h>
61 #include <sys/zfs_fuid.h>
62 #include <sys/sunddi.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/dsl_dir.h>
65 #include <sys/jail.h>
66 #include <sys/osd.h>
67 #include <ufs/ufs/quota.h>
68 #include <sys/zfs_quota.h>
69 
70 #include "zfs_comutil.h"
71 
72 #ifndef	MNTK_VMSETSIZE_BUG
73 #define	MNTK_VMSETSIZE_BUG	0
74 #endif
75 #ifndef	MNTK_NOMSYNC
76 #define	MNTK_NOMSYNC	8
77 #endif
78 
79 struct mtx zfs_debug_mtx;
80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
81 
82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
83 
84 int zfs_super_owner;
85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
86 	"File system owners can perform privileged operation on file systems");
87 
88 int zfs_debug_level;
89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
90 	"Debug level");
91 
92 int zfs_bclone_enabled = 1;
93 SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
94 	&zfs_bclone_enabled, 0, "Enable block cloning");
95 
96 struct zfs_jailparam {
97 	int mount_snapshot;
98 };
99 
100 static struct zfs_jailparam zfs_jailparam0 = {
101 	.mount_snapshot = 0,
102 };
103 
104 static int zfs_jailparam_slot;
105 
106 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
107 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
108 	"Allow mounting snapshots in the .zfs directory for unjailed datasets");
109 
110 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
111 static int zfs_version_acl = ZFS_ACL_VERSION;
112 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
113 	"ZFS_ACL_VERSION");
114 static int zfs_version_spa = SPA_VERSION;
115 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
116 	"SPA_VERSION");
117 static int zfs_version_zpl = ZPL_VERSION;
118 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
119 	"ZPL_VERSION");
120 
121 #if __FreeBSD_version >= 1400018
122 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
123     bool *mp_busy);
124 #else
125 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
126 #endif
127 static int zfs_mount(vfs_t *vfsp);
128 static int zfs_umount(vfs_t *vfsp, int fflag);
129 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
130 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
131 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
132 static int zfs_sync(vfs_t *vfsp, int waitfor);
133 #if __FreeBSD_version >= 1300098
134 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
135     struct ucred **credanonp, int *numsecflavors, int *secflavors);
136 #else
137 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
138     struct ucred **credanonp, int *numsecflavors, int **secflavors);
139 #endif
140 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
141 static void zfs_freevfs(vfs_t *vfsp);
142 
143 struct vfsops zfs_vfsops = {
144 	.vfs_mount =		zfs_mount,
145 	.vfs_unmount =		zfs_umount,
146 #if __FreeBSD_version >= 1300049
147 	.vfs_root =		vfs_cache_root,
148 	.vfs_cachedroot = zfs_root,
149 #else
150 	.vfs_root =		zfs_root,
151 #endif
152 	.vfs_statfs =		zfs_statfs,
153 	.vfs_vget =		zfs_vget,
154 	.vfs_sync =		zfs_sync,
155 	.vfs_checkexp =		zfs_checkexp,
156 	.vfs_fhtovp =		zfs_fhtovp,
157 	.vfs_quotactl =		zfs_quotactl,
158 };
159 
160 #ifdef VFCF_CROSS_COPY_FILE_RANGE
161 VFS_SET(zfs_vfsops, zfs,
162     VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE);
163 #else
164 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL);
165 #endif
166 
167 /*
168  * We need to keep a count of active fs's.
169  * This is necessary to prevent our module
170  * from being unloaded after a umount -f
171  */
172 static uint32_t	zfs_active_fs_count = 0;
173 
174 int
175 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
176     char *setpoint)
177 {
178 	int error;
179 	zfsvfs_t *zfvp;
180 	vfs_t *vfsp;
181 	objset_t *os;
182 	uint64_t tmp = *val;
183 
184 	error = dmu_objset_from_ds(ds, &os);
185 	if (error != 0)
186 		return (error);
187 
188 	error = getzfsvfs_impl(os, &zfvp);
189 	if (error != 0)
190 		return (error);
191 	if (zfvp == NULL)
192 		return (ENOENT);
193 	vfsp = zfvp->z_vfs;
194 	switch (zfs_prop) {
195 	case ZFS_PROP_ATIME:
196 		if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
197 			tmp = 0;
198 		if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
199 			tmp = 1;
200 		break;
201 	case ZFS_PROP_DEVICES:
202 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
203 			tmp = 0;
204 		if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
205 			tmp = 1;
206 		break;
207 	case ZFS_PROP_EXEC:
208 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
209 			tmp = 0;
210 		if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
211 			tmp = 1;
212 		break;
213 	case ZFS_PROP_SETUID:
214 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
215 			tmp = 0;
216 		if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
217 			tmp = 1;
218 		break;
219 	case ZFS_PROP_READONLY:
220 		if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
221 			tmp = 0;
222 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
223 			tmp = 1;
224 		break;
225 	case ZFS_PROP_XATTR:
226 		if (zfvp->z_flags & ZSB_XATTR)
227 			tmp = zfvp->z_xattr;
228 		break;
229 	case ZFS_PROP_NBMAND:
230 		if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
231 			tmp = 0;
232 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
233 			tmp = 1;
234 		break;
235 	default:
236 		vfs_unbusy(vfsp);
237 		return (ENOENT);
238 	}
239 
240 	vfs_unbusy(vfsp);
241 	if (tmp != *val) {
242 		if (setpoint)
243 			(void) strcpy(setpoint, "temporary");
244 		*val = tmp;
245 	}
246 	return (0);
247 }
248 
249 static int
250 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
251 {
252 	int error = 0;
253 	char buf[32];
254 	uint64_t usedobj, quotaobj;
255 	uint64_t quota, used = 0;
256 	timespec_t now;
257 
258 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
259 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
260 
261 	if (quotaobj == 0 || zfsvfs->z_replay) {
262 		error = ENOENT;
263 		goto done;
264 	}
265 	(void) sprintf(buf, "%llx", (longlong_t)id);
266 	if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
267 	    buf, sizeof (quota), 1, &quota)) != 0) {
268 		dprintf("%s(%d): quotaobj lookup failed\n",
269 		    __FUNCTION__, __LINE__);
270 		goto done;
271 	}
272 	/*
273 	 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
274 	 * So we set them to be the same.
275 	 */
276 	dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
277 	error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
278 	if (error && error != ENOENT) {
279 		dprintf("%s(%d):  usedobj failed; %d\n",
280 		    __FUNCTION__, __LINE__, error);
281 		goto done;
282 	}
283 	dqp->dqb_curblocks = btodb(used);
284 	dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
285 	vfs_timestamp(&now);
286 	/*
287 	 * Setting this to 0 causes FreeBSD quota(8) to print
288 	 * the number of days since the epoch, which isn't
289 	 * particularly useful.
290 	 */
291 	dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
292 done:
293 	return (error);
294 }
295 
296 static int
297 #if __FreeBSD_version >= 1400018
298 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
299 #else
300 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
301 #endif
302 {
303 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
304 	struct thread *td;
305 	int cmd, type, error = 0;
306 	int bitsize;
307 	zfs_userquota_prop_t quota_type;
308 	struct dqblk64 dqblk = { 0 };
309 
310 	td = curthread;
311 	cmd = cmds >> SUBCMDSHIFT;
312 	type = cmds & SUBCMDMASK;
313 
314 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
315 		return (error);
316 	if (id == -1) {
317 		switch (type) {
318 		case USRQUOTA:
319 			id = td->td_ucred->cr_ruid;
320 			break;
321 		case GRPQUOTA:
322 			id = td->td_ucred->cr_rgid;
323 			break;
324 		default:
325 			error = EINVAL;
326 #if __FreeBSD_version < 1400018
327 			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
328 				vfs_unbusy(vfsp);
329 #endif
330 			goto done;
331 		}
332 	}
333 	/*
334 	 * Map BSD type to:
335 	 * ZFS_PROP_USERUSED,
336 	 * ZFS_PROP_USERQUOTA,
337 	 * ZFS_PROP_GROUPUSED,
338 	 * ZFS_PROP_GROUPQUOTA
339 	 */
340 	switch (cmd) {
341 	case Q_SETQUOTA:
342 	case Q_SETQUOTA32:
343 		if (type == USRQUOTA)
344 			quota_type = ZFS_PROP_USERQUOTA;
345 		else if (type == GRPQUOTA)
346 			quota_type = ZFS_PROP_GROUPQUOTA;
347 		else
348 			error = EINVAL;
349 		break;
350 	case Q_GETQUOTA:
351 	case Q_GETQUOTA32:
352 		if (type == USRQUOTA)
353 			quota_type = ZFS_PROP_USERUSED;
354 		else if (type == GRPQUOTA)
355 			quota_type = ZFS_PROP_GROUPUSED;
356 		else
357 			error = EINVAL;
358 		break;
359 	}
360 
361 	/*
362 	 * Depending on the cmd, we may need to get
363 	 * the ruid and domain (see fuidstr_to_sid?),
364 	 * the fuid (how?), or other information.
365 	 * Create fuid using zfs_fuid_create(zfsvfs, id,
366 	 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
367 	 * I think I can use just the id?
368 	 *
369 	 * Look at zfs_id_overquota() to look up a quota.
370 	 * zap_lookup(something, quotaobj, fuidstring,
371 	 *     sizeof (long long), 1, &quota)
372 	 *
373 	 * See zfs_set_userquota() to set a quota.
374 	 */
375 	if ((uint32_t)type >= MAXQUOTAS) {
376 		error = EINVAL;
377 		goto done;
378 	}
379 
380 	switch (cmd) {
381 	case Q_GETQUOTASIZE:
382 		bitsize = 64;
383 		error = copyout(&bitsize, arg, sizeof (int));
384 		break;
385 	case Q_QUOTAON:
386 		// As far as I can tell, you can't turn quotas on or off on zfs
387 		error = 0;
388 #if __FreeBSD_version < 1400018
389 		vfs_unbusy(vfsp);
390 #endif
391 		break;
392 	case Q_QUOTAOFF:
393 		error = ENOTSUP;
394 #if __FreeBSD_version < 1400018
395 		vfs_unbusy(vfsp);
396 #endif
397 		break;
398 	case Q_SETQUOTA:
399 		error = copyin(arg, &dqblk, sizeof (dqblk));
400 		if (error == 0)
401 			error = zfs_set_userquota(zfsvfs, quota_type,
402 			    "", id, dbtob(dqblk.dqb_bhardlimit));
403 		break;
404 	case Q_GETQUOTA:
405 		error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
406 		if (error == 0)
407 			error = copyout(&dqblk, arg, sizeof (dqblk));
408 		break;
409 	default:
410 		error = EINVAL;
411 		break;
412 	}
413 done:
414 	zfs_exit(zfsvfs, FTAG);
415 	return (error);
416 }
417 
418 
419 boolean_t
420 zfs_is_readonly(zfsvfs_t *zfsvfs)
421 {
422 	return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
423 }
424 
425 static int
426 zfs_sync(vfs_t *vfsp, int waitfor)
427 {
428 
429 	/*
430 	 * Data integrity is job one.  We don't want a compromised kernel
431 	 * writing to the storage pool, so we never sync during panic.
432 	 */
433 	if (panicstr)
434 		return (0);
435 
436 	/*
437 	 * Ignore the system syncher.  ZFS already commits async data
438 	 * at zfs_txg_timeout intervals.
439 	 */
440 	if (waitfor == MNT_LAZY)
441 		return (0);
442 
443 	if (vfsp != NULL) {
444 		/*
445 		 * Sync a specific filesystem.
446 		 */
447 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
448 		dsl_pool_t *dp;
449 		int error;
450 
451 		if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
452 			return (error);
453 		dp = dmu_objset_pool(zfsvfs->z_os);
454 
455 		/*
456 		 * If the system is shutting down, then skip any
457 		 * filesystems which may exist on a suspended pool.
458 		 */
459 		if (rebooting && spa_suspended(dp->dp_spa)) {
460 			zfs_exit(zfsvfs, FTAG);
461 			return (0);
462 		}
463 
464 		if (zfsvfs->z_log != NULL)
465 			zil_commit(zfsvfs->z_log, 0);
466 
467 		zfs_exit(zfsvfs, FTAG);
468 	} else {
469 		/*
470 		 * Sync all ZFS filesystems.  This is what happens when you
471 		 * run sync(8).  Unlike other filesystems, ZFS honors the
472 		 * request by waiting for all pools to commit all dirty data.
473 		 */
474 		spa_sync_allpools();
475 	}
476 
477 	return (0);
478 }
479 
480 static void
481 atime_changed_cb(void *arg, uint64_t newval)
482 {
483 	zfsvfs_t *zfsvfs = arg;
484 
485 	if (newval == TRUE) {
486 		zfsvfs->z_atime = TRUE;
487 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
488 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
489 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
490 	} else {
491 		zfsvfs->z_atime = FALSE;
492 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
493 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
494 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
495 	}
496 }
497 
498 static void
499 xattr_changed_cb(void *arg, uint64_t newval)
500 {
501 	zfsvfs_t *zfsvfs = arg;
502 
503 	if (newval == ZFS_XATTR_OFF) {
504 		zfsvfs->z_flags &= ~ZSB_XATTR;
505 	} else {
506 		zfsvfs->z_flags |= ZSB_XATTR;
507 
508 		if (newval == ZFS_XATTR_SA)
509 			zfsvfs->z_xattr_sa = B_TRUE;
510 		else
511 			zfsvfs->z_xattr_sa = B_FALSE;
512 	}
513 }
514 
515 static void
516 blksz_changed_cb(void *arg, uint64_t newval)
517 {
518 	zfsvfs_t *zfsvfs = arg;
519 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
520 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
521 	ASSERT(ISP2(newval));
522 
523 	zfsvfs->z_max_blksz = newval;
524 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
525 }
526 
527 static void
528 readonly_changed_cb(void *arg, uint64_t newval)
529 {
530 	zfsvfs_t *zfsvfs = arg;
531 
532 	if (newval) {
533 		/* XXX locking on vfs_flag? */
534 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
535 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
536 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
537 	} else {
538 		/* XXX locking on vfs_flag? */
539 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
540 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
541 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
542 	}
543 }
544 
545 static void
546 setuid_changed_cb(void *arg, uint64_t newval)
547 {
548 	zfsvfs_t *zfsvfs = arg;
549 
550 	if (newval == FALSE) {
551 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
552 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
553 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
554 	} else {
555 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
556 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
557 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
558 	}
559 }
560 
561 static void
562 exec_changed_cb(void *arg, uint64_t newval)
563 {
564 	zfsvfs_t *zfsvfs = arg;
565 
566 	if (newval == FALSE) {
567 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
568 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
569 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
570 	} else {
571 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
572 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
573 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
574 	}
575 }
576 
577 /*
578  * The nbmand mount option can be changed at mount time.
579  * We can't allow it to be toggled on live file systems or incorrect
580  * behavior may be seen from cifs clients
581  *
582  * This property isn't registered via dsl_prop_register(), but this callback
583  * will be called when a file system is first mounted
584  */
585 static void
586 nbmand_changed_cb(void *arg, uint64_t newval)
587 {
588 	zfsvfs_t *zfsvfs = arg;
589 	if (newval == FALSE) {
590 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
591 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
592 	} else {
593 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
594 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
595 	}
596 }
597 
598 static void
599 snapdir_changed_cb(void *arg, uint64_t newval)
600 {
601 	zfsvfs_t *zfsvfs = arg;
602 
603 	zfsvfs->z_show_ctldir = newval;
604 }
605 
606 static void
607 acl_mode_changed_cb(void *arg, uint64_t newval)
608 {
609 	zfsvfs_t *zfsvfs = arg;
610 
611 	zfsvfs->z_acl_mode = newval;
612 }
613 
614 static void
615 acl_inherit_changed_cb(void *arg, uint64_t newval)
616 {
617 	zfsvfs_t *zfsvfs = arg;
618 
619 	zfsvfs->z_acl_inherit = newval;
620 }
621 
622 static void
623 acl_type_changed_cb(void *arg, uint64_t newval)
624 {
625 	zfsvfs_t *zfsvfs = arg;
626 
627 	zfsvfs->z_acl_type = newval;
628 }
629 
630 static int
631 zfs_register_callbacks(vfs_t *vfsp)
632 {
633 	struct dsl_dataset *ds = NULL;
634 	objset_t *os = NULL;
635 	zfsvfs_t *zfsvfs = NULL;
636 	uint64_t nbmand;
637 	boolean_t readonly = B_FALSE;
638 	boolean_t do_readonly = B_FALSE;
639 	boolean_t setuid = B_FALSE;
640 	boolean_t do_setuid = B_FALSE;
641 	boolean_t exec = B_FALSE;
642 	boolean_t do_exec = B_FALSE;
643 	boolean_t xattr = B_FALSE;
644 	boolean_t atime = B_FALSE;
645 	boolean_t do_atime = B_FALSE;
646 	boolean_t do_xattr = B_FALSE;
647 	int error = 0;
648 
649 	ASSERT3P(vfsp, !=, NULL);
650 	zfsvfs = vfsp->vfs_data;
651 	ASSERT3P(zfsvfs, !=, NULL);
652 	os = zfsvfs->z_os;
653 
654 	/*
655 	 * This function can be called for a snapshot when we update snapshot's
656 	 * mount point, which isn't really supported.
657 	 */
658 	if (dmu_objset_is_snapshot(os))
659 		return (EOPNOTSUPP);
660 
661 	/*
662 	 * The act of registering our callbacks will destroy any mount
663 	 * options we may have.  In order to enable temporary overrides
664 	 * of mount options, we stash away the current values and
665 	 * restore them after we register the callbacks.
666 	 */
667 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
668 	    !spa_writeable(dmu_objset_spa(os))) {
669 		readonly = B_TRUE;
670 		do_readonly = B_TRUE;
671 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
672 		readonly = B_FALSE;
673 		do_readonly = B_TRUE;
674 	}
675 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
676 		setuid = B_FALSE;
677 		do_setuid = B_TRUE;
678 	} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
679 		setuid = B_TRUE;
680 		do_setuid = B_TRUE;
681 	}
682 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
683 		exec = B_FALSE;
684 		do_exec = B_TRUE;
685 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
686 		exec = B_TRUE;
687 		do_exec = B_TRUE;
688 	}
689 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
690 		zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
691 		do_xattr = B_TRUE;
692 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
693 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
694 		do_xattr = B_TRUE;
695 	} else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
696 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
697 		do_xattr = B_TRUE;
698 	} else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
699 		zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
700 		do_xattr = B_TRUE;
701 	}
702 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
703 		atime = B_FALSE;
704 		do_atime = B_TRUE;
705 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
706 		atime = B_TRUE;
707 		do_atime = B_TRUE;
708 	}
709 
710 	/*
711 	 * We need to enter pool configuration here, so that we can use
712 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
713 	 * dsl_prop_get_integer() can not be used, because it has to acquire
714 	 * spa_namespace_lock and we can not do that because we already hold
715 	 * z_teardown_lock.  The problem is that spa_write_cachefile() is called
716 	 * with spa_namespace_lock held and the function calls ZFS vnode
717 	 * operations to write the cache file and thus z_teardown_lock is
718 	 * acquired after spa_namespace_lock.
719 	 */
720 	ds = dmu_objset_ds(os);
721 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
722 
723 	/*
724 	 * nbmand is a special property.  It can only be changed at
725 	 * mount time.
726 	 *
727 	 * This is weird, but it is documented to only be changeable
728 	 * at mount time.
729 	 */
730 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
731 		nbmand = B_FALSE;
732 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
733 		nbmand = B_TRUE;
734 	} else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
735 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
736 		return (error);
737 	}
738 
739 	/*
740 	 * Register property callbacks.
741 	 *
742 	 * It would probably be fine to just check for i/o error from
743 	 * the first prop_register(), but I guess I like to go
744 	 * overboard...
745 	 */
746 	error = dsl_prop_register(ds,
747 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
748 	error = error ? error : dsl_prop_register(ds,
749 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
750 	error = error ? error : dsl_prop_register(ds,
751 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
752 	error = error ? error : dsl_prop_register(ds,
753 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
754 	error = error ? error : dsl_prop_register(ds,
755 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
756 	error = error ? error : dsl_prop_register(ds,
757 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
758 	error = error ? error : dsl_prop_register(ds,
759 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
760 	error = error ? error : dsl_prop_register(ds,
761 	    zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
762 	error = error ? error : dsl_prop_register(ds,
763 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
764 	error = error ? error : dsl_prop_register(ds,
765 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
766 	    zfsvfs);
767 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
768 	if (error)
769 		goto unregister;
770 
771 	/*
772 	 * Invoke our callbacks to restore temporary mount options.
773 	 */
774 	if (do_readonly)
775 		readonly_changed_cb(zfsvfs, readonly);
776 	if (do_setuid)
777 		setuid_changed_cb(zfsvfs, setuid);
778 	if (do_exec)
779 		exec_changed_cb(zfsvfs, exec);
780 	if (do_xattr)
781 		xattr_changed_cb(zfsvfs, xattr);
782 	if (do_atime)
783 		atime_changed_cb(zfsvfs, atime);
784 
785 	nbmand_changed_cb(zfsvfs, nbmand);
786 
787 	return (0);
788 
789 unregister:
790 	dsl_prop_unregister_all(ds, zfsvfs);
791 	return (error);
792 }
793 
794 /*
795  * Associate this zfsvfs with the given objset, which must be owned.
796  * This will cache a bunch of on-disk state from the objset in the
797  * zfsvfs.
798  */
799 static int
800 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
801 {
802 	int error;
803 	uint64_t val;
804 
805 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
806 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
807 	zfsvfs->z_os = os;
808 
809 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
810 	if (error != 0)
811 		return (error);
812 	if (zfsvfs->z_version >
813 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
814 		(void) printf("Can't mount a version %lld file system "
815 		    "on a version %lld pool\n. Pool must be upgraded to mount "
816 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
817 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
818 		return (SET_ERROR(ENOTSUP));
819 	}
820 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
821 	if (error != 0)
822 		return (error);
823 	zfsvfs->z_norm = (int)val;
824 
825 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
826 	if (error != 0)
827 		return (error);
828 	zfsvfs->z_utf8 = (val != 0);
829 
830 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
831 	if (error != 0)
832 		return (error);
833 	zfsvfs->z_case = (uint_t)val;
834 
835 	error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
836 	if (error != 0)
837 		return (error);
838 	zfsvfs->z_acl_type = (uint_t)val;
839 
840 	/*
841 	 * Fold case on file systems that are always or sometimes case
842 	 * insensitive.
843 	 */
844 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
845 	    zfsvfs->z_case == ZFS_CASE_MIXED)
846 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
847 
848 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
849 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
850 
851 	uint64_t sa_obj = 0;
852 	if (zfsvfs->z_use_sa) {
853 		/* should either have both of these objects or none */
854 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
855 		    &sa_obj);
856 		if (error != 0)
857 			return (error);
858 
859 		error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
860 		if (error == 0 && val == ZFS_XATTR_SA)
861 			zfsvfs->z_xattr_sa = B_TRUE;
862 	}
863 
864 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
865 	    &zfsvfs->z_attr_table);
866 	if (error != 0)
867 		return (error);
868 
869 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
870 		sa_register_update_callback(os, zfs_sa_upgrade);
871 
872 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
873 	    &zfsvfs->z_root);
874 	if (error != 0)
875 		return (error);
876 	ASSERT3U(zfsvfs->z_root, !=, 0);
877 
878 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
879 	    &zfsvfs->z_unlinkedobj);
880 	if (error != 0)
881 		return (error);
882 
883 	error = zap_lookup(os, MASTER_NODE_OBJ,
884 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
885 	    8, 1, &zfsvfs->z_userquota_obj);
886 	if (error == ENOENT)
887 		zfsvfs->z_userquota_obj = 0;
888 	else if (error != 0)
889 		return (error);
890 
891 	error = zap_lookup(os, MASTER_NODE_OBJ,
892 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
893 	    8, 1, &zfsvfs->z_groupquota_obj);
894 	if (error == ENOENT)
895 		zfsvfs->z_groupquota_obj = 0;
896 	else if (error != 0)
897 		return (error);
898 
899 	error = zap_lookup(os, MASTER_NODE_OBJ,
900 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
901 	    8, 1, &zfsvfs->z_projectquota_obj);
902 	if (error == ENOENT)
903 		zfsvfs->z_projectquota_obj = 0;
904 	else if (error != 0)
905 		return (error);
906 
907 	error = zap_lookup(os, MASTER_NODE_OBJ,
908 	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
909 	    8, 1, &zfsvfs->z_userobjquota_obj);
910 	if (error == ENOENT)
911 		zfsvfs->z_userobjquota_obj = 0;
912 	else if (error != 0)
913 		return (error);
914 
915 	error = zap_lookup(os, MASTER_NODE_OBJ,
916 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
917 	    8, 1, &zfsvfs->z_groupobjquota_obj);
918 	if (error == ENOENT)
919 		zfsvfs->z_groupobjquota_obj = 0;
920 	else if (error != 0)
921 		return (error);
922 
923 	error = zap_lookup(os, MASTER_NODE_OBJ,
924 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
925 	    8, 1, &zfsvfs->z_projectobjquota_obj);
926 	if (error == ENOENT)
927 		zfsvfs->z_projectobjquota_obj = 0;
928 	else if (error != 0)
929 		return (error);
930 
931 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
932 	    &zfsvfs->z_fuid_obj);
933 	if (error == ENOENT)
934 		zfsvfs->z_fuid_obj = 0;
935 	else if (error != 0)
936 		return (error);
937 
938 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
939 	    &zfsvfs->z_shares_dir);
940 	if (error == ENOENT)
941 		zfsvfs->z_shares_dir = 0;
942 	else if (error != 0)
943 		return (error);
944 
945 	/*
946 	 * Only use the name cache if we are looking for a
947 	 * name on a file system that does not require normalization
948 	 * or case folding.  We can also look there if we happen to be
949 	 * on a non-normalizing, mixed sensitivity file system IF we
950 	 * are looking for the exact name (which is always the case on
951 	 * FreeBSD).
952 	 */
953 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
954 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
955 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
956 
957 	return (0);
958 }
959 
960 taskq_t *zfsvfs_taskq;
961 
962 static void
963 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
964 {
965 
966 	zfs_unlinked_drain((zfsvfs_t *)context);
967 }
968 
969 int
970 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
971 {
972 	objset_t *os;
973 	zfsvfs_t *zfsvfs;
974 	int error;
975 	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
976 
977 	/*
978 	 * XXX: Fix struct statfs so this isn't necessary!
979 	 *
980 	 * The 'osname' is used as the filesystem's special node, which means
981 	 * it must fit in statfs.f_mntfromname, or else it can't be
982 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
983 	 * 'zfs unmount' to think it's not mounted when it is.
984 	 */
985 	if (strlen(osname) >= MNAMELEN)
986 		return (SET_ERROR(ENAMETOOLONG));
987 
988 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
989 
990 	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
991 	    &os);
992 	if (error != 0) {
993 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
994 		return (error);
995 	}
996 
997 	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
998 
999 	return (error);
1000 }
1001 
1002 
1003 int
1004 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1005 {
1006 	int error;
1007 
1008 	zfsvfs->z_vfs = NULL;
1009 	zfsvfs->z_parent = zfsvfs;
1010 
1011 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1012 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1013 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1014 	    offsetof(znode_t, z_link_node));
1015 	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1016 	    zfsvfs_task_unlinked_drain, zfsvfs);
1017 	ZFS_TEARDOWN_INIT(zfsvfs);
1018 	ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
1019 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1020 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1021 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1022 
1023 	error = zfsvfs_init(zfsvfs, os);
1024 	if (error != 0) {
1025 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1026 		*zfvp = NULL;
1027 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1028 		return (error);
1029 	}
1030 
1031 	*zfvp = zfsvfs;
1032 	return (0);
1033 }
1034 
1035 static int
1036 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1037 {
1038 	int error;
1039 
1040 	/*
1041 	 * Check for a bad on-disk format version now since we
1042 	 * lied about owning the dataset readonly before.
1043 	 */
1044 	if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1045 	    dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1046 		return (SET_ERROR(EROFS));
1047 
1048 	error = zfs_register_callbacks(zfsvfs->z_vfs);
1049 	if (error)
1050 		return (error);
1051 
1052 	/*
1053 	 * If we are not mounting (ie: online recv), then we don't
1054 	 * have to worry about replaying the log as we blocked all
1055 	 * operations out since we closed the ZIL.
1056 	 */
1057 	if (mounting) {
1058 		boolean_t readonly;
1059 
1060 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1061 		error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1062 		if (error)
1063 			return (error);
1064 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1065 		    &zfsvfs->z_kstat.dk_zil_sums);
1066 
1067 		/*
1068 		 * During replay we remove the read only flag to
1069 		 * allow replays to succeed.
1070 		 */
1071 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1072 		if (readonly != 0) {
1073 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1074 		} else {
1075 			dsl_dir_t *dd;
1076 			zap_stats_t zs;
1077 
1078 			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1079 			    &zs) == 0) {
1080 				dataset_kstats_update_nunlinks_kstat(
1081 				    &zfsvfs->z_kstat, zs.zs_num_entries);
1082 				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1083 				    "num_entries in unlinked set: %llu",
1084 				    (u_longlong_t)zs.zs_num_entries);
1085 			}
1086 
1087 			zfs_unlinked_drain(zfsvfs);
1088 			dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1089 			dd->dd_activity_cancelled = B_FALSE;
1090 		}
1091 
1092 		/*
1093 		 * Parse and replay the intent log.
1094 		 *
1095 		 * Because of ziltest, this must be done after
1096 		 * zfs_unlinked_drain().  (Further note: ziltest
1097 		 * doesn't use readonly mounts, where
1098 		 * zfs_unlinked_drain() isn't called.)  This is because
1099 		 * ziltest causes spa_sync() to think it's committed,
1100 		 * but actually it is not, so the intent log contains
1101 		 * many txg's worth of changes.
1102 		 *
1103 		 * In particular, if object N is in the unlinked set in
1104 		 * the last txg to actually sync, then it could be
1105 		 * actually freed in a later txg and then reallocated
1106 		 * in a yet later txg.  This would write a "create
1107 		 * object N" record to the intent log.  Normally, this
1108 		 * would be fine because the spa_sync() would have
1109 		 * written out the fact that object N is free, before
1110 		 * we could write the "create object N" intent log
1111 		 * record.
1112 		 *
1113 		 * But when we are in ziltest mode, we advance the "open
1114 		 * txg" without actually spa_sync()-ing the changes to
1115 		 * disk.  So we would see that object N is still
1116 		 * allocated and in the unlinked set, and there is an
1117 		 * intent log record saying to allocate it.
1118 		 */
1119 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1120 			if (zil_replay_disable) {
1121 				zil_destroy(zfsvfs->z_log, B_FALSE);
1122 			} else {
1123 				boolean_t use_nc = zfsvfs->z_use_namecache;
1124 				zfsvfs->z_use_namecache = B_FALSE;
1125 				zfsvfs->z_replay = B_TRUE;
1126 				zil_replay(zfsvfs->z_os, zfsvfs,
1127 				    zfs_replay_vector);
1128 				zfsvfs->z_replay = B_FALSE;
1129 				zfsvfs->z_use_namecache = use_nc;
1130 			}
1131 		}
1132 
1133 		/* restore readonly bit */
1134 		if (readonly != 0)
1135 			zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1136 	} else {
1137 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1138 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1139 		    &zfsvfs->z_kstat.dk_zil_sums);
1140 	}
1141 
1142 	/*
1143 	 * Set the objset user_ptr to track its zfsvfs.
1144 	 */
1145 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1146 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1147 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1148 
1149 	return (0);
1150 }
1151 
1152 void
1153 zfsvfs_free(zfsvfs_t *zfsvfs)
1154 {
1155 	int i;
1156 
1157 	zfs_fuid_destroy(zfsvfs);
1158 
1159 	mutex_destroy(&zfsvfs->z_znodes_lock);
1160 	mutex_destroy(&zfsvfs->z_lock);
1161 	list_destroy(&zfsvfs->z_all_znodes);
1162 	ZFS_TEARDOWN_DESTROY(zfsvfs);
1163 	ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1164 	rw_destroy(&zfsvfs->z_fuid_lock);
1165 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1166 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1167 	dataset_kstats_destroy(&zfsvfs->z_kstat);
1168 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1169 }
1170 
1171 static void
1172 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1173 {
1174 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1175 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1176 }
1177 
1178 static int
1179 zfs_domount(vfs_t *vfsp, char *osname)
1180 {
1181 	uint64_t recordsize, fsid_guid;
1182 	int error = 0;
1183 	zfsvfs_t *zfsvfs;
1184 
1185 	ASSERT3P(vfsp, !=, NULL);
1186 	ASSERT3P(osname, !=, NULL);
1187 
1188 	error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1189 	if (error)
1190 		return (error);
1191 	zfsvfs->z_vfs = vfsp;
1192 
1193 	if ((error = dsl_prop_get_integer(osname,
1194 	    "recordsize", &recordsize, NULL)))
1195 		goto out;
1196 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1197 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1198 
1199 	vfsp->vfs_data = zfsvfs;
1200 	vfsp->mnt_flag |= MNT_LOCAL;
1201 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1202 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1203 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1204 	/*
1205 	 * This can cause a loss of coherence between ARC and page cache
1206 	 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1207 	 */
1208 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
1209 	vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1210 	vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1211 
1212 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1213 	vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1214 #endif
1215 	/*
1216 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1217 	 * separates our fsid from any other filesystem types, and a
1218 	 * 56-bit objset unique ID.  The objset unique ID is unique to
1219 	 * all objsets open on this system, provided by unique_create().
1220 	 * The 8-bit fs type must be put in the low bits of fsid[1]
1221 	 * because that's where other Solaris filesystems put it.
1222 	 */
1223 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1224 	ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1225 	vfsp->vfs_fsid.val[0] = fsid_guid;
1226 	vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1227 	    (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1228 
1229 	/*
1230 	 * Set features for file system.
1231 	 */
1232 	zfs_set_fuid_feature(zfsvfs);
1233 
1234 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1235 		uint64_t pval;
1236 
1237 		atime_changed_cb(zfsvfs, B_FALSE);
1238 		readonly_changed_cb(zfsvfs, B_TRUE);
1239 		if ((error = dsl_prop_get_integer(osname,
1240 		    "xattr", &pval, NULL)))
1241 			goto out;
1242 		xattr_changed_cb(zfsvfs, pval);
1243 		if ((error = dsl_prop_get_integer(osname,
1244 		    "acltype", &pval, NULL)))
1245 			goto out;
1246 		acl_type_changed_cb(zfsvfs, pval);
1247 		zfsvfs->z_issnap = B_TRUE;
1248 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1249 
1250 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1251 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1252 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1253 	} else {
1254 		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1255 			goto out;
1256 	}
1257 
1258 	vfs_mountedfrom(vfsp, osname);
1259 
1260 	if (!zfsvfs->z_issnap)
1261 		zfsctl_create(zfsvfs);
1262 out:
1263 	if (error) {
1264 		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1265 		zfsvfs_free(zfsvfs);
1266 	} else {
1267 		atomic_inc_32(&zfs_active_fs_count);
1268 	}
1269 
1270 	return (error);
1271 }
1272 
1273 static void
1274 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1275 {
1276 	objset_t *os = zfsvfs->z_os;
1277 
1278 	if (!dmu_objset_is_snapshot(os))
1279 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1280 }
1281 
1282 static int
1283 getpoolname(const char *osname, char *poolname)
1284 {
1285 	char *p;
1286 
1287 	p = strchr(osname, '/');
1288 	if (p == NULL) {
1289 		if (strlen(osname) >= MAXNAMELEN)
1290 			return (ENAMETOOLONG);
1291 		(void) strcpy(poolname, osname);
1292 	} else {
1293 		if (p - osname >= MAXNAMELEN)
1294 			return (ENAMETOOLONG);
1295 		(void) strlcpy(poolname, osname, p - osname + 1);
1296 	}
1297 	return (0);
1298 }
1299 
1300 static void
1301 fetch_osname_options(char *name, bool *checkpointrewind)
1302 {
1303 
1304 	if (name[0] == '!') {
1305 		*checkpointrewind = true;
1306 		memmove(name, name + 1, strlen(name));
1307 	} else {
1308 		*checkpointrewind = false;
1309 	}
1310 }
1311 
1312 static int
1313 zfs_mount(vfs_t *vfsp)
1314 {
1315 	kthread_t	*td = curthread;
1316 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
1317 	cred_t		*cr = td->td_ucred;
1318 	char		*osname;
1319 	int		error = 0;
1320 	int		canwrite;
1321 	bool		checkpointrewind, isctlsnap = false;
1322 
1323 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1324 		return (SET_ERROR(EINVAL));
1325 
1326 	/*
1327 	 * If full-owner-access is enabled and delegated administration is
1328 	 * turned on, we must set nosuid.
1329 	 */
1330 	if (zfs_super_owner &&
1331 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1332 		secpolicy_fs_mount_clearopts(cr, vfsp);
1333 	}
1334 
1335 	fetch_osname_options(osname, &checkpointrewind);
1336 	isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1337 	    strchr(osname, '@') != NULL);
1338 
1339 	/*
1340 	 * Check for mount privilege?
1341 	 *
1342 	 * If we don't have privilege then see if
1343 	 * we have local permission to allow it
1344 	 */
1345 	error = secpolicy_fs_mount(cr, mvp, vfsp);
1346 	if (error && isctlsnap) {
1347 		secpolicy_fs_mount_clearopts(cr, vfsp);
1348 	} else if (error) {
1349 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1350 			goto out;
1351 
1352 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1353 			vattr_t		vattr;
1354 
1355 			/*
1356 			 * Make sure user is the owner of the mount point
1357 			 * or has sufficient privileges.
1358 			 */
1359 
1360 			vattr.va_mask = AT_UID;
1361 
1362 			vn_lock(mvp, LK_SHARED | LK_RETRY);
1363 			if (VOP_GETATTR(mvp, &vattr, cr)) {
1364 				VOP_UNLOCK1(mvp);
1365 				goto out;
1366 			}
1367 
1368 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1369 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1370 				VOP_UNLOCK1(mvp);
1371 				goto out;
1372 			}
1373 			VOP_UNLOCK1(mvp);
1374 		}
1375 
1376 		secpolicy_fs_mount_clearopts(cr, vfsp);
1377 	}
1378 
1379 	/*
1380 	 * Refuse to mount a filesystem if we are in a local zone and the
1381 	 * dataset is not visible.
1382 	 */
1383 	if (!INGLOBALZONE(curproc) &&
1384 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1385 		boolean_t mount_snapshot = B_FALSE;
1386 
1387 		/*
1388 		 * Snapshots may be mounted in .zfs for unjailed datasets
1389 		 * if allowed by the jail param zfs.mount_snapshot.
1390 		 */
1391 		if (isctlsnap) {
1392 			struct prison *pr;
1393 			struct zfs_jailparam *zjp;
1394 
1395 			pr = curthread->td_ucred->cr_prison;
1396 			mtx_lock(&pr->pr_mtx);
1397 			zjp = osd_jail_get(pr, zfs_jailparam_slot);
1398 			mtx_unlock(&pr->pr_mtx);
1399 			if (zjp && zjp->mount_snapshot)
1400 				mount_snapshot = B_TRUE;
1401 		}
1402 		if (!mount_snapshot) {
1403 			error = SET_ERROR(EPERM);
1404 			goto out;
1405 		}
1406 	}
1407 
1408 	vfsp->vfs_flag |= MNT_NFS4ACLS;
1409 
1410 	/*
1411 	 * When doing a remount, we simply refresh our temporary properties
1412 	 * according to those options set in the current VFS options.
1413 	 */
1414 	if (vfsp->vfs_flag & MS_REMOUNT) {
1415 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
1416 
1417 		/*
1418 		 * Refresh mount options with z_teardown_lock blocking I/O while
1419 		 * the filesystem is in an inconsistent state.
1420 		 * The lock also serializes this code with filesystem
1421 		 * manipulations between entry to zfs_suspend_fs() and return
1422 		 * from zfs_resume_fs().
1423 		 */
1424 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1425 		zfs_unregister_callbacks(zfsvfs);
1426 		error = zfs_register_callbacks(vfsp);
1427 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1428 		goto out;
1429 	}
1430 
1431 	/* Initial root mount: try hard to import the requested root pool. */
1432 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1433 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1434 		char pname[MAXNAMELEN];
1435 
1436 		error = getpoolname(osname, pname);
1437 		if (error == 0)
1438 			error = spa_import_rootpool(pname, checkpointrewind);
1439 		if (error)
1440 			goto out;
1441 	}
1442 	DROP_GIANT();
1443 	error = zfs_domount(vfsp, osname);
1444 	PICKUP_GIANT();
1445 
1446 out:
1447 	return (error);
1448 }
1449 
1450 static int
1451 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1452 {
1453 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1454 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1455 	int error;
1456 
1457 	statp->f_version = STATFS_VERSION;
1458 
1459 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1460 		return (error);
1461 
1462 	dmu_objset_space(zfsvfs->z_os,
1463 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1464 
1465 	/*
1466 	 * The underlying storage pool actually uses multiple block sizes.
1467 	 * We report the fragsize as the smallest block size we support,
1468 	 * and we report our blocksize as the filesystem's maximum blocksize.
1469 	 */
1470 	statp->f_bsize = SPA_MINBLOCKSIZE;
1471 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1472 
1473 	/*
1474 	 * The following report "total" blocks of various kinds in the
1475 	 * file system, but reported in terms of f_frsize - the
1476 	 * "fragment" size.
1477 	 */
1478 
1479 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1480 	statp->f_bfree = availbytes / statp->f_bsize;
1481 	statp->f_bavail = statp->f_bfree; /* no root reservation */
1482 
1483 	/*
1484 	 * statvfs() should really be called statufs(), because it assumes
1485 	 * static metadata.  ZFS doesn't preallocate files, so the best
1486 	 * we can do is report the max that could possibly fit in f_files,
1487 	 * and that minus the number actually used in f_ffree.
1488 	 * For f_ffree, report the smaller of the number of object available
1489 	 * and the number of blocks (each object will take at least a block).
1490 	 */
1491 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1492 	statp->f_files = statp->f_ffree + usedobjs;
1493 
1494 	/*
1495 	 * We're a zfs filesystem.
1496 	 */
1497 	strlcpy(statp->f_fstypename, "zfs",
1498 	    sizeof (statp->f_fstypename));
1499 
1500 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1501 	    sizeof (statp->f_mntfromname));
1502 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1503 	    sizeof (statp->f_mntonname));
1504 
1505 	statp->f_namemax = MAXNAMELEN - 1;
1506 
1507 	zfs_exit(zfsvfs, FTAG);
1508 	return (0);
1509 }
1510 
1511 static int
1512 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1513 {
1514 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1515 	znode_t *rootzp;
1516 	int error;
1517 
1518 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1519 		return (error);
1520 
1521 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1522 	if (error == 0)
1523 		*vpp = ZTOV(rootzp);
1524 
1525 	zfs_exit(zfsvfs, FTAG);
1526 
1527 	if (error == 0) {
1528 		error = vn_lock(*vpp, flags);
1529 		if (error != 0) {
1530 			VN_RELE(*vpp);
1531 			*vpp = NULL;
1532 		}
1533 	}
1534 	return (error);
1535 }
1536 
1537 /*
1538  * Teardown the zfsvfs::z_os.
1539  *
1540  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1541  * and 'z_teardown_inactive_lock' held.
1542  */
1543 static int
1544 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1545 {
1546 	znode_t	*zp;
1547 	dsl_dir_t *dd;
1548 
1549 	/*
1550 	 * If someone has not already unmounted this file system,
1551 	 * drain the zrele_taskq to ensure all active references to the
1552 	 * zfsvfs_t have been handled only then can it be safely destroyed.
1553 	 */
1554 	if (zfsvfs->z_os) {
1555 		/*
1556 		 * If we're unmounting we have to wait for the list to
1557 		 * drain completely.
1558 		 *
1559 		 * If we're not unmounting there's no guarantee the list
1560 		 * will drain completely, but zreles run from the taskq
1561 		 * may add the parents of dir-based xattrs to the taskq
1562 		 * so we want to wait for these.
1563 		 *
1564 		 * We can safely check z_all_znodes for being empty because the
1565 		 * VFS has already blocked operations which add to it.
1566 		 */
1567 		int round = 0;
1568 		while (!list_is_empty(&zfsvfs->z_all_znodes)) {
1569 			taskq_wait_outstanding(dsl_pool_zrele_taskq(
1570 			    dmu_objset_pool(zfsvfs->z_os)), 0);
1571 			if (++round > 1 && !unmounting)
1572 				break;
1573 		}
1574 	}
1575 	ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1576 
1577 	if (!unmounting) {
1578 		/*
1579 		 * We purge the parent filesystem's vfsp as the parent
1580 		 * filesystem and all of its snapshots have their vnode's
1581 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1582 		 * 'z_parent' is self referential for non-snapshots.
1583 		 */
1584 #ifdef FREEBSD_NAMECACHE
1585 #if __FreeBSD_version >= 1300117
1586 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
1587 #else
1588 		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
1589 #endif
1590 #endif
1591 	}
1592 
1593 	/*
1594 	 * Close the zil. NB: Can't close the zil while zfs_inactive
1595 	 * threads are blocked as zil_close can call zfs_inactive.
1596 	 */
1597 	if (zfsvfs->z_log) {
1598 		zil_close(zfsvfs->z_log);
1599 		zfsvfs->z_log = NULL;
1600 	}
1601 
1602 	ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1603 
1604 	/*
1605 	 * If we are not unmounting (ie: online recv) and someone already
1606 	 * unmounted this file system while we were doing the switcheroo,
1607 	 * or a reopen of z_os failed then just bail out now.
1608 	 */
1609 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1610 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1611 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1612 		return (SET_ERROR(EIO));
1613 	}
1614 
1615 	/*
1616 	 * At this point there are no vops active, and any new vops will
1617 	 * fail with EIO since we have z_teardown_lock for writer (only
1618 	 * relevant for forced unmount).
1619 	 *
1620 	 * Release all holds on dbufs.
1621 	 */
1622 	mutex_enter(&zfsvfs->z_znodes_lock);
1623 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1624 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1625 		if (zp->z_sa_hdl != NULL) {
1626 			zfs_znode_dmu_fini(zp);
1627 		}
1628 	}
1629 	mutex_exit(&zfsvfs->z_znodes_lock);
1630 
1631 	/*
1632 	 * If we are unmounting, set the unmounted flag and let new vops
1633 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1634 	 * other vops will fail with EIO.
1635 	 */
1636 	if (unmounting) {
1637 		zfsvfs->z_unmounted = B_TRUE;
1638 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1639 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1640 	}
1641 
1642 	/*
1643 	 * z_os will be NULL if there was an error in attempting to reopen
1644 	 * zfsvfs, so just return as the properties had already been
1645 	 * unregistered and cached data had been evicted before.
1646 	 */
1647 	if (zfsvfs->z_os == NULL)
1648 		return (0);
1649 
1650 	/*
1651 	 * Unregister properties.
1652 	 */
1653 	zfs_unregister_callbacks(zfsvfs);
1654 
1655 	/*
1656 	 * Evict cached data
1657 	 */
1658 	if (!zfs_is_readonly(zfsvfs))
1659 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1660 	dmu_objset_evict_dbufs(zfsvfs->z_os);
1661 	dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1662 	dsl_dir_cancel_waiters(dd);
1663 
1664 	return (0);
1665 }
1666 
1667 static int
1668 zfs_umount(vfs_t *vfsp, int fflag)
1669 {
1670 	kthread_t *td = curthread;
1671 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1672 	objset_t *os;
1673 	cred_t *cr = td->td_ucred;
1674 	int ret;
1675 
1676 	ret = secpolicy_fs_unmount(cr, vfsp);
1677 	if (ret) {
1678 		if (dsl_deleg_access((char *)vfsp->vfs_resource,
1679 		    ZFS_DELEG_PERM_MOUNT, cr))
1680 			return (ret);
1681 	}
1682 
1683 	/*
1684 	 * Unmount any snapshots mounted under .zfs before unmounting the
1685 	 * dataset itself.
1686 	 */
1687 	if (zfsvfs->z_ctldir != NULL) {
1688 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1689 			return (ret);
1690 	}
1691 
1692 	if (fflag & MS_FORCE) {
1693 		/*
1694 		 * Mark file system as unmounted before calling
1695 		 * vflush(FORCECLOSE). This way we ensure no future vnops
1696 		 * will be called and risk operating on DOOMED vnodes.
1697 		 */
1698 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1699 		zfsvfs->z_unmounted = B_TRUE;
1700 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1701 	}
1702 
1703 	/*
1704 	 * Flush all the files.
1705 	 */
1706 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1707 	if (ret != 0)
1708 		return (ret);
1709 	while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1710 	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1711 		taskqueue_drain(zfsvfs_taskq->tq_queue,
1712 		    &zfsvfs->z_unlinked_drain_task);
1713 
1714 	VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1715 	os = zfsvfs->z_os;
1716 
1717 	/*
1718 	 * z_os will be NULL if there was an error in
1719 	 * attempting to reopen zfsvfs.
1720 	 */
1721 	if (os != NULL) {
1722 		/*
1723 		 * Unset the objset user_ptr.
1724 		 */
1725 		mutex_enter(&os->os_user_ptr_lock);
1726 		dmu_objset_set_user(os, NULL);
1727 		mutex_exit(&os->os_user_ptr_lock);
1728 
1729 		/*
1730 		 * Finally release the objset
1731 		 */
1732 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1733 	}
1734 
1735 	/*
1736 	 * We can now safely destroy the '.zfs' directory node.
1737 	 */
1738 	if (zfsvfs->z_ctldir != NULL)
1739 		zfsctl_destroy(zfsvfs);
1740 	zfs_freevfs(vfsp);
1741 
1742 	return (0);
1743 }
1744 
1745 static int
1746 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1747 {
1748 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1749 	znode_t		*zp;
1750 	int 		err;
1751 
1752 	/*
1753 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
1754 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1755 	 * This will make NFS to switch to LOOKUP instead of using VGET.
1756 	 */
1757 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1758 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1759 		return (EOPNOTSUPP);
1760 
1761 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1762 		return (err);
1763 	err = zfs_zget(zfsvfs, ino, &zp);
1764 	if (err == 0 && zp->z_unlinked) {
1765 		vrele(ZTOV(zp));
1766 		err = EINVAL;
1767 	}
1768 	if (err == 0)
1769 		*vpp = ZTOV(zp);
1770 	zfs_exit(zfsvfs, FTAG);
1771 	if (err == 0) {
1772 		err = vn_lock(*vpp, flags);
1773 		if (err != 0)
1774 			vrele(*vpp);
1775 	}
1776 	if (err != 0)
1777 		*vpp = NULL;
1778 	return (err);
1779 }
1780 
1781 static int
1782 #if __FreeBSD_version >= 1300098
1783 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1784     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1785 #else
1786 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1787     struct ucred **credanonp, int *numsecflavors, int **secflavors)
1788 #endif
1789 {
1790 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1791 
1792 	/*
1793 	 * If this is regular file system vfsp is the same as
1794 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1795 	 * zfsvfs->z_parent->z_vfs represents parent file system
1796 	 * which we have to use here, because only this file system
1797 	 * has mnt_export configured.
1798 	 */
1799 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1800 	    credanonp, numsecflavors, secflavors));
1801 }
1802 
1803 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1804 	"struct fid bigger than SHORT_FID_LEN");
1805 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1806 	"struct fid bigger than LONG_FID_LEN");
1807 
1808 static int
1809 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1810 {
1811 	struct componentname cn;
1812 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1813 	znode_t		*zp;
1814 	vnode_t		*dvp;
1815 	uint64_t	object = 0;
1816 	uint64_t	fid_gen = 0;
1817 	uint64_t	setgen = 0;
1818 	uint64_t	gen_mask;
1819 	uint64_t	zp_gen;
1820 	int 		i, err;
1821 
1822 	*vpp = NULL;
1823 
1824 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1825 		return (err);
1826 
1827 	/*
1828 	 * On FreeBSD we can get snapshot's mount point or its parent file
1829 	 * system mount point depending if snapshot is already mounted or not.
1830 	 */
1831 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1832 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1833 		uint64_t	objsetid = 0;
1834 
1835 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1836 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1837 
1838 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1839 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1840 
1841 		zfs_exit(zfsvfs, FTAG);
1842 
1843 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1844 		if (err)
1845 			return (SET_ERROR(EINVAL));
1846 		if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1847 			return (err);
1848 	}
1849 
1850 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1851 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1852 
1853 		for (i = 0; i < sizeof (zfid->zf_object); i++)
1854 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1855 
1856 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1857 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1858 	} else {
1859 		zfs_exit(zfsvfs, FTAG);
1860 		return (SET_ERROR(EINVAL));
1861 	}
1862 
1863 	if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1864 		zfs_exit(zfsvfs, FTAG);
1865 		dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1866 		    (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1867 		return (SET_ERROR(EINVAL));
1868 	}
1869 
1870 	/*
1871 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1872 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
1873 	 * we are in the .zfs/shares directory tree.
1874 	 */
1875 	if ((fid_gen == 0 &&
1876 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1877 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1878 		zfs_exit(zfsvfs, FTAG);
1879 		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1880 		if (object == ZFSCTL_INO_SNAPDIR) {
1881 			cn.cn_nameptr = "snapshot";
1882 			cn.cn_namelen = strlen(cn.cn_nameptr);
1883 			cn.cn_nameiop = LOOKUP;
1884 			cn.cn_flags = ISLASTCN | LOCKLEAF;
1885 			cn.cn_lkflags = flags;
1886 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1887 			vput(dvp);
1888 		} else if (object == zfsvfs->z_shares_dir) {
1889 			/*
1890 			 * XXX This branch must not be taken,
1891 			 * if it is, then the lookup below will
1892 			 * explode.
1893 			 */
1894 			cn.cn_nameptr = "shares";
1895 			cn.cn_namelen = strlen(cn.cn_nameptr);
1896 			cn.cn_nameiop = LOOKUP;
1897 			cn.cn_flags = ISLASTCN;
1898 			cn.cn_lkflags = flags;
1899 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1900 			vput(dvp);
1901 		} else {
1902 			*vpp = dvp;
1903 		}
1904 		return (err);
1905 	}
1906 
1907 	gen_mask = -1ULL >> (64 - 8 * i);
1908 
1909 	dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1910 	    (u_longlong_t)fid_gen,
1911 	    (u_longlong_t)gen_mask);
1912 	if ((err = zfs_zget(zfsvfs, object, &zp))) {
1913 		zfs_exit(zfsvfs, FTAG);
1914 		return (err);
1915 	}
1916 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1917 	    sizeof (uint64_t));
1918 	zp_gen = zp_gen & gen_mask;
1919 	if (zp_gen == 0)
1920 		zp_gen = 1;
1921 	if (zp->z_unlinked || zp_gen != fid_gen) {
1922 		dprintf("znode gen (%llu) != fid gen (%llu)\n",
1923 		    (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1924 		vrele(ZTOV(zp));
1925 		zfs_exit(zfsvfs, FTAG);
1926 		return (SET_ERROR(EINVAL));
1927 	}
1928 
1929 	*vpp = ZTOV(zp);
1930 	zfs_exit(zfsvfs, FTAG);
1931 	err = vn_lock(*vpp, flags);
1932 	if (err == 0)
1933 		vnode_create_vobject(*vpp, zp->z_size, curthread);
1934 	else
1935 		*vpp = NULL;
1936 	return (err);
1937 }
1938 
1939 /*
1940  * Block out VOPs and close zfsvfs_t::z_os
1941  *
1942  * Note, if successful, then we return with the 'z_teardown_lock' and
1943  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1944  * dataset and objset intact so that they can be atomically handed off during
1945  * a subsequent rollback or recv operation and the resume thereafter.
1946  */
1947 int
1948 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1949 {
1950 	int error;
1951 
1952 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1953 		return (error);
1954 
1955 	return (0);
1956 }
1957 
1958 /*
1959  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1960  * is an invariant across any of the operations that can be performed while the
1961  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1962  * are the same: the relevant objset and associated dataset are owned by
1963  * zfsvfs, held, and long held on entry.
1964  */
1965 int
1966 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1967 {
1968 	int err;
1969 	znode_t *zp;
1970 
1971 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1972 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1973 
1974 	/*
1975 	 * We already own this, so just update the objset_t, as the one we
1976 	 * had before may have been evicted.
1977 	 */
1978 	objset_t *os;
1979 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
1980 	VERIFY(dsl_dataset_long_held(ds));
1981 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1982 	dsl_pool_config_enter(dp, FTAG);
1983 	VERIFY0(dmu_objset_from_ds(ds, &os));
1984 	dsl_pool_config_exit(dp, FTAG);
1985 
1986 	err = zfsvfs_init(zfsvfs, os);
1987 	if (err != 0)
1988 		goto bail;
1989 
1990 	ds->ds_dir->dd_activity_cancelled = B_FALSE;
1991 	VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
1992 
1993 	zfs_set_fuid_feature(zfsvfs);
1994 
1995 	/*
1996 	 * Attempt to re-establish all the active znodes with
1997 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
1998 	 * any potential callers discover that via zfs_enter_verify_zp
1999 	 * when they try to use their znode.
2000 	 */
2001 	mutex_enter(&zfsvfs->z_znodes_lock);
2002 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2003 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2004 		(void) zfs_rezget(zp);
2005 	}
2006 	mutex_exit(&zfsvfs->z_znodes_lock);
2007 
2008 bail:
2009 	/* release the VOPs */
2010 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2011 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2012 
2013 	if (err) {
2014 		/*
2015 		 * Since we couldn't setup the sa framework, try to force
2016 		 * unmount this file system.
2017 		 */
2018 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2019 			vfs_ref(zfsvfs->z_vfs);
2020 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2021 		}
2022 	}
2023 	return (err);
2024 }
2025 
2026 static void
2027 zfs_freevfs(vfs_t *vfsp)
2028 {
2029 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2030 
2031 	zfsvfs_free(zfsvfs);
2032 
2033 	atomic_dec_32(&zfs_active_fs_count);
2034 }
2035 
2036 #ifdef __i386__
2037 static int desiredvnodes_backup;
2038 #include <sys/vmmeter.h>
2039 
2040 
2041 #include <vm/vm_page.h>
2042 #include <vm/vm_object.h>
2043 #include <vm/vm_kern.h>
2044 #include <vm/vm_map.h>
2045 #endif
2046 
2047 static void
2048 zfs_vnodes_adjust(void)
2049 {
2050 #ifdef __i386__
2051 	int newdesiredvnodes;
2052 
2053 	desiredvnodes_backup = desiredvnodes;
2054 
2055 	/*
2056 	 * We calculate newdesiredvnodes the same way it is done in
2057 	 * vntblinit(). If it is equal to desiredvnodes, it means that
2058 	 * it wasn't tuned by the administrator and we can tune it down.
2059 	 */
2060 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2061 	    vm_kmem_size / (5 * (sizeof (struct vm_object) +
2062 	    sizeof (struct vnode))));
2063 	if (newdesiredvnodes == desiredvnodes)
2064 		desiredvnodes = (3 * newdesiredvnodes) / 4;
2065 #endif
2066 }
2067 
2068 static void
2069 zfs_vnodes_adjust_back(void)
2070 {
2071 
2072 #ifdef __i386__
2073 	desiredvnodes = desiredvnodes_backup;
2074 #endif
2075 }
2076 
2077 #if __FreeBSD_version >= 1300139
2078 static struct sx zfs_vnlru_lock;
2079 static struct vnode *zfs_vnlru_marker;
2080 #endif
2081 static arc_prune_t *zfs_prune;
2082 
2083 static void
2084 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
2085 {
2086 	if (nr_to_scan > INT_MAX)
2087 		nr_to_scan = INT_MAX;
2088 #if __FreeBSD_version >= 1300139
2089 	sx_xlock(&zfs_vnlru_lock);
2090 	vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
2091 	sx_xunlock(&zfs_vnlru_lock);
2092 #else
2093 	vnlru_free(nr_to_scan, &zfs_vfsops);
2094 #endif
2095 }
2096 
2097 void
2098 zfs_init(void)
2099 {
2100 
2101 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2102 
2103 	/*
2104 	 * Initialize .zfs directory structures
2105 	 */
2106 	zfsctl_init();
2107 
2108 	/*
2109 	 * Initialize znode cache, vnode ops, etc...
2110 	 */
2111 	zfs_znode_init();
2112 
2113 	/*
2114 	 * Reduce number of vnodes. Originally number of vnodes is calculated
2115 	 * with UFS inode in mind. We reduce it here, because it's too big for
2116 	 * ZFS/i386.
2117 	 */
2118 	zfs_vnodes_adjust();
2119 
2120 	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2121 
2122 	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2123 
2124 #if __FreeBSD_version >= 1300139
2125 	zfs_vnlru_marker = vnlru_alloc_marker();
2126 	sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
2127 #endif
2128 	zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
2129 }
2130 
2131 void
2132 zfs_fini(void)
2133 {
2134 	arc_remove_prune_callback(zfs_prune);
2135 #if __FreeBSD_version >= 1300139
2136 	vnlru_free_marker(zfs_vnlru_marker);
2137 	sx_destroy(&zfs_vnlru_lock);
2138 #endif
2139 
2140 	taskq_destroy(zfsvfs_taskq);
2141 	zfsctl_fini();
2142 	zfs_znode_fini();
2143 	zfs_vnodes_adjust_back();
2144 }
2145 
2146 int
2147 zfs_busy(void)
2148 {
2149 	return (zfs_active_fs_count != 0);
2150 }
2151 
2152 /*
2153  * Release VOPs and unmount a suspended filesystem.
2154  */
2155 int
2156 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2157 {
2158 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2159 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2160 
2161 	/*
2162 	 * We already own this, so just hold and rele it to update the
2163 	 * objset_t, as the one we had before may have been evicted.
2164 	 */
2165 	objset_t *os;
2166 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2167 	VERIFY(dsl_dataset_long_held(ds));
2168 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2169 	dsl_pool_config_enter(dp, FTAG);
2170 	VERIFY0(dmu_objset_from_ds(ds, &os));
2171 	dsl_pool_config_exit(dp, FTAG);
2172 	zfsvfs->z_os = os;
2173 
2174 	/* release the VOPs */
2175 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2176 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2177 
2178 	/*
2179 	 * Try to force unmount this file system.
2180 	 */
2181 	(void) zfs_umount(zfsvfs->z_vfs, 0);
2182 	zfsvfs->z_unmounted = B_TRUE;
2183 	return (0);
2184 }
2185 
2186 int
2187 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2188 {
2189 	int error;
2190 	objset_t *os = zfsvfs->z_os;
2191 	dmu_tx_t *tx;
2192 
2193 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2194 		return (SET_ERROR(EINVAL));
2195 
2196 	if (newvers < zfsvfs->z_version)
2197 		return (SET_ERROR(EINVAL));
2198 
2199 	if (zfs_spa_version_map(newvers) >
2200 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2201 		return (SET_ERROR(ENOTSUP));
2202 
2203 	tx = dmu_tx_create(os);
2204 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2205 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2206 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2207 		    ZFS_SA_ATTRS);
2208 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2209 	}
2210 	error = dmu_tx_assign(tx, TXG_WAIT);
2211 	if (error) {
2212 		dmu_tx_abort(tx);
2213 		return (error);
2214 	}
2215 
2216 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2217 	    8, 1, &newvers, tx);
2218 
2219 	if (error) {
2220 		dmu_tx_commit(tx);
2221 		return (error);
2222 	}
2223 
2224 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2225 		uint64_t sa_obj;
2226 
2227 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2228 		    SPA_VERSION_SA);
2229 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2230 		    DMU_OT_NONE, 0, tx);
2231 
2232 		error = zap_add(os, MASTER_NODE_OBJ,
2233 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2234 		ASSERT0(error);
2235 
2236 		VERIFY0(sa_set_sa_object(os, sa_obj));
2237 		sa_register_update_callback(os, zfs_sa_upgrade);
2238 	}
2239 
2240 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2241 	    "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2242 	    (uintmax_t)newvers);
2243 	dmu_tx_commit(tx);
2244 
2245 	zfsvfs->z_version = newvers;
2246 	os->os_version = newvers;
2247 
2248 	zfs_set_fuid_feature(zfsvfs);
2249 
2250 	return (0);
2251 }
2252 
2253 /*
2254  * Return true if the corresponding vfs's unmounted flag is set.
2255  * Otherwise return false.
2256  * If this function returns true we know VFS unmount has been initiated.
2257  */
2258 boolean_t
2259 zfs_get_vfs_flag_unmounted(objset_t *os)
2260 {
2261 	zfsvfs_t *zfvp;
2262 	boolean_t unmounted = B_FALSE;
2263 
2264 	ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2265 
2266 	mutex_enter(&os->os_user_ptr_lock);
2267 	zfvp = dmu_objset_get_user(os);
2268 	if (zfvp != NULL && zfvp->z_vfs != NULL &&
2269 	    (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2270 		unmounted = B_TRUE;
2271 	mutex_exit(&os->os_user_ptr_lock);
2272 
2273 	return (unmounted);
2274 }
2275 
2276 #ifdef _KERNEL
2277 void
2278 zfsvfs_update_fromname(const char *oldname, const char *newname)
2279 {
2280 	char tmpbuf[MAXPATHLEN];
2281 	struct mount *mp;
2282 	char *fromname;
2283 	size_t oldlen;
2284 
2285 	oldlen = strlen(oldname);
2286 
2287 	mtx_lock(&mountlist_mtx);
2288 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2289 		fromname = mp->mnt_stat.f_mntfromname;
2290 		if (strcmp(fromname, oldname) == 0) {
2291 			(void) strlcpy(fromname, newname,
2292 			    sizeof (mp->mnt_stat.f_mntfromname));
2293 			continue;
2294 		}
2295 		if (strncmp(fromname, oldname, oldlen) == 0 &&
2296 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2297 			(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2298 			    newname, fromname + oldlen);
2299 			(void) strlcpy(fromname, tmpbuf,
2300 			    sizeof (mp->mnt_stat.f_mntfromname));
2301 			continue;
2302 		}
2303 	}
2304 	mtx_unlock(&mountlist_mtx);
2305 }
2306 #endif
2307 
2308 /*
2309  * Find a prison with ZFS info.
2310  * Return the ZFS info and the (locked) prison.
2311  */
2312 static struct zfs_jailparam *
2313 zfs_jailparam_find(struct prison *spr, struct prison **prp)
2314 {
2315 	struct prison *pr;
2316 	struct zfs_jailparam *zjp;
2317 
2318 	for (pr = spr; ; pr = pr->pr_parent) {
2319 		mtx_lock(&pr->pr_mtx);
2320 		if (pr == &prison0) {
2321 			zjp = &zfs_jailparam0;
2322 			break;
2323 		}
2324 		zjp = osd_jail_get(pr, zfs_jailparam_slot);
2325 		if (zjp != NULL)
2326 			break;
2327 		mtx_unlock(&pr->pr_mtx);
2328 	}
2329 	*prp = pr;
2330 
2331 	return (zjp);
2332 }
2333 
2334 /*
2335  * Ensure a prison has its own ZFS info.  If zjpp is non-null, point it to the
2336  * ZFS info and lock the prison.
2337  */
2338 static void
2339 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2340 {
2341 	struct prison *ppr;
2342 	struct zfs_jailparam *zjp, *nzjp;
2343 	void **rsv;
2344 
2345 	/* If this prison already has ZFS info, return that. */
2346 	zjp = zfs_jailparam_find(pr, &ppr);
2347 	if (ppr == pr)
2348 		goto done;
2349 
2350 	/*
2351 	 * Allocate a new info record.  Then check again, in case something
2352 	 * changed during the allocation.
2353 	 */
2354 	mtx_unlock(&ppr->pr_mtx);
2355 	nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2356 	rsv = osd_reserve(zfs_jailparam_slot);
2357 	zjp = zfs_jailparam_find(pr, &ppr);
2358 	if (ppr == pr) {
2359 		free(nzjp, M_PRISON);
2360 		osd_free_reserved(rsv);
2361 		goto done;
2362 	}
2363 	/* Inherit the initial values from the ancestor. */
2364 	mtx_lock(&pr->pr_mtx);
2365 	(void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2366 	(void) memcpy(nzjp, zjp, sizeof (*zjp));
2367 	zjp = nzjp;
2368 	mtx_unlock(&ppr->pr_mtx);
2369 done:
2370 	if (zjpp != NULL)
2371 		*zjpp = zjp;
2372 	else
2373 		mtx_unlock(&pr->pr_mtx);
2374 }
2375 
2376 /*
2377  * Jail OSD methods for ZFS VFS info.
2378  */
2379 static int
2380 zfs_jailparam_create(void *obj, void *data)
2381 {
2382 	struct prison *pr = obj;
2383 	struct vfsoptlist *opts = data;
2384 	int jsys;
2385 
2386 	if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2387 	    jsys == JAIL_SYS_INHERIT)
2388 		return (0);
2389 	/*
2390 	 * Inherit a prison's initial values from its parent
2391 	 * (different from JAIL_SYS_INHERIT which also inherits changes).
2392 	 */
2393 	zfs_jailparam_alloc(pr, NULL);
2394 	return (0);
2395 }
2396 
2397 static int
2398 zfs_jailparam_get(void *obj, void *data)
2399 {
2400 	struct prison *ppr, *pr = obj;
2401 	struct vfsoptlist *opts = data;
2402 	struct zfs_jailparam *zjp;
2403 	int jsys, error;
2404 
2405 	zjp = zfs_jailparam_find(pr, &ppr);
2406 	jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2407 	error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2408 	if (error != 0 && error != ENOENT)
2409 		goto done;
2410 	if (jsys == JAIL_SYS_NEW) {
2411 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2412 		    &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2413 		if (error != 0 && error != ENOENT)
2414 			goto done;
2415 	} else {
2416 		/*
2417 		 * If this prison is inheriting its ZFS info, report
2418 		 * empty/zero parameters.
2419 		 */
2420 		static int mount_snapshot = 0;
2421 
2422 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2423 		    &mount_snapshot, sizeof (mount_snapshot));
2424 		if (error != 0 && error != ENOENT)
2425 			goto done;
2426 	}
2427 	error = 0;
2428 done:
2429 	mtx_unlock(&ppr->pr_mtx);
2430 	return (error);
2431 }
2432 
2433 static int
2434 zfs_jailparam_set(void *obj, void *data)
2435 {
2436 	struct prison *pr = obj;
2437 	struct prison *ppr;
2438 	struct vfsoptlist *opts = data;
2439 	int error, jsys, mount_snapshot;
2440 
2441 	/* Set the parameters, which should be correct. */
2442 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2443 	if (error == ENOENT)
2444 		jsys = -1;
2445 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2446 	    sizeof (mount_snapshot));
2447 	if (error == ENOENT)
2448 		mount_snapshot = -1;
2449 	else
2450 		jsys = JAIL_SYS_NEW;
2451 	switch (jsys) {
2452 	case JAIL_SYS_NEW:
2453 	{
2454 		/* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2455 		struct zfs_jailparam *zjp;
2456 
2457 		/*
2458 		 * A child jail cannot have more permissions than its parent
2459 		 */
2460 		if (pr->pr_parent != &prison0) {
2461 			zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2462 			mtx_unlock(&ppr->pr_mtx);
2463 			if (zjp->mount_snapshot < mount_snapshot) {
2464 				return (EPERM);
2465 			}
2466 		}
2467 		zfs_jailparam_alloc(pr, &zjp);
2468 		if (mount_snapshot != -1)
2469 			zjp->mount_snapshot = mount_snapshot;
2470 		mtx_unlock(&pr->pr_mtx);
2471 		break;
2472 	}
2473 	case JAIL_SYS_INHERIT:
2474 		/* "zfs=inherit": inherit the parent's ZFS info. */
2475 		mtx_lock(&pr->pr_mtx);
2476 		osd_jail_del(pr, zfs_jailparam_slot);
2477 		mtx_unlock(&pr->pr_mtx);
2478 		break;
2479 	case -1:
2480 		/*
2481 		 * If the setting being changed is not ZFS related
2482 		 * then do nothing.
2483 		 */
2484 		break;
2485 	}
2486 
2487 	return (0);
2488 }
2489 
2490 static int
2491 zfs_jailparam_check(void *obj __unused, void *data)
2492 {
2493 	struct vfsoptlist *opts = data;
2494 	int error, jsys, mount_snapshot;
2495 
2496 	/* Check that the parameters are correct. */
2497 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2498 	if (error != ENOENT) {
2499 		if (error != 0)
2500 			return (error);
2501 		if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2502 			return (EINVAL);
2503 	}
2504 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2505 	    sizeof (mount_snapshot));
2506 	if (error != ENOENT) {
2507 		if (error != 0)
2508 			return (error);
2509 		if (mount_snapshot != 0 && mount_snapshot != 1)
2510 			return (EINVAL);
2511 	}
2512 	return (0);
2513 }
2514 
2515 static void
2516 zfs_jailparam_destroy(void *data)
2517 {
2518 
2519 	free(data, M_PRISON);
2520 }
2521 
2522 static void
2523 zfs_jailparam_sysinit(void *arg __unused)
2524 {
2525 	struct prison *pr;
2526 	osd_method_t  methods[PR_MAXMETHOD] = {
2527 		[PR_METHOD_CREATE] = zfs_jailparam_create,
2528 		[PR_METHOD_GET] = zfs_jailparam_get,
2529 		[PR_METHOD_SET] = zfs_jailparam_set,
2530 		[PR_METHOD_CHECK] = zfs_jailparam_check,
2531 	};
2532 
2533 	zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2534 	/* Copy the defaults to any existing prisons. */
2535 	sx_slock(&allprison_lock);
2536 	TAILQ_FOREACH(pr, &allprison, pr_list)
2537 		zfs_jailparam_alloc(pr, NULL);
2538 	sx_sunlock(&allprison_lock);
2539 }
2540 
2541 static void
2542 zfs_jailparam_sysuninit(void *arg __unused)
2543 {
2544 
2545 	osd_jail_deregister(zfs_jailparam_slot);
2546 }
2547 
2548 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2549 	zfs_jailparam_sysinit, NULL);
2550 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2551 	zfs_jailparam_sysuninit, NULL);
2552