1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24  * All rights reserved.
25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28  */
29 
30 /* Portions Copyright 2010 Robert Milkowski */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/acl.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/mntent.h>
42 #include <sys/mount.h>
43 #include <sys/cmn_err.h>
44 #include <sys/zfs_znode.h>
45 #include <sys/zfs_dir.h>
46 #include <sys/zil.h>
47 #include <sys/fs/zfs.h>
48 #include <sys/dmu.h>
49 #include <sys/dsl_prop.h>
50 #include <sys/dsl_dataset.h>
51 #include <sys/dsl_deleg.h>
52 #include <sys/spa.h>
53 #include <sys/zap.h>
54 #include <sys/sa.h>
55 #include <sys/sa_impl.h>
56 #include <sys/policy.h>
57 #include <sys/atomic.h>
58 #include <sys/zfs_ioctl.h>
59 #include <sys/zfs_ctldir.h>
60 #include <sys/zfs_fuid.h>
61 #include <sys/sunddi.h>
62 #include <sys/dmu_objset.h>
63 #include <sys/dsl_dir.h>
64 #include <sys/spa_boot.h>
65 #include <sys/jail.h>
66 #include <ufs/ufs/quota.h>
67 #include <sys/zfs_quota.h>
68 
69 #include "zfs_comutil.h"
70 
71 #ifndef	MNTK_VMSETSIZE_BUG
72 #define	MNTK_VMSETSIZE_BUG	0
73 #endif
74 #ifndef	MNTK_NOMSYNC
75 #define	MNTK_NOMSYNC	8
76 #endif
77 
78 /* BEGIN CSTYLED */
79 struct mtx zfs_debug_mtx;
80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
81 
82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
83 
84 int zfs_super_owner;
85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
86     "File system owner can perform privileged operation on his file systems");
87 
88 int zfs_debug_level;
89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
90 	"Debug level");
91 
92 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
93 static int zfs_version_acl = ZFS_ACL_VERSION;
94 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
95     "ZFS_ACL_VERSION");
96 static int zfs_version_spa = SPA_VERSION;
97 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
98     "SPA_VERSION");
99 static int zfs_version_zpl = ZPL_VERSION;
100 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
101     "ZPL_VERSION");
102 /* END CSTYLED */
103 
104 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
105 static int zfs_mount(vfs_t *vfsp);
106 static int zfs_umount(vfs_t *vfsp, int fflag);
107 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
108 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
109 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
110 static int zfs_sync(vfs_t *vfsp, int waitfor);
111 #if __FreeBSD_version >= 1300098
112 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
113     struct ucred **credanonp, int *numsecflavors, int *secflavors);
114 #else
115 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
116     struct ucred **credanonp, int *numsecflavors, int **secflavors);
117 #endif
118 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
119 static void zfs_freevfs(vfs_t *vfsp);
120 
121 struct vfsops zfs_vfsops = {
122 	.vfs_mount =		zfs_mount,
123 	.vfs_unmount =		zfs_umount,
124 #if __FreeBSD_version >= 1300049
125 	.vfs_root =		vfs_cache_root,
126 	.vfs_cachedroot = zfs_root,
127 #else
128 	.vfs_root =		zfs_root,
129 #endif
130 	.vfs_statfs =		zfs_statfs,
131 	.vfs_vget =		zfs_vget,
132 	.vfs_sync =		zfs_sync,
133 	.vfs_checkexp =		zfs_checkexp,
134 	.vfs_fhtovp =		zfs_fhtovp,
135 	.vfs_quotactl =		zfs_quotactl,
136 };
137 
138 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
139 
140 /*
141  * We need to keep a count of active fs's.
142  * This is necessary to prevent our module
143  * from being unloaded after a umount -f
144  */
145 static uint32_t	zfs_active_fs_count = 0;
146 
147 int
148 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
149     char *setpoint)
150 {
151 	int error;
152 	zfsvfs_t *zfvp;
153 	vfs_t *vfsp;
154 	objset_t *os;
155 	uint64_t tmp = *val;
156 
157 	error = dmu_objset_from_ds(ds, &os);
158 	if (error != 0)
159 		return (error);
160 
161 	error = getzfsvfs_impl(os, &zfvp);
162 	if (error != 0)
163 		return (error);
164 	if (zfvp == NULL)
165 		return (ENOENT);
166 	vfsp = zfvp->z_vfs;
167 	switch (zfs_prop) {
168 	case ZFS_PROP_ATIME:
169 		if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
170 			tmp = 0;
171 		if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
172 			tmp = 1;
173 		break;
174 	case ZFS_PROP_DEVICES:
175 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
176 			tmp = 0;
177 		if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
178 			tmp = 1;
179 		break;
180 	case ZFS_PROP_EXEC:
181 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
182 			tmp = 0;
183 		if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
184 			tmp = 1;
185 		break;
186 	case ZFS_PROP_SETUID:
187 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
188 			tmp = 0;
189 		if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
190 			tmp = 1;
191 		break;
192 	case ZFS_PROP_READONLY:
193 		if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
194 			tmp = 0;
195 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
196 			tmp = 1;
197 		break;
198 	case ZFS_PROP_XATTR:
199 		if (zfvp->z_flags & ZSB_XATTR)
200 			tmp = zfvp->z_xattr;
201 		break;
202 	case ZFS_PROP_NBMAND:
203 		if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
204 			tmp = 0;
205 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
206 			tmp = 1;
207 		break;
208 	default:
209 		vfs_unbusy(vfsp);
210 		return (ENOENT);
211 	}
212 
213 	vfs_unbusy(vfsp);
214 	if (tmp != *val) {
215 		(void) strcpy(setpoint, "temporary");
216 		*val = tmp;
217 	}
218 	return (0);
219 }
220 
221 static int
222 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
223 {
224 	int error = 0;
225 	char buf[32];
226 	uint64_t usedobj, quotaobj;
227 	uint64_t quota, used = 0;
228 	timespec_t now;
229 
230 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
231 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
232 
233 	if (quotaobj == 0 || zfsvfs->z_replay) {
234 		error = ENOENT;
235 		goto done;
236 	}
237 	(void) sprintf(buf, "%llx", (longlong_t)id);
238 	if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
239 	    buf, sizeof (quota), 1, &quota)) != 0) {
240 		dprintf("%s(%d): quotaobj lookup failed\n",
241 		    __FUNCTION__, __LINE__);
242 		goto done;
243 	}
244 	/*
245 	 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
246 	 * So we set them to be the same.
247 	 */
248 	dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
249 	error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
250 	if (error && error != ENOENT) {
251 		dprintf("%s(%d):  usedobj failed; %d\n",
252 		    __FUNCTION__, __LINE__, error);
253 		goto done;
254 	}
255 	dqp->dqb_curblocks = btodb(used);
256 	dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
257 	vfs_timestamp(&now);
258 	/*
259 	 * Setting this to 0 causes FreeBSD quota(8) to print
260 	 * the number of days since the epoch, which isn't
261 	 * particularly useful.
262 	 */
263 	dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
264 done:
265 	return (error);
266 }
267 
268 static int
269 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
270 {
271 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
272 	struct thread *td;
273 	int cmd, type, error = 0;
274 	int bitsize;
275 	zfs_userquota_prop_t quota_type;
276 	struct dqblk64 dqblk = { 0 };
277 
278 	td = curthread;
279 	cmd = cmds >> SUBCMDSHIFT;
280 	type = cmds & SUBCMDMASK;
281 
282 	ZFS_ENTER(zfsvfs);
283 	if (id == -1) {
284 		switch (type) {
285 		case USRQUOTA:
286 			id = td->td_ucred->cr_ruid;
287 			break;
288 		case GRPQUOTA:
289 			id = td->td_ucred->cr_rgid;
290 			break;
291 		default:
292 			error = EINVAL;
293 			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
294 				vfs_unbusy(vfsp);
295 			goto done;
296 		}
297 	}
298 	/*
299 	 * Map BSD type to:
300 	 * ZFS_PROP_USERUSED,
301 	 * ZFS_PROP_USERQUOTA,
302 	 * ZFS_PROP_GROUPUSED,
303 	 * ZFS_PROP_GROUPQUOTA
304 	 */
305 	switch (cmd) {
306 	case Q_SETQUOTA:
307 	case Q_SETQUOTA32:
308 		if (type == USRQUOTA)
309 			quota_type = ZFS_PROP_USERQUOTA;
310 		else if (type == GRPQUOTA)
311 			quota_type = ZFS_PROP_GROUPQUOTA;
312 		else
313 			error = EINVAL;
314 		break;
315 	case Q_GETQUOTA:
316 	case Q_GETQUOTA32:
317 		if (type == USRQUOTA)
318 			quota_type = ZFS_PROP_USERUSED;
319 		else if (type == GRPQUOTA)
320 			quota_type = ZFS_PROP_GROUPUSED;
321 		else
322 			error = EINVAL;
323 		break;
324 	}
325 
326 	/*
327 	 * Depending on the cmd, we may need to get
328 	 * the ruid and domain (see fuidstr_to_sid?),
329 	 * the fuid (how?), or other information.
330 	 * Create fuid using zfs_fuid_create(zfsvfs, id,
331 	 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
332 	 * I think I can use just the id?
333 	 *
334 	 * Look at zfs_id_overquota() to look up a quota.
335 	 * zap_lookup(something, quotaobj, fuidstring,
336 	 *     sizeof (long long), 1, &quota)
337 	 *
338 	 * See zfs_set_userquota() to set a quota.
339 	 */
340 	if ((uint32_t)type >= MAXQUOTAS) {
341 		error = EINVAL;
342 		goto done;
343 	}
344 
345 	switch (cmd) {
346 	case Q_GETQUOTASIZE:
347 		bitsize = 64;
348 		error = copyout(&bitsize, arg, sizeof (int));
349 		break;
350 	case Q_QUOTAON:
351 		// As far as I can tell, you can't turn quotas on or off on zfs
352 		error = 0;
353 		vfs_unbusy(vfsp);
354 		break;
355 	case Q_QUOTAOFF:
356 		error = ENOTSUP;
357 		vfs_unbusy(vfsp);
358 		break;
359 	case Q_SETQUOTA:
360 		error = copyin(arg, &dqblk, sizeof (dqblk));
361 		if (error == 0)
362 			error = zfs_set_userquota(zfsvfs, quota_type,
363 			    "", id, dbtob(dqblk.dqb_bhardlimit));
364 		break;
365 	case Q_GETQUOTA:
366 		error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
367 		if (error == 0)
368 			error = copyout(&dqblk, arg, sizeof (dqblk));
369 		break;
370 	default:
371 		error = EINVAL;
372 		break;
373 	}
374 done:
375 	ZFS_EXIT(zfsvfs);
376 	return (error);
377 }
378 
379 
380 boolean_t
381 zfs_is_readonly(zfsvfs_t *zfsvfs)
382 {
383 	return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
384 }
385 
386 /*ARGSUSED*/
387 static int
388 zfs_sync(vfs_t *vfsp, int waitfor)
389 {
390 
391 	/*
392 	 * Data integrity is job one.  We don't want a compromised kernel
393 	 * writing to the storage pool, so we never sync during panic.
394 	 */
395 	if (panicstr)
396 		return (0);
397 
398 	/*
399 	 * Ignore the system syncher.  ZFS already commits async data
400 	 * at zfs_txg_timeout intervals.
401 	 */
402 	if (waitfor == MNT_LAZY)
403 		return (0);
404 
405 	if (vfsp != NULL) {
406 		/*
407 		 * Sync a specific filesystem.
408 		 */
409 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
410 		dsl_pool_t *dp;
411 		int error;
412 
413 		error = vfs_stdsync(vfsp, waitfor);
414 		if (error != 0)
415 			return (error);
416 
417 		ZFS_ENTER(zfsvfs);
418 		dp = dmu_objset_pool(zfsvfs->z_os);
419 
420 		/*
421 		 * If the system is shutting down, then skip any
422 		 * filesystems which may exist on a suspended pool.
423 		 */
424 		if (rebooting && spa_suspended(dp->dp_spa)) {
425 			ZFS_EXIT(zfsvfs);
426 			return (0);
427 		}
428 
429 		if (zfsvfs->z_log != NULL)
430 			zil_commit(zfsvfs->z_log, 0);
431 
432 		ZFS_EXIT(zfsvfs);
433 	} else {
434 		/*
435 		 * Sync all ZFS filesystems.  This is what happens when you
436 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
437 		 * request by waiting for all pools to commit all dirty data.
438 		 */
439 		spa_sync_allpools();
440 	}
441 
442 	return (0);
443 }
444 
445 static void
446 atime_changed_cb(void *arg, uint64_t newval)
447 {
448 	zfsvfs_t *zfsvfs = arg;
449 
450 	if (newval == TRUE) {
451 		zfsvfs->z_atime = TRUE;
452 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
453 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
454 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
455 	} else {
456 		zfsvfs->z_atime = FALSE;
457 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
458 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
459 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
460 	}
461 }
462 
463 static void
464 xattr_changed_cb(void *arg, uint64_t newval)
465 {
466 	zfsvfs_t *zfsvfs = arg;
467 
468 	if (newval == ZFS_XATTR_OFF) {
469 		zfsvfs->z_flags &= ~ZSB_XATTR;
470 	} else {
471 		zfsvfs->z_flags |= ZSB_XATTR;
472 
473 		if (newval == ZFS_XATTR_SA)
474 			zfsvfs->z_xattr_sa = B_TRUE;
475 		else
476 			zfsvfs->z_xattr_sa = B_FALSE;
477 	}
478 }
479 
480 static void
481 blksz_changed_cb(void *arg, uint64_t newval)
482 {
483 	zfsvfs_t *zfsvfs = arg;
484 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
485 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
486 	ASSERT(ISP2(newval));
487 
488 	zfsvfs->z_max_blksz = newval;
489 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
490 }
491 
492 static void
493 readonly_changed_cb(void *arg, uint64_t newval)
494 {
495 	zfsvfs_t *zfsvfs = arg;
496 
497 	if (newval) {
498 		/* XXX locking on vfs_flag? */
499 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
500 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
501 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
502 	} else {
503 		/* XXX locking on vfs_flag? */
504 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
505 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
506 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
507 	}
508 }
509 
510 static void
511 setuid_changed_cb(void *arg, uint64_t newval)
512 {
513 	zfsvfs_t *zfsvfs = arg;
514 
515 	if (newval == FALSE) {
516 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
517 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
518 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
519 	} else {
520 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
521 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
522 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
523 	}
524 }
525 
526 static void
527 exec_changed_cb(void *arg, uint64_t newval)
528 {
529 	zfsvfs_t *zfsvfs = arg;
530 
531 	if (newval == FALSE) {
532 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
533 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
534 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
535 	} else {
536 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
537 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
538 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
539 	}
540 }
541 
542 /*
543  * The nbmand mount option can be changed at mount time.
544  * We can't allow it to be toggled on live file systems or incorrect
545  * behavior may be seen from cifs clients
546  *
547  * This property isn't registered via dsl_prop_register(), but this callback
548  * will be called when a file system is first mounted
549  */
550 static void
551 nbmand_changed_cb(void *arg, uint64_t newval)
552 {
553 	zfsvfs_t *zfsvfs = arg;
554 	if (newval == FALSE) {
555 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
556 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
557 	} else {
558 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
559 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
560 	}
561 }
562 
563 static void
564 snapdir_changed_cb(void *arg, uint64_t newval)
565 {
566 	zfsvfs_t *zfsvfs = arg;
567 
568 	zfsvfs->z_show_ctldir = newval;
569 }
570 
571 static void
572 vscan_changed_cb(void *arg, uint64_t newval)
573 {
574 	zfsvfs_t *zfsvfs = arg;
575 
576 	zfsvfs->z_vscan = newval;
577 }
578 
579 static void
580 acl_mode_changed_cb(void *arg, uint64_t newval)
581 {
582 	zfsvfs_t *zfsvfs = arg;
583 
584 	zfsvfs->z_acl_mode = newval;
585 }
586 
587 static void
588 acl_inherit_changed_cb(void *arg, uint64_t newval)
589 {
590 	zfsvfs_t *zfsvfs = arg;
591 
592 	zfsvfs->z_acl_inherit = newval;
593 }
594 
595 static int
596 zfs_register_callbacks(vfs_t *vfsp)
597 {
598 	struct dsl_dataset *ds = NULL;
599 	objset_t *os = NULL;
600 	zfsvfs_t *zfsvfs = NULL;
601 	uint64_t nbmand;
602 	boolean_t readonly = B_FALSE;
603 	boolean_t do_readonly = B_FALSE;
604 	boolean_t setuid = B_FALSE;
605 	boolean_t do_setuid = B_FALSE;
606 	boolean_t exec = B_FALSE;
607 	boolean_t do_exec = B_FALSE;
608 	boolean_t xattr = B_FALSE;
609 	boolean_t atime = B_FALSE;
610 	boolean_t do_atime = B_FALSE;
611 	boolean_t do_xattr = B_FALSE;
612 	int error = 0;
613 
614 	ASSERT(vfsp);
615 	zfsvfs = vfsp->vfs_data;
616 	ASSERT(zfsvfs);
617 	os = zfsvfs->z_os;
618 
619 	/*
620 	 * This function can be called for a snapshot when we update snapshot's
621 	 * mount point, which isn't really supported.
622 	 */
623 	if (dmu_objset_is_snapshot(os))
624 		return (EOPNOTSUPP);
625 
626 	/*
627 	 * The act of registering our callbacks will destroy any mount
628 	 * options we may have.  In order to enable temporary overrides
629 	 * of mount options, we stash away the current values and
630 	 * restore them after we register the callbacks.
631 	 */
632 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
633 	    !spa_writeable(dmu_objset_spa(os))) {
634 		readonly = B_TRUE;
635 		do_readonly = B_TRUE;
636 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
637 		readonly = B_FALSE;
638 		do_readonly = B_TRUE;
639 	}
640 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
641 		setuid = B_FALSE;
642 		do_setuid = B_TRUE;
643 	} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
644 		setuid = B_TRUE;
645 		do_setuid = B_TRUE;
646 	}
647 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
648 		exec = B_FALSE;
649 		do_exec = B_TRUE;
650 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
651 		exec = B_TRUE;
652 		do_exec = B_TRUE;
653 	}
654 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
655 		zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
656 		do_xattr = B_TRUE;
657 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
658 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
659 		do_xattr = B_TRUE;
660 	} else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
661 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
662 		do_xattr = B_TRUE;
663 	} else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
664 		zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
665 		do_xattr = B_TRUE;
666 	}
667 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
668 		atime = B_FALSE;
669 		do_atime = B_TRUE;
670 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
671 		atime = B_TRUE;
672 		do_atime = B_TRUE;
673 	}
674 
675 	/*
676 	 * We need to enter pool configuration here, so that we can use
677 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
678 	 * dsl_prop_get_integer() can not be used, because it has to acquire
679 	 * spa_namespace_lock and we can not do that because we already hold
680 	 * z_teardown_lock.  The problem is that spa_write_cachefile() is called
681 	 * with spa_namespace_lock held and the function calls ZFS vnode
682 	 * operations to write the cache file and thus z_teardown_lock is
683 	 * acquired after spa_namespace_lock.
684 	 */
685 	ds = dmu_objset_ds(os);
686 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
687 
688 	/*
689 	 * nbmand is a special property.  It can only be changed at
690 	 * mount time.
691 	 *
692 	 * This is weird, but it is documented to only be changeable
693 	 * at mount time.
694 	 */
695 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
696 		nbmand = B_FALSE;
697 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
698 		nbmand = B_TRUE;
699 	} else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) {
700 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
701 		return (error);
702 	}
703 
704 	/*
705 	 * Register property callbacks.
706 	 *
707 	 * It would probably be fine to just check for i/o error from
708 	 * the first prop_register(), but I guess I like to go
709 	 * overboard...
710 	 */
711 	error = dsl_prop_register(ds,
712 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
713 	error = error ? error : dsl_prop_register(ds,
714 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
715 	error = error ? error : dsl_prop_register(ds,
716 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
717 	error = error ? error : dsl_prop_register(ds,
718 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
719 	error = error ? error : dsl_prop_register(ds,
720 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
721 	error = error ? error : dsl_prop_register(ds,
722 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
723 	error = error ? error : dsl_prop_register(ds,
724 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
725 	error = error ? error : dsl_prop_register(ds,
726 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
727 	error = error ? error : dsl_prop_register(ds,
728 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
729 	    zfsvfs);
730 	error = error ? error : dsl_prop_register(ds,
731 	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
732 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
733 	if (error)
734 		goto unregister;
735 
736 	/*
737 	 * Invoke our callbacks to restore temporary mount options.
738 	 */
739 	if (do_readonly)
740 		readonly_changed_cb(zfsvfs, readonly);
741 	if (do_setuid)
742 		setuid_changed_cb(zfsvfs, setuid);
743 	if (do_exec)
744 		exec_changed_cb(zfsvfs, exec);
745 	if (do_xattr)
746 		xattr_changed_cb(zfsvfs, xattr);
747 	if (do_atime)
748 		atime_changed_cb(zfsvfs, atime);
749 
750 	nbmand_changed_cb(zfsvfs, nbmand);
751 
752 	return (0);
753 
754 unregister:
755 	dsl_prop_unregister_all(ds, zfsvfs);
756 	return (error);
757 }
758 
759 /*
760  * Associate this zfsvfs with the given objset, which must be owned.
761  * This will cache a bunch of on-disk state from the objset in the
762  * zfsvfs.
763  */
764 static int
765 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
766 {
767 	int error;
768 	uint64_t val;
769 
770 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
771 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
772 	zfsvfs->z_os = os;
773 
774 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
775 	if (error != 0)
776 		return (error);
777 	if (zfsvfs->z_version >
778 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
779 		(void) printf("Can't mount a version %lld file system "
780 		    "on a version %lld pool\n. Pool must be upgraded to mount "
781 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
782 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
783 		return (SET_ERROR(ENOTSUP));
784 	}
785 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
786 	if (error != 0)
787 		return (error);
788 	zfsvfs->z_norm = (int)val;
789 
790 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
791 	if (error != 0)
792 		return (error);
793 	zfsvfs->z_utf8 = (val != 0);
794 
795 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
796 	if (error != 0)
797 		return (error);
798 	zfsvfs->z_case = (uint_t)val;
799 
800 	/*
801 	 * Fold case on file systems that are always or sometimes case
802 	 * insensitive.
803 	 */
804 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
805 	    zfsvfs->z_case == ZFS_CASE_MIXED)
806 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
807 
808 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
809 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
810 
811 	uint64_t sa_obj = 0;
812 	if (zfsvfs->z_use_sa) {
813 		/* should either have both of these objects or none */
814 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
815 		    &sa_obj);
816 		if (error != 0)
817 			return (error);
818 	}
819 
820 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
821 	    &zfsvfs->z_attr_table);
822 	if (error != 0)
823 		return (error);
824 
825 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
826 		sa_register_update_callback(os, zfs_sa_upgrade);
827 
828 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
829 	    &zfsvfs->z_root);
830 	if (error != 0)
831 		return (error);
832 	ASSERT(zfsvfs->z_root != 0);
833 
834 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
835 	    &zfsvfs->z_unlinkedobj);
836 	if (error != 0)
837 		return (error);
838 
839 	error = zap_lookup(os, MASTER_NODE_OBJ,
840 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
841 	    8, 1, &zfsvfs->z_userquota_obj);
842 	if (error == ENOENT)
843 		zfsvfs->z_userquota_obj = 0;
844 	else if (error != 0)
845 		return (error);
846 
847 	error = zap_lookup(os, MASTER_NODE_OBJ,
848 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
849 	    8, 1, &zfsvfs->z_groupquota_obj);
850 	if (error == ENOENT)
851 		zfsvfs->z_groupquota_obj = 0;
852 	else if (error != 0)
853 		return (error);
854 
855 	error = zap_lookup(os, MASTER_NODE_OBJ,
856 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
857 	    8, 1, &zfsvfs->z_projectquota_obj);
858 	if (error == ENOENT)
859 		zfsvfs->z_projectquota_obj = 0;
860 	else if (error != 0)
861 		return (error);
862 
863 	error = zap_lookup(os, MASTER_NODE_OBJ,
864 	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
865 	    8, 1, &zfsvfs->z_userobjquota_obj);
866 	if (error == ENOENT)
867 		zfsvfs->z_userobjquota_obj = 0;
868 	else if (error != 0)
869 		return (error);
870 
871 	error = zap_lookup(os, MASTER_NODE_OBJ,
872 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
873 	    8, 1, &zfsvfs->z_groupobjquota_obj);
874 	if (error == ENOENT)
875 		zfsvfs->z_groupobjquota_obj = 0;
876 	else if (error != 0)
877 		return (error);
878 
879 	error = zap_lookup(os, MASTER_NODE_OBJ,
880 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
881 	    8, 1, &zfsvfs->z_projectobjquota_obj);
882 	if (error == ENOENT)
883 		zfsvfs->z_projectobjquota_obj = 0;
884 	else if (error != 0)
885 		return (error);
886 
887 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
888 	    &zfsvfs->z_fuid_obj);
889 	if (error == ENOENT)
890 		zfsvfs->z_fuid_obj = 0;
891 	else if (error != 0)
892 		return (error);
893 
894 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
895 	    &zfsvfs->z_shares_dir);
896 	if (error == ENOENT)
897 		zfsvfs->z_shares_dir = 0;
898 	else if (error != 0)
899 		return (error);
900 
901 	/*
902 	 * Only use the name cache if we are looking for a
903 	 * name on a file system that does not require normalization
904 	 * or case folding.  We can also look there if we happen to be
905 	 * on a non-normalizing, mixed sensitivity file system IF we
906 	 * are looking for the exact name (which is always the case on
907 	 * FreeBSD).
908 	 */
909 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
910 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
911 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
912 
913 	return (0);
914 }
915 
916 taskq_t *zfsvfs_taskq;
917 
918 static void
919 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
920 {
921 
922 	zfs_unlinked_drain((zfsvfs_t *)context);
923 }
924 
925 int
926 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
927 {
928 	objset_t *os;
929 	zfsvfs_t *zfsvfs;
930 	int error;
931 	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
932 
933 	/*
934 	 * XXX: Fix struct statfs so this isn't necessary!
935 	 *
936 	 * The 'osname' is used as the filesystem's special node, which means
937 	 * it must fit in statfs.f_mntfromname, or else it can't be
938 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
939 	 * 'zfs unmount' to think it's not mounted when it is.
940 	 */
941 	if (strlen(osname) >= MNAMELEN)
942 		return (SET_ERROR(ENAMETOOLONG));
943 
944 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
945 
946 	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
947 	    &os);
948 	if (error != 0) {
949 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
950 		return (error);
951 	}
952 
953 	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
954 
955 	return (error);
956 }
957 
958 
959 int
960 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
961 {
962 	int error;
963 
964 	zfsvfs->z_vfs = NULL;
965 	zfsvfs->z_parent = zfsvfs;
966 
967 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
968 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
969 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
970 	    offsetof(znode_t, z_link_node));
971 	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
972 	    zfsvfs_task_unlinked_drain, zfsvfs);
973 #ifdef DIAGNOSTIC
974 	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
975 #else
976 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
977 #endif
978 	ZFS_INIT_TEARDOWN_INACTIVE(zfsvfs);
979 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
980 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
981 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
982 
983 	error = zfsvfs_init(zfsvfs, os);
984 	if (error != 0) {
985 		dmu_objset_disown(os, B_TRUE, zfsvfs);
986 		*zfvp = NULL;
987 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
988 		return (error);
989 	}
990 
991 	*zfvp = zfsvfs;
992 	return (0);
993 }
994 
995 static int
996 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
997 {
998 	int error;
999 
1000 	/*
1001 	 * Check for a bad on-disk format version now since we
1002 	 * lied about owning the dataset readonly before.
1003 	 */
1004 	if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1005 	    dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1006 		return (SET_ERROR(EROFS));
1007 
1008 	error = zfs_register_callbacks(zfsvfs->z_vfs);
1009 	if (error)
1010 		return (error);
1011 
1012 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1013 
1014 	/*
1015 	 * If we are not mounting (ie: online recv), then we don't
1016 	 * have to worry about replaying the log as we blocked all
1017 	 * operations out since we closed the ZIL.
1018 	 */
1019 	if (mounting) {
1020 		boolean_t readonly;
1021 
1022 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1023 		dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1024 
1025 		/*
1026 		 * During replay we remove the read only flag to
1027 		 * allow replays to succeed.
1028 		 */
1029 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1030 		if (readonly != 0) {
1031 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1032 		} else {
1033 			dsl_dir_t *dd;
1034 			zap_stats_t zs;
1035 
1036 			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1037 			    &zs) == 0) {
1038 				dataset_kstats_update_nunlinks_kstat(
1039 				    &zfsvfs->z_kstat, zs.zs_num_entries);
1040 				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1041 				    "num_entries in unlinked set: %llu",
1042 				    zs.zs_num_entries);
1043 			}
1044 
1045 			zfs_unlinked_drain(zfsvfs);
1046 			dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1047 			dd->dd_activity_cancelled = B_FALSE;
1048 		}
1049 
1050 		/*
1051 		 * Parse and replay the intent log.
1052 		 *
1053 		 * Because of ziltest, this must be done after
1054 		 * zfs_unlinked_drain().  (Further note: ziltest
1055 		 * doesn't use readonly mounts, where
1056 		 * zfs_unlinked_drain() isn't called.)  This is because
1057 		 * ziltest causes spa_sync() to think it's committed,
1058 		 * but actually it is not, so the intent log contains
1059 		 * many txg's worth of changes.
1060 		 *
1061 		 * In particular, if object N is in the unlinked set in
1062 		 * the last txg to actually sync, then it could be
1063 		 * actually freed in a later txg and then reallocated
1064 		 * in a yet later txg.  This would write a "create
1065 		 * object N" record to the intent log.  Normally, this
1066 		 * would be fine because the spa_sync() would have
1067 		 * written out the fact that object N is free, before
1068 		 * we could write the "create object N" intent log
1069 		 * record.
1070 		 *
1071 		 * But when we are in ziltest mode, we advance the "open
1072 		 * txg" without actually spa_sync()-ing the changes to
1073 		 * disk.  So we would see that object N is still
1074 		 * allocated and in the unlinked set, and there is an
1075 		 * intent log record saying to allocate it.
1076 		 */
1077 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1078 			if (zil_replay_disable) {
1079 				zil_destroy(zfsvfs->z_log, B_FALSE);
1080 			} else {
1081 				boolean_t use_nc = zfsvfs->z_use_namecache;
1082 				zfsvfs->z_use_namecache = B_FALSE;
1083 				zfsvfs->z_replay = B_TRUE;
1084 				zil_replay(zfsvfs->z_os, zfsvfs,
1085 				    zfs_replay_vector);
1086 				zfsvfs->z_replay = B_FALSE;
1087 				zfsvfs->z_use_namecache = use_nc;
1088 			}
1089 		}
1090 
1091 		/* restore readonly bit */
1092 		if (readonly != 0)
1093 			zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1094 	}
1095 
1096 	/*
1097 	 * Set the objset user_ptr to track its zfsvfs.
1098 	 */
1099 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1100 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1101 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1102 
1103 	return (0);
1104 }
1105 
1106 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1107 
1108 void
1109 zfsvfs_free(zfsvfs_t *zfsvfs)
1110 {
1111 	int i;
1112 
1113 	/*
1114 	 * This is a barrier to prevent the filesystem from going away in
1115 	 * zfs_znode_move() until we can safely ensure that the filesystem is
1116 	 * not unmounted. We consider the filesystem valid before the barrier
1117 	 * and invalid after the barrier.
1118 	 */
1119 	rw_enter(&zfsvfs_lock, RW_READER);
1120 	rw_exit(&zfsvfs_lock);
1121 
1122 	zfs_fuid_destroy(zfsvfs);
1123 
1124 	mutex_destroy(&zfsvfs->z_znodes_lock);
1125 	mutex_destroy(&zfsvfs->z_lock);
1126 	ASSERT(zfsvfs->z_nr_znodes == 0);
1127 	list_destroy(&zfsvfs->z_all_znodes);
1128 	rrm_destroy(&zfsvfs->z_teardown_lock);
1129 	ZFS_DESTROY_TEARDOWN_INACTIVE(zfsvfs);
1130 	rw_destroy(&zfsvfs->z_fuid_lock);
1131 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1132 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1133 	dataset_kstats_destroy(&zfsvfs->z_kstat);
1134 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1135 }
1136 
1137 static void
1138 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1139 {
1140 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1141 	if (zfsvfs->z_vfs) {
1142 		if (zfsvfs->z_use_fuids) {
1143 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1144 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1145 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1146 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1147 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1148 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1149 		} else {
1150 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1151 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1152 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1153 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1154 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1155 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1156 		}
1157 	}
1158 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1159 }
1160 
1161 static int
1162 zfs_domount(vfs_t *vfsp, char *osname)
1163 {
1164 	uint64_t recordsize, fsid_guid;
1165 	int error = 0;
1166 	zfsvfs_t *zfsvfs;
1167 
1168 	ASSERT(vfsp);
1169 	ASSERT(osname);
1170 
1171 	error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1172 	if (error)
1173 		return (error);
1174 	zfsvfs->z_vfs = vfsp;
1175 
1176 	if ((error = dsl_prop_get_integer(osname,
1177 	    "recordsize", &recordsize, NULL)))
1178 		goto out;
1179 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1180 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1181 
1182 	vfsp->vfs_data = zfsvfs;
1183 	vfsp->mnt_flag |= MNT_LOCAL;
1184 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1185 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1186 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1187 	/*
1188 	 * This can cause a loss of coherence between ARC and page cache
1189 	 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1190 	 */
1191 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
1192 	vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1193 	vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1194 
1195 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1196 	vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1197 #endif
1198 	/*
1199 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1200 	 * separates our fsid from any other filesystem types, and a
1201 	 * 56-bit objset unique ID.  The objset unique ID is unique to
1202 	 * all objsets open on this system, provided by unique_create().
1203 	 * The 8-bit fs type must be put in the low bits of fsid[1]
1204 	 * because that's where other Solaris filesystems put it.
1205 	 */
1206 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1207 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1208 	vfsp->vfs_fsid.val[0] = fsid_guid;
1209 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1210 	    (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1211 
1212 	/*
1213 	 * Set features for file system.
1214 	 */
1215 	zfs_set_fuid_feature(zfsvfs);
1216 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1217 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1218 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1219 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1220 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1221 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1222 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1223 	}
1224 	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1225 
1226 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1227 		uint64_t pval;
1228 
1229 		atime_changed_cb(zfsvfs, B_FALSE);
1230 		readonly_changed_cb(zfsvfs, B_TRUE);
1231 		if ((error = dsl_prop_get_integer(osname,
1232 		    "xattr", &pval, NULL)))
1233 			goto out;
1234 		xattr_changed_cb(zfsvfs, pval);
1235 		zfsvfs->z_issnap = B_TRUE;
1236 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1237 
1238 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1239 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1240 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1241 	} else {
1242 		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1243 			goto out;
1244 	}
1245 
1246 	vfs_mountedfrom(vfsp, osname);
1247 
1248 	if (!zfsvfs->z_issnap)
1249 		zfsctl_create(zfsvfs);
1250 out:
1251 	if (error) {
1252 		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1253 		zfsvfs_free(zfsvfs);
1254 	} else {
1255 		atomic_inc_32(&zfs_active_fs_count);
1256 	}
1257 
1258 	return (error);
1259 }
1260 
1261 static void
1262 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1263 {
1264 	objset_t *os = zfsvfs->z_os;
1265 
1266 	if (!dmu_objset_is_snapshot(os))
1267 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1268 }
1269 
1270 static int
1271 getpoolname(const char *osname, char *poolname)
1272 {
1273 	char *p;
1274 
1275 	p = strchr(osname, '/');
1276 	if (p == NULL) {
1277 		if (strlen(osname) >= MAXNAMELEN)
1278 			return (ENAMETOOLONG);
1279 		(void) strcpy(poolname, osname);
1280 	} else {
1281 		if (p - osname >= MAXNAMELEN)
1282 			return (ENAMETOOLONG);
1283 		(void) strncpy(poolname, osname, p - osname);
1284 		poolname[p - osname] = '\0';
1285 	}
1286 	return (0);
1287 }
1288 
1289 /*ARGSUSED*/
1290 static int
1291 zfs_mount(vfs_t *vfsp)
1292 {
1293 	kthread_t	*td = curthread;
1294 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
1295 	cred_t		*cr = td->td_ucred;
1296 	char		*osname;
1297 	int		error = 0;
1298 	int		canwrite;
1299 
1300 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1301 		return (SET_ERROR(EINVAL));
1302 
1303 	/*
1304 	 * If full-owner-access is enabled and delegated administration is
1305 	 * turned on, we must set nosuid.
1306 	 */
1307 	if (zfs_super_owner &&
1308 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1309 		secpolicy_fs_mount_clearopts(cr, vfsp);
1310 	}
1311 
1312 	/*
1313 	 * Check for mount privilege?
1314 	 *
1315 	 * If we don't have privilege then see if
1316 	 * we have local permission to allow it
1317 	 */
1318 	error = secpolicy_fs_mount(cr, mvp, vfsp);
1319 	if (error) {
1320 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1321 			goto out;
1322 
1323 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1324 			vattr_t		vattr;
1325 
1326 			/*
1327 			 * Make sure user is the owner of the mount point
1328 			 * or has sufficient privileges.
1329 			 */
1330 
1331 			vattr.va_mask = AT_UID;
1332 
1333 			vn_lock(mvp, LK_SHARED | LK_RETRY);
1334 			if (VOP_GETATTR(mvp, &vattr, cr)) {
1335 				VOP_UNLOCK1(mvp);
1336 				goto out;
1337 			}
1338 
1339 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1340 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1341 				VOP_UNLOCK1(mvp);
1342 				goto out;
1343 			}
1344 			VOP_UNLOCK1(mvp);
1345 		}
1346 
1347 		secpolicy_fs_mount_clearopts(cr, vfsp);
1348 	}
1349 
1350 	/*
1351 	 * Refuse to mount a filesystem if we are in a local zone and the
1352 	 * dataset is not visible.
1353 	 */
1354 	if (!INGLOBALZONE(curproc) &&
1355 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1356 		error = SET_ERROR(EPERM);
1357 		goto out;
1358 	}
1359 
1360 	vfsp->vfs_flag |= MNT_NFS4ACLS;
1361 
1362 	/*
1363 	 * When doing a remount, we simply refresh our temporary properties
1364 	 * according to those options set in the current VFS options.
1365 	 */
1366 	if (vfsp->vfs_flag & MS_REMOUNT) {
1367 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
1368 
1369 		/*
1370 		 * Refresh mount options with z_teardown_lock blocking I/O while
1371 		 * the filesystem is in an inconsistent state.
1372 		 * The lock also serializes this code with filesystem
1373 		 * manipulations between entry to zfs_suspend_fs() and return
1374 		 * from zfs_resume_fs().
1375 		 */
1376 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1377 		zfs_unregister_callbacks(zfsvfs);
1378 		error = zfs_register_callbacks(vfsp);
1379 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1380 		goto out;
1381 	}
1382 
1383 	/* Initial root mount: try hard to import the requested root pool. */
1384 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1385 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1386 		char pname[MAXNAMELEN];
1387 
1388 		error = getpoolname(osname, pname);
1389 		if (error == 0)
1390 			error = spa_import_rootpool(pname, false);
1391 		if (error)
1392 			goto out;
1393 	}
1394 	DROP_GIANT();
1395 	error = zfs_domount(vfsp, osname);
1396 	PICKUP_GIANT();
1397 
1398 out:
1399 	return (error);
1400 }
1401 
1402 static int
1403 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1404 {
1405 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1406 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1407 
1408 	statp->f_version = STATFS_VERSION;
1409 
1410 	ZFS_ENTER(zfsvfs);
1411 
1412 	dmu_objset_space(zfsvfs->z_os,
1413 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1414 
1415 	/*
1416 	 * The underlying storage pool actually uses multiple block sizes.
1417 	 * We report the fragsize as the smallest block size we support,
1418 	 * and we report our blocksize as the filesystem's maximum blocksize.
1419 	 */
1420 	statp->f_bsize = SPA_MINBLOCKSIZE;
1421 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1422 
1423 	/*
1424 	 * The following report "total" blocks of various kinds in the
1425 	 * file system, but reported in terms of f_frsize - the
1426 	 * "fragment" size.
1427 	 */
1428 
1429 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1430 	statp->f_bfree = availbytes / statp->f_bsize;
1431 	statp->f_bavail = statp->f_bfree; /* no root reservation */
1432 
1433 	/*
1434 	 * statvfs() should really be called statufs(), because it assumes
1435 	 * static metadata.  ZFS doesn't preallocate files, so the best
1436 	 * we can do is report the max that could possibly fit in f_files,
1437 	 * and that minus the number actually used in f_ffree.
1438 	 * For f_ffree, report the smaller of the number of object available
1439 	 * and the number of blocks (each object will take at least a block).
1440 	 */
1441 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1442 	statp->f_files = statp->f_ffree + usedobjs;
1443 
1444 	/*
1445 	 * We're a zfs filesystem.
1446 	 */
1447 	strlcpy(statp->f_fstypename, "zfs",
1448 	    sizeof (statp->f_fstypename));
1449 
1450 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1451 	    sizeof (statp->f_mntfromname));
1452 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1453 	    sizeof (statp->f_mntonname));
1454 
1455 	statp->f_namemax = MAXNAMELEN - 1;
1456 
1457 	ZFS_EXIT(zfsvfs);
1458 	return (0);
1459 }
1460 
1461 static int
1462 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1463 {
1464 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1465 	znode_t *rootzp;
1466 	int error;
1467 
1468 	ZFS_ENTER(zfsvfs);
1469 
1470 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1471 	if (error == 0)
1472 		*vpp = ZTOV(rootzp);
1473 
1474 	ZFS_EXIT(zfsvfs);
1475 
1476 	if (error == 0) {
1477 		error = vn_lock(*vpp, flags);
1478 		if (error != 0) {
1479 			VN_RELE(*vpp);
1480 			*vpp = NULL;
1481 		}
1482 	}
1483 	return (error);
1484 }
1485 
1486 /*
1487  * Teardown the zfsvfs::z_os.
1488  *
1489  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1490  * and 'z_teardown_inactive_lock' held.
1491  */
1492 static int
1493 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1494 {
1495 	znode_t	*zp;
1496 	dsl_dir_t *dd;
1497 
1498 	/*
1499 	 * If someone has not already unmounted this file system,
1500 	 * drain the zrele_taskq to ensure all active references to the
1501 	 * zfsvfs_t have been handled only then can it be safely destroyed.
1502 	 */
1503 	if (zfsvfs->z_os) {
1504 		/*
1505 		 * If we're unmounting we have to wait for the list to
1506 		 * drain completely.
1507 		 *
1508 		 * If we're not unmounting there's no guarantee the list
1509 		 * will drain completely, but zreles run from the taskq
1510 		 * may add the parents of dir-based xattrs to the taskq
1511 		 * so we want to wait for these.
1512 		 *
1513 		 * We can safely read z_nr_znodes without locking because the
1514 		 * VFS has already blocked operations which add to the
1515 		 * z_all_znodes list and thus increment z_nr_znodes.
1516 		 */
1517 		int round = 0;
1518 		while (zfsvfs->z_nr_znodes > 0) {
1519 			taskq_wait_outstanding(dsl_pool_zrele_taskq(
1520 			    dmu_objset_pool(zfsvfs->z_os)), 0);
1521 			if (++round > 1 && !unmounting)
1522 				break;
1523 		}
1524 	}
1525 	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1526 
1527 	if (!unmounting) {
1528 		/*
1529 		 * We purge the parent filesystem's vfsp as the parent
1530 		 * filesystem and all of its snapshots have their vnode's
1531 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1532 		 * 'z_parent' is self referential for non-snapshots.
1533 		 */
1534 #ifdef FREEBSD_NAMECACHE
1535 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
1536 #endif
1537 	}
1538 
1539 	/*
1540 	 * Close the zil. NB: Can't close the zil while zfs_inactive
1541 	 * threads are blocked as zil_close can call zfs_inactive.
1542 	 */
1543 	if (zfsvfs->z_log) {
1544 		zil_close(zfsvfs->z_log);
1545 		zfsvfs->z_log = NULL;
1546 	}
1547 
1548 	ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs);
1549 
1550 	/*
1551 	 * If we are not unmounting (ie: online recv) and someone already
1552 	 * unmounted this file system while we were doing the switcheroo,
1553 	 * or a reopen of z_os failed then just bail out now.
1554 	 */
1555 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1556 		ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
1557 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1558 		return (SET_ERROR(EIO));
1559 	}
1560 
1561 	/*
1562 	 * At this point there are no vops active, and any new vops will
1563 	 * fail with EIO since we have z_teardown_lock for writer (only
1564 	 * relevant for forced unmount).
1565 	 *
1566 	 * Release all holds on dbufs.
1567 	 */
1568 	mutex_enter(&zfsvfs->z_znodes_lock);
1569 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1570 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
1571 		if (zp->z_sa_hdl) {
1572 			ASSERT(ZTOV(zp)->v_count >= 0);
1573 			zfs_znode_dmu_fini(zp);
1574 		}
1575 	mutex_exit(&zfsvfs->z_znodes_lock);
1576 
1577 	/*
1578 	 * If we are unmounting, set the unmounted flag and let new vops
1579 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1580 	 * other vops will fail with EIO.
1581 	 */
1582 	if (unmounting) {
1583 		zfsvfs->z_unmounted = B_TRUE;
1584 		ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
1585 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1586 	}
1587 
1588 	/*
1589 	 * z_os will be NULL if there was an error in attempting to reopen
1590 	 * zfsvfs, so just return as the properties had already been
1591 	 * unregistered and cached data had been evicted before.
1592 	 */
1593 	if (zfsvfs->z_os == NULL)
1594 		return (0);
1595 
1596 	/*
1597 	 * Unregister properties.
1598 	 */
1599 	zfs_unregister_callbacks(zfsvfs);
1600 
1601 	/*
1602 	 * Evict cached data
1603 	 */
1604 	if (!zfs_is_readonly(zfsvfs))
1605 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1606 	dmu_objset_evict_dbufs(zfsvfs->z_os);
1607 	dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1608 	dsl_dir_cancel_waiters(dd);
1609 
1610 	return (0);
1611 }
1612 
1613 /*ARGSUSED*/
1614 static int
1615 zfs_umount(vfs_t *vfsp, int fflag)
1616 {
1617 	kthread_t *td = curthread;
1618 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1619 	objset_t *os;
1620 	cred_t *cr = td->td_ucred;
1621 	int ret;
1622 
1623 	ret = secpolicy_fs_unmount(cr, vfsp);
1624 	if (ret) {
1625 		if (dsl_deleg_access((char *)vfsp->vfs_resource,
1626 		    ZFS_DELEG_PERM_MOUNT, cr))
1627 			return (ret);
1628 	}
1629 
1630 	/*
1631 	 * Unmount any snapshots mounted under .zfs before unmounting the
1632 	 * dataset itself.
1633 	 */
1634 	if (zfsvfs->z_ctldir != NULL) {
1635 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1636 			return (ret);
1637 	}
1638 
1639 	if (fflag & MS_FORCE) {
1640 		/*
1641 		 * Mark file system as unmounted before calling
1642 		 * vflush(FORCECLOSE). This way we ensure no future vnops
1643 		 * will be called and risk operating on DOOMED vnodes.
1644 		 */
1645 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1646 		zfsvfs->z_unmounted = B_TRUE;
1647 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1648 	}
1649 
1650 	/*
1651 	 * Flush all the files.
1652 	 */
1653 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1654 	if (ret != 0)
1655 		return (ret);
1656 	while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1657 	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1658 		taskqueue_drain(zfsvfs_taskq->tq_queue,
1659 		    &zfsvfs->z_unlinked_drain_task);
1660 
1661 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1662 	os = zfsvfs->z_os;
1663 
1664 	/*
1665 	 * z_os will be NULL if there was an error in
1666 	 * attempting to reopen zfsvfs.
1667 	 */
1668 	if (os != NULL) {
1669 		/*
1670 		 * Unset the objset user_ptr.
1671 		 */
1672 		mutex_enter(&os->os_user_ptr_lock);
1673 		dmu_objset_set_user(os, NULL);
1674 		mutex_exit(&os->os_user_ptr_lock);
1675 
1676 		/*
1677 		 * Finally release the objset
1678 		 */
1679 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1680 	}
1681 
1682 	/*
1683 	 * We can now safely destroy the '.zfs' directory node.
1684 	 */
1685 	if (zfsvfs->z_ctldir != NULL)
1686 		zfsctl_destroy(zfsvfs);
1687 	zfs_freevfs(vfsp);
1688 
1689 	return (0);
1690 }
1691 
1692 static int
1693 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1694 {
1695 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1696 	znode_t		*zp;
1697 	int 		err;
1698 
1699 	/*
1700 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
1701 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1702 	 * This will make NFS to switch to LOOKUP instead of using VGET.
1703 	 */
1704 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1705 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1706 		return (EOPNOTSUPP);
1707 
1708 	ZFS_ENTER(zfsvfs);
1709 	err = zfs_zget(zfsvfs, ino, &zp);
1710 	if (err == 0 && zp->z_unlinked) {
1711 		vrele(ZTOV(zp));
1712 		err = EINVAL;
1713 	}
1714 	if (err == 0)
1715 		*vpp = ZTOV(zp);
1716 	ZFS_EXIT(zfsvfs);
1717 	if (err == 0) {
1718 		err = vn_lock(*vpp, flags);
1719 		if (err != 0)
1720 			vrele(*vpp);
1721 	}
1722 	if (err != 0)
1723 		*vpp = NULL;
1724 	return (err);
1725 }
1726 
1727 static int
1728 #if __FreeBSD_version >= 1300098
1729 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1730     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1731 #else
1732 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1733     struct ucred **credanonp, int *numsecflavors, int **secflavors)
1734 #endif
1735 {
1736 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1737 
1738 	/*
1739 	 * If this is regular file system vfsp is the same as
1740 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1741 	 * zfsvfs->z_parent->z_vfs represents parent file system
1742 	 * which we have to use here, because only this file system
1743 	 * has mnt_export configured.
1744 	 */
1745 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1746 	    credanonp, numsecflavors, secflavors));
1747 }
1748 
1749 CTASSERT(SHORT_FID_LEN <= sizeof (struct fid));
1750 CTASSERT(LONG_FID_LEN <= sizeof (struct fid));
1751 
1752 static int
1753 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1754 {
1755 	struct componentname cn;
1756 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1757 	znode_t		*zp;
1758 	vnode_t		*dvp;
1759 	uint64_t	object = 0;
1760 	uint64_t	fid_gen = 0;
1761 	uint64_t	gen_mask;
1762 	uint64_t	zp_gen;
1763 	int 		i, err;
1764 
1765 	*vpp = NULL;
1766 
1767 	ZFS_ENTER(zfsvfs);
1768 
1769 	/*
1770 	 * On FreeBSD we can get snapshot's mount point or its parent file
1771 	 * system mount point depending if snapshot is already mounted or not.
1772 	 */
1773 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1774 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1775 		uint64_t	objsetid = 0;
1776 		uint64_t	setgen = 0;
1777 
1778 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1779 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1780 
1781 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1782 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1783 
1784 		ZFS_EXIT(zfsvfs);
1785 
1786 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1787 		if (err)
1788 			return (SET_ERROR(EINVAL));
1789 		ZFS_ENTER(zfsvfs);
1790 	}
1791 
1792 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1793 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1794 
1795 		for (i = 0; i < sizeof (zfid->zf_object); i++)
1796 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1797 
1798 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1799 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1800 	} else {
1801 		ZFS_EXIT(zfsvfs);
1802 		return (SET_ERROR(EINVAL));
1803 	}
1804 
1805 	/*
1806 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1807 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
1808 	 * we are in the .zfs/shares directory tree.
1809 	 */
1810 	if ((fid_gen == 0 &&
1811 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1812 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1813 		ZFS_EXIT(zfsvfs);
1814 		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1815 		if (object == ZFSCTL_INO_SNAPDIR) {
1816 			cn.cn_nameptr = "snapshot";
1817 			cn.cn_namelen = strlen(cn.cn_nameptr);
1818 			cn.cn_nameiop = LOOKUP;
1819 			cn.cn_flags = ISLASTCN | LOCKLEAF;
1820 			cn.cn_lkflags = flags;
1821 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1822 			vput(dvp);
1823 		} else if (object == zfsvfs->z_shares_dir) {
1824 			/*
1825 			 * XXX This branch must not be taken,
1826 			 * if it is, then the lookup below will
1827 			 * explode.
1828 			 */
1829 			cn.cn_nameptr = "shares";
1830 			cn.cn_namelen = strlen(cn.cn_nameptr);
1831 			cn.cn_nameiop = LOOKUP;
1832 			cn.cn_flags = ISLASTCN;
1833 			cn.cn_lkflags = flags;
1834 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1835 			vput(dvp);
1836 		} else {
1837 			*vpp = dvp;
1838 		}
1839 		return (err);
1840 	}
1841 
1842 	gen_mask = -1ULL >> (64 - 8 * i);
1843 
1844 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1845 	if ((err = zfs_zget(zfsvfs, object, &zp))) {
1846 		ZFS_EXIT(zfsvfs);
1847 		return (err);
1848 	}
1849 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1850 	    sizeof (uint64_t));
1851 	zp_gen = zp_gen & gen_mask;
1852 	if (zp_gen == 0)
1853 		zp_gen = 1;
1854 	if (zp->z_unlinked || zp_gen != fid_gen) {
1855 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1856 		vrele(ZTOV(zp));
1857 		ZFS_EXIT(zfsvfs);
1858 		return (SET_ERROR(EINVAL));
1859 	}
1860 
1861 	*vpp = ZTOV(zp);
1862 	ZFS_EXIT(zfsvfs);
1863 	err = vn_lock(*vpp, flags);
1864 	if (err == 0)
1865 		vnode_create_vobject(*vpp, zp->z_size, curthread);
1866 	else
1867 		*vpp = NULL;
1868 	return (err);
1869 }
1870 
1871 /*
1872  * Block out VOPs and close zfsvfs_t::z_os
1873  *
1874  * Note, if successful, then we return with the 'z_teardown_lock' and
1875  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1876  * dataset and objset intact so that they can be atomically handed off during
1877  * a subsequent rollback or recv operation and the resume thereafter.
1878  */
1879 int
1880 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1881 {
1882 	int error;
1883 
1884 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1885 		return (error);
1886 
1887 	return (0);
1888 }
1889 
1890 /*
1891  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1892  * is an invariant across any of the operations that can be performed while the
1893  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1894  * are the same: the relevant objset and associated dataset are owned by
1895  * zfsvfs, held, and long held on entry.
1896  */
1897 int
1898 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1899 {
1900 	int err;
1901 	znode_t *zp;
1902 
1903 	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
1904 	ASSERT(ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs));
1905 
1906 	/*
1907 	 * We already own this, so just update the objset_t, as the one we
1908 	 * had before may have been evicted.
1909 	 */
1910 	objset_t *os;
1911 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
1912 	VERIFY(dsl_dataset_long_held(ds));
1913 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1914 	dsl_pool_config_enter(dp, FTAG);
1915 	VERIFY0(dmu_objset_from_ds(ds, &os));
1916 	dsl_pool_config_exit(dp, FTAG);
1917 
1918 	err = zfsvfs_init(zfsvfs, os);
1919 	if (err != 0)
1920 		goto bail;
1921 
1922 	ds->ds_dir->dd_activity_cancelled = B_FALSE;
1923 	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1924 
1925 	zfs_set_fuid_feature(zfsvfs);
1926 
1927 	/*
1928 	 * Attempt to re-establish all the active znodes with
1929 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
1930 	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1931 	 * when they try to use their znode.
1932 	 */
1933 	mutex_enter(&zfsvfs->z_znodes_lock);
1934 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1935 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1936 		(void) zfs_rezget(zp);
1937 	}
1938 	mutex_exit(&zfsvfs->z_znodes_lock);
1939 
1940 bail:
1941 	/* release the VOPs */
1942 	ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
1943 	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1944 
1945 	if (err) {
1946 		/*
1947 		 * Since we couldn't setup the sa framework, try to force
1948 		 * unmount this file system.
1949 		 */
1950 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
1951 			vfs_ref(zfsvfs->z_vfs);
1952 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
1953 		}
1954 	}
1955 	return (err);
1956 }
1957 
1958 static void
1959 zfs_freevfs(vfs_t *vfsp)
1960 {
1961 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1962 
1963 	zfsvfs_free(zfsvfs);
1964 
1965 	atomic_dec_32(&zfs_active_fs_count);
1966 }
1967 
1968 #ifdef __i386__
1969 static int desiredvnodes_backup;
1970 #include <sys/vmmeter.h>
1971 
1972 
1973 #include <vm/vm_page.h>
1974 #include <vm/vm_object.h>
1975 #include <vm/vm_kern.h>
1976 #include <vm/vm_map.h>
1977 #endif
1978 
1979 static void
1980 zfs_vnodes_adjust(void)
1981 {
1982 #ifdef __i386__
1983 	int newdesiredvnodes;
1984 
1985 	desiredvnodes_backup = desiredvnodes;
1986 
1987 	/*
1988 	 * We calculate newdesiredvnodes the same way it is done in
1989 	 * vntblinit(). If it is equal to desiredvnodes, it means that
1990 	 * it wasn't tuned by the administrator and we can tune it down.
1991 	 */
1992 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
1993 	    vm_kmem_size / (5 * (sizeof (struct vm_object) +
1994 	    sizeof (struct vnode))));
1995 	if (newdesiredvnodes == desiredvnodes)
1996 		desiredvnodes = (3 * newdesiredvnodes) / 4;
1997 #endif
1998 }
1999 
2000 static void
2001 zfs_vnodes_adjust_back(void)
2002 {
2003 
2004 #ifdef __i386__
2005 	desiredvnodes = desiredvnodes_backup;
2006 #endif
2007 }
2008 
2009 void
2010 zfs_init(void)
2011 {
2012 
2013 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2014 
2015 	/*
2016 	 * Initialize .zfs directory structures
2017 	 */
2018 	zfsctl_init();
2019 
2020 	/*
2021 	 * Initialize znode cache, vnode ops, etc...
2022 	 */
2023 	zfs_znode_init();
2024 
2025 	/*
2026 	 * Reduce number of vnodes. Originally number of vnodes is calculated
2027 	 * with UFS inode in mind. We reduce it here, because it's too big for
2028 	 * ZFS/i386.
2029 	 */
2030 	zfs_vnodes_adjust();
2031 
2032 	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2033 
2034 	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2035 }
2036 
2037 void
2038 zfs_fini(void)
2039 {
2040 	taskq_destroy(zfsvfs_taskq);
2041 	zfsctl_fini();
2042 	zfs_znode_fini();
2043 	zfs_vnodes_adjust_back();
2044 }
2045 
2046 int
2047 zfs_busy(void)
2048 {
2049 	return (zfs_active_fs_count != 0);
2050 }
2051 
2052 /*
2053  * Release VOPs and unmount a suspended filesystem.
2054  */
2055 int
2056 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2057 {
2058 	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
2059 	ASSERT(ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs));
2060 
2061 	/*
2062 	 * We already own this, so just hold and rele it to update the
2063 	 * objset_t, as the one we had before may have been evicted.
2064 	 */
2065 	objset_t *os;
2066 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2067 	VERIFY(dsl_dataset_long_held(ds));
2068 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2069 	dsl_pool_config_enter(dp, FTAG);
2070 	VERIFY0(dmu_objset_from_ds(ds, &os));
2071 	dsl_pool_config_exit(dp, FTAG);
2072 	zfsvfs->z_os = os;
2073 
2074 	/* release the VOPs */
2075 	ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
2076 	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2077 
2078 	/*
2079 	 * Try to force unmount this file system.
2080 	 */
2081 	(void) zfs_umount(zfsvfs->z_vfs, 0);
2082 	zfsvfs->z_unmounted = B_TRUE;
2083 	return (0);
2084 }
2085 
2086 int
2087 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2088 {
2089 	int error;
2090 	objset_t *os = zfsvfs->z_os;
2091 	dmu_tx_t *tx;
2092 
2093 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2094 		return (SET_ERROR(EINVAL));
2095 
2096 	if (newvers < zfsvfs->z_version)
2097 		return (SET_ERROR(EINVAL));
2098 
2099 	if (zfs_spa_version_map(newvers) >
2100 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2101 		return (SET_ERROR(ENOTSUP));
2102 
2103 	tx = dmu_tx_create(os);
2104 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2105 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2106 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2107 		    ZFS_SA_ATTRS);
2108 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2109 	}
2110 	error = dmu_tx_assign(tx, TXG_WAIT);
2111 	if (error) {
2112 		dmu_tx_abort(tx);
2113 		return (error);
2114 	}
2115 
2116 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2117 	    8, 1, &newvers, tx);
2118 
2119 	if (error) {
2120 		dmu_tx_commit(tx);
2121 		return (error);
2122 	}
2123 
2124 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2125 		uint64_t sa_obj;
2126 
2127 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2128 		    SPA_VERSION_SA);
2129 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2130 		    DMU_OT_NONE, 0, tx);
2131 
2132 		error = zap_add(os, MASTER_NODE_OBJ,
2133 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2134 		ASSERT0(error);
2135 
2136 		VERIFY(0 == sa_set_sa_object(os, sa_obj));
2137 		sa_register_update_callback(os, zfs_sa_upgrade);
2138 	}
2139 
2140 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2141 	    "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2142 	    (uintmax_t)newvers);
2143 	dmu_tx_commit(tx);
2144 
2145 	zfsvfs->z_version = newvers;
2146 	os->os_version = newvers;
2147 
2148 	zfs_set_fuid_feature(zfsvfs);
2149 
2150 	return (0);
2151 }
2152 
2153 /*
2154  * Read a property stored within the master node.
2155  */
2156 int
2157 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2158 {
2159 	uint64_t *cached_copy = NULL;
2160 
2161 	/*
2162 	 * Figure out where in the objset_t the cached copy would live, if it
2163 	 * is available for the requested property.
2164 	 */
2165 	if (os != NULL) {
2166 		switch (prop) {
2167 		case ZFS_PROP_VERSION:
2168 			cached_copy = &os->os_version;
2169 			break;
2170 		case ZFS_PROP_NORMALIZE:
2171 			cached_copy = &os->os_normalization;
2172 			break;
2173 		case ZFS_PROP_UTF8ONLY:
2174 			cached_copy = &os->os_utf8only;
2175 			break;
2176 		case ZFS_PROP_CASE:
2177 			cached_copy = &os->os_casesensitivity;
2178 			break;
2179 		default:
2180 			break;
2181 		}
2182 	}
2183 	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2184 		*value = *cached_copy;
2185 		return (0);
2186 	}
2187 
2188 	/*
2189 	 * If the property wasn't cached, look up the file system's value for
2190 	 * the property. For the version property, we look up a slightly
2191 	 * different string.
2192 	 */
2193 	const char *pname;
2194 	int error = ENOENT;
2195 	if (prop == ZFS_PROP_VERSION) {
2196 		pname = ZPL_VERSION_STR;
2197 	} else {
2198 		pname = zfs_prop_to_name(prop);
2199 	}
2200 
2201 	if (os != NULL) {
2202 		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2203 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2204 	}
2205 
2206 	if (error == ENOENT) {
2207 		/* No value set, use the default value */
2208 		switch (prop) {
2209 		case ZFS_PROP_VERSION:
2210 			*value = ZPL_VERSION;
2211 			break;
2212 		case ZFS_PROP_NORMALIZE:
2213 		case ZFS_PROP_UTF8ONLY:
2214 			*value = 0;
2215 			break;
2216 		case ZFS_PROP_CASE:
2217 			*value = ZFS_CASE_SENSITIVE;
2218 			break;
2219 		default:
2220 			return (error);
2221 		}
2222 		error = 0;
2223 	}
2224 
2225 	/*
2226 	 * If one of the methods for getting the property value above worked,
2227 	 * copy it into the objset_t's cache.
2228 	 */
2229 	if (error == 0 && cached_copy != NULL) {
2230 		*cached_copy = *value;
2231 	}
2232 
2233 	return (error);
2234 }
2235 
2236 /*
2237  * Return true if the corresponding vfs's unmounted flag is set.
2238  * Otherwise return false.
2239  * If this function returns true we know VFS unmount has been initiated.
2240  */
2241 boolean_t
2242 zfs_get_vfs_flag_unmounted(objset_t *os)
2243 {
2244 	zfsvfs_t *zfvp;
2245 	boolean_t unmounted = B_FALSE;
2246 
2247 	ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
2248 
2249 	mutex_enter(&os->os_user_ptr_lock);
2250 	zfvp = dmu_objset_get_user(os);
2251 	if (zfvp != NULL && zfvp->z_vfs != NULL &&
2252 	    (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2253 		unmounted = B_TRUE;
2254 	mutex_exit(&os->os_user_ptr_lock);
2255 
2256 	return (unmounted);
2257 }
2258 
2259 #ifdef _KERNEL
2260 void
2261 zfsvfs_update_fromname(const char *oldname, const char *newname)
2262 {
2263 	char tmpbuf[MAXPATHLEN];
2264 	struct mount *mp;
2265 	char *fromname;
2266 	size_t oldlen;
2267 
2268 	oldlen = strlen(oldname);
2269 
2270 	mtx_lock(&mountlist_mtx);
2271 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2272 		fromname = mp->mnt_stat.f_mntfromname;
2273 		if (strcmp(fromname, oldname) == 0) {
2274 			(void) strlcpy(fromname, newname,
2275 			    sizeof (mp->mnt_stat.f_mntfromname));
2276 			continue;
2277 		}
2278 		if (strncmp(fromname, oldname, oldlen) == 0 &&
2279 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2280 			(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2281 			    newname, fromname + oldlen);
2282 			(void) strlcpy(fromname, tmpbuf,
2283 			    sizeof (mp->mnt_stat.f_mntfromname));
2284 			continue;
2285 		}
2286 	}
2287 	mtx_unlock(&mountlist_mtx);
2288 }
2289 #endif
2290