1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/sysmacros.h>
30 #include <sys/kmem.h>
31 #include <sys/pathname.h>
32 #include <sys/vnode.h>
33 #include <sys/vfs.h>
34 #include <sys/vfs_opreg.h>
35 #include <sys/mntent.h>
36 #include <sys/mount.h>
37 #include <sys/cmn_err.h>
38 #include <sys/zfs_znode.h>
39 #include <sys/zfs_dir.h>
40 #include <sys/zil.h>
41 #include <sys/fs/zfs.h>
42 #include <sys/dmu.h>
43 #include <sys/dsl_prop.h>
44 #include <sys/dsl_dataset.h>
45 #include <sys/dsl_deleg.h>
46 #include <sys/spa.h>
47 #include <sys/zap.h>
48 #include <sys/varargs.h>
49 #include <sys/policy.h>
50 #include <sys/atomic.h>
51 #include <sys/mkdev.h>
52 #include <sys/modctl.h>
53 #include <sys/zfs_ioctl.h>
54 #include <sys/zfs_ctldir.h>
55 #include <sys/zfs_fuid.h>
56 #include <sys/sunddi.h>
57 #include <sys/dnlc.h>
58 #include <sys/dmu_objset.h>
59 #include <sys/spa_boot.h>
60
61 #ifdef __NetBSD__
62 /* include ddi_name_to_major function is there better place for it ?*/
63 #include <sys/ddi.h>
64 #include <sys/systm.h>
65 #endif
66
67 int zfsfstype;
68 vfsops_t *zfs_vfsops = NULL;
69 static major_t zfs_major;
70 static minor_t zfs_minor;
71 static kmutex_t zfs_dev_mtx;
72
73 int zfs_debug_level;
74 kmutex_t zfs_debug_mtx;
75
76 /* XXX NetBSD static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);*/
77 static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len);
78 static int zfs_umount(vfs_t *vfsp, int fflag);
79 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
80 static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp);
81 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
82 static int zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp);
83 static int zfs_start(vfs_t *vfsp, int flags);
84 static void zfs_freevfs(vfs_t *vfsp);
85
86 void zfs_init(void);
87 void zfs_fini(void);
88
89
90 extern const struct vnodeopv_desc zfs_vnodeop_opv_desc;
91
92 static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = {
93 &zfs_vnodeop_opv_desc,
94 NULL,
95 };
96
97 static struct vfsops zfs_vfsops_template = {
98 .vfs_name = MOUNT_ZFS,
99 .vfs_min_mount_data = sizeof(struct zfs_args),
100 .vfs_opv_descs = zfs_vnodeop_descs,
101 .vfs_mount = zfs_mount,
102 .vfs_unmount = zfs_umount,
103 .vfs_root = zfs_root,
104 .vfs_statvfs = zfs_statvfs,
105 .vfs_sync = zfs_sync,
106 .vfs_vget = zfs_vget,
107 .vfs_loadvnode = zfs_loadvnode,
108 .vfs_fhtovp = zfs_fhtovp,
109 .vfs_init = zfs_init,
110 .vfs_done = zfs_fini,
111 .vfs_start = zfs_start,
112 .vfs_renamelock_enter = (void*)nullop,
113 .vfs_renamelock_exit = (void*)nullop,
114 .vfs_reinit = (void *)nullop,
115 .vfs_vptofh = (void *)eopnotsupp,
116 .vfs_fhtovp = (void *)eopnotsupp,
117 .vfs_quotactl = (void *)eopnotsupp,
118 .vfs_extattrctl = (void *)eopnotsupp,
119 .vfs_snapshot = (void *)eopnotsupp,
120 .vfs_fsync = (void *)eopnotsupp,
121 };
122
123 /*
124 * We need to keep a count of active fs's.
125 * This is necessary to prevent our module
126 * from being unloaded after a umount -f
127 */
128 static uint32_t zfs_active_fs_count = 0;
129
130 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
131 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
132 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
133 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
134
135 /*
136 * MO_DEFAULT is not used since the default value is determined
137 * by the equivalent property.
138 */
139 static mntopt_t mntopts[] = {
140 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
141 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
142 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
143 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
144 };
145
146 static mntopts_t zfs_mntopts = {
147 sizeof (mntopts) / sizeof (mntopt_t),
148 mntopts
149 };
150
151 static bool
zfs_sync_selector(void * cl,struct vnode * vp)152 zfs_sync_selector(void *cl, struct vnode *vp)
153 {
154 znode_t *zp;
155
156 /*
157 * Skip the vnode/inode if inaccessible, or if the
158 * atime is clean.
159 */
160 zp = VTOZ(vp);
161 return zp != NULL && vp->v_type != VNON && zp->z_atime_dirty != 0
162 && !zp->z_unlinked;
163 }
164
165 /*ARGSUSED*/
166 int
zfs_sync(vfs_t * vfsp,int flag,cred_t * cr)167 zfs_sync(vfs_t *vfsp, int flag, cred_t *cr)
168 {
169 zfsvfs_t *zfsvfs = vfsp->vfs_data;
170 znode_t *zp;
171 vnode_t *vp;
172 struct vnode_iterator *marker;
173 dmu_tx_t *tx;
174 int error;
175
176
177 error = 0;
178
179 /*
180 * Data integrity is job one. We don't want a compromised kernel
181 * writing to the storage pool, so we never sync during panic.
182 */
183 if (panicstr)
184 return (0);
185
186 /*
187 * On NetBSD, we need to push out atime updates. Solaris does
188 * this during VOP_INACTIVE, but that does not work well with the
189 * BSD VFS, so we do it in batch here.
190 */
191 vfs_vnode_iterator_init(vfsp, &marker);
192 while ((vp = vfs_vnode_iterator_next(marker, zfs_sync_selector, NULL)))
193 {
194 error = vn_lock(vp, LK_EXCLUSIVE);
195 if (error) {
196 vrele(vp);
197 continue;
198 }
199 zp = VTOZ(vp);
200 tx = dmu_tx_create(zfsvfs->z_os);
201 dmu_tx_hold_bonus(tx, zp->z_id);
202 error = dmu_tx_assign(tx, TXG_WAIT);
203 if (error) {
204 dmu_tx_abort(tx);
205 } else {
206 dmu_buf_will_dirty(zp->z_dbuf, tx);
207 mutex_enter(&zp->z_lock);
208 zp->z_atime_dirty = 0;
209 mutex_exit(&zp->z_lock);
210 dmu_tx_commit(tx);
211 }
212 vput(vp);
213 }
214 vfs_vnode_iterator_destroy(marker);
215
216 /*
217 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
218 * to sync metadata, which they would otherwise cache indefinitely.
219 * Semantically, the only requirement is that the sync be initiated.
220 * The DMU syncs out txgs frequently, so there's nothing to do.
221 */
222 if ((flag & MNT_LAZY) != 0)
223 return (0);
224
225 if (vfsp != NULL) {
226 /*
227 * Sync a specific filesystem.
228 */
229 zfsvfs_t *zfsvfs = vfsp->vfs_data;
230 dsl_pool_t *dp;
231
232 ZFS_ENTER(zfsvfs);
233 dp = dmu_objset_pool(zfsvfs->z_os);
234
235 /*
236 * If the system is shutting down, then skip any
237 * filesystems which may exist on a suspended pool.
238 */
239 if (sys_shutdown && spa_suspended(dp->dp_spa)) {
240 ZFS_EXIT(zfsvfs);
241 return (0);
242 }
243
244 if (zfsvfs->z_log != NULL)
245 zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
246 else
247 txg_wait_synced(dp, 0);
248 ZFS_EXIT(zfsvfs);
249 } else {
250 /*
251 * Sync all ZFS filesystems. This is what happens when you
252 * run sync(1M). Unlike other filesystems, ZFS honors the
253 * request by waiting for all pools to commit all dirty data.
254 */
255 spa_sync_allpools();
256 }
257
258 return (0);
259 }
260
261 static int
zfs_create_unique_device(dev_t * dev)262 zfs_create_unique_device(dev_t *dev)
263 {
264 major_t new_major;
265
266 do {
267 ASSERT3U(zfs_minor, <=, MAXMIN);
268 minor_t start = zfs_minor;
269 do {
270 mutex_enter(&zfs_dev_mtx);
271 if (zfs_minor >= MAXMIN) {
272 /*
273 * If we're still using the real major
274 * keep out of /dev/zfs and /dev/zvol minor
275 * number space. If we're using a getudev()'ed
276 * major number, we can use all of its minors.
277 */
278 if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
279 zfs_minor = ZFS_MIN_MINOR;
280 else
281 zfs_minor = 0;
282 } else {
283 zfs_minor++;
284 }
285 *dev = makedevice(zfs_major, zfs_minor);
286 mutex_exit(&zfs_dev_mtx);
287 } while (vfs_devismounted(*dev) && zfs_minor != start);
288 break;
289 #ifndef __NetBSD__
290 if (zfs_minor == start) {
291 /*
292 * We are using all ~262,000 minor numbers for the
293 * current major number. Create a new major number.
294 */
295 if ((new_major = getudev()) == (major_t)-1) {
296 cmn_err(CE_WARN,
297 "zfs_mount: Can't get unique major "
298 "device number.");
299 return (-1);
300 }
301 mutex_enter(&zfs_dev_mtx);
302 zfs_major = new_major;
303 zfs_minor = 0;
304
305 mutex_exit(&zfs_dev_mtx);
306 } else {
307 break;
308 }
309 /* CONSTANTCONDITION */
310 #endif
311 } while (1);
312
313 return (0);
314 }
315
316 static void
atime_changed_cb(void * arg,uint64_t newval)317 atime_changed_cb(void *arg, uint64_t newval)
318 {
319 zfsvfs_t *zfsvfs = arg;
320
321 if (newval == TRUE) {
322 zfsvfs->z_atime = TRUE;
323 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
324 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
325 } else {
326 zfsvfs->z_atime = FALSE;
327 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
328 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
329 }
330 }
331
332 static void
xattr_changed_cb(void * arg,uint64_t newval)333 xattr_changed_cb(void *arg, uint64_t newval)
334 {
335 zfsvfs_t *zfsvfs = arg;
336
337 if (newval == TRUE) {
338 /* XXX locking on vfs_flag? */
339 #ifdef TODO
340 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
341 #endif
342 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
343 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
344 } else {
345 /* XXX locking on vfs_flag? */
346 #ifdef TODO
347 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
348 #endif
349 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
350 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
351 }
352 }
353
354 static void
blksz_changed_cb(void * arg,uint64_t newval)355 blksz_changed_cb(void *arg, uint64_t newval)
356 {
357 zfsvfs_t *zfsvfs = arg;
358
359 if (newval < SPA_MINBLOCKSIZE ||
360 newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
361 newval = SPA_MAXBLOCKSIZE;
362
363 zfsvfs->z_max_blksz = newval;
364 zfsvfs->z_vfs->vfs_bsize = newval;
365 }
366
367 static void
readonly_changed_cb(void * arg,uint64_t newval)368 readonly_changed_cb(void *arg, uint64_t newval)
369 {
370 zfsvfs_t *zfsvfs = arg;
371
372 if (newval) {
373 /* XXX locking on vfs_flag? */
374 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
375 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
376 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
377 } else {
378 /* XXX locking on vfs_flag? */
379 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
380 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
381 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
382 }
383 }
384
385 static void
devices_changed_cb(void * arg,uint64_t newval)386 devices_changed_cb(void *arg, uint64_t newval)
387 {
388 zfsvfs_t *zfsvfs = arg;
389
390 if (newval == FALSE) {
391 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
392 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
393 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
394 } else {
395 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
396 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
397 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
398 }
399 }
400
401 static void
setuid_changed_cb(void * arg,uint64_t newval)402 setuid_changed_cb(void *arg, uint64_t newval)
403 {
404 zfsvfs_t *zfsvfs = arg;
405
406 if (newval == FALSE) {
407 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
408 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
409 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
410 } else {
411 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
412 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
413 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
414 }
415 }
416
417 static void
exec_changed_cb(void * arg,uint64_t newval)418 exec_changed_cb(void *arg, uint64_t newval)
419 {
420 zfsvfs_t *zfsvfs = arg;
421
422 if (newval == FALSE) {
423 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
424 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
425 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
426 } else {
427 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
428 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
429 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
430 }
431 }
432
433 /*
434 * The nbmand mount option can be changed at mount time.
435 * We can't allow it to be toggled on live file systems or incorrect
436 * behavior may be seen from cifs clients
437 *
438 * This property isn't registered via dsl_prop_register(), but this callback
439 * will be called when a file system is first mounted
440 */
441 static void
nbmand_changed_cb(void * arg,uint64_t newval)442 nbmand_changed_cb(void *arg, uint64_t newval)
443 {
444 zfsvfs_t *zfsvfs = arg;
445 if (newval == FALSE) {
446 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
447 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
448 } else {
449 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
450 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
451 }
452 }
453
454 static void
snapdir_changed_cb(void * arg,uint64_t newval)455 snapdir_changed_cb(void *arg, uint64_t newval)
456 {
457 zfsvfs_t *zfsvfs = arg;
458
459 zfsvfs->z_show_ctldir = newval;
460 }
461
462 static void
vscan_changed_cb(void * arg,uint64_t newval)463 vscan_changed_cb(void *arg, uint64_t newval)
464 {
465 zfsvfs_t *zfsvfs = arg;
466
467 zfsvfs->z_vscan = newval;
468 }
469
470 static void
acl_mode_changed_cb(void * arg,uint64_t newval)471 acl_mode_changed_cb(void *arg, uint64_t newval)
472 {
473 zfsvfs_t *zfsvfs = arg;
474
475 zfsvfs->z_acl_mode = newval;
476 }
477
478 static void
acl_inherit_changed_cb(void * arg,uint64_t newval)479 acl_inherit_changed_cb(void *arg, uint64_t newval)
480 {
481 zfsvfs_t *zfsvfs = arg;
482
483 zfsvfs->z_acl_inherit = newval;
484 }
485
486 static int
zfs_register_callbacks(vfs_t * vfsp)487 zfs_register_callbacks(vfs_t *vfsp)
488 {
489 struct dsl_dataset *ds = NULL;
490 objset_t *os = NULL;
491 zfsvfs_t *zfsvfs = NULL;
492 uint64_t nbmand;
493 int readonly, do_readonly = B_FALSE;
494 int setuid, do_setuid = B_FALSE;
495 int exec, do_exec = B_FALSE;
496 int devices, do_devices = B_FALSE;
497 int xattr, do_xattr = B_FALSE;
498 int atime, do_atime = B_FALSE;
499 int error = 0;
500
501 ASSERT(vfsp);
502 zfsvfs = vfsp->vfs_data;
503 ASSERT(zfsvfs);
504 os = zfsvfs->z_os;
505
506 /*
507 * The act of registering our callbacks will destroy any mount
508 * options we may have. In order to enable temporary overrides
509 * of mount options, we stash away the current values and
510 * restore them after we register the callbacks.
511 */
512 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
513 readonly = B_TRUE;
514 do_readonly = B_TRUE;
515 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
516 readonly = B_FALSE;
517 do_readonly = B_TRUE;
518 }
519 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
520 devices = B_FALSE;
521 setuid = B_FALSE;
522 do_devices = B_TRUE;
523 do_setuid = B_TRUE;
524 } else {
525 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
526 devices = B_FALSE;
527 do_devices = B_TRUE;
528 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
529 devices = B_TRUE;
530 do_devices = B_TRUE;
531 }
532
533 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
534 setuid = B_FALSE;
535 do_setuid = B_TRUE;
536 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
537 setuid = B_TRUE;
538 do_setuid = B_TRUE;
539 }
540 }
541 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
542 exec = B_FALSE;
543 do_exec = B_TRUE;
544 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
545 exec = B_TRUE;
546 do_exec = B_TRUE;
547 }
548 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
549 xattr = B_FALSE;
550 do_xattr = B_TRUE;
551 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
552 xattr = B_TRUE;
553 do_xattr = B_TRUE;
554 }
555 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
556 atime = B_FALSE;
557 do_atime = B_TRUE;
558 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
559 atime = B_TRUE;
560 do_atime = B_TRUE;
561 }
562
563 /*
564 * nbmand is a special property. It can only be changed at
565 * mount time.
566 *
567 * This is weird, but it is documented to only be changeable
568 * at mount time.
569 */
570 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
571 nbmand = B_FALSE;
572 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
573 nbmand = B_TRUE;
574 } else {
575 char osname[MAXNAMELEN];
576
577 dmu_objset_name(os, osname);
578 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
579 NULL)) {
580 return (error);
581 }
582 }
583
584 /*
585 * Register property callbacks.
586 *
587 * It would probably be fine to just check for i/o error from
588 * the first prop_register(), but I guess I like to go
589 * overboard...
590 */
591 ds = dmu_objset_ds(os);
592 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
593 error = error ? error : dsl_prop_register(ds,
594 "xattr", xattr_changed_cb, zfsvfs);
595 error = error ? error : dsl_prop_register(ds,
596 "recordsize", blksz_changed_cb, zfsvfs);
597 error = error ? error : dsl_prop_register(ds,
598 "readonly", readonly_changed_cb, zfsvfs);
599 error = error ? error : dsl_prop_register(ds,
600 "devices", devices_changed_cb, zfsvfs);
601 error = error ? error : dsl_prop_register(ds,
602 "setuid", setuid_changed_cb, zfsvfs);
603 error = error ? error : dsl_prop_register(ds,
604 "exec", exec_changed_cb, zfsvfs);
605 error = error ? error : dsl_prop_register(ds,
606 "snapdir", snapdir_changed_cb, zfsvfs);
607 error = error ? error : dsl_prop_register(ds,
608 "aclmode", acl_mode_changed_cb, zfsvfs);
609 error = error ? error : dsl_prop_register(ds,
610 "aclinherit", acl_inherit_changed_cb, zfsvfs);
611 error = error ? error : dsl_prop_register(ds,
612 "vscan", vscan_changed_cb, zfsvfs);
613 if (error)
614 goto unregister;
615
616 /*
617 * Invoke our callbacks to restore temporary mount options.
618 */
619 if (do_readonly)
620 readonly_changed_cb(zfsvfs, readonly);
621 if (do_setuid)
622 setuid_changed_cb(zfsvfs, setuid);
623 if (do_exec)
624 exec_changed_cb(zfsvfs, exec);
625 if (do_devices)
626 devices_changed_cb(zfsvfs, devices);
627 if (do_xattr)
628 xattr_changed_cb(zfsvfs, xattr);
629 if (do_atime)
630 atime_changed_cb(zfsvfs, atime);
631
632 nbmand_changed_cb(zfsvfs, nbmand);
633
634 return (0);
635
636 unregister:
637 /*
638 * We may attempt to unregister some callbacks that are not
639 * registered, but this is OK; it will simply return ENOMSG,
640 * which we will ignore.
641 */
642 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
643 (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
644 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
645 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
646 (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
647 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
648 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
649 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
650 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
651 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
652 zfsvfs);
653 (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
654 return (error);
655
656 }
657
658 static void
uidacct(objset_t * os,boolean_t isgroup,uint64_t fuid,int64_t delta,dmu_tx_t * tx)659 uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
660 int64_t delta, dmu_tx_t *tx)
661 {
662 uint64_t used = 0;
663 char buf[32];
664 int err;
665 uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
666
667 if (delta == 0)
668 return;
669
670 (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
671 err = zap_lookup(os, obj, buf, 8, 1, &used);
672 ASSERT(err == 0 || err == ENOENT);
673 /* no underflow/overflow */
674 ASSERT(delta > 0 || used >= -delta);
675 ASSERT(delta < 0 || used + delta > used);
676 used += delta;
677 if (used == 0)
678 err = zap_remove(os, obj, buf, tx);
679 else
680 err = zap_update(os, obj, buf, 8, 1, &used, tx);
681 ASSERT(err == 0);
682 }
683
684 static int
zfs_space_delta_cb(dmu_object_type_t bonustype,void * bonus,uint64_t * userp,uint64_t * groupp)685 zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus,
686 uint64_t *userp, uint64_t *groupp)
687 {
688 znode_phys_t *znp = bonus;
689
690 if (bonustype != DMU_OT_ZNODE)
691 return (ENOENT);
692
693 *userp = znp->zp_uid;
694 *groupp = znp->zp_gid;
695 return (0);
696 }
697
698 static void
fuidstr_to_sid(zfsvfs_t * zfsvfs,const char * fuidstr,char * domainbuf,int buflen,uid_t * ridp)699 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
700 char *domainbuf, int buflen, uid_t *ridp)
701 {
702 uint64_t fuid;
703 const char *domain;
704
705 fuid = strtonum(fuidstr, NULL);
706
707 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
708 if (domain)
709 (void) strlcpy(domainbuf, domain, buflen);
710 else
711 domainbuf[0] = '\0';
712 *ridp = FUID_RID(fuid);
713 }
714
715 static uint64_t
zfs_userquota_prop_to_obj(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type)716 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
717 {
718 switch (type) {
719 case ZFS_PROP_USERUSED:
720 return (DMU_USERUSED_OBJECT);
721 case ZFS_PROP_GROUPUSED:
722 return (DMU_GROUPUSED_OBJECT);
723 case ZFS_PROP_USERQUOTA:
724 return (zfsvfs->z_userquota_obj);
725 case ZFS_PROP_GROUPQUOTA:
726 return (zfsvfs->z_groupquota_obj);
727 }
728 return (0);
729 }
730
731 int
zfs_userspace_many(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type,uint64_t * cookiep,void * vbuf,uint64_t * bufsizep)732 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
733 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
734 {
735 int error;
736 zap_cursor_t zc;
737 zap_attribute_t za;
738 zfs_useracct_t *buf = vbuf;
739 uint64_t obj;
740
741 if (!dmu_objset_userspace_present(zfsvfs->z_os))
742 return (ENOTSUP);
743
744 obj = zfs_userquota_prop_to_obj(zfsvfs, type);
745 if (obj == 0) {
746 *bufsizep = 0;
747 return (0);
748 }
749
750 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
751 (error = zap_cursor_retrieve(&zc, &za)) == 0;
752 zap_cursor_advance(&zc)) {
753 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
754 *bufsizep)
755 break;
756
757 fuidstr_to_sid(zfsvfs, za.za_name,
758 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
759
760 buf->zu_space = za.za_first_integer;
761 buf++;
762 }
763 if (error == ENOENT)
764 error = 0;
765
766 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
767 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
768 *cookiep = zap_cursor_serialize(&zc);
769 zap_cursor_fini(&zc);
770 return (error);
771 }
772
773 /*
774 * buf must be big enough (eg, 32 bytes)
775 */
776 static int
id_to_fuidstr(zfsvfs_t * zfsvfs,const char * domain,uid_t rid,char * buf,size_t buflen,boolean_t addok)777 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
778 char *buf, size_t buflen, boolean_t addok)
779 {
780 uint64_t fuid;
781 int domainid = 0;
782
783 if (domain && domain[0]) {
784 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
785 if (domainid == -1)
786 return (ENOENT);
787 }
788 fuid = FUID_ENCODE(domainid, rid);
789 (void) snprintf(buf, buflen, "%llx", (longlong_t)fuid);
790 return (0);
791 }
792
793 int
zfs_userspace_one(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type,const char * domain,uint64_t rid,uint64_t * valp)794 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
795 const char *domain, uint64_t rid, uint64_t *valp)
796 {
797 char buf[32];
798 int err;
799 uint64_t obj;
800
801 *valp = 0;
802
803 if (!dmu_objset_userspace_present(zfsvfs->z_os))
804 return (ENOTSUP);
805
806 obj = zfs_userquota_prop_to_obj(zfsvfs, type);
807 if (obj == 0)
808 return (0);
809
810 err = id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof(buf), FALSE);
811 if (err)
812 return (err);
813
814 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
815 if (err == ENOENT)
816 err = 0;
817 return (err);
818 }
819
820 int
zfs_set_userquota(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type,const char * domain,uint64_t rid,uint64_t quota)821 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
822 const char *domain, uint64_t rid, uint64_t quota)
823 {
824 char buf[32];
825 int err;
826 dmu_tx_t *tx;
827 uint64_t *objp;
828 boolean_t fuid_dirtied;
829
830 if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
831 return (EINVAL);
832
833 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
834 return (ENOTSUP);
835
836 objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
837 &zfsvfs->z_groupquota_obj;
838
839 err = id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof(buf), B_TRUE);
840 if (err)
841 return (err);
842 fuid_dirtied = zfsvfs->z_fuid_dirty;
843
844 tx = dmu_tx_create(zfsvfs->z_os);
845 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
846 if (*objp == 0) {
847 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
848 zfs_userquota_prop_prefixes[type]);
849 }
850 if (fuid_dirtied)
851 zfs_fuid_txhold(zfsvfs, tx);
852 err = dmu_tx_assign(tx, TXG_WAIT);
853 if (err) {
854 dmu_tx_abort(tx);
855 return (err);
856 }
857
858 mutex_enter(&zfsvfs->z_lock);
859 if (*objp == 0) {
860 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
861 DMU_OT_NONE, 0, tx);
862 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
863 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
864 }
865 mutex_exit(&zfsvfs->z_lock);
866
867 if (quota == 0) {
868 err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
869 if (err == ENOENT)
870 err = 0;
871 } else {
872 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx);
873 }
874 ASSERT(err == 0);
875 if (fuid_dirtied)
876 zfs_fuid_sync(zfsvfs, tx);
877 dmu_tx_commit(tx);
878 return (err);
879 }
880
881 boolean_t
zfs_usergroup_overquota(zfsvfs_t * zfsvfs,boolean_t isgroup,uint64_t fuid)882 zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
883 {
884 char buf[32];
885 uint64_t used, quota, usedobj, quotaobj;
886 int err;
887
888 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
889 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
890
891 if (quotaobj == 0 || zfsvfs->z_replay)
892 return (B_FALSE);
893
894 (void) snprintf(buf, sizeof(buf), "%llx", (longlong_t)fuid);
895 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a);
896 if (err != 0)
897 return (B_FALSE);
898
899 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
900 if (err != 0)
901 return (B_FALSE);
902 return (used >= quota);
903 }
904
905 int
zfsvfs_create(const char * osname,zfsvfs_t ** zfvp)906 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
907 {
908 objset_t *os;
909 zfsvfs_t *zfsvfs;
910 uint64_t zval;
911 int i, error;
912
913 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
914
915 /*
916 * We claim to always be readonly so we can open snapshots;
917 * other ZPL code will prevent us from writing to snapshots.
918 */
919 error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
920 if (error) {
921 kmem_free(zfsvfs, sizeof (zfsvfs_t));
922 return (error);
923 }
924
925 /*
926 * Initialize the zfs-specific filesystem structure.
927 * Should probably make this a kmem cache, shuffle fields,
928 * and just bzero up to z_hold_mtx[].
929 */
930 zfsvfs->z_vfs = NULL;
931 zfsvfs->z_parent = zfsvfs;
932 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
933 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
934 zfsvfs->z_os = os;
935
936 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
937 if (error) {
938 goto out;
939 } else if (zfsvfs->z_version > ZPL_VERSION) {
940 (void) printf("Mismatched versions: File system "
941 "is version %llu on-disk format, which is "
942 "incompatible with this software version %lld!",
943 (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
944 error = ENOTSUP;
945 goto out;
946 }
947
948 if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
949 goto out;
950 zfsvfs->z_norm = (int)zval;
951
952 if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
953 goto out;
954 zfsvfs->z_utf8 = (zval != 0);
955
956 if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
957 goto out;
958 zfsvfs->z_case = (uint_t)zval;
959
960 /*
961 * Fold case on file systems that are always or sometimes case
962 * insensitive.
963 */
964 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
965 zfsvfs->z_case == ZFS_CASE_MIXED)
966 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
967
968 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
969
970 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
971 &zfsvfs->z_root);
972 if (error)
973 goto out;
974 ASSERT(zfsvfs->z_root != 0);
975
976 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
977 &zfsvfs->z_unlinkedobj);
978 if (error)
979 goto out;
980
981 error = zap_lookup(os, MASTER_NODE_OBJ,
982 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
983 8, 1, &zfsvfs->z_userquota_obj);
984 if (error && error != ENOENT)
985 goto out;
986
987 error = zap_lookup(os, MASTER_NODE_OBJ,
988 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
989 8, 1, &zfsvfs->z_groupquota_obj);
990 if (error && error != ENOENT)
991 goto out;
992
993 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
994 &zfsvfs->z_fuid_obj);
995 if (error && error != ENOENT)
996 goto out;
997
998 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
999 &zfsvfs->z_shares_dir);
1000 if (error && error != ENOENT)
1001 goto out;
1002
1003 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1004 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1005 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1006 offsetof(znode_t, z_link_node));
1007 rrw_init(&zfsvfs->z_teardown_lock);
1008 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1009 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1010 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1011 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1012
1013 *zfvp = zfsvfs;
1014 return (0);
1015
1016 out:
1017 dmu_objset_disown(os, zfsvfs);
1018 *zfvp = NULL;
1019 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1020 return (error);
1021 }
1022
1023 static int
zfsvfs_setup(zfsvfs_t * zfsvfs,boolean_t mounting)1024 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1025 {
1026 int error;
1027
1028 error = zfs_register_callbacks(zfsvfs->z_vfs);
1029 if (error)
1030 return (error);
1031
1032 /*
1033 * Set the objset user_ptr to track its zfsvfs.
1034 */
1035 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1036 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1037 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1038
1039 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1040 if (zil_disable) {
1041 zil_destroy(zfsvfs->z_log, B_FALSE);
1042 zfsvfs->z_log = NULL;
1043 }
1044
1045 /*
1046 * If we are not mounting (ie: online recv), then we don't
1047 * have to worry about replaying the log as we blocked all
1048 * operations out since we closed the ZIL.
1049 */
1050 if (mounting) {
1051 boolean_t readonly;
1052
1053 /*
1054 * During replay we remove the read only flag to
1055 * allow replays to succeed.
1056 */
1057 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1058 if (readonly != 0)
1059 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1060 else
1061 zfs_unlinked_drain(zfsvfs);
1062
1063 if (zfsvfs->z_log) {
1064 /*
1065 * Parse and replay the intent log.
1066 *
1067 * Because of ziltest, this must be done after
1068 * zfs_unlinked_drain(). (Further note: ziltest
1069 * doesn't use readonly mounts, where
1070 * zfs_unlinked_drain() isn't called.) This is because
1071 * ziltest causes spa_sync() to think it's committed,
1072 * but actually it is not, so the intent log contains
1073 * many txg's worth of changes.
1074 *
1075 * In particular, if object N is in the unlinked set in
1076 * the last txg to actually sync, then it could be
1077 * actually freed in a later txg and then reallocated
1078 * in a yet later txg. This would write a "create
1079 * object N" record to the intent log. Normally, this
1080 * would be fine because the spa_sync() would have
1081 * written out the fact that object N is free, before
1082 * we could write the "create object N" intent log
1083 * record.
1084 *
1085 * But when we are in ziltest mode, we advance the "open
1086 * txg" without actually spa_sync()-ing the changes to
1087 * disk. So we would see that object N is still
1088 * allocated and in the unlinked set, and there is an
1089 * intent log record saying to allocate it.
1090 */
1091 zfsvfs->z_replay = B_TRUE;
1092 zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
1093 zfsvfs->z_replay = B_FALSE;
1094 }
1095 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
1096 }
1097
1098 return (0);
1099 }
1100
1101 void
zfsvfs_free(zfsvfs_t * zfsvfs)1102 zfsvfs_free(zfsvfs_t *zfsvfs)
1103 {
1104 int i;
1105 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1106
1107 /*
1108 * This is a barrier to prevent the filesystem from going away in
1109 * zfs_znode_move() until we can safely ensure that the filesystem is
1110 * not unmounted. We consider the filesystem valid before the barrier
1111 * and invalid after the barrier.
1112 */
1113 rw_enter(&zfsvfs_lock, RW_READER);
1114 rw_exit(&zfsvfs_lock);
1115
1116 zfs_fuid_destroy(zfsvfs);
1117 mutex_destroy(&zfsvfs->z_znodes_lock);
1118 mutex_destroy(&zfsvfs->z_lock);
1119 list_destroy(&zfsvfs->z_all_znodes);
1120 rrw_destroy(&zfsvfs->z_teardown_lock);
1121 rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1122 rw_destroy(&zfsvfs->z_fuid_lock);
1123 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1124 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1125 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1126 }
1127
1128 static void
zfs_set_fuid_feature(zfsvfs_t * zfsvfs)1129 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1130 {
1131 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1132 if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
1133 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1134 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1135 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1136 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1137 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1138 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1139 }
1140 }
1141
1142 static int
zfs_domount(vfs_t * vfsp,char * osname)1143 zfs_domount(vfs_t *vfsp, char *osname)
1144 {
1145 dev_t mount_dev;
1146 uint64_t recordsize, fsid_guid;
1147 int error = 0;
1148 zfsvfs_t *zfsvfs;
1149
1150 ASSERT(vfsp);
1151 ASSERT(osname);
1152
1153 error = zfsvfs_create(osname, &zfsvfs);
1154 if (error)
1155 return (error);
1156 zfsvfs->z_vfs = vfsp;
1157 zfsvfs->z_parent = zfsvfs;
1158 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
1159 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
1160
1161 /* Initialize the generic filesystem structure. */
1162 vfsp->vfs_data = NULL;
1163
1164 if (zfs_create_unique_device(&mount_dev) == -1) {
1165 error = ENODEV;
1166 goto out;
1167 }
1168 ASSERT(vfs_devismounted(mount_dev) == 0);
1169
1170 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1171 NULL))
1172 goto out;
1173
1174 vfsp->vfs_bsize = DEV_BSIZE;
1175 vfsp->vfs_flag |= VFS_NOTRUNC;
1176 vfsp->vfs_data = zfsvfs;
1177
1178 /*
1179 * The fsid is 64 bits, composed of an 8-bit fs type, which
1180 * separates our fsid from any other filesystem types, and a
1181 * 56-bit objset unique ID. The objset unique ID is unique to
1182 * all objsets open on this system, provided by unique_create().
1183 * The 8-bit fs type must be put in the low bits of fsid[1]
1184 * because that's where other Solaris filesystems put it.
1185 */
1186 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1187 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1188 vfsp->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid;
1189 vfsp->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) |
1190 zfsfstype & 0xFF;
1191
1192 dprintf("zfs_domount vrele after vfsp->vfs_count %d\n", vfsp->vfs_count);
1193 /*
1194 * Set features for file system.
1195 */
1196 zfs_set_fuid_feature(zfsvfs);
1197 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1198 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1199 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1200 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1201 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1202 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1203 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1204 }
1205 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1206
1207 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1208 uint64_t pval;
1209
1210 atime_changed_cb(zfsvfs, B_FALSE);
1211 readonly_changed_cb(zfsvfs, B_TRUE);
1212 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1213 goto out;
1214 xattr_changed_cb(zfsvfs, pval);
1215 zfsvfs->z_issnap = B_TRUE;
1216
1217 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1218 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1219 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1220 } else {
1221 error = zfsvfs_setup(zfsvfs, B_TRUE);
1222 }
1223
1224 dprintf("zfs_vfsops.c zfs_domount called\n");
1225 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
1226
1227 if (!zfsvfs->z_issnap)
1228 zfsctl_create(zfsvfs);
1229 out:
1230 if (error) {
1231 dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1232 zfsvfs_free(zfsvfs);
1233 } else {
1234 atomic_add_32(&zfs_active_fs_count, 1);
1235 }
1236 return (error);
1237 }
1238
1239 void
zfs_unregister_callbacks(zfsvfs_t * zfsvfs)1240 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1241 {
1242 objset_t *os = zfsvfs->z_os;
1243 struct dsl_dataset *ds;
1244
1245 /*
1246 * Unregister properties.
1247 */
1248 if (!dmu_objset_is_snapshot(os)) {
1249 ds = dmu_objset_ds(os);
1250 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
1251 zfsvfs) == 0);
1252
1253 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
1254 zfsvfs) == 0);
1255
1256 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
1257 zfsvfs) == 0);
1258
1259 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
1260 zfsvfs) == 0);
1261
1262 VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
1263 zfsvfs) == 0);
1264
1265 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
1266 zfsvfs) == 0);
1267
1268 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
1269 zfsvfs) == 0);
1270
1271 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
1272 zfsvfs) == 0);
1273
1274 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
1275 zfsvfs) == 0);
1276
1277 VERIFY(dsl_prop_unregister(ds, "aclinherit",
1278 acl_inherit_changed_cb, zfsvfs) == 0);
1279
1280 VERIFY(dsl_prop_unregister(ds, "vscan",
1281 vscan_changed_cb, zfsvfs) == 0);
1282 }
1283 }
1284
1285 /*
1286 * Convert a decimal digit string to a uint64_t integer.
1287 */
1288 static int
str_to_uint64(char * str,uint64_t * objnum)1289 str_to_uint64(char *str, uint64_t *objnum)
1290 {
1291 uint64_t num = 0;
1292
1293 while (*str) {
1294 if (*str < '0' || *str > '9')
1295 return (EINVAL);
1296
1297 num = num*10 + *str++ - '0';
1298 }
1299
1300 *objnum = num;
1301 return (0);
1302 }
1303
1304 /*
1305 * The boot path passed from the boot loader is in the form of
1306 * "rootpool-name/root-filesystem-object-number'. Convert this
1307 * string to a dataset name: "rootpool-name/root-filesystem-name".
1308 */
1309 static int
zfs_parse_bootfs(char * bpath,char * outpath)1310 zfs_parse_bootfs(char *bpath, char *outpath)
1311 {
1312 char *slashp;
1313 uint64_t objnum;
1314 int error;
1315
1316 if (*bpath == 0 || *bpath == '/')
1317 return (EINVAL);
1318
1319 (void) strcpy(outpath, bpath);
1320
1321 slashp = strchr(bpath, '/');
1322
1323 /* if no '/', just return the pool name */
1324 if (slashp == NULL) {
1325 return (0);
1326 }
1327
1328 /* if not a number, just return the root dataset name */
1329 if (str_to_uint64(slashp+1, &objnum)) {
1330 return (0);
1331 }
1332
1333 *slashp = '\0';
1334 error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1335 *slashp = '/';
1336
1337 return (error);
1338 }
1339
1340
1341 /*
1342 * zfs_check_global_label:
1343 * Check that the hex label string is appropriate for the dataset
1344 * being mounted into the global_zone proper.
1345 *
1346 * Return an error if the hex label string is not default or
1347 * admin_low/admin_high. For admin_low labels, the corresponding
1348 * dataset must be readonly.
1349 */
1350 int
zfs_check_global_label(const char * dsname,const char * hexsl)1351 zfs_check_global_label(const char *dsname, const char *hexsl)
1352 {
1353 #ifdef PORT_SOLARIS
1354 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1355 return (0);
1356 if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1357 return (0);
1358 if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1359 /* must be readonly */
1360 uint64_t rdonly;
1361
1362 if (dsl_prop_get_integer(dsname,
1363 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1364 return (EACCES);
1365 return (rdonly ? 0 : EACCES);
1366 }
1367 return (EACCES);
1368 #else
1369 return 0;
1370 #endif
1371 }
1372
1373 /*
1374 * zfs_mount_label_policy:
1375 * Determine whether the mount is allowed according to MAC check.
1376 * by comparing (where appropriate) label of the dataset against
1377 * the label of the zone being mounted into. If the dataset has
1378 * no label, create one.
1379 *
1380 * Returns:
1381 * 0 : access allowed
1382 * >0 : error code, such as EACCES
1383 */
1384 static int
zfs_mount_label_policy(vfs_t * vfsp,char * osname)1385 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1386 {
1387 #ifdef PORT_SOLARIS
1388 int error, retv;
1389 zone_t *mntzone = NULL;
1390 ts_label_t *mnt_tsl;
1391 bslabel_t *mnt_sl;
1392 bslabel_t ds_sl;
1393 char ds_hexsl[MAXNAMELEN];
1394
1395 retv = EACCES; /* assume the worst */
1396
1397 /*
1398 * Start by getting the dataset label if it exists.
1399 */
1400 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1401 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1402 if (error)
1403 return (EACCES);
1404
1405 /*
1406 * If labeling is NOT enabled, then disallow the mount of datasets
1407 * which have a non-default label already. No other label checks
1408 * are needed.
1409 */
1410 if (!is_system_labeled()) {
1411 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1412 return (0);
1413 return (EACCES);
1414 }
1415
1416 /*
1417 * Get the label of the mountpoint. If mounting into the global
1418 * zone (i.e. mountpoint is not within an active zone and the
1419 * zoned property is off), the label must be default or
1420 * admin_low/admin_high only; no other checks are needed.
1421 */
1422 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1423 if (mntzone->zone_id == GLOBAL_ZONEID) {
1424 uint64_t zoned;
1425
1426 zone_rele(mntzone);
1427
1428 if (dsl_prop_get_integer(osname,
1429 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1430 return (EACCES);
1431 if (!zoned)
1432 return (zfs_check_global_label(osname, ds_hexsl));
1433 else
1434 /*
1435 * This is the case of a zone dataset being mounted
1436 * initially, before the zone has been fully created;
1437 * allow this mount into global zone.
1438 */
1439 return (0);
1440 }
1441
1442 mnt_tsl = mntzone->zone_slabel;
1443 ASSERT(mnt_tsl != NULL);
1444 label_hold(mnt_tsl);
1445 mnt_sl = label2bslabel(mnt_tsl);
1446
1447 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1448 /*
1449 * The dataset doesn't have a real label, so fabricate one.
1450 */
1451 char *str = NULL;
1452
1453 if (l_to_str_internal(mnt_sl, &str) == 0 &&
1454 dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1455 ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
1456 retv = 0;
1457 if (str != NULL)
1458 kmem_free(str, strlen(str) + 1);
1459 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1460 /*
1461 * Now compare labels to complete the MAC check. If the
1462 * labels are equal then allow access. If the mountpoint
1463 * label dominates the dataset label, allow readonly access.
1464 * Otherwise, access is denied.
1465 */
1466 if (blequal(mnt_sl, &ds_sl))
1467 retv = 0;
1468 else if (bldominates(mnt_sl, &ds_sl)) {
1469 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1470 retv = 0;
1471 }
1472 }
1473
1474 label_rele(mnt_tsl);
1475 zone_rele(mntzone);
1476 return (retv);
1477 #else /* PORT_SOLARIS */
1478 return (0);
1479 #endif
1480 }
1481
1482 #ifndef __NetBSD__
1483 static int
zfs_mountroot(vfs_t * vfsp,enum whymountroot why)1484 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1485 {
1486 int error = 0;
1487 static int zfsrootdone = 0;
1488 zfsvfs_t *zfsvfs = NULL;
1489 znode_t *zp = NULL;
1490 vnode_t *vp = NULL;
1491 char *zfs_bootfs;
1492 char *zfs_devid;
1493
1494 ASSERT(vfsp);
1495
1496 /*
1497 * The filesystem that we mount as root is defined in the
1498 * boot property "zfs-bootfs" with a format of
1499 * "poolname/root-dataset-objnum".
1500 */
1501 if (why == ROOT_INIT) {
1502 if (zfsrootdone++)
1503 return (EBUSY);
1504 /*
1505 * the process of doing a spa_load will require the
1506 * clock to be set before we could (for example) do
1507 * something better by looking at the timestamp on
1508 * an uberblock, so just set it to -1.
1509 */
1510 clkset(-1);
1511
1512 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1513 cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1514 "bootfs name");
1515 return (EINVAL);
1516 }
1517 zfs_devid = spa_get_bootprop("diskdevid");
1518 error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1519 if (zfs_devid)
1520 spa_free_bootprop(zfs_devid);
1521 if (error) {
1522 spa_free_bootprop(zfs_bootfs);
1523 cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1524 error);
1525 return (error);
1526 }
1527 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1528 spa_free_bootprop(zfs_bootfs);
1529 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1530 error);
1531 return (error);
1532 }
1533
1534 spa_free_bootprop(zfs_bootfs);
1535
1536 if (error = vfs_lock(vfsp))
1537 return (error);
1538
1539 if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1540 cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1541 goto out;
1542 }
1543
1544 zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1545 ASSERT(zfsvfs);
1546 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1547 cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1548 goto out;
1549 }
1550
1551 vp = ZTOV(zp);
1552 mutex_enter(&vp->v_lock);
1553 vp->v_flag |= VROOT;
1554 mutex_exit(&vp->v_lock);
1555 rootvp = vp;
1556
1557 /*
1558 * Leave rootvp held. The root file system is never unmounted.
1559 */
1560
1561 vfs_add((struct vnode *)0, vfsp,
1562 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1563 out:
1564 vfs_unlock(vfsp);
1565 return (error);
1566 } else if (why == ROOT_REMOUNT) {
1567 readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1568 vfsp->vfs_flag |= VFS_REMOUNT;
1569
1570 /* refresh mount options */
1571 zfs_unregister_callbacks(vfsp->vfs_data);
1572 return (zfs_register_callbacks(vfsp));
1573
1574 } else if (why == ROOT_UNMOUNT) {
1575 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1576 (void) zfs_sync(vfsp, 0, 0);
1577 return (0);
1578 }
1579
1580 /*
1581 * if "why" is equal to anything else other than ROOT_INIT,
1582 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1583 */
1584 return (ENOTSUP);
1585 }
1586 #endif /*__NetBSD__ */
1587
1588 /*ARGSUSED*/
1589 static int
zfs_mount(vfs_t * vfsp,const char * path,void * data,size_t * data_len)1590 zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len)
1591 {
1592 char *osname;
1593 pathname_t spn;
1594 vnode_t *mvp = vfsp->mnt_vnodecovered;
1595 struct mounta *uap = data;
1596 int error = 0;
1597 int canwrite;
1598 cred_t *cr;
1599
1600 crget(cr);
1601 dprintf("zfs_vfsops.c zfs_mount called\n");
1602 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
1603 if (mvp->v_type != VDIR)
1604 return (ENOTDIR);
1605
1606 if (uap == NULL)
1607 return (EINVAL);
1608
1609 mutex_enter(mvp->v_interlock);
1610 if ((uap->flags & MS_REMOUNT) == 0 &&
1611 (uap->flags & MS_OVERLAY) == 0 &&
1612 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1613 mutex_exit(mvp->v_interlock);
1614 return (EBUSY);
1615 }
1616 mutex_exit(mvp->v_interlock);
1617
1618 /*
1619 * ZFS does not support passing unparsed data in via MS_DATA.
1620 * Users should use the MS_OPTIONSTR interface; this means
1621 * that all option parsing is already done and the options struct
1622 * can be interrogated.
1623 */
1624 if ((uap->flags & MS_DATA) && uap->datalen > 0)
1625 return (EINVAL);
1626
1627 osname = PNBUF_GET();
1628
1629 strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1);
1630
1631 /*
1632 * Check for mount privilege?
1633 *
1634 * If we don't have privilege then see if
1635 * we have local permission to allow it
1636 */
1637 error = secpolicy_fs_mount(cr, mvp, vfsp);
1638 if (error) {
1639 error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
1640 if (error == 0) {
1641 vattr_t vattr;
1642
1643 /*
1644 * Make sure user is the owner of the mount point
1645 * or has sufficient privileges.
1646 */
1647
1648 vattr.va_mask = AT_UID;
1649
1650 if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1651 goto out;
1652 }
1653
1654 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1655 VOP_ACCESS(mvp, VWRITE, cr) != 0) {
1656 error = EPERM;
1657 goto out;
1658 }
1659
1660 /* XXX NetBSD secpolicy_fs_mount_clearopts(cr, vfsp);*/
1661 } else {
1662 goto out;
1663 }
1664 }
1665
1666 /*
1667 * Refuse to mount a filesystem if we are in a local zone and the
1668 * dataset is not visible.
1669 */
1670 if (!INGLOBALZONE(curproc) &&
1671 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1672 error = EPERM;
1673 goto out;
1674 }
1675
1676 error = zfs_mount_label_policy(vfsp, osname);
1677 if (error)
1678 goto out;
1679
1680 /*
1681 * When doing a remount, we simply refresh our temporary properties
1682 * according to those options set in the current VFS options.
1683 */
1684 if (uap->flags & MS_REMOUNT) {
1685 /* refresh mount options */
1686 zfs_unregister_callbacks(vfsp->vfs_data);
1687 error = zfs_register_callbacks(vfsp);
1688 goto out;
1689 }
1690
1691 /* Mark ZFS as MP SAFE */
1692 vfsp->mnt_iflag |= IMNT_MPSAFE;
1693
1694 error = zfs_domount(vfsp, osname);
1695
1696 vfs_getnewfsid(vfsp);
1697
1698 /* setup zfs mount info */
1699 strlcpy(vfsp->mnt_stat.f_mntfromname, osname,
1700 sizeof(vfsp->mnt_stat.f_mntfromname));
1701 set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname,
1702 UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp);
1703
1704 /*
1705 * Add an extra VFS_HOLD on our parent vfs so that it can't
1706 * disappear due to a forced unmount.
1707 */
1708 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1709 VFS_HOLD(mvp->v_vfsp);
1710
1711 out:
1712 PNBUF_PUT(osname);
1713 return (error);
1714 }
1715
1716 static int
zfs_statvfs(vfs_t * vfsp,struct statvfs * statp)1717 zfs_statvfs(vfs_t *vfsp, struct statvfs *statp)
1718 {
1719 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1720 dev_t dev;
1721 uint64_t refdbytes, availbytes, usedobjs, availobjs;
1722
1723 ZFS_ENTER(zfsvfs);
1724
1725 dmu_objset_space(zfsvfs->z_os,
1726 &refdbytes, &availbytes, &usedobjs, &availobjs);
1727
1728 /*
1729 * The underlying storage pool actually uses multiple block sizes.
1730 * We report the fragsize as the smallest block size we support,
1731 * and we report our blocksize as the filesystem's maximum blocksize.
1732 */
1733 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1734 statp->f_bsize = zfsvfs->z_max_blksz;
1735
1736 /*
1737 * The following report "total" blocks of various kinds in the
1738 * file system, but reported in terms of f_frsize - the
1739 * "fragment" size.
1740 */
1741
1742 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1743 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1744 statp->f_bavail = statp->f_bfree; /* no root reservation */
1745
1746 /*
1747 * statvfs() should really be called statufs(), because it assumes
1748 * static metadata. ZFS doesn't preallocate files, so the best
1749 * we can do is report the max that could possibly fit in f_files,
1750 * and that minus the number actually used in f_ffree.
1751 * For f_ffree, report the smaller of the number of object available
1752 * and the number of blocks (each object will take at least a block).
1753 */
1754 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1755 statp->f_favail = statp->f_ffree; /* no "root reservation" */
1756 statp->f_files = statp->f_ffree + usedobjs;
1757
1758 statp->f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0];
1759
1760 /*
1761 * We're a zfs filesystem.
1762 */
1763 (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
1764 (void) strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1765 sizeof(statp->f_mntfromname));
1766 (void) strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1767 sizeof(statp->f_mntonname));
1768
1769 statp->f_namemax = ZFS_MAXNAMELEN;
1770
1771 /*
1772 * We have all of 32 characters to stuff a string here.
1773 * Is there anything useful we could/should provide?
1774 */
1775 #ifndef __NetBSD__
1776 bzero(statp->f_fstr, sizeof (statp->f_fstr));
1777 #endif
1778 ZFS_EXIT(zfsvfs);
1779 return (0);
1780 }
1781
1782 static int
zfs_root(vfs_t * vfsp,vnode_t ** vpp)1783 zfs_root(vfs_t *vfsp, vnode_t **vpp)
1784 {
1785 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1786 znode_t *rootzp;
1787 int error;
1788
1789 ZFS_ENTER(zfsvfs);
1790 dprintf("zfs_root called\n");
1791 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1792 if (error == 0)
1793 *vpp = ZTOV(rootzp);
1794 dprintf("vpp -> %d, error %d -- %p\n", (*vpp)->v_type, error, *vpp);
1795 ZFS_EXIT(zfsvfs);
1796 if (error == 0)
1797 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1798 KASSERT((error != 0) || (*vpp != NULL));
1799 KASSERT((error != 0) || (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE));
1800 return (error);
1801 }
1802
1803 /*
1804 * Teardown the zfsvfs::z_os.
1805 *
1806 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1807 * and 'z_teardown_inactive_lock' held.
1808 */
1809 static int
zfsvfs_teardown(zfsvfs_t * zfsvfs,boolean_t unmounting)1810 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1811 {
1812 znode_t *zp;
1813
1814 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1815
1816 if (!unmounting) {
1817 /*
1818 * We purge the parent filesystem's vfsp as the parent
1819 * filesystem and all of its snapshots have their vnode's
1820 * v_vfsp set to the parent's filesystem's vfsp. Note,
1821 * 'z_parent' is self referential for non-snapshots.
1822 */
1823 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1824 }
1825
1826 /*
1827 * Close the zil. NB: Can't close the zil while zfs_inactive
1828 * threads are blocked as zil_close can call zfs_inactive.
1829 */
1830 if (zfsvfs->z_log) {
1831 zil_close(zfsvfs->z_log);
1832 zfsvfs->z_log = NULL;
1833 }
1834
1835 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1836
1837 /*
1838 * If we are not unmounting (ie: online recv) and someone already
1839 * unmounted this file system while we were doing the switcheroo,
1840 * or a reopen of z_os failed then just bail out now.
1841 */
1842 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1843 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1844 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1845 return (EIO);
1846 }
1847
1848 /*
1849 * At this point there are no vops active, and any new vops will
1850 * fail with EIO since we have z_teardown_lock for writer (only
1851 * relavent for forced unmount).
1852 *
1853 * Release all holds on dbufs.
1854 */
1855 mutex_enter(&zfsvfs->z_znodes_lock);
1856 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1857 zp = list_next(&zfsvfs->z_all_znodes, zp))
1858 if (zp->z_dbuf) {
1859 ASSERT(ZTOV(zp)->v_count > 0);
1860 zfs_znode_dmu_fini(zp);
1861 }
1862 mutex_exit(&zfsvfs->z_znodes_lock);
1863
1864 /*
1865 * If we are unmounting, set the unmounted flag and let new vops
1866 * unblock. zfs_inactive will have the unmounted behavior, and all
1867 * other vops will fail with EIO.
1868 */
1869 if (unmounting) {
1870 zfsvfs->z_unmounted = B_TRUE;
1871 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1872 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1873 }
1874
1875 /*
1876 * z_os will be NULL if there was an error in attempting to reopen
1877 * zfsvfs, so just return as the properties had already been
1878 * unregistered and cached data had been evicted before.
1879 */
1880 if (zfsvfs->z_os == NULL)
1881 return (0);
1882
1883 /*
1884 * Unregister properties.
1885 */
1886 zfs_unregister_callbacks(zfsvfs);
1887
1888 /*
1889 * Evict cached data
1890 */
1891 if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
1892 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1893 (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1894 }
1895
1896 return (0);
1897 }
1898
1899 /*ARGSUSED*/
1900 static int
zfs_umount(vfs_t * vfsp,int fflag)1901 zfs_umount(vfs_t *vfsp, int fflag)
1902 {
1903 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1904 objset_t *os;
1905 int ret, flags = 0;
1906 cred_t *cr;
1907
1908 vnode_t *vpp;
1909 int counter;
1910
1911 counter = 0;
1912
1913 dprintf("ZFS_UMOUNT called\n");
1914
1915 /*TAILQ_FOREACH(vpp, &vfsp->mnt_vnodelist, v_mntvnodes) {
1916 printf("vnode list vnode number %d -- vnode address %p\n", counter, vpp);
1917 vprint("ZFS vfsp vnode list", vpp);
1918 counter++;
1919 } */
1920
1921 crget(cr);
1922 #ifdef TODO
1923 ret = secpolicy_fs_unmount(cr, vfsp);
1924 if (ret) {
1925 ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1926 ZFS_DELEG_PERM_MOUNT, cr);
1927 if (ret)
1928 return (ret);
1929 }
1930 #endif
1931 /*
1932 * We purge the parent filesystem's vfsp as the parent filesystem
1933 * and all of its snapshots have their vnode's v_vfsp set to the
1934 * parent's filesystem's vfsp. Note, 'z_parent' is self
1935 * referential for non-snapshots.
1936 */
1937 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1938
1939 /*
1940 * Unmount any snapshots mounted under .zfs before unmounting the
1941 * dataset itself.
1942 */
1943 if (zfsvfs->z_ctldir != NULL &&
1944 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1945 return (ret);
1946 }
1947
1948 #if 0
1949 if (!(fflag & MS_FORCE)) {
1950 /*
1951 * Check the number of active vnodes in the file system.
1952 * Our count is maintained in the vfs structure, but the
1953 * number is off by 1 to indicate a hold on the vfs
1954 * structure itself.
1955 *
1956 * The '.zfs' directory maintains a reference of its
1957 * own, and any active references underneath are
1958 * reflected in the vnode count.
1959 */
1960 if (zfsvfs->z_ctldir == NULL) {
1961 if (vfsp->vfs_count > 1){
1962 return (EBUSY);
1963 }
1964 } else {
1965 if (vfsp->vfs_count > 2 ||
1966 zfsvfs->z_ctldir->v_count > 1) {
1967 return (EBUSY);
1968 }
1969 }
1970 }
1971 #endif
1972 ret = vflush(vfsp, NULL, (ISSET(fflag, MS_FORCE)? FORCECLOSE : 0));
1973 if (ret != 0)
1974 return ret;
1975 vfsp->vfs_flag |= VFS_UNMOUNTED;
1976
1977 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1978 os = zfsvfs->z_os;
1979
1980 /*
1981 * z_os will be NULL if there was an error in
1982 * attempting to reopen zfsvfs.
1983 */
1984 if (os != NULL) {
1985 /*
1986 * Unset the objset user_ptr.
1987 */
1988 mutex_enter(&os->os_user_ptr_lock);
1989 dmu_objset_set_user(os, NULL);
1990 mutex_exit(&os->os_user_ptr_lock);
1991
1992 /*
1993 * Finally release the objset
1994 */
1995 dmu_objset_disown(os, zfsvfs);
1996 }
1997
1998 /*
1999 * We can now safely destroy the '.zfs' directory node.
2000 */
2001 if (zfsvfs->z_ctldir != NULL)
2002 zfsctl_destroy(zfsvfs);
2003
2004 return (0);
2005 }
2006
2007 static int
zfs_vget(vfs_t * vfsp,ino_t ino,vnode_t ** vpp)2008 zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp)
2009 {
2010 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2011 znode_t *zp;
2012 int err;
2013
2014 dprintf("zfs_vget called\n");
2015 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
2016
2017 ZFS_ENTER(zfsvfs);
2018 err = zfs_zget(zfsvfs, ino, &zp);
2019 if (err == 0 && zp->z_unlinked) {
2020 VN_RELE(ZTOV(zp));
2021 err = EINVAL;
2022 }
2023 if (err != 0)
2024 *vpp = NULL;
2025 else {
2026 *vpp = ZTOV(zp);
2027 /* XXX NetBSD how to get flags for vn_lock ? */
2028 vn_lock(*vpp, 0);
2029 }
2030 ZFS_EXIT(zfsvfs);
2031 return (err);
2032 }
2033
2034 static int
zfs_fhtovp(vfs_t * vfsp,fid_t * fidp,vnode_t ** vpp)2035 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
2036 {
2037 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2038 znode_t *zp;
2039 uint64_t object = 0;
2040 uint64_t fid_gen = 0;
2041 uint64_t gen_mask;
2042 uint64_t zp_gen;
2043 int i, err;
2044
2045 *vpp = NULL;
2046
2047 dprintf("zfs_fhtovp called\n");
2048 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
2049
2050 ZFS_ENTER(zfsvfs);
2051
2052 if (fidp->fid_len == LONG_FID_LEN) {
2053 zfid_long_t *zlfid = (zfid_long_t *)fidp;
2054 uint64_t objsetid = 0;
2055 uint64_t setgen = 0;
2056
2057 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
2058 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
2059
2060 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
2061 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
2062
2063 ZFS_EXIT(zfsvfs);
2064
2065 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
2066 if (err)
2067 return (EINVAL);
2068 ZFS_ENTER(zfsvfs);
2069 }
2070
2071 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
2072 zfid_short_t *zfid = (zfid_short_t *)fidp;
2073
2074 for (i = 0; i < sizeof (zfid->zf_object); i++)
2075 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
2076
2077 for (i = 0; i < sizeof (zfid->zf_gen); i++)
2078 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
2079 } else {
2080 ZFS_EXIT(zfsvfs);
2081 return (EINVAL);
2082 }
2083
2084 /* A zero fid_gen means we are in the .zfs control directories */
2085 if (fid_gen == 0 &&
2086 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
2087 *vpp = zfsvfs->z_ctldir;
2088 ASSERT(*vpp != NULL);
2089 if (object == ZFSCTL_INO_SNAPDIR) {
2090 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
2091 0, NULL, NULL, NULL, NULL, NULL) == 0);
2092 } else {
2093 VN_HOLD(*vpp);
2094 }
2095 ZFS_EXIT(zfsvfs);
2096 /* XXX: LK_RETRY? */
2097 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
2098 return (0);
2099 }
2100
2101 gen_mask = -1ULL >> (64 - 8 * i);
2102
2103 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
2104 if (err = zfs_zget(zfsvfs, object, &zp)) {
2105 ZFS_EXIT(zfsvfs);
2106 return (err);
2107 }
2108 zp_gen = zp->z_phys->zp_gen & gen_mask;
2109 if (zp_gen == 0)
2110 zp_gen = 1;
2111 if (zp->z_unlinked || zp_gen != fid_gen) {
2112 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2113 VN_RELE(ZTOV(zp));
2114 ZFS_EXIT(zfsvfs);
2115 return (EINVAL);
2116 }
2117
2118 *vpp = ZTOV(zp);
2119 /* XXX: LK_RETRY? */
2120 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
2121 ZFS_EXIT(zfsvfs);
2122 return (0);
2123 }
2124
2125 /*
2126 * Block out VOPs and close zfsvfs_t::z_os
2127 *
2128 * Note, if successful, then we return with the 'z_teardown_lock' and
2129 * 'z_teardown_inactive_lock' write held.
2130 */
2131 int
zfs_suspend_fs(zfsvfs_t * zfsvfs)2132 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2133 {
2134 int error;
2135
2136 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2137 return (error);
2138 dmu_objset_disown(zfsvfs->z_os, zfsvfs);
2139
2140 return (0);
2141 }
2142
2143 /*
2144 * Reopen zfsvfs_t::z_os and release VOPs.
2145 */
2146 int
zfs_resume_fs(zfsvfs_t * zfsvfs,const char * osname)2147 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
2148 {
2149 int err;
2150
2151 ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
2152 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2153
2154 err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
2155 &zfsvfs->z_os);
2156 if (err) {
2157 zfsvfs->z_os = NULL;
2158 } else {
2159 znode_t *zp;
2160
2161 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2162
2163 /*
2164 * Attempt to re-establish all the active znodes with
2165 * their dbufs. If a zfs_rezget() fails, then we'll let
2166 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2167 * when they try to use their znode.
2168 */
2169 mutex_enter(&zfsvfs->z_znodes_lock);
2170 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2171 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2172 (void) zfs_rezget(zp);
2173 }
2174 mutex_exit(&zfsvfs->z_znodes_lock);
2175
2176 }
2177
2178 /* release the VOPs */
2179 rw_exit(&zfsvfs->z_teardown_inactive_lock);
2180 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
2181
2182 if (err) {
2183 /*
2184 * Since we couldn't reopen zfsvfs::z_os, force
2185 * unmount this file system.
2186 */
2187 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
2188 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curlwp);
2189 }
2190 return (err);
2191 }
2192
2193 static void
zfs_freevfs(vfs_t * vfsp)2194 zfs_freevfs(vfs_t *vfsp)
2195 {
2196 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2197
2198 /*
2199 * If this is a snapshot, we have an extra VFS_HOLD on our parent
2200 * from zfs_mount(). Release it here.
2201 */
2202 if (zfsvfs->z_issnap)
2203 VFS_RELE(zfsvfs->z_parent->z_vfs);
2204
2205 zfsvfs_free(zfsvfs);
2206
2207 atomic_add_32(&zfs_active_fs_count, -1);
2208 }
2209
2210 /*
2211 * VFS_INIT() initialization. Note that there is no VFS_FINI(),
2212 * so we can't safely do any non-idempotent initialization here.
2213 * Leave that to zfs_init() and zfs_fini(), which are called
2214 * from the module's _init() and _fini() entry points.
2215 */
2216 /*ARGSUSED*/
2217 int
zfs_vfsinit(int fstype,char * name)2218 zfs_vfsinit(int fstype, char *name)
2219 {
2220 int error;
2221
2222 zfsfstype = fstype;
2223
2224 /*
2225 * Setup vfsops and vnodeops tables.
2226 */
2227 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
2228
2229 error = zfs_create_op_tables();
2230 if (error) {
2231 zfs_remove_op_tables();
2232 cmn_err(CE_WARN, "zfs: bad vnode ops template");
2233 vfs_freevfsops_by_type(zfsfstype);
2234 return (error);
2235 }
2236
2237 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
2238 mutex_init(&zfs_debug_mtx, NULL, MUTEX_DEFAULT, NULL);
2239
2240 /*
2241 * Unique major number for all zfs mounts.
2242 * If we run out of 32-bit minors, we'll getudev() another major.
2243 */
2244 zfs_major = ddi_name_to_major(ZFS_DRIVER);
2245 zfs_minor = ZFS_MIN_MINOR;
2246
2247 return (0);
2248 }
2249
2250 int
zfs_vfsfini(void)2251 zfs_vfsfini(void)
2252 {
2253 int err;
2254
2255 err = vfs_detach(&zfs_vfsops_template);
2256 if (err != 0)
2257 return err;
2258
2259 mutex_destroy(&zfs_debug_mtx);
2260 mutex_destroy(&zfs_dev_mtx);
2261
2262 return 0;
2263 }
2264
2265 void
zfs_init(void)2266 zfs_init(void)
2267 {
2268 /*
2269 * Initialize .zfs directory structures
2270 */
2271 zfsctl_init();
2272
2273 /*
2274 * Initialize znode cache, vnode ops, etc...
2275 */
2276 zfs_znode_init();
2277
2278 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2279 }
2280
2281 void
zfs_fini(void)2282 zfs_fini(void)
2283 {
2284 zfsctl_fini();
2285 zfs_znode_fini();
2286 }
2287
2288 int
zfs_busy(void)2289 zfs_busy(void)
2290 {
2291 return (zfs_active_fs_count != 0);
2292 }
2293
2294 int
zfs_set_version(zfsvfs_t * zfsvfs,uint64_t newvers)2295 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2296 {
2297 int error;
2298 objset_t *os = zfsvfs->z_os;
2299 dmu_tx_t *tx;
2300
2301 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2302 return (EINVAL);
2303
2304 if (newvers < zfsvfs->z_version)
2305 return (EINVAL);
2306
2307 tx = dmu_tx_create(os);
2308 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2309 error = dmu_tx_assign(tx, TXG_WAIT);
2310 if (error) {
2311 dmu_tx_abort(tx);
2312 return (error);
2313 }
2314 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2315 8, 1, &newvers, tx);
2316
2317 if (error) {
2318 dmu_tx_commit(tx);
2319 return (error);
2320 }
2321
2322 spa_history_internal_log(LOG_DS_UPGRADE,
2323 dmu_objset_spa(os), tx, CRED(),
2324 "oldver=%llu newver=%llu dataset = %llu",
2325 zfsvfs->z_version, newvers, dmu_objset_id(os));
2326
2327 dmu_tx_commit(tx);
2328
2329 zfsvfs->z_version = newvers;
2330
2331 if (zfsvfs->z_version >= ZPL_VERSION_FUID)
2332 zfs_set_fuid_feature(zfsvfs);
2333
2334 return (0);
2335 }
2336
2337 /*
2338 * Read a property stored within the master node.
2339 */
2340 int
zfs_get_zplprop(objset_t * os,zfs_prop_t prop,uint64_t * value)2341 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2342 {
2343 const char *pname;
2344 int error = ENOENT;
2345
2346 /*
2347 * Look up the file system's value for the property. For the
2348 * version property, we look up a slightly different string.
2349 */
2350 if (prop == ZFS_PROP_VERSION)
2351 pname = ZPL_VERSION_STR;
2352 else
2353 pname = zfs_prop_to_name(prop);
2354
2355 if (os != NULL)
2356 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2357
2358 if (error == ENOENT) {
2359 /* No value set, use the default value */
2360 switch (prop) {
2361 case ZFS_PROP_VERSION:
2362 *value = ZPL_VERSION;
2363 break;
2364 case ZFS_PROP_NORMALIZE:
2365 case ZFS_PROP_UTF8ONLY:
2366 *value = 0;
2367 break;
2368 case ZFS_PROP_CASE:
2369 *value = ZFS_CASE_SENSITIVE;
2370 break;
2371 default:
2372 return (error);
2373 }
2374 error = 0;
2375 }
2376 return (error);
2377 }
2378
2379 static int
zfs_start(vfs_t * vfsp,int flags)2380 zfs_start(vfs_t *vfsp, int flags)
2381 {
2382
2383 return (0);
2384 }
2385
2386
2387 #ifdef TODO
2388 static vfsdef_t vfw = {
2389 VFSDEF_VERSION,
2390 MNTTYPE_ZFS,
2391 zfs_vfsinit,
2392 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
2393 VSW_XID,
2394 &zfs_mntopts
2395 };
2396
2397 struct modlfs zfs_modlfs = {
2398 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
2399 };
2400 #endif
2401