1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/errno.h>
30 #include <sys/vfs.h>
31 #include <sys/vnode.h>
32 #include <sys/uio.h>
33 #include <sys/pathname.h>
34 #include <sys/kmem.h>
35 #include <sys/cred.h>
36 #include <sys/statvfs.h>
37 #include <sys/fs/lofs_info.h>
38 #include <sys/fs/lofs_node.h>
39 #include <sys/mount.h>
40 #include <sys/mntent.h>
41 #include <sys/mkdev.h>
42 #include <sys/priv.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/cmn_err.h>
46 #include <sys/policy.h>
47 #include <sys/tsol/label.h>
48 #include "fs/fs_subr.h"
49 
50 /*
51  * This is the loadable module wrapper.
52  */
53 #include <sys/modctl.h>
54 
55 static mntopts_t lofs_mntopts;
56 
57 static int lofsinit(int, char *);
58 
59 static vfsdef_t vfw = {
60 	VFSDEF_VERSION,
61 	"lofs",
62 	lofsinit,
63 	VSW_HASPROTO|VSW_STATS,
64 	&lofs_mntopts
65 };
66 
67 /*
68  * Stuff needed to support "zonedevfs" mode.
69  */
70 static major_t lofs_major;
71 static minor_t lofs_minor;
72 static kmutex_t lofs_minor_lock;
73 
74 /*
75  * LOFS mount options table
76  */
77 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
78 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
79 static char *zonedevfs_cancel[] = { MNTOPT_LOFS_NOZONEDEVFS, NULL };
80 static char *nozonedevfs_cancel[] = { MNTOPT_LOFS_ZONEDEVFS, NULL };
81 static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL };
82 static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL };
83 
84 static mntopt_t mntopts[] = {
85 /*
86  *	option name		cancel option	default arg	flags
87  *		private data
88  */
89 	{ MNTOPT_XATTR,		xattr_cancel,	NULL,		0,
90 		(void *)0 },
91 	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,		0,
92 		(void *)0 },
93 	{ MNTOPT_LOFS_ZONEDEVFS,	zonedevfs_cancel,	NULL,	0,
94 		(void *)0 },
95 	{ MNTOPT_LOFS_NOZONEDEVFS,	nozonedevfs_cancel,	NULL,	0,
96 		(void *)0 },
97 	{ MNTOPT_LOFS_SUB,	sub_cancel,	NULL,		0,
98 		(void *)0 },
99 	{ MNTOPT_LOFS_NOSUB,	nosub_cancel,	NULL,		0,
100 		(void *)0 },
101 };
102 
103 static mntopts_t lofs_mntopts = {
104 	sizeof (mntopts) / sizeof (mntopt_t),
105 	mntopts
106 };
107 
108 /*
109  * Module linkage information for the kernel.
110  */
111 
112 static struct modlfs modlfs = {
113 	&mod_fsops, "filesystem for lofs", &vfw
114 };
115 
116 static struct modlinkage modlinkage = {
117 	MODREV_1, (void *)&modlfs, NULL
118 };
119 
120 /*
121  * This is the module initialization routine.
122  */
123 
124 int
125 _init(void)
126 {
127 	int status;
128 
129 	lofs_subrinit();
130 	status = mod_install(&modlinkage);
131 	if (status != 0) {
132 		/*
133 		 * Cleanup previously initialized work.
134 		 */
135 		lofs_subrfini();
136 	}
137 
138 	return (status);
139 }
140 
141 /*
142  * Don't allow the lofs module to be unloaded for now.
143  * There is a memory leak if it gets unloaded.
144  */
145 
146 int
147 _fini(void)
148 {
149 	return (EBUSY);
150 }
151 
152 int
153 _info(struct modinfo *modinfop)
154 {
155 	return (mod_info(&modlinkage, modinfop));
156 }
157 
158 
159 static int lofsfstype;
160 vfsops_t *lo_vfsops;
161 
162 /*
163  * lo mount vfsop
164  * Set up mount info record and attach it to vfs struct.
165  */
166 /*ARGSUSED*/
167 static int
168 lo_mount(struct vfs *vfsp,
169 	struct vnode *vp,
170 	struct mounta *uap,
171 	struct cred *cr)
172 {
173 	int error;
174 	struct vnode *srootvp = NULL;	/* the server's root */
175 	struct vnode *realrootvp;
176 	struct loinfo *li;
177 	int is_zonedevfs = 0;
178 	int nodev;
179 
180 	nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);
181 
182 	if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0)
183 		return (EPERM);
184 
185 	/*
186 	 * Loopback devices which get "nodevices" added can be done without
187 	 * "nodevices" set because we cannot import devices into a zone
188 	 * with loopback.  Note that we have all zone privileges when
189 	 * this happens; if not, we'd have gotten "nosuid".
190 	 */
191 	if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
192 		vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);
193 
194 	/*
195 	 * We must ensure that only the global zone applies the 'zonedevfs'
196 	 * option; we don't want non-global zones to be able to establish
197 	 * lofs mounts using the special dev_t we use to ensure that the
198 	 * contents of a zone's /dev cannot be victim to link(2) or rename(2).
199 	 * See below, where we set all of this up.
200 	 *
201 	 * Since this is more like a privilege check, we use crgetzoneid(cr)
202 	 * instead of getzoneid().
203 	 */
204 	is_zonedevfs = vfs_optionisset(vfsp, MNTOPT_LOFS_ZONEDEVFS, NULL);
205 	if (crgetzoneid(cr) != GLOBAL_ZONEID && is_zonedevfs)
206 		return (EPERM);
207 
208 	mutex_enter(&vp->v_lock);
209 	if (!(uap->flags & MS_OVERLAY) &&
210 	    (vp->v_count != 1 || (vp->v_flag & VROOT))) {
211 		mutex_exit(&vp->v_lock);
212 		return (EBUSY);
213 	}
214 	mutex_exit(&vp->v_lock);
215 
216 	/*
217 	 * Find real root, and make vfs point to real vfs
218 	 */
219 	if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
220 		UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP,
221 	    &realrootvp))
222 		return (error);
223 
224 	/*
225 	 * Enforce MAC policy if needed.
226 	 *
227 	 * Loopback mounts must not allow writing up. The dominance test
228 	 * is intended to prevent a global zone caller from accidentally
229 	 * creating write-up conditions between two labeled zones.
230 	 * Local zones can't violate MAC on their own without help from
231 	 * the global zone because they can't name a pathname that
232 	 * they don't already have.
233 	 *
234 	 * The special case check for the NET_MAC_AWARE process flag is
235 	 * to support the case of the automounter in the global zone. We
236 	 * permit automounting of local zone directories such as home
237 	 * directories, into the global zone as required by setlabel,
238 	 * zonecopy, and saving of desktop sessions. Such mounts are
239 	 * trusted not to expose the contents of one zone's directories
240 	 * to another by leaking them through the global zone.
241 	 */
242 	if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) {
243 		char	specname[MAXPATHLEN];
244 		zone_t	*from_zptr;
245 		zone_t	*to_zptr;
246 
247 		if (vnodetopath(NULL, realrootvp, specname,
248 		    sizeof (specname), CRED()) != 0)
249 			return (EACCES);
250 
251 		from_zptr = zone_find_by_path(specname);
252 		to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
253 
254 		/*
255 		 * Special case for zone devfs: the zone for /dev will
256 		 * incorrectly appear as the global zone since it's not
257 		 * under the zone rootpath.  So for zone devfs check allow
258 		 * read-write mounts.
259 		 *
260 		 * Second special case for scratch zones used for Live Upgrade:
261 		 * this is used to mount the zone's root from /root to /a in
262 		 * the scratch zone.  As with the other special case, this
263 		 * appears to be outside of the zone because it's not under
264 		 * the zone rootpath, which is $ZONEPATH/lu in the scratch
265 		 * zone case.
266 		 */
267 
268 		if (from_zptr != to_zptr && !is_zonedevfs &&
269 		    !(to_zptr->zone_flags & ZF_IS_SCRATCH)) {
270 			/*
271 			 * We know at this point that the labels aren't equal
272 			 * because the zone pointers aren't equal, and zones
273 			 * can't share a label.
274 			 *
275 			 * If the source is the global zone then making
276 			 * it available to a local zone must be done in
277 			 * read-only mode as the label will become admin_low.
278 			 *
279 			 * If it is a mount between local zones then if
280 			 * the current process is in the global zone and has
281 			 * the NET_MAC_AWARE flag, then regular read-write
282 			 * access is allowed.  If it's in some other zone, but
283 			 * the label on the mount point dominates the original
284 			 * source, then allow the mount as read-only
285 			 * ("read-down").
286 			 */
287 			if (from_zptr->zone_id == GLOBAL_ZONEID) {
288 				/* make the mount read-only */
289 				vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
290 			} else { /* cross-zone mount */
291 				if (to_zptr->zone_id == GLOBAL_ZONEID &&
292 				    /* LINTED: no consequent */
293 				    getpflags(NET_MAC_AWARE, cr) != 0) {
294 					/* Allow the mount as read-write */
295 				} else if (bldominates(
296 				    label2bslabel(to_zptr->zone_slabel),
297 				    label2bslabel(from_zptr->zone_slabel))) {
298 					/* make the mount read-only */
299 					vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
300 				} else {
301 					zone_rele(to_zptr);
302 					zone_rele(from_zptr);
303 					return (EACCES);
304 				}
305 			}
306 		}
307 		zone_rele(to_zptr);
308 		zone_rele(from_zptr);
309 	}
310 
311 	/*
312 	 * realrootvp may be an AUTOFS node, in which case we
313 	 * perform a VOP_ACCESS() to trigger the mount of the
314 	 * intended filesystem, so we loopback mount the intended
315 	 * filesystem instead of the AUTOFS filesystem.
316 	 */
317 	(void) VOP_ACCESS(realrootvp, 0, 0, cr);
318 
319 	/*
320 	 * We're interested in the top most filesystem.
321 	 * This is specially important when uap->spec is a trigger
322 	 * AUTOFS node, since we're really interested in mounting the
323 	 * filesystem AUTOFS mounted as result of the VOP_ACCESS()
324 	 * call not the AUTOFS node itself.
325 	 */
326 	if (vn_mountedvfs(realrootvp) != NULL) {
327 		if (error = traverse(&realrootvp)) {
328 			VN_RELE(realrootvp);
329 			return (error);
330 		}
331 	}
332 
333 	/*
334 	 * Allocate a vfs info struct and attach it
335 	 */
336 	li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP);
337 	li->li_realvfs = realrootvp->v_vfsp;
338 	li->li_mountvfs = vfsp;
339 
340 	/*
341 	 * Set mount flags to be inherited by loopback vfs's
342 	 */
343 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
344 		li->li_mflag |= VFS_RDONLY;
345 	}
346 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
347 		li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES);
348 	}
349 	if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
350 		li->li_mflag |= VFS_NODEVICES;
351 	}
352 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
353 		li->li_mflag |= VFS_NOSETUID;
354 	}
355 	/*
356 	 * Permissive flags are added to the "deny" bitmap.
357 	 */
358 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
359 		li->li_dflag |= VFS_XATTR;
360 	}
361 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
362 		li->li_dflag |= VFS_NBMAND;
363 	}
364 
365 	/*
366 	 * Propagate inheritable mount flags from the real vfs.
367 	 */
368 	if ((li->li_realvfs->vfs_flag & VFS_RDONLY) &&
369 	    !vfs_optionisset(vfsp, MNTOPT_RO, NULL))
370 		vfs_setmntopt(vfsp, MNTOPT_RO, NULL,
371 		    VFS_NODISPLAY);
372 	if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) &&
373 	    !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
374 		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL,
375 		    VFS_NODISPLAY);
376 	if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) &&
377 	    !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
378 		vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL,
379 		    VFS_NODISPLAY);
380 	/*
381 	 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags
382 	 * such as VFS_RDONLY, are handled differently.  An explicit
383 	 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR.
384 	 */
385 	if ((li->li_realvfs->vfs_flag & VFS_XATTR) &&
386 	    !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) &&
387 	    !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
388 		vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL,
389 		    VFS_NODISPLAY);
390 	if ((li->li_realvfs->vfs_flag & VFS_NBMAND) &&
391 	    !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) &&
392 	    !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
393 		vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL,
394 		    VFS_NODISPLAY);
395 
396 	li->li_refct = 0;
397 	vfsp->vfs_data = (caddr_t)li;
398 	vfsp->vfs_bcount = 0;
399 	vfsp->vfs_fstype = lofsfstype;
400 	vfsp->vfs_bsize = li->li_realvfs->vfs_bsize;
401 
402 	/*
403 	 * Test to see if we need to be in "zone /dev" mode.  In zonedevfs
404 	 * mode, we pull a nasty trick; we make sure that the lofs dev_t does
405 	 * *not* reflect the underlying device, so that no renames or links
406 	 * can occur to or from the /dev hierarchy.
407 	 */
408 	if (is_zonedevfs) {
409 		dev_t dev;
410 
411 		mutex_enter(&lofs_minor_lock);
412 		do {
413 			lofs_minor = (lofs_minor + 1) & MAXMIN32;
414 			dev = makedevice(lofs_major, lofs_minor);
415 		} while (vfs_devismounted(dev));
416 		mutex_exit(&lofs_minor_lock);
417 
418 		vfsp->vfs_dev = dev;
419 		vfs_make_fsid(&vfsp->vfs_fsid, dev, lofsfstype);
420 
421 		li->li_flag |= LO_ZONEDEVFS;
422 	} else {
423 		vfsp->vfs_dev = li->li_realvfs->vfs_dev;
424 		vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0];
425 		vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1];
426 	}
427 
428 	if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) {
429 		li->li_flag |= LO_NOSUB;
430 	}
431 
432 	/*
433 	 * Setup the hashtable. If the root of this mount isn't a directory,
434 	 * there's no point in allocating a large hashtable. A table with one
435 	 * bucket is sufficient.
436 	 */
437 	if (realrootvp->v_type != VDIR)
438 		lsetup(li, 1);
439 	else
440 		lsetup(li, 0);
441 
442 	/*
443 	 * Make the root vnode
444 	 */
445 	srootvp = makelonode(realrootvp, li, 0);
446 	srootvp->v_flag |= VROOT;
447 	li->li_rootvp = srootvp;
448 
449 #ifdef LODEBUG
450 	lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n",
451 	    vfsp, li->li_realvfs, srootvp, realrootvp, li);
452 #endif
453 	return (0);
454 }
455 
456 /*
457  * Undo loopback mount
458  */
459 static int
460 lo_unmount(struct vfs *vfsp, int flag, struct cred *cr)
461 {
462 	struct loinfo *li;
463 
464 	if (secpolicy_fs_unmount(cr, vfsp) != 0)
465 		return (EPERM);
466 
467 	/*
468 	 * Forced unmount is not supported by this file system
469 	 * and thus, ENOTSUP, is being returned.
470 	 */
471 	if (flag & MS_FORCE)
472 		return (ENOTSUP);
473 
474 	li = vtoli(vfsp);
475 #ifdef LODEBUG
476 	lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li);
477 #endif
478 	if (li->li_refct != 1 || li->li_rootvp->v_count != 1) {
479 #ifdef LODEBUG
480 		lo_dprint(4, "refct %d v_ct %d\n", li->li_refct,
481 		    li->li_rootvp->v_count);
482 #endif
483 		return (EBUSY);
484 	}
485 	VN_RELE(li->li_rootvp);
486 	return (0);
487 }
488 
489 /*
490  * Find root of lofs mount.
491  */
492 static int
493 lo_root(struct vfs *vfsp, struct vnode **vpp)
494 {
495 	*vpp = vtoli(vfsp)->li_rootvp;
496 #ifdef LODEBUG
497 	lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp);
498 #endif
499 	/*
500 	 * If the root of the filesystem is a special file, return the specvp
501 	 * version of the vnode. We don't save the specvp vnode in our
502 	 * hashtable since that's exclusively for lnodes.
503 	 */
504 	if (IS_DEVVP(*vpp)) {
505 		struct vnode *svp;
506 
507 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred);
508 		if (svp == NULL)
509 			return (ENOSYS);
510 		*vpp = svp;
511 	} else {
512 		VN_HOLD(*vpp);
513 	}
514 
515 	return (0);
516 }
517 
518 /*
519  * Get file system statistics.
520  */
521 static int
522 lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp)
523 {
524 	vnode_t *realrootvp;
525 
526 #ifdef LODEBUG
527 	lo_dprint(4, "lostatvfs %p\n", vfsp);
528 #endif
529 	/*
530 	 * Using realrootvp->v_vfsp (instead of the realvfsp that was
531 	 * cached) is necessary to make lofs work woth forced UFS unmounts.
532 	 * In the case of a forced unmount, UFS stores a set of dummy vfsops
533 	 * in all the (i)vnodes in the filesystem. The dummy ops simply
534 	 * returns back EIO.
535 	 */
536 	(void) lo_realvfs(vfsp, &realrootvp);
537 	if (realrootvp != NULL)
538 		return (VFS_STATVFS(realrootvp->v_vfsp, sbp));
539 	else
540 		return (EIO);
541 }
542 
543 /*
544  * LOFS doesn't have any data or metadata to flush, pending I/O on the
545  * underlying filesystem will be flushed when such filesystem is synched.
546  */
547 /* ARGSUSED */
548 static int
549 lo_sync(struct vfs *vfsp,
550 	short flag,
551 	struct cred *cr)
552 {
553 #ifdef LODEBUG
554 	lo_dprint(4, "lo_sync: %p\n", vfsp);
555 #endif
556 	return (0);
557 }
558 
559 /*
560  * Obtain the vnode from the underlying filesystem.
561  */
562 static int
563 lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
564 {
565 	vnode_t *realrootvp;
566 
567 #ifdef LODEBUG
568 	lo_dprint(4, "lo_vget: %p\n", vfsp);
569 #endif
570 	(void) lo_realvfs(vfsp, &realrootvp);
571 	if (realrootvp != NULL)
572 		return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp));
573 	else
574 		return (EIO);
575 }
576 
577 /*
578  * Free mount-specific data.
579  */
580 static void
581 lo_freevfs(struct vfs *vfsp)
582 {
583 	struct loinfo *li = vtoli(vfsp);
584 
585 	ldestroy(li);
586 	kmem_free(li, sizeof (struct loinfo));
587 }
588 
589 static int
590 lofsinit(int fstyp, char *name)
591 {
592 	static const fs_operation_def_t lo_vfsops_template[] = {
593 		VFSNAME_MOUNT, lo_mount,
594 		VFSNAME_UNMOUNT, lo_unmount,
595 		VFSNAME_ROOT, lo_root,
596 		VFSNAME_STATVFS, lo_statvfs,
597 		VFSNAME_SYNC, (fs_generic_func_p) lo_sync,
598 		VFSNAME_VGET, lo_vget,
599 		VFSNAME_FREEVFS, (fs_generic_func_p) lo_freevfs,
600 		NULL, NULL
601 	};
602 	int error;
603 
604 	error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops);
605 	if (error != 0) {
606 		cmn_err(CE_WARN, "lofsinit: bad vfs ops template");
607 		return (error);
608 	}
609 
610 	error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops);
611 	if (error != 0) {
612 		(void) vfs_freevfsops_by_type(fstyp);
613 		cmn_err(CE_WARN, "lofsinit: bad vnode ops template");
614 		return (error);
615 	}
616 
617 	lofsfstype = fstyp;
618 
619 	if ((lofs_major = getudev()) == (major_t)-1) {
620 		(void) vfs_freevfsops_by_type(fstyp);
621 		cmn_err(CE_WARN, "lofsinit: Can't get unique device number.");
622 		return (ENXIO);
623 	}
624 
625 	lofs_minor = 0;
626 	mutex_init(&lofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
627 
628 	return (0);
629 }
630