1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * vnode ops for the devfs
31  *
32  * For leaf vnode special files (VCHR|VBLK) specfs will always see the VOP
33  * first because dv_find always performs leaf vnode substitution, returning
34  * a specfs vnode with an s_realvp pointing to the devfs leaf vnode. This
35  * means that the only leaf special file VOP operations that devfs will see
36  * after VOP_LOOKUP are the ones that specfs forwards.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/param.h>
41 #include <sys/t_lock.h>
42 #include <sys/systm.h>
43 #include <sys/sysmacros.h>
44 #include <sys/user.h>
45 #include <sys/time.h>
46 #include <sys/vfs.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/kmem.h>
52 #include <sys/uio.h>
53 #include <sys/errno.h>
54 #include <sys/stat.h>
55 #include <sys/cred.h>
56 #include <sys/dirent.h>
57 #include <sys/pathname.h>
58 #include <sys/cmn_err.h>
59 #include <sys/debug.h>
60 #include <sys/policy.h>
61 #include <sys/modctl.h>
62 
63 #include <fs/fs_subr.h>
64 #include <sys/fs/dv_node.h>
65 
66 extern struct vattr	dv_vattr_dir, dv_vattr_file;
67 extern dev_t rconsdev;
68 
69 /*
70  * Open of devices (leaf nodes) is handled by specfs.
71  * There is nothing to do to open a directory
72  */
73 /*ARGSUSED*/
74 static int
75 devfs_open(struct vnode **vpp, int flag, struct cred *cred)
76 {
77 	struct dv_node	*dv = VTODV(*vpp);
78 
79 	dcmn_err2(("devfs_open %s\n", dv->dv_name));
80 	ASSERT((*vpp)->v_type == VDIR);
81 	return (0);
82 }
83 
84 /*
85  * Close of devices (leaf nodes) is handled by specfs.
86  * There is nothing much to do inorder to close a directory.
87  */
88 /*ARGSUSED1*/
89 static int
90 devfs_close(struct vnode *vp, int flag, int count,
91     offset_t offset, struct cred *cred)
92 {
93 	struct dv_node	*dv = VTODV(vp);
94 
95 	dcmn_err2(("devfs_close %s\n", dv->dv_name));
96 	ASSERT(vp->v_type == VDIR);
97 
98 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
99 	cleanshares(vp, ttoproc(curthread)->p_pid);
100 	return (0);
101 }
102 
103 /*
104  * Read of devices (leaf nodes) is handled by specfs.
105  * Read of directories is not supported.
106  */
107 /*ARGSUSED*/
108 static int
109 devfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
110 	struct caller_context *ct)
111 {
112 	dcmn_err2(("devfs_read %s\n", VTODV(vp)->dv_name));
113 	ASSERT(vp->v_type == VDIR);
114 	ASSERT(RW_READ_HELD(&VTODV(vp)->dv_contents));
115 	return (EISDIR);
116 }
117 
118 /*
119  * Write of devices (leaf nodes) is handled by specfs.
120  * Write of directories is not supported.
121  */
122 /*ARGSUSED*/
123 static int
124 devfs_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
125 	struct caller_context *ct)
126 {
127 	dcmn_err2(("devfs_write %s\n", VTODV(vp)->dv_name));
128 	ASSERT(vp->v_type == VDIR);
129 	ASSERT(RW_WRITE_HELD(&VTODV(vp)->dv_contents));
130 	return (EISDIR);
131 }
132 
133 /*
134  * Ioctls to device (leaf nodes) is handled by specfs.
135  * Ioctl to directories is not supported.
136  */
137 /*ARGSUSED*/
138 static int
139 devfs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
140     struct cred *cred, int *rvalp)
141 {
142 	dcmn_err2(("devfs_ioctl %s\n", VTODV(vp)->dv_name));
143 	ASSERT(vp->v_type == VDIR);
144 
145 	return (ENOTTY);	/* no ioctls supported */
146 }
147 
148 /*
149  * We can be asked directly about the attributes of directories, or
150  * (via sp->s_realvp) about the filesystem attributes of special files.
151  *
152  * For directories, we just believe the attribute store
153  * though we mangle the nodeid, fsid, and rdev to convince userland we
154  * really are a different filesystem.
155  *
156  * For special files, a little more fakery is required.
157  *
158  * If the attribute store is not there (read only root), we believe our
159  * memory based attributes.
160  */
161 static int
162 devfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr)
163 {
164 	struct dv_node	*dv = VTODV(vp);
165 	int		error = 0;
166 	uint_t		mask;
167 
168 	/*
169 	 * Message goes to console only. Otherwise, the message
170 	 * causes devfs_getattr to be invoked again... infinite loop
171 	 */
172 	dcmn_err2(("?devfs_getattr %s\n", dv->dv_name));
173 	ASSERT(dv->dv_attr || dv->dv_attrvp);
174 
175 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
176 		cmn_err(CE_WARN,	/* panic ? */
177 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
178 		return (ENOENT);
179 	}
180 
181 	if (dv->dv_attr) {
182 		/*
183 		 * obtain from the memory version of attribute.
184 		 * preserve mask for those that optimize.
185 		 * devfs specific fields are already merged on creation.
186 		 */
187 		mask = vap->va_mask;
188 		*vap = *dv->dv_attr;
189 		vap->va_mask = mask;
190 	} else {
191 		/* obtain from attribute store and merge */
192 		error = VOP_GETATTR(dv->dv_attrvp, vap, flags, cr);
193 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
194 		dv_vattr_merge(dv, vap);
195 	}
196 
197 	/*
198 	 * Restrict the permissions of the node fronting the console
199 	 * to 0600 with root as the owner.  This prevents a non-root
200 	 * user from gaining access to a serial terminal (like /dev/term/a)
201 	 * which is in reality serving as the console device (/dev/console).
202 	 */
203 	if (vp->v_rdev == rconsdev) {
204 		mode_t	rconsmask = S_IXUSR|S_IRWXG|S_IRWXO;
205 		vap->va_mode &= (~rconsmask);
206 		vap->va_uid = 0;
207 	}
208 
209 	return (error);
210 }
211 
212 static int devfs_unlocked_access(void *, int, struct cred *);
213 
214 /*ARGSUSED4*/
215 static int
216 devfs_setattr_dir(
217 	struct dv_node *dv,
218 	struct vnode *vp,
219 	struct vattr *vap,
220 	int flags,
221 	struct cred *cr)
222 {
223 	struct vattr	*map;
224 	long int	mask;
225 	int		error = 0;
226 	struct vattr	vattr;
227 
228 	ASSERT(dv->dv_attr || dv->dv_attrvp);
229 
230 	ASSERT(vp->v_type == VDIR);
231 	ASSERT((dv->dv_flags & DV_NO_FSPERM) == 0);
232 
233 	if (vap->va_mask & AT_NOSET)
234 		return (EINVAL);
235 
236 	/* to ensure consistency, single thread setting of attributes */
237 	rw_enter(&dv->dv_contents, RW_WRITER);
238 
239 again:	if (dv->dv_attr) {
240 
241 		error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
242 					flags, devfs_unlocked_access, dv);
243 
244 		if (error)
245 			goto out;
246 
247 		/*
248 		 * Apply changes to the memory based attribute. This code
249 		 * is modeled after the tmpfs implementation of memory
250 		 * based vnodes
251 		 */
252 		map = dv->dv_attr;
253 		mask = vap->va_mask;
254 
255 		/* Change file access modes. */
256 		if (mask & AT_MODE) {
257 			map->va_mode &= S_IFMT;
258 			map->va_mode |= vap->va_mode & ~S_IFMT;
259 		}
260 		if (mask & AT_UID)
261 			map->va_uid = vap->va_uid;
262 		if (mask & AT_GID)
263 			map->va_gid = vap->va_gid;
264 		if (mask & AT_ATIME)
265 			map->va_atime = vap->va_atime;
266 		if (mask & AT_MTIME)
267 			map->va_mtime = vap->va_mtime;
268 
269 		if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME))
270 			gethrestime(&map->va_ctime);
271 	} else {
272 		/* use the backing attribute store */
273 		ASSERT(dv->dv_attrvp);
274 
275 		/*
276 		 * See if we are changing something we care about
277 		 * the persistence of - return success if we don't care.
278 		 */
279 		if (vap->va_mask & (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) {
280 			/* Set the attributes */
281 			error = VOP_SETATTR(dv->dv_attrvp,
282 				vap, flags, cr, NULL);
283 			dsysdebug(error,
284 				("vop_setattr %s %d\n", dv->dv_name, error));
285 
286 			/*
287 			 * Some file systems may return EROFS for a setattr
288 			 * on a readonly file system.  In this case we create
289 			 * our own memory based attribute.
290 			 */
291 			if (error == EROFS) {
292 				/*
293 				 * obtain attributes from existing file
294 				 * that we will modify and switch to memory
295 				 * based attribute until attribute store is
296 				 * read/write.
297 				 */
298 				vattr = dv_vattr_dir;
299 				if (VOP_GETATTR(dv->dv_attrvp, &vattr,
300 				    flags, cr) == 0) {
301 					dv->dv_attr = kmem_alloc(
302 					    sizeof (struct vattr), KM_SLEEP);
303 					*dv->dv_attr = vattr;
304 					dv_vattr_merge(dv, dv->dv_attr);
305 					goto again;
306 				}
307 			}
308 		}
309 	}
310 out:
311 	rw_exit(&dv->dv_contents);
312 	return (error);
313 }
314 
315 
316 /*
317  * Compare the uid/gid/mode changes requested for a setattr
318  * operation with the same details of a node's default minor
319  * perm information.  Return 0 if identical.
320  */
321 static int
322 dv_setattr_cmp(struct vattr *map, mperm_t *mp)
323 {
324 	if ((map->va_mode & S_IAMB) != (mp->mp_mode & S_IAMB))
325 		return (1);
326 	if (map->va_uid != mp->mp_uid)
327 		return (1);
328 	if (map->va_gid != mp->mp_gid)
329 		return (1);
330 	return (0);
331 }
332 
333 
334 /*ARGSUSED4*/
335 static int
336 devfs_setattr(
337 	struct vnode *vp,
338 	struct vattr *vap,
339 	int flags,
340 	struct cred *cr,
341 	caller_context_t *ct)
342 {
343 	struct dv_node	*dv = VTODV(vp);
344 	struct dv_node	*ddv;
345 	struct vnode	*dvp;
346 	struct vattr	*map;
347 	long int	mask;
348 	int		error = 0;
349 	struct vattr	*free_vattr = NULL;
350 	struct vattr	*vattrp = NULL;
351 	mperm_t		mp;
352 	int		persist;
353 
354 	/*
355 	 * Message goes to console only. Otherwise, the message
356 	 * causes devfs_getattr to be invoked again... infinite loop
357 	 */
358 	dcmn_err2(("?devfs_setattr %s\n", dv->dv_name));
359 	ASSERT(dv->dv_attr || dv->dv_attrvp);
360 
361 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
362 		cmn_err(CE_WARN,	/* panic ? */
363 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
364 		return (ENOENT);
365 	}
366 
367 	if (vap->va_mask & AT_NOSET)
368 		return (EINVAL);
369 
370 	/*
371 	 * If we are changing something we don't care about
372 	 * the persistence of, return success.
373 	 */
374 	if ((vap->va_mask &
375 	    (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) == 0)
376 		return (0);
377 
378 	/*
379 	 * If driver overrides fs perm, disallow chmod
380 	 * and do not create attribute nodes.
381 	 */
382 	if (dv->dv_flags & DV_NO_FSPERM) {
383 		ASSERT(dv->dv_attr);
384 		if (vap->va_mask & (AT_MODE | AT_UID | AT_GID))
385 			return (EPERM);
386 		if ((vap->va_mask & (AT_ATIME|AT_MTIME)) == 0)
387 			return (0);
388 		rw_enter(&dv->dv_contents, RW_WRITER);
389 		if (vap->va_mask & AT_ATIME)
390 			dv->dv_attr->va_atime = vap->va_atime;
391 		if (vap->va_mask & AT_MTIME)
392 			dv->dv_attr->va_mtime = vap->va_mtime;
393 		rw_exit(&dv->dv_contents);
394 		return (0);
395 	}
396 
397 	/*
398 	 * Directories are always created but device nodes are
399 	 * only used to persist non-default permissions.
400 	 */
401 	if (vp->v_type == VDIR) {
402 		ASSERT(dv->dv_attr || dv->dv_attrvp);
403 		return (devfs_setattr_dir(dv, vp, vap, flags, cr));
404 	}
405 
406 	/*
407 	 * Allocate now before we take any locks
408 	 */
409 	vattrp = kmem_zalloc(sizeof (*vattrp), KM_SLEEP);
410 
411 	/* to ensure consistency, single thread setting of attributes */
412 	rw_enter(&dv->dv_contents, RW_WRITER);
413 
414 	/*
415 	 * We don't need to create an attribute node
416 	 * to persist access or modification times.
417 	 */
418 	persist = (vap->va_mask & (AT_MODE | AT_UID | AT_GID));
419 
420 	/*
421 	 * If persisting something, get the default permissions
422 	 * for this minor to compare against what the attributes
423 	 * are now being set to.  Default ordering is:
424 	 *	- minor_perm match for this minor
425 	 *	- mode supplied by ddi_create_priv_minor_node
426 	 *	- devfs defaults
427 	 */
428 	if (persist) {
429 		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) != 0) {
430 			mp.mp_uid = dv_vattr_file.va_uid;
431 			mp.mp_gid = dv_vattr_file.va_gid;
432 			mp.mp_mode = dv_vattr_file.va_mode;
433 			if (dv->dv_flags & DV_DFLT_MODE) {
434 				ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
435 				mp.mp_mode &= ~S_IAMB;
436 				mp.mp_mode |= dv->dv_dflt_mode;
437 				dcmn_err5(("%s: setattr priv default 0%o\n",
438 				    dv->dv_name, mp.mp_mode));
439 			} else {
440 				dcmn_err5(("%s: setattr devfs default 0%o\n",
441 				    dv->dv_name, mp.mp_mode));
442 			}
443 		} else {
444 			dcmn_err5(("%s: setattr minor perm default 0%o\n",
445 			    dv->dv_name, mp.mp_mode));
446 		}
447 	}
448 
449 	/*
450 	 * If we don't have a vattr for this node, construct one.
451 	 */
452 	if (dv->dv_attr) {
453 		free_vattr = vattrp;
454 		vattrp = NULL;
455 	} else {
456 		ASSERT(dv->dv_attrvp);
457 		ASSERT(vp->v_type != VDIR);
458 		*vattrp = dv_vattr_file;
459 		error = VOP_GETATTR(dv->dv_attrvp, vattrp, 0, cr);
460 		dsysdebug(error, ("vop_getattr %s %d\n",
461 			dv->dv_name, error));
462 		if (error)
463 			goto out;
464 		dv->dv_attr = vattrp;
465 		dv_vattr_merge(dv, dv->dv_attr);
466 		vattrp = NULL;
467 	}
468 
469 	error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
470 					flags, devfs_unlocked_access, dv);
471 	if (error) {
472 		dsysdebug(error, ("devfs_setattr %s secpolicy error %d\n",
473 			dv->dv_name, error));
474 		goto out;
475 	}
476 
477 	/*
478 	 * Apply changes to the memory based attribute. This code
479 	 * is modeled after the tmpfs implementation of memory
480 	 * based vnodes
481 	 */
482 	map = dv->dv_attr;
483 	mask = vap->va_mask;
484 
485 	/* Change file access modes. */
486 	if (mask & AT_MODE) {
487 		map->va_mode &= S_IFMT;
488 		map->va_mode |= vap->va_mode & ~S_IFMT;
489 	}
490 	if (mask & AT_UID)
491 		map->va_uid = vap->va_uid;
492 	if (mask & AT_GID)
493 		map->va_gid = vap->va_gid;
494 	if (mask & AT_ATIME)
495 		map->va_atime = vap->va_atime;
496 	if (mask & AT_MTIME)
497 		map->va_mtime = vap->va_mtime;
498 
499 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME)) {
500 		gethrestime(&map->va_ctime);
501 	}
502 
503 	/*
504 	 * A setattr to defaults means we no longer need the
505 	 * shadow node as a persistent store, unless there
506 	 * are ACLs.  Otherwise create a shadow node if one
507 	 * doesn't exist yet.
508 	 */
509 	if (persist) {
510 		if ((dv_setattr_cmp(map, &mp) == 0) &&
511 		    ((dv->dv_flags & DV_ACL) == 0)) {
512 
513 			if (dv->dv_attrvp) {
514 				ddv = dv->dv_dotdot;
515 				ASSERT(ddv->dv_attrvp);
516 				error = VOP_REMOVE(ddv->dv_attrvp,
517 				    dv->dv_name, cr);
518 				dsysdebug(error,
519 				    ("vop_remove %s %s %d\n",
520 				    ddv->dv_name, dv->dv_name, error));
521 
522 				if (error == EROFS)
523 					error = 0;
524 				VN_RELE(dv->dv_attrvp);
525 				dv->dv_attrvp = NULL;
526 			}
527 			ASSERT(dv->dv_attr);
528 		} else {
529 			if (mask & AT_MODE)
530 				dcmn_err5(("%s persisting mode 0%o\n",
531 					dv->dv_name, vap->va_mode));
532 			if (mask & AT_UID)
533 				dcmn_err5(("%s persisting uid %d\n",
534 					dv->dv_name, vap->va_uid));
535 			if (mask & AT_GID)
536 				dcmn_err5(("%s persisting gid %d\n",
537 					dv->dv_name, vap->va_gid));
538 
539 			if (dv->dv_attrvp == NULL) {
540 				dvp = DVTOV(dv->dv_dotdot);
541 				dv_shadow_node(dvp, dv->dv_name, vp,
542 				    NULL, NULLVP, cr,
543 				    DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
544 			}
545 			if (dv->dv_attrvp) {
546 				error = VOP_SETATTR(dv->dv_attrvp,
547 				    vap, flags, cr, NULL);
548 				dsysdebug(error, ("vop_setattr %s %d\n",
549 				    dv->dv_name, error));
550 			}
551 			/*
552 			 * Some file systems may return EROFS for a setattr
553 			 * on a readonly file system.  In this case save
554 			 * as our own memory based attribute.
555 			 * NOTE: ufs is NOT one of these (see ufs_iupdat).
556 			 */
557 			if (dv->dv_attr && dv->dv_attrvp && error == 0) {
558 				vattrp = dv->dv_attr;
559 				dv->dv_attr = NULL;
560 			} else if (error == EROFS)
561 				error = 0;
562 		}
563 	}
564 
565 out:
566 	rw_exit(&dv->dv_contents);
567 
568 	if (vattrp)
569 		kmem_free(vattrp, sizeof (*vattrp));
570 	if (free_vattr)
571 		kmem_free(free_vattr, sizeof (*free_vattr));
572 	return (error);
573 }
574 
575 static int
576 devfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
577 {
578 	switch (cmd) {
579 	case _PC_ACL_ENABLED:
580 		/*
581 		 * We rely on the underlying filesystem for ACLs,
582 		 * so direct the query for ACL support there.
583 		 * ACL support isn't relative to the file
584 		 * and we can't guarantee that the dv node
585 		 * has an attribute node, so any valid
586 		 * attribute node will suffice.
587 		 */
588 		ASSERT(dvroot);
589 		ASSERT(dvroot->dv_attrvp);
590 		return (VOP_PATHCONF(dvroot->dv_attrvp, cmd, valp, cr));
591 		/*NOTREACHED*/
592 	}
593 
594 	return (fs_pathconf(vp, cmd, valp, cr));
595 }
596 
597 /*
598  * Let avp handle security attributes (acl's).
599  */
600 static int
601 devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
602     struct cred *cr)
603 {
604 	dvnode_t *dv = VTODV(vp);
605 	struct vnode *avp;
606 	int	error;
607 
608 	dcmn_err2(("devfs_getsecattr %s\n", dv->dv_name));
609 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
610 
611 	rw_enter(&dv->dv_contents, RW_READER);
612 
613 	avp = dv->dv_attrvp;
614 
615 	/* fabricate the acl */
616 	if (avp == NULL) {
617 		error = fs_fab_acl(vp, vsap, flags, cr);
618 		rw_exit(&dv->dv_contents);
619 		return (error);
620 	}
621 
622 	error = VOP_GETSECATTR(avp, vsap, flags, cr);
623 	dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
624 	rw_exit(&dv->dv_contents);
625 	return (error);
626 }
627 
628 /*
629  * Set security attributes (acl's)
630  *
631  * Note that the dv_contents lock has already been acquired
632  * by the caller's VOP_RWLOCK.
633  */
634 static int
635 devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
636     struct cred *cr)
637 {
638 	dvnode_t *dv = VTODV(vp);
639 	struct vnode *avp;
640 	int	error;
641 
642 	dcmn_err2(("devfs_setsecattr %s\n", dv->dv_name));
643 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
644 	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
645 
646 	/*
647 	 * Not a supported operation on drivers not providing
648 	 * file system based permissions.
649 	 */
650 	if (dv->dv_flags & DV_NO_FSPERM)
651 		return (ENOTSUP);
652 
653 	/*
654 	 * To complete, the setsecattr requires an underlying attribute node.
655 	 */
656 	if (dv->dv_attrvp == NULL) {
657 		ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
658 		dv_shadow_node(DVTOV(dv->dv_dotdot), dv->dv_name, vp,
659 		    NULL, NULLVP, cr, DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
660 	}
661 
662 	if ((avp = dv->dv_attrvp) == NULL) {
663 		dcmn_err2(("devfs_setsecattr %s: "
664 		    "cannot construct attribute node\n", dv->dv_name));
665 		return (fs_nosys());
666 	}
667 
668 	/*
669 	 * The acl(2) system call issues a VOP_RWLOCK before setting an ACL.
670 	 * Since backing file systems expect the lock to be held before seeing
671 	 * a VOP_SETSECATTR ACL, we need to issue the VOP_RWLOCK to the backing
672 	 * store before forwarding the ACL.
673 	 */
674 	(void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, NULL);
675 	error = VOP_SETSECATTR(avp, vsap, flags, cr);
676 	dsysdebug(error, ("vop_setsecattr %s %d\n", VTODV(vp)->dv_name, error));
677 	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
678 
679 	/*
680 	 * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
681 	 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
682 	 * VOP_GETSECATTR calls.
683 	 */
684 	if (fs_acl_nontrivial(avp, cr))
685 		dv->dv_flags |= DV_ACL;
686 	return (error);
687 }
688 
689 /*
690  * This function is used for secpolicy_setattr().  It must call an
691  * access() like function while it is already holding the
692  * dv_contents lock.  We only care about this when dv_attr != NULL;
693  * so the unlocked access call only concerns itself with that
694  * particular branch of devfs_access().
695  */
696 static int
697 devfs_unlocked_access(void *vdv, int mode, struct cred *cr)
698 {
699 	struct dv_node *dv = vdv;
700 	int shift = 0;
701 	uid_t owner = dv->dv_attr->va_uid;
702 
703 	/* Check access based on owner, group and public permissions. */
704 	if (crgetuid(cr) != owner) {
705 		shift += 3;
706 		if (groupmember(dv->dv_attr->va_gid, cr) == 0)
707 			shift += 3;
708 	}
709 
710 	/* compute missing mode bits */
711 	mode &= ~(dv->dv_attr->va_mode << shift);
712 
713 	if (mode == 0)
714 		return (0);
715 
716 	return (secpolicy_vnode_access(cr, DVTOV(dv), owner, mode));
717 }
718 
719 static int
720 devfs_access(struct vnode *vp, int mode, int flags, struct cred *cr)
721 {
722 	struct dv_node	*dv = VTODV(vp);
723 	int		res;
724 
725 	dcmn_err2(("devfs_access %s\n", dv->dv_name));
726 	ASSERT(dv->dv_attr || dv->dv_attrvp);
727 
728 	/* restrict console access to privileged processes */
729 	if ((vp->v_rdev == rconsdev) && secpolicy_console(cr) != 0) {
730 		return (EACCES);
731 	}
732 
733 	if (dv->dv_attr && ((dv->dv_flags & DV_ACL) == 0)) {
734 		rw_enter(&dv->dv_contents, RW_READER);
735 		if (dv->dv_attr) {
736 			res = devfs_unlocked_access(dv, mode, cr);
737 			rw_exit(&dv->dv_contents);
738 			return (res);
739 		}
740 		rw_exit(&dv->dv_contents);
741 	}
742 	return (VOP_ACCESS(dv->dv_attrvp, mode, flags, cr));
743 }
744 
745 /*
746  * Lookup
747  *
748  * Given the directory vnode and the name of the component, return
749  * the corresponding held vnode for that component.
750  *
751  * Of course in these fictional filesystems, nothing's ever quite
752  * -that- simple.
753  *
754  * devfs name	type		shadow (fs attributes)	type	comments
755  * -------------------------------------------------------------------------
756  * drv[@addr]	VDIR		drv[@addr]		VDIR	nexus driver
757  * drv[@addr]:m	VCHR/VBLK	drv[@addr]:m		VREG	leaf driver
758  * drv[@addr]	VCHR/VBLK	drv[@addr]:.default	VREG	leaf driver
759  * -------------------------------------------------------------------------
760  *
761  * The following names are reserved for the attribute filesystem (which
762  * could easily be another layer on top of this one - we simply need to
763  * hold the vnode of the thing we're looking at)
764  *
765  * attr name	type		shadow (fs attributes)	type	comments
766  * -------------------------------------------------------------------------
767  * drv[@addr]	VDIR		-			-	attribute dir
768  * minorname	VDIR		-			-	minorname
769  * attribute	VREG		-			-	attribute
770  * -------------------------------------------------------------------------
771  *
772  * Examples:
773  *
774  *	devfs:/devices/.../mm@0:zero		VCHR
775  *	shadow:/.devices/.../mm@0:zero		VREG, fs attrs
776  *	devfs:/devices/.../mm@0:/zero/attr	VREG, driver attribute
777  *
778  *	devfs:/devices/.../sd@0,0:a		VBLK
779  *	shadow:/.devices/.../sd@0,0:a		VREG, fs attrs
780  *	devfs:/devices/.../sd@0,0:/a/.type	VREG, "ddi_block:chan"
781  *
782  *	devfs:/devices/.../mm@0			VCHR
783  *	shadow:/.devices/.../mm@0:.default	VREG, fs attrs
784  *	devfs:/devices/.../mm@0:/.default/attr	VREG, driver attribute
785  *	devfs:/devices/.../mm@0:/.default/.type	VREG, "ddi_pseudo"
786  *
787  *	devfs:/devices/.../obio			VDIR
788  *	shadow:/devices/.../obio		VDIR, needed for fs attrs.
789  *	devfs:/devices/.../obio:/.default/attr	VDIR, driver attribute
790  *
791  * We also need to be able deal with "old" devices that have gone away,
792  * though I think that provided we return them with readdir, they can
793  * be removed (i.e. they don't have to respond to lookup, though it might
794  * be weird if they didn't ;-)
795  *
796  * Lookup has side-effects.
797  *
798  * - It will create directories and fs attribute files in the shadow hierarchy.
799  * - It should cause non-SID devices to be probed (ask the parent nexi).
800  */
801 /*ARGSUSED3*/
802 static int
803 devfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
804     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred)
805 {
806 	ASSERT(dvp->v_type == VDIR);
807 	dcmn_err2(("devfs_lookup: %s\n", nm));
808 	return (dv_find(VTODV(dvp), nm, vpp, pnp, rdir, cred, 0));
809 }
810 
811 /*
812  * devfs nodes can't really be created directly by userland - however,
813  * we do allow creates to find existing nodes:
814  *
815  * - any create fails if the node doesn't exist - EROFS.
816  * - creating an existing directory read-only succeeds, otherwise EISDIR.
817  * - exclusive creates fail if the node already exists - EEXIST.
818  * - failure to create the snode for an existing device - ENOSYS.
819  */
820 /*ARGSUSED2*/
821 static int
822 devfs_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
823     int mode, struct vnode **vpp, struct cred *cred, int flag)
824 {
825 	int error;
826 	struct vnode *vp;
827 
828 	dcmn_err2(("devfs_create %s\n", nm));
829 	error = dv_find(VTODV(dvp), nm, &vp, NULL, NULLVP, cred, 0);
830 	if (error == 0) {
831 		if (excl == EXCL)
832 			error = EEXIST;
833 		else if (vp->v_type == VDIR && (mode & VWRITE))
834 			error = EISDIR;
835 		else
836 			error = VOP_ACCESS(vp, mode, 0, cred);
837 
838 		if (error) {
839 			VN_RELE(vp);
840 		} else
841 			*vpp = vp;
842 	} else if (error == ENOENT)
843 		error = EROFS;
844 
845 	return (error);
846 }
847 
848 /*
849  * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
850  * Otherwise, simply return cached dv_node's. Hotplug code always call
851  * devfs_clean() to invalid the dv_node cache.
852  */
853 static int
854 devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp)
855 {
856 	struct dv_node *ddv, *dv;
857 	struct dirent64 *de, *bufp;
858 	offset_t diroff;
859 	offset_t	soff;
860 	size_t reclen, movesz;
861 	int error;
862 	struct vattr va;
863 	size_t bufsz;
864 
865 	ddv = VTODV(dvp);
866 	dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
867 	    ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
868 	ASSERT(ddv->dv_attr || ddv->dv_attrvp);
869 	ASSERT(RW_READ_HELD(&ddv->dv_contents));
870 
871 	if (uiop->uio_loffset >= MAXOFF_T) {
872 		if (eofp)
873 			*eofp = 1;
874 		return (0);
875 	}
876 
877 	if (uiop->uio_iovcnt != 1)
878 		return (EINVAL);
879 
880 	if (dvp->v_type != VDIR)
881 		return (ENOTDIR);
882 
883 	/* Load the initial contents */
884 	if (ddv->dv_flags & DV_BUILD) {
885 		if (!rw_tryupgrade(&ddv->dv_contents)) {
886 			rw_exit(&ddv->dv_contents);
887 			rw_enter(&ddv->dv_contents, RW_WRITER);
888 		}
889 
890 		/* recheck and fill */
891 		if (ddv->dv_flags & DV_BUILD)
892 			dv_filldir(ddv);
893 
894 		rw_downgrade(&ddv->dv_contents);
895 	}
896 
897 	soff = uiop->uio_offset;
898 	bufsz = uiop->uio_iov->iov_len;
899 	de = bufp = kmem_alloc(bufsz, KM_SLEEP);
900 	movesz = 0;
901 	dv = (struct dv_node *)-1;
902 
903 	/*
904 	 * Move as many entries into the uio structure as it will take.
905 	 * Special case "." and "..".
906 	 */
907 	diroff = 0;
908 	if (soff == 0) {				/* . */
909 		reclen = DIRENT64_RECLEN(strlen("."));
910 		if ((movesz + reclen) > bufsz)
911 			goto full;
912 		de->d_ino = (ino64_t)ddv->dv_ino;
913 		de->d_off = (off64_t)diroff + 1;
914 		de->d_reclen = (ushort_t)reclen;
915 
916 		/* use strncpy(9f) to zero out uninitialized bytes */
917 
918 		(void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
919 		movesz += reclen;
920 		de = (dirent64_t *)((char *)de + reclen);
921 		dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
922 		    "reclen %lu\n", diroff, soff, ".", reclen));
923 	}
924 
925 	diroff++;
926 	if (soff <= 1) {				/* .. */
927 		reclen = DIRENT64_RECLEN(strlen(".."));
928 		if ((movesz + reclen) > bufsz)
929 			goto full;
930 		de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
931 		de->d_off = (off64_t)diroff + 1;
932 		de->d_reclen = (ushort_t)reclen;
933 
934 		/* use strncpy(9f) to zero out uninitialized bytes */
935 
936 		(void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
937 		movesz += reclen;
938 		de = (dirent64_t *)((char *)de + reclen);
939 		dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
940 		    "reclen %lu\n", diroff, soff, "..", reclen));
941 	}
942 
943 	diroff++;
944 	for (dv = ddv->dv_dot; dv; dv = dv->dv_next, diroff++) {
945 		/*
946 		 * although DDM_INTERNAL_PATH minor nodes are skipped for
947 		 * readdirs outside the kernel, they still occupy directory
948 		 * offsets
949 		 */
950 		if (diroff < soff ||
951 		    ((dv->dv_flags & DV_INTERNAL) && (cred != kcred)))
952 			continue;
953 
954 		reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
955 		if ((movesz + reclen) > bufsz) {
956 			dcmn_err3(("devfs_readdir: C: diroff "
957 			    "%lld, soff %lld: '%s' reclen %lu\n",
958 			    diroff, soff, dv->dv_name, reclen));
959 			goto full;
960 		}
961 		de->d_ino = (ino64_t)dv->dv_ino;
962 		de->d_off = (off64_t)diroff + 1;
963 		de->d_reclen = (ushort_t)reclen;
964 
965 		/* use strncpy(9f) to zero out uninitialized bytes */
966 
967 		ASSERT(strlen(dv->dv_name) + 1 <=
968 		    DIRENT64_NAMELEN(reclen));
969 		(void) strncpy(de->d_name, dv->dv_name,
970 		    DIRENT64_NAMELEN(reclen));
971 
972 		movesz += reclen;
973 		de = (dirent64_t *)((char *)de + reclen);
974 		dcmn_err4(("devfs_readdir: D: diroff "
975 		    "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
976 		    dv->dv_name, reclen));
977 	}
978 
979 	/* the buffer is full, or we exhausted everything */
980 full:	dcmn_err3(("devfs_readdir: moving %lu bytes: "
981 	    "diroff %lld, soff %lld, dv %p\n",
982 	    movesz, diroff, soff, (void *)dv));
983 
984 	if ((movesz == 0) && dv)
985 		error = EINVAL;		/* cannot be represented */
986 	else {
987 		error = uiomove(bufp, movesz, UIO_READ, uiop);
988 		if (error == 0) {
989 			if (eofp)
990 				*eofp = dv ? 0 : 1;
991 			uiop->uio_offset = diroff;
992 		}
993 
994 		va.va_mask = AT_ATIME;
995 		gethrestime(&va.va_atime);
996 		rw_exit(&ddv->dv_contents);
997 		(void) devfs_setattr(dvp, &va, 0, cred, NULL);
998 		rw_enter(&ddv->dv_contents, RW_READER);
999 	}
1000 
1001 	kmem_free(bufp, bufsz);
1002 	return (error);
1003 }
1004 
1005 /*ARGSUSED*/
1006 static int
1007 devfs_fsync(struct vnode *vp, int syncflag, struct cred *cred)
1008 {
1009 	/*
1010 	 * Message goes to console only. Otherwise, the message
1011 	 * causes devfs_fsync to be invoked again... infinite loop
1012 	 */
1013 	dcmn_err2(("devfs_fsync %s\n", VTODV(vp)->dv_name));
1014 	return (0);
1015 }
1016 
1017 /*
1018  * Normally, we leave the dv_node here at count of 0.
1019  * The node will be destroyed when dv_cleandir() is called.
1020  *
1021  * Stale dv_node's are already unlinked from the fs tree,
1022  * so dv_cleandir() won't find them. We destroy such nodes
1023  * immediately.
1024  */
1025 /*ARGSUSED1*/
1026 static void
1027 devfs_inactive(struct vnode *vp, struct cred *cred)
1028 {
1029 	int destroy;
1030 	struct dv_node *dv = VTODV(vp);
1031 
1032 	dcmn_err2(("devfs_inactive: %s\n", dv->dv_name));
1033 	mutex_enter(&vp->v_lock);
1034 	ASSERT(vp->v_count >= 1);
1035 	--vp->v_count;
1036 	destroy = (DV_STALE(dv) && vp->v_count == 0);
1037 	mutex_exit(&vp->v_lock);
1038 
1039 	/* stale nodes cannot be rediscovered, destroy it here */
1040 	if (destroy)
1041 		dv_destroy(dv, 0);
1042 }
1043 
1044 /*
1045  * XXX Why do we need this?  NFS mounted /dev directories?
1046  * XXX Talk to peter staubach about this.
1047  */
1048 static int
1049 devfs_fid(struct vnode *vp, struct fid *fidp)
1050 {
1051 	struct dv_node	*dv = VTODV(vp);
1052 	struct dv_fid	*dv_fid;
1053 
1054 	if (fidp->fid_len < (sizeof (struct dv_fid) - sizeof (ushort_t))) {
1055 		fidp->fid_len = sizeof (struct dv_fid) - sizeof (ushort_t);
1056 		return (ENOSPC);
1057 	}
1058 
1059 	dv_fid = (struct dv_fid *)fidp;
1060 	bzero(dv_fid, sizeof (struct dv_fid));
1061 	dv_fid->dvfid_len = (int)sizeof (struct dv_fid) - sizeof (ushort_t);
1062 	dv_fid->dvfid_ino = dv->dv_ino;
1063 	/* dv_fid->dvfid_gen = dv->tn_gen; XXX ? */
1064 
1065 	return (0);
1066 }
1067 
1068 /*
1069  * This pair of routines bracket all VOP_READ, VOP_WRITE
1070  * and VOP_READDIR requests.  The contents lock stops things
1071  * moving around while we're looking at them.
1072  *
1073  * Also used by file and record locking.
1074  */
1075 /*ARGSUSED2*/
1076 static int
1077 devfs_rwlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1078 {
1079 	dcmn_err2(("devfs_rwlock %s\n", VTODV(vp)->dv_name));
1080 	rw_enter(&VTODV(vp)->dv_contents, write_flag ? RW_WRITER : RW_READER);
1081 	return (write_flag);
1082 }
1083 
1084 /*ARGSUSED1*/
1085 static void
1086 devfs_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1087 {
1088 	dcmn_err2(("devfs_rwunlock %s\n", VTODV(vp)->dv_name));
1089 	rw_exit(&VTODV(vp)->dv_contents);
1090 }
1091 
1092 /*
1093  * XXX	Should probably do a better job of computing the maximum
1094  *	offset available in the directory.
1095  */
1096 /*ARGSUSED1*/
1097 static int
1098 devfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp)
1099 {
1100 	ASSERT(vp->v_type == VDIR);
1101 	dcmn_err2(("devfs_seek %s\n", VTODV(vp)->dv_name));
1102 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1103 }
1104 
1105 vnodeops_t *dv_vnodeops;
1106 
1107 const fs_operation_def_t dv_vnodeops_template[] = {
1108 	VOPNAME_OPEN, devfs_open,
1109 	VOPNAME_CLOSE, devfs_close,
1110 	VOPNAME_READ, devfs_read,
1111 	VOPNAME_WRITE, devfs_write,
1112 	VOPNAME_IOCTL, devfs_ioctl,
1113 	VOPNAME_GETATTR, devfs_getattr,
1114 	VOPNAME_SETATTR, devfs_setattr,
1115 	VOPNAME_ACCESS, devfs_access,
1116 	VOPNAME_LOOKUP, devfs_lookup,
1117 	VOPNAME_CREATE, devfs_create,
1118 	VOPNAME_READDIR, devfs_readdir,
1119 	VOPNAME_FSYNC, devfs_fsync,
1120 	VOPNAME_INACTIVE, (fs_generic_func_p) devfs_inactive,
1121 	VOPNAME_FID, devfs_fid,
1122 	VOPNAME_RWLOCK, devfs_rwlock,
1123 	VOPNAME_RWUNLOCK, (fs_generic_func_p) devfs_rwunlock,
1124 	VOPNAME_SEEK, devfs_seek,
1125 	VOPNAME_PATHCONF, devfs_pathconf,
1126 	VOPNAME_DISPOSE, fs_error,
1127 	VOPNAME_SETSECATTR, devfs_setsecattr,
1128 	VOPNAME_GETSECATTR, devfs_getsecattr,
1129 	NULL, NULL
1130 };
1131