xref: /original-bsd/sys/miscfs/nullfs/null_vnops.c (revision c4f3b704)
1 /*
2  * Copyright (c) 1992, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * John Heidemann of the UCLA Ficus project.
7  *
8  * %sccs.include.redist.c%
9  *
10  *	@(#)null_vnops.c	8.6 (Berkeley) 05/27/95
11  *
12  * Ancestors:
13  *	@(#)lofs_vnops.c	1.2 (Berkeley) 6/18/92
14  *	$Id: lofs_vnops.c,v 1.11 1992/05/30 10:05:43 jsp Exp jsp $
15  *	...and...
16  *	@(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project
17  */
18 
19 /*
20  * Null Layer
21  *
22  * (See mount_null(8) for more information.)
23  *
24  * The null layer duplicates a portion of the file system
25  * name space under a new name.  In this respect, it is
26  * similar to the loopback file system.  It differs from
27  * the loopback fs in two respects:  it is implemented using
28  * a stackable layers techniques, and it's "null-node"s stack above
29  * all lower-layer vnodes, not just over directory vnodes.
30  *
31  * The null layer has two purposes.  First, it serves as a demonstration
32  * of layering by proving a layer which does nothing.  (It actually
33  * does everything the loopback file system does, which is slightly
34  * more than nothing.)  Second, the null layer can serve as a prototype
35  * layer.  Since it provides all necessary layer framework,
36  * new file system layers can be created very easily be starting
37  * with a null layer.
38  *
39  * The remainder of this man page examines the null layer as a basis
40  * for constructing new layers.
41  *
42  *
43  * INSTANTIATING NEW NULL LAYERS
44  *
45  * New null layers are created with mount_null(8).
46  * Mount_null(8) takes two arguments, the pathname
47  * of the lower vfs (target-pn) and the pathname where the null
48  * layer will appear in the namespace (alias-pn).  After
49  * the null layer is put into place, the contents
50  * of target-pn subtree will be aliased under alias-pn.
51  *
52  *
53  * OPERATION OF A NULL LAYER
54  *
55  * The null layer is the minimum file system layer,
56  * simply bypassing all possible operations to the lower layer
57  * for processing there.  The majority of its activity centers
58  * on the bypass routine, though which nearly all vnode operations
59  * pass.
60  *
61  * The bypass routine accepts arbitrary vnode operations for
62  * handling by the lower layer.  It begins by examing vnode
63  * operation arguments and replacing any null-nodes by their
64  * lower-layer equivlants.  It then invokes the operation
65  * on the lower layer.  Finally, it replaces the null-nodes
66  * in the arguments and, if a vnode is return by the operation,
67  * stacks a null-node on top of the returned vnode.
68  *
69  * Although bypass handles most operations, vop_getattr, vop_lock,
70  * vop_unlock, vop_inactive, vop_reclaim, and vop_print are not
71  * bypassed. Vop_getattr must change the fsid being returned.
72  * Vop_lock and vop_unlock must handle any locking for the
73  * current vnode as well as pass the lock request down.
74  * Vop_inactive and vop_reclaim are not bypassed so that
75  * they can handle freeing null-layer specific data. Vop_print
76  * is not bypassed to avoid excessive debugging information.
77  * Also, certain vnode operations change the locking state within
78  * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
79  * and symlink). Ideally these operations should not change the
80  * lock state, but should be changed to let the caller of the
81  * function unlock them. Otherwise all intermediate vnode layers
82  * (such as union, umapfs, etc) must catch these functions to do
83  * the necessary locking at their layer.
84  *
85  *
86  * INSTANTIATING VNODE STACKS
87  *
88  * Mounting associates the null layer with a lower layer,
89  * effect stacking two VFSes.  Vnode stacks are instead
90  * created on demand as files are accessed.
91  *
92  * The initial mount creates a single vnode stack for the
93  * root of the new null layer.  All other vnode stacks
94  * are created as a result of vnode operations on
95  * this or other null vnode stacks.
96  *
97  * New vnode stacks come into existance as a result of
98  * an operation which returns a vnode.
99  * The bypass routine stacks a null-node above the new
100  * vnode before returning it to the caller.
101  *
102  * For example, imagine mounting a null layer with
103  * "mount_null /usr/include /dev/layer/null".
104  * Changing directory to /dev/layer/null will assign
105  * the root null-node (which was created when the null layer was mounted).
106  * Now consider opening "sys".  A vop_lookup would be
107  * done on the root null-node.  This operation would bypass through
108  * to the lower layer which would return a vnode representing
109  * the UFS "sys".  Null_bypass then builds a null-node
110  * aliasing the UFS "sys" and returns this to the caller.
111  * Later operations on the null-node "sys" will repeat this
112  * process when constructing other vnode stacks.
113  *
114  *
115  * CREATING OTHER FILE SYSTEM LAYERS
116  *
117  * One of the easiest ways to construct new file system layers is to make
118  * a copy of the null layer, rename all files and variables, and
119  * then begin modifing the copy.  Sed can be used to easily rename
120  * all variables.
121  *
122  * The umap layer is an example of a layer descended from the
123  * null layer.
124  *
125  *
126  * INVOKING OPERATIONS ON LOWER LAYERS
127  *
128  * There are two techniques to invoke operations on a lower layer
129  * when the operation cannot be completely bypassed.  Each method
130  * is appropriate in different situations.  In both cases,
131  * it is the responsibility of the aliasing layer to make
132  * the operation arguments "correct" for the lower layer
133  * by mapping an vnode arguments to the lower layer.
134  *
135  * The first approach is to call the aliasing layer's bypass routine.
136  * This method is most suitable when you wish to invoke the operation
137  * currently being hanldled on the lower layer.  It has the advantage
138  * that the bypass routine already must do argument mapping.
139  * An example of this is null_getattrs in the null layer.
140  *
141  * A second approach is to directly invoked vnode operations on
142  * the lower layer with the VOP_OPERATIONNAME interface.
143  * The advantage of this method is that it is easy to invoke
144  * arbitrary operations on the lower layer.  The disadvantage
145  * is that vnodes arguments must be manualy mapped.
146  *
147  */
148 
149 #include <sys/param.h>
150 #include <sys/systm.h>
151 #include <sys/proc.h>
152 #include <sys/time.h>
153 #include <sys/types.h>
154 #include <sys/vnode.h>
155 #include <sys/mount.h>
156 #include <sys/namei.h>
157 #include <sys/malloc.h>
158 #include <sys/buf.h>
159 #include <miscfs/nullfs/null.h>
160 
161 
162 int null_bug_bypass = 0;   /* for debugging: enables bypass printf'ing */
163 
164 /*
165  * This is the 10-Apr-92 bypass routine.
166  *    This version has been optimized for speed, throwing away some
167  * safety checks.  It should still always work, but it's not as
168  * robust to programmer errors.
169  *    Define SAFETY to include some error checking code.
170  *
171  * In general, we map all vnodes going down and unmap them on the way back.
172  * As an exception to this, vnodes can be marked "unmapped" by setting
173  * the Nth bit in operation's vdesc_flags.
174  *
175  * Also, some BSD vnode operations have the side effect of vrele'ing
176  * their arguments.  With stacking, the reference counts are held
177  * by the upper node, not the lower one, so we must handle these
178  * side-effects here.  This is not of concern in Sun-derived systems
179  * since there are no such side-effects.
180  *
181  * This makes the following assumptions:
182  * - only one returned vpp
183  * - no INOUT vpp's (Sun's vop_open has one of these)
184  * - the vnode operation vector of the first vnode should be used
185  *   to determine what implementation of the op should be invoked
186  * - all mapped vnodes are of our vnode-type (NEEDSWORK:
187  *   problems on rmdir'ing mount points and renaming?)
188  */
189 int
190 null_bypass(ap)
191 	struct vop_generic_args /* {
192 		struct vnodeop_desc *a_desc;
193 		<other random data follows, presumably>
194 	} */ *ap;
195 {
196 	extern int (**null_vnodeop_p)();  /* not extern, really "forward" */
197 	register struct vnode **this_vp_p;
198 	int error;
199 	struct vnode *old_vps[VDESC_MAX_VPS];
200 	struct vnode **vps_p[VDESC_MAX_VPS];
201 	struct vnode ***vppp;
202 	struct vnodeop_desc *descp = ap->a_desc;
203 	int reles, i;
204 
205 	if (null_bug_bypass)
206 		printf ("null_bypass: %s\n", descp->vdesc_name);
207 
208 #ifdef SAFETY
209 	/*
210 	 * We require at least one vp.
211 	 */
212 	if (descp->vdesc_vp_offsets == NULL ||
213 	    descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
214 		panic ("null_bypass: no vp's in map.\n");
215 #endif
216 
217 	/*
218 	 * Map the vnodes going in.
219 	 * Later, we'll invoke the operation based on
220 	 * the first mapped vnode's operation vector.
221 	 */
222 	reles = descp->vdesc_flags;
223 	for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
224 		if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
225 			break;   /* bail out at end of list */
226 		vps_p[i] = this_vp_p =
227 			VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap);
228 		/*
229 		 * We're not guaranteed that any but the first vnode
230 		 * are of our type.  Check for and don't map any
231 		 * that aren't.  (We must always map first vp or vclean fails.)
232 		 */
233 		if (i && (*this_vp_p == NULL ||
234 		    (*this_vp_p)->v_op != null_vnodeop_p)) {
235 			old_vps[i] = NULL;
236 		} else {
237 			old_vps[i] = *this_vp_p;
238 			*(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p);
239 			/*
240 			 * XXX - Several operations have the side effect
241 			 * of vrele'ing their vp's.  We must account for
242 			 * that.  (This should go away in the future.)
243 			 */
244 			if (reles & 1)
245 				VREF(*this_vp_p);
246 		}
247 
248 	}
249 
250 	/*
251 	 * Call the operation on the lower layer
252 	 * with the modified argument structure.
253 	 */
254 	error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap);
255 
256 	/*
257 	 * Maintain the illusion of call-by-value
258 	 * by restoring vnodes in the argument structure
259 	 * to their original value.
260 	 */
261 	reles = descp->vdesc_flags;
262 	for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
263 		if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
264 			break;   /* bail out at end of list */
265 		if (old_vps[i]) {
266 			*(vps_p[i]) = old_vps[i];
267 			if (reles & 1)
268 				vrele(*(vps_p[i]));
269 		}
270 	}
271 
272 	/*
273 	 * Map the possible out-going vpp
274 	 * (Assumes that the lower layer always returns
275 	 * a VREF'ed vpp unless it gets an error.)
276 	 */
277 	if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET &&
278 	    !(descp->vdesc_flags & VDESC_NOMAP_VPP) &&
279 	    !error) {
280 		/*
281 		 * XXX - even though some ops have vpp returned vp's,
282 		 * several ops actually vrele this before returning.
283 		 * We must avoid these ops.
284 		 * (This should go away when these ops are regularized.)
285 		 */
286 		if (descp->vdesc_flags & VDESC_VPP_WILLRELE)
287 			goto out;
288 		vppp = VOPARG_OFFSETTO(struct vnode***,
289 				 descp->vdesc_vpp_offset,ap);
290 		error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp);
291 	}
292 
293  out:
294 	return (error);
295 }
296 
297 /*
298  * We have to carry on the locking protocol on the null layer vnodes
299  * as we progress through the tree. We also have to enforce read-only
300  * if this layer is mounted read-only.
301  */
302 null_lookup(ap)
303 	struct vop_lookup_args /* {
304 		struct vnode * a_dvp;
305 		struct vnode ** a_vpp;
306 		struct componentname * a_cnp;
307 	} */ *ap;
308 {
309 	struct componentname *cnp = ap->a_cnp;
310 	struct proc *p = cnp->cn_proc;
311 	int flags = cnp->cn_flags;
312 	struct vop_lock_args lockargs;
313 	struct vop_unlock_args unlockargs;
314 	struct vnode *dvp, *vp;
315 	int error;
316 
317 	if ((flags & ISLASTCN) && (ap->a_dvp->v_mount->mnt_flag & MNT_RDONLY) &&
318 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
319 		return (EROFS);
320 	error = null_bypass(ap);
321 	if (error == EJUSTRETURN && (flags & ISLASTCN) &&
322 	    (ap->a_dvp->v_mount->mnt_flag & MNT_RDONLY) &&
323 	    (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
324 		error = EROFS;
325 	/*
326 	 * We must do the same locking and unlocking at this layer as
327 	 * is done in the layers below us. We could figure this out
328 	 * based on the error return and the LASTCN, LOCKPARENT, and
329 	 * LOCKLEAF flags. However, it is more expidient to just find
330 	 * out the state of the lower level vnodes and set ours to the
331 	 * same state.
332 	 */
333 	dvp = ap->a_dvp;
334 	vp = *ap->a_vpp;
335 	if (dvp == vp)
336 		return (error);
337 	if (!VOP_ISLOCKED(dvp)) {
338 		unlockargs.a_vp = dvp;
339 		unlockargs.a_flags = 0;
340 		unlockargs.a_p = p;
341 		vop_nounlock(&unlockargs);
342 	}
343 	if (vp != NULL && VOP_ISLOCKED(vp)) {
344 		lockargs.a_vp = vp;
345 		lockargs.a_flags = LK_SHARED;
346 		lockargs.a_p = p;
347 		vop_nolock(&lockargs);
348 	}
349 	return (error);
350 }
351 
352 /*
353  * Setattr call. Disallow write attempts if the layer is mounted read-only.
354  */
355 int
356 null_setattr(ap)
357 	struct vop_setattr_args /* {
358 		struct vnodeop_desc *a_desc;
359 		struct vnode *a_vp;
360 		struct vattr *a_vap;
361 		struct ucred *a_cred;
362 		struct proc *a_p;
363 	} */ *ap;
364 {
365 	struct vnode *vp = ap->a_vp;
366 	struct vattr *vap = ap->a_vap;
367 
368   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
369 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.ts_sec != VNOVAL ||
370 	    vap->va_mtime.ts_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
371 	    (vp->v_mount->mnt_flag & MNT_RDONLY))
372 		return (EROFS);
373 	if (vap->va_size != VNOVAL) {
374  		switch (vp->v_type) {
375  		case VDIR:
376  			return (EISDIR);
377  		case VCHR:
378  		case VBLK:
379  		case VSOCK:
380  		case VFIFO:
381 			return (0);
382 		case VREG:
383 		case VLNK:
384  		default:
385 			/*
386 			 * Disallow write attempts if the filesystem is
387 			 * mounted read-only.
388 			 */
389 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
390 				return (EROFS);
391 		}
392 	}
393 	return (null_bypass(ap));
394 }
395 
396 /*
397  *  We handle getattr only to change the fsid.
398  */
399 int
400 null_getattr(ap)
401 	struct vop_getattr_args /* {
402 		struct vnode *a_vp;
403 		struct vattr *a_vap;
404 		struct ucred *a_cred;
405 		struct proc *a_p;
406 	} */ *ap;
407 {
408 	int error;
409 
410 	if (error = null_bypass(ap))
411 		return (error);
412 	/* Requires that arguments be restored. */
413 	ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
414 	return (0);
415 }
416 
417 int
418 null_access(ap)
419 	struct vop_access_args /* {
420 		struct vnode *a_vp;
421 		int  a_mode;
422 		struct ucred *a_cred;
423 		struct proc *a_p;
424 	} */ *ap;
425 {
426 	struct vnode *vp = ap->a_vp;
427 	mode_t mode = ap->a_mode;
428 
429 	/*
430 	 * Disallow write attempts on read-only layers;
431 	 * unless the file is a socket, fifo, or a block or
432 	 * character device resident on the file system.
433 	 */
434 	if (mode & VWRITE) {
435 		switch (vp->v_type) {
436 		case VDIR:
437 		case VLNK:
438 		case VREG:
439 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
440 				return (EROFS);
441 			break;
442 		}
443 	}
444 	return (null_bypass(ap));
445 }
446 
447 /*
448  * We need to process our own vnode lock and then clear the
449  * interlock flag as it applies only to our vnode, not the
450  * vnodes below us on the stack.
451  */
452 int
453 null_lock(ap)
454 	struct vop_lock_args /* {
455 		struct vnode *a_vp;
456 		int a_flags;
457 		struct proc *a_p;
458 	} */ *ap;
459 {
460 
461 	vop_nolock(ap);
462 	if ((ap->a_flags & LK_TYPE_MASK) == LK_DRAIN)
463 		return (0);
464 	ap->a_flags &= ~LK_INTERLOCK;
465 	return (null_bypass(ap));
466 }
467 
468 /*
469  * We need to process our own vnode unlock and then clear the
470  * interlock flag as it applies only to our vnode, not the
471  * vnodes below us on the stack.
472  */
473 int
474 null_unlock(ap)
475 	struct vop_unlock_args /* {
476 		struct vnode *a_vp;
477 		int a_flags;
478 		struct proc *a_p;
479 	} */ *ap;
480 {
481 	struct vnode *vp = ap->a_vp;
482 
483 	vop_nounlock(ap);
484 	ap->a_flags &= ~LK_INTERLOCK;
485 	return (null_bypass(ap));
486 }
487 
488 int
489 null_inactive(ap)
490 	struct vop_inactive_args /* {
491 		struct vnode *a_vp;
492 		struct proc *a_p;
493 	} */ *ap;
494 {
495 	/*
496 	 * Do nothing (and _don't_ bypass).
497 	 * Wait to vrele lowervp until reclaim,
498 	 * so that until then our null_node is in the
499 	 * cache and reusable.
500 	 *
501 	 * NEEDSWORK: Someday, consider inactive'ing
502 	 * the lowervp and then trying to reactivate it
503 	 * with capabilities (v_id)
504 	 * like they do in the name lookup cache code.
505 	 * That's too much work for now.
506 	 */
507 	VOP_UNLOCK(ap->a_vp, 0, ap->a_p);
508 	return (0);
509 }
510 
511 int
512 null_reclaim(ap)
513 	struct vop_reclaim_args /* {
514 		struct vnode *a_vp;
515 		struct proc *a_p;
516 	} */ *ap;
517 {
518 	struct vnode *vp = ap->a_vp;
519 	struct null_node *xp = VTONULL(vp);
520 	struct vnode *lowervp = xp->null_lowervp;
521 
522 	/*
523 	 * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p,
524 	 * so we can't call VOPs on ourself.
525 	 */
526 	/* After this assignment, this node will not be re-used. */
527 	xp->null_lowervp = NULL;
528 	LIST_REMOVE(xp, null_hash);
529 	FREE(vp->v_data, M_TEMP);
530 	vp->v_data = NULL;
531 	vrele (lowervp);
532 	return (0);
533 }
534 
535 int
536 null_print(ap)
537 	struct vop_print_args /* {
538 		struct vnode *a_vp;
539 	} */ *ap;
540 {
541 	register struct vnode *vp = ap->a_vp;
542 	printf ("\ttag VT_NULLFS, vp=%x, lowervp=%x\n", vp, NULLVPTOLOWERVP(vp));
543 	return (0);
544 }
545 
546 /*
547  * XXX - vop_strategy must be hand coded because it has no
548  * vnode in its arguments.
549  * This goes away with a merged VM/buffer cache.
550  */
551 int
552 null_strategy(ap)
553 	struct vop_strategy_args /* {
554 		struct buf *a_bp;
555 	} */ *ap;
556 {
557 	struct buf *bp = ap->a_bp;
558 	int error;
559 	struct vnode *savedvp;
560 
561 	savedvp = bp->b_vp;
562 	bp->b_vp = NULLVPTOLOWERVP(bp->b_vp);
563 
564 	error = VOP_STRATEGY(bp);
565 
566 	bp->b_vp = savedvp;
567 
568 	return (error);
569 }
570 
571 /*
572  * XXX - like vop_strategy, vop_bwrite must be hand coded because it has no
573  * vnode in its arguments.
574  * This goes away with a merged VM/buffer cache.
575  */
576 int
577 null_bwrite(ap)
578 	struct vop_bwrite_args /* {
579 		struct buf *a_bp;
580 	} */ *ap;
581 {
582 	struct buf *bp = ap->a_bp;
583 	int error;
584 	struct vnode *savedvp;
585 
586 	savedvp = bp->b_vp;
587 	bp->b_vp = NULLVPTOLOWERVP(bp->b_vp);
588 
589 	error = VOP_BWRITE(bp);
590 
591 	bp->b_vp = savedvp;
592 
593 	return (error);
594 }
595 
596 /*
597  * Global vfs data structures
598  */
599 int (**null_vnodeop_p)();
600 struct vnodeopv_entry_desc null_vnodeop_entries[] = {
601 	{ &vop_default_desc, null_bypass },
602 
603 	{ &vop_lookup_desc, null_lookup },
604 	{ &vop_setattr_desc, null_setattr },
605 	{ &vop_getattr_desc, null_getattr },
606 	{ &vop_access_desc, null_access },
607 	{ &vop_lock_desc, null_lock },
608 	{ &vop_unlock_desc, null_unlock },
609 	{ &vop_inactive_desc, null_inactive },
610 	{ &vop_reclaim_desc, null_reclaim },
611 	{ &vop_print_desc, null_print },
612 
613 	{ &vop_strategy_desc, null_strategy },
614 	{ &vop_bwrite_desc, null_bwrite },
615 
616 	{ (struct vnodeop_desc*)NULL, (int(*)())NULL }
617 };
618 struct vnodeopv_desc null_vnodeop_opv_desc =
619 	{ &null_vnodeop_p, null_vnodeop_entries };
620