1 /*	$NetBSD: vfs_subr.c,v 1.449 2016/05/26 11:07:33 hannken Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran,
10  * by Marshall Kirk McKusick and Greg Ganger at the University of Michigan.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 1989, 1993
36  *	The Regents of the University of California.  All rights reserved.
37  * (c) UNIX System Laboratories, Inc.
38  * All or some portions of this file are derived from material licensed
39  * to the University of California by American Telephone and Telegraph
40  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
41  * the permission of UNIX System Laboratories, Inc.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice, this list of conditions and the following disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  * 3. Neither the name of the University nor the names of its contributors
52  *    may be used to endorse or promote products derived from this software
53  *    without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  *
67  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
68  */
69 
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.449 2016/05/26 11:07:33 hannken Exp $");
72 
73 #ifdef _KERNEL_OPT
74 #include "opt_ddb.h"
75 #include "opt_compat_netbsd.h"
76 #include "opt_compat_43.h"
77 #endif
78 
79 #define _VFS_VNODE_PRIVATE	/* for vcache_print(). */
80 
81 #include <sys/param.h>
82 #include <sys/systm.h>
83 #include <sys/conf.h>
84 #include <sys/dirent.h>
85 #include <sys/filedesc.h>
86 #include <sys/kernel.h>
87 #include <sys/mount.h>
88 #include <sys/vnode.h>
89 #include <sys/stat.h>
90 #include <sys/sysctl.h>
91 #include <sys/namei.h>
92 #include <sys/buf.h>
93 #include <sys/errno.h>
94 #include <sys/kmem.h>
95 #include <sys/syscallargs.h>
96 #include <sys/kauth.h>
97 #include <sys/module.h>
98 
99 #include <miscfs/genfs/genfs.h>
100 #include <miscfs/specfs/specdev.h>
101 #include <uvm/uvm_ddb.h>
102 
103 const enum vtype iftovt_tab[16] = {
104 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
105 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
106 };
107 const int	vttoif_tab[9] = {
108 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
109 	S_IFSOCK, S_IFIFO, S_IFMT,
110 };
111 
112 /*
113  * Insq/Remq for the vnode usage lists.
114  */
115 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
116 #define	bufremvn(bp) {							\
117 	LIST_REMOVE(bp, b_vnbufs);					\
118 	(bp)->b_vnbufs.le_next = NOLIST;				\
119 }
120 
121 int doforce = 1;		/* 1 => permit forcible unmounting */
122 int prtactive = 0;		/* 1 => print out reclaim of active vnodes */
123 
124 extern struct mount *dead_rootmount;
125 
126 /*
127  * Local declarations.
128  */
129 
130 static void vn_initialize_syncerd(void);
131 
132 /*
133  * Initialize the vnode management data structures.
134  */
135 void
vntblinit(void)136 vntblinit(void)
137 {
138 
139 	vn_initialize_syncerd();
140 	vfs_mount_sysinit();
141 	vfs_vnode_sysinit();
142 }
143 
144 /*
145  * Flush out and invalidate all buffers associated with a vnode.
146  * Called with the underlying vnode locked, which should prevent new dirty
147  * buffers from being queued.
148  */
149 int
vinvalbuf(struct vnode * vp,int flags,kauth_cred_t cred,struct lwp * l,bool catch_p,int slptimeo)150 vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
151 	  bool catch_p, int slptimeo)
152 {
153 	struct buf *bp, *nbp;
154 	int error;
155 	int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
156 	    (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
157 
158 	/* XXXUBC this doesn't look at flags or slp* */
159 	mutex_enter(vp->v_interlock);
160 	error = VOP_PUTPAGES(vp, 0, 0, flushflags);
161 	if (error) {
162 		return error;
163 	}
164 
165 	if (flags & V_SAVE) {
166 		error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
167 		if (error)
168 		        return (error);
169 		KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
170 	}
171 
172 	mutex_enter(&bufcache_lock);
173 restart:
174 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
175 		KASSERT(bp->b_vp == vp);
176 		nbp = LIST_NEXT(bp, b_vnbufs);
177 		error = bbusy(bp, catch_p, slptimeo, NULL);
178 		if (error != 0) {
179 			if (error == EPASSTHROUGH)
180 				goto restart;
181 			mutex_exit(&bufcache_lock);
182 			return (error);
183 		}
184 		brelsel(bp, BC_INVAL | BC_VFLUSH);
185 	}
186 
187 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
188 		KASSERT(bp->b_vp == vp);
189 		nbp = LIST_NEXT(bp, b_vnbufs);
190 		error = bbusy(bp, catch_p, slptimeo, NULL);
191 		if (error != 0) {
192 			if (error == EPASSTHROUGH)
193 				goto restart;
194 			mutex_exit(&bufcache_lock);
195 			return (error);
196 		}
197 		/*
198 		 * XXX Since there are no node locks for NFS, I believe
199 		 * there is a slight chance that a delayed write will
200 		 * occur while sleeping just above, so check for it.
201 		 */
202 		if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
203 #ifdef DEBUG
204 			printf("buffer still DELWRI\n");
205 #endif
206 			bp->b_cflags |= BC_BUSY | BC_VFLUSH;
207 			mutex_exit(&bufcache_lock);
208 			VOP_BWRITE(bp->b_vp, bp);
209 			mutex_enter(&bufcache_lock);
210 			goto restart;
211 		}
212 		brelsel(bp, BC_INVAL | BC_VFLUSH);
213 	}
214 
215 #ifdef DIAGNOSTIC
216 	if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
217 		panic("vinvalbuf: flush failed, vp %p", vp);
218 #endif
219 
220 	mutex_exit(&bufcache_lock);
221 
222 	return (0);
223 }
224 
225 /*
226  * Destroy any in core blocks past the truncation length.
227  * Called with the underlying vnode locked, which should prevent new dirty
228  * buffers from being queued.
229  */
230 int
vtruncbuf(struct vnode * vp,daddr_t lbn,bool catch_p,int slptimeo)231 vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo)
232 {
233 	struct buf *bp, *nbp;
234 	int error;
235 	voff_t off;
236 
237 	off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
238 	mutex_enter(vp->v_interlock);
239 	error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
240 	if (error) {
241 		return error;
242 	}
243 
244 	mutex_enter(&bufcache_lock);
245 restart:
246 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
247 		KASSERT(bp->b_vp == vp);
248 		nbp = LIST_NEXT(bp, b_vnbufs);
249 		if (bp->b_lblkno < lbn)
250 			continue;
251 		error = bbusy(bp, catch_p, slptimeo, NULL);
252 		if (error != 0) {
253 			if (error == EPASSTHROUGH)
254 				goto restart;
255 			mutex_exit(&bufcache_lock);
256 			return (error);
257 		}
258 		brelsel(bp, BC_INVAL | BC_VFLUSH);
259 	}
260 
261 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
262 		KASSERT(bp->b_vp == vp);
263 		nbp = LIST_NEXT(bp, b_vnbufs);
264 		if (bp->b_lblkno < lbn)
265 			continue;
266 		error = bbusy(bp, catch_p, slptimeo, NULL);
267 		if (error != 0) {
268 			if (error == EPASSTHROUGH)
269 				goto restart;
270 			mutex_exit(&bufcache_lock);
271 			return (error);
272 		}
273 		brelsel(bp, BC_INVAL | BC_VFLUSH);
274 	}
275 	mutex_exit(&bufcache_lock);
276 
277 	return (0);
278 }
279 
280 /*
281  * Flush all dirty buffers from a vnode.
282  * Called with the underlying vnode locked, which should prevent new dirty
283  * buffers from being queued.
284  */
285 int
vflushbuf(struct vnode * vp,int flags)286 vflushbuf(struct vnode *vp, int flags)
287 {
288 	struct buf *bp, *nbp;
289 	int error, pflags;
290 	bool dirty, sync;
291 
292 	sync = (flags & FSYNC_WAIT) != 0;
293 	pflags = PGO_CLEANIT | PGO_ALLPAGES |
294 		(sync ? PGO_SYNCIO : 0) |
295 		((flags & FSYNC_LAZY) ? PGO_LAZY : 0);
296 	mutex_enter(vp->v_interlock);
297 	(void) VOP_PUTPAGES(vp, 0, 0, pflags);
298 
299 loop:
300 	mutex_enter(&bufcache_lock);
301 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
302 		KASSERT(bp->b_vp == vp);
303 		nbp = LIST_NEXT(bp, b_vnbufs);
304 		if ((bp->b_cflags & BC_BUSY))
305 			continue;
306 		if ((bp->b_oflags & BO_DELWRI) == 0)
307 			panic("vflushbuf: not dirty, bp %p", bp);
308 		bp->b_cflags |= BC_BUSY | BC_VFLUSH;
309 		mutex_exit(&bufcache_lock);
310 		/*
311 		 * Wait for I/O associated with indirect blocks to complete,
312 		 * since there is no way to quickly wait for them below.
313 		 */
314 		if (bp->b_vp == vp || !sync)
315 			(void) bawrite(bp);
316 		else {
317 			error = bwrite(bp);
318 			if (error)
319 				return error;
320 		}
321 		goto loop;
322 	}
323 	mutex_exit(&bufcache_lock);
324 
325 	if (!sync)
326 		return 0;
327 
328 	mutex_enter(vp->v_interlock);
329 	while (vp->v_numoutput != 0)
330 		cv_wait(&vp->v_cv, vp->v_interlock);
331 	dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
332 	mutex_exit(vp->v_interlock);
333 
334 	if (dirty) {
335 		vprint("vflushbuf: dirty", vp);
336 		goto loop;
337 	}
338 
339 	return 0;
340 }
341 
342 /*
343  * Create a vnode for a block device.
344  * Used for root filesystem and swap areas.
345  * Also used for memory file system special devices.
346  */
347 int
bdevvp(dev_t dev,vnode_t ** vpp)348 bdevvp(dev_t dev, vnode_t **vpp)
349 {
350 	struct vattr va;
351 
352 	vattr_null(&va);
353 	va.va_type = VBLK;
354 	va.va_rdev = dev;
355 
356 	return vcache_new(dead_rootmount, NULL, &va, NOCRED, vpp);
357 }
358 
359 /*
360  * Create a vnode for a character device.
361  * Used for kernfs and some console handling.
362  */
363 int
cdevvp(dev_t dev,vnode_t ** vpp)364 cdevvp(dev_t dev, vnode_t **vpp)
365 {
366 	struct vattr va;
367 
368 	vattr_null(&va);
369 	va.va_type = VCHR;
370 	va.va_rdev = dev;
371 
372 	return vcache_new(dead_rootmount, NULL, &va, NOCRED, vpp);
373 }
374 
375 /*
376  * Associate a buffer with a vnode.  There must already be a hold on
377  * the vnode.
378  */
379 void
bgetvp(struct vnode * vp,struct buf * bp)380 bgetvp(struct vnode *vp, struct buf *bp)
381 {
382 
383 	KASSERT(bp->b_vp == NULL);
384 	KASSERT(bp->b_objlock == &buffer_lock);
385 	KASSERT(mutex_owned(vp->v_interlock));
386 	KASSERT(mutex_owned(&bufcache_lock));
387 	KASSERT((bp->b_cflags & BC_BUSY) != 0);
388 	KASSERT(!cv_has_waiters(&bp->b_done));
389 
390 	vholdl(vp);
391 	bp->b_vp = vp;
392 	if (vp->v_type == VBLK || vp->v_type == VCHR)
393 		bp->b_dev = vp->v_rdev;
394 	else
395 		bp->b_dev = NODEV;
396 
397 	/*
398 	 * Insert onto list for new vnode.
399 	 */
400 	bufinsvn(bp, &vp->v_cleanblkhd);
401 	bp->b_objlock = vp->v_interlock;
402 }
403 
404 /*
405  * Disassociate a buffer from a vnode.
406  */
407 void
brelvp(struct buf * bp)408 brelvp(struct buf *bp)
409 {
410 	struct vnode *vp = bp->b_vp;
411 
412 	KASSERT(vp != NULL);
413 	KASSERT(bp->b_objlock == vp->v_interlock);
414 	KASSERT(mutex_owned(vp->v_interlock));
415 	KASSERT(mutex_owned(&bufcache_lock));
416 	KASSERT((bp->b_cflags & BC_BUSY) != 0);
417 	KASSERT(!cv_has_waiters(&bp->b_done));
418 
419 	/*
420 	 * Delete from old vnode list, if on one.
421 	 */
422 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
423 		bufremvn(bp);
424 
425 	if (vp->v_uobj.uo_npages == 0 && (vp->v_iflag & VI_ONWORKLST) &&
426 	    LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
427 		vp->v_iflag &= ~VI_WRMAPDIRTY;
428 		vn_syncer_remove_from_worklist(vp);
429 	}
430 
431 	bp->b_objlock = &buffer_lock;
432 	bp->b_vp = NULL;
433 	holdrelel(vp);
434 }
435 
436 /*
437  * Reassign a buffer from one vnode list to another.
438  * The list reassignment must be within the same vnode.
439  * Used to assign file specific control information
440  * (indirect blocks) to the list to which they belong.
441  */
442 void
reassignbuf(struct buf * bp,struct vnode * vp)443 reassignbuf(struct buf *bp, struct vnode *vp)
444 {
445 	struct buflists *listheadp;
446 	int delayx;
447 
448 	KASSERT(mutex_owned(&bufcache_lock));
449 	KASSERT(bp->b_objlock == vp->v_interlock);
450 	KASSERT(mutex_owned(vp->v_interlock));
451 	KASSERT((bp->b_cflags & BC_BUSY) != 0);
452 
453 	/*
454 	 * Delete from old vnode list, if on one.
455 	 */
456 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
457 		bufremvn(bp);
458 
459 	/*
460 	 * If dirty, put on list of dirty buffers;
461 	 * otherwise insert onto list of clean buffers.
462 	 */
463 	if ((bp->b_oflags & BO_DELWRI) == 0) {
464 		listheadp = &vp->v_cleanblkhd;
465 		if (vp->v_uobj.uo_npages == 0 &&
466 		    (vp->v_iflag & VI_ONWORKLST) &&
467 		    LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
468 			vp->v_iflag &= ~VI_WRMAPDIRTY;
469 			vn_syncer_remove_from_worklist(vp);
470 		}
471 	} else {
472 		listheadp = &vp->v_dirtyblkhd;
473 		if ((vp->v_iflag & VI_ONWORKLST) == 0) {
474 			switch (vp->v_type) {
475 			case VDIR:
476 				delayx = dirdelay;
477 				break;
478 			case VBLK:
479 				if (spec_node_getmountedfs(vp) != NULL) {
480 					delayx = metadelay;
481 					break;
482 				}
483 				/* fall through */
484 			default:
485 				delayx = filedelay;
486 				break;
487 			}
488 			if (!vp->v_mount ||
489 			    (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
490 				vn_syncer_add_to_worklist(vp, delayx);
491 		}
492 	}
493 	bufinsvn(bp, listheadp);
494 }
495 
496 /*
497  * Lookup a vnode by device number and return it referenced.
498  */
499 int
vfinddev(dev_t dev,enum vtype type,vnode_t ** vpp)500 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
501 {
502 
503 	return (spec_node_lookup_by_dev(type, dev, vpp) == 0);
504 }
505 
506 /*
507  * Revoke all the vnodes corresponding to the specified minor number
508  * range (endpoints inclusive) of the specified major.
509  */
510 void
vdevgone(int maj,int minl,int minh,enum vtype type)511 vdevgone(int maj, int minl, int minh, enum vtype type)
512 {
513 	vnode_t *vp;
514 	dev_t dev;
515 	int mn;
516 
517 	for (mn = minl; mn <= minh; mn++) {
518 		dev = makedev(maj, mn);
519 		while (spec_node_lookup_by_dev(type, dev, &vp) == 0) {
520 			VOP_REVOKE(vp, REVOKEALL);
521 			vrele(vp);
522 		}
523 	}
524 }
525 
526 /*
527  * The filesystem synchronizer mechanism - syncer.
528  *
529  * It is useful to delay writes of file data and filesystem metadata for
530  * a certain amount of time so that quickly created and deleted files need
531  * not waste disk bandwidth being created and removed.  To implement this,
532  * vnodes are appended to a "workitem" queue.
533  *
534  * Most pending metadata should not wait for more than ten seconds.  Thus,
535  * mounted on block devices are delayed only about a half the time that file
536  * data is delayed.  Similarly, directory updates are more critical, so are
537  * only delayed about a third the time that file data is delayed.
538  *
539  * There are SYNCER_MAXDELAY queues that are processed in a round-robin
540  * manner at a rate of one each second (driven off the filesystem syner
541  * thread). The syncer_delayno variable indicates the next queue that is
542  * to be processed.  Items that need to be processed soon are placed in
543  * this queue:
544  *
545  *	syncer_workitem_pending[syncer_delayno]
546  *
547  * A delay of e.g. fifteen seconds is done by placing the request fifteen
548  * entries later in the queue:
549  *
550  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
551  *
552  * Flag VI_ONWORKLST indicates that vnode is added into the queue.
553  */
554 
555 #define SYNCER_MAXDELAY		32
556 
557 typedef TAILQ_HEAD(synclist, vnode) synclist_t;
558 
559 static void	vn_syncer_add1(struct vnode *, int);
560 static void	sysctl_vfs_syncfs_setup(struct sysctllog **);
561 
562 /*
563  * Defines and variables for the syncer process.
564  */
565 int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
566 time_t syncdelay = 30;			/* max time to delay syncing data */
567 time_t filedelay = 30;			/* time to delay syncing files */
568 time_t dirdelay  = 15;			/* time to delay syncing directories */
569 time_t metadelay = 10;			/* time to delay syncing metadata */
570 time_t lockdelay = 1;			/* time to delay if locking fails */
571 
572 kmutex_t		syncer_mutex;	/* used to freeze syncer, long term */
573 static kmutex_t		syncer_data_lock; /* short term lock on data structs */
574 
575 static int		syncer_delayno = 0;
576 static long		syncer_last;
577 static synclist_t *	syncer_workitem_pending;
578 
579 static void
vn_initialize_syncerd(void)580 vn_initialize_syncerd(void)
581 {
582 	int i;
583 
584 	syncer_last = SYNCER_MAXDELAY + 2;
585 
586 	sysctl_vfs_syncfs_setup(NULL);
587 
588 	syncer_workitem_pending =
589 	    kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP);
590 
591 	for (i = 0; i < syncer_last; i++)
592 		TAILQ_INIT(&syncer_workitem_pending[i]);
593 
594 	mutex_init(&syncer_mutex, MUTEX_DEFAULT, IPL_NONE);
595 	mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE);
596 }
597 
598 /*
599  * Return delay factor appropriate for the given file system.   For
600  * WAPBL we use the sync vnode to burst out metadata updates: sync
601  * those file systems more frequently.
602  */
603 static inline int
sync_delay(struct mount * mp)604 sync_delay(struct mount *mp)
605 {
606 
607 	return mp->mnt_wapbl != NULL ? metadelay : syncdelay;
608 }
609 
610 /*
611  * Compute the next slot index from delay.
612  */
613 static inline int
sync_delay_slot(int delayx)614 sync_delay_slot(int delayx)
615 {
616 
617 	if (delayx > syncer_maxdelay - 2)
618 		delayx = syncer_maxdelay - 2;
619 	return (syncer_delayno + delayx) % syncer_last;
620 }
621 
622 /*
623  * Add an item to the syncer work queue.
624  */
625 static void
vn_syncer_add1(struct vnode * vp,int delayx)626 vn_syncer_add1(struct vnode *vp, int delayx)
627 {
628 	synclist_t *slp;
629 
630 	KASSERT(mutex_owned(&syncer_data_lock));
631 
632 	if (vp->v_iflag & VI_ONWORKLST) {
633 		/*
634 		 * Remove in order to adjust the position of the vnode.
635 		 * Note: called from sched_sync(), which will not hold
636 		 * interlock, therefore we cannot modify v_iflag here.
637 		 */
638 		slp = &syncer_workitem_pending[vp->v_synclist_slot];
639 		TAILQ_REMOVE(slp, vp, v_synclist);
640 	} else {
641 		KASSERT(mutex_owned(vp->v_interlock));
642 		vp->v_iflag |= VI_ONWORKLST;
643 	}
644 
645 	vp->v_synclist_slot = sync_delay_slot(delayx);
646 
647 	slp = &syncer_workitem_pending[vp->v_synclist_slot];
648 	TAILQ_INSERT_TAIL(slp, vp, v_synclist);
649 }
650 
651 void
vn_syncer_add_to_worklist(struct vnode * vp,int delayx)652 vn_syncer_add_to_worklist(struct vnode *vp, int delayx)
653 {
654 
655 	KASSERT(mutex_owned(vp->v_interlock));
656 
657 	mutex_enter(&syncer_data_lock);
658 	vn_syncer_add1(vp, delayx);
659 	mutex_exit(&syncer_data_lock);
660 }
661 
662 /*
663  * Remove an item from the syncer work queue.
664  */
665 void
vn_syncer_remove_from_worklist(struct vnode * vp)666 vn_syncer_remove_from_worklist(struct vnode *vp)
667 {
668 	synclist_t *slp;
669 
670 	KASSERT(mutex_owned(vp->v_interlock));
671 
672 	mutex_enter(&syncer_data_lock);
673 	if (vp->v_iflag & VI_ONWORKLST) {
674 		vp->v_iflag &= ~VI_ONWORKLST;
675 		slp = &syncer_workitem_pending[vp->v_synclist_slot];
676 		TAILQ_REMOVE(slp, vp, v_synclist);
677 	}
678 	mutex_exit(&syncer_data_lock);
679 }
680 
681 /*
682  * Add this mount point to the syncer.
683  */
684 void
vfs_syncer_add_to_worklist(struct mount * mp)685 vfs_syncer_add_to_worklist(struct mount *mp)
686 {
687 	static int start, incr, next;
688 	int vdelay;
689 
690 	KASSERT(mutex_owned(&mp->mnt_updating));
691 	KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) == 0);
692 
693 	/*
694 	 * We attempt to scatter the mount points on the list
695 	 * so that they will go off at evenly distributed times
696 	 * even if all the filesystems are mounted at once.
697 	 */
698 
699 	next += incr;
700 	if (next == 0 || next > syncer_maxdelay) {
701 		start /= 2;
702 		incr /= 2;
703 		if (start == 0) {
704 			start = syncer_maxdelay / 2;
705 			incr = syncer_maxdelay;
706 		}
707 		next = start;
708 	}
709 	mp->mnt_iflag |= IMNT_ONWORKLIST;
710 	vdelay = sync_delay(mp);
711 	mp->mnt_synclist_slot = vdelay > 0 ? next % vdelay : 0;
712 }
713 
714 /*
715  * Remove the mount point from the syncer.
716  */
717 void
vfs_syncer_remove_from_worklist(struct mount * mp)718 vfs_syncer_remove_from_worklist(struct mount *mp)
719 {
720 
721 	KASSERT(mutex_owned(&mp->mnt_updating));
722 	KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) != 0);
723 
724 	mp->mnt_iflag &= ~IMNT_ONWORKLIST;
725 }
726 
727 /*
728  * Try lazy sync, return true on success.
729  */
730 static bool
lazy_sync_vnode(struct vnode * vp)731 lazy_sync_vnode(struct vnode *vp)
732 {
733 	bool synced;
734 
735 	KASSERT(mutex_owned(&syncer_data_lock));
736 
737 	synced = false;
738 	/* We are locking in the wrong direction. */
739 	if (mutex_tryenter(vp->v_interlock)) {
740 		mutex_exit(&syncer_data_lock);
741 		if (vget(vp, LK_NOWAIT, false /* !wait */) == 0) {
742 			if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
743 				synced = true;
744 				(void) VOP_FSYNC(vp, curlwp->l_cred,
745 				    FSYNC_LAZY, 0, 0);
746 				vput(vp);
747 			} else
748 				vrele(vp);
749 		}
750 		mutex_enter(&syncer_data_lock);
751 	}
752 	return synced;
753 }
754 
755 /*
756  * System filesystem synchronizer daemon.
757  */
758 void
sched_sync(void * arg)759 sched_sync(void *arg)
760 {
761 	synclist_t *slp;
762 	struct vnode *vp;
763 	struct mount *mp, *nmp;
764 	time_t starttime;
765 	bool synced;
766 
767 	for (;;) {
768 		mutex_enter(&syncer_mutex);
769 
770 		starttime = time_second;
771 
772 		/*
773 		 * Sync mounts whose dirty time has expired.
774 		 */
775 		mutex_enter(&mountlist_lock);
776 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
777 			if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0 ||
778 			    mp->mnt_synclist_slot != syncer_delayno) {
779 				nmp = TAILQ_NEXT(mp, mnt_list);
780 				continue;
781 			}
782 			mp->mnt_synclist_slot = sync_delay_slot(sync_delay(mp));
783 			if (vfs_busy(mp, &nmp))
784 				continue;
785 			VFS_SYNC(mp, MNT_LAZY, curlwp->l_cred);
786 			vfs_unbusy(mp, false, &nmp);
787 		}
788 		mutex_exit(&mountlist_lock);
789 
790 		mutex_enter(&syncer_data_lock);
791 
792 		/*
793 		 * Push files whose dirty time has expired.
794 		 */
795 		slp = &syncer_workitem_pending[syncer_delayno];
796 		syncer_delayno += 1;
797 		if (syncer_delayno >= syncer_last)
798 			syncer_delayno = 0;
799 
800 		while ((vp = TAILQ_FIRST(slp)) != NULL) {
801 			synced = lazy_sync_vnode(vp);
802 
803 			/*
804 			 * XXX The vnode may have been recycled, in which
805 			 * case it may have a new identity.
806 			 */
807 			if (TAILQ_FIRST(slp) == vp) {
808 				/*
809 				 * Put us back on the worklist.  The worklist
810 				 * routine will remove us from our current
811 				 * position and then add us back in at a later
812 				 * position.
813 				 *
814 				 * Try again sooner rather than later if
815 				 * we were unable to lock the vnode.  Lock
816 				 * failure should not prevent us from doing
817 				 * the sync "soon".
818 				 *
819 				 * If we locked it yet arrive here, it's
820 				 * likely that lazy sync is in progress and
821 				 * so the vnode still has dirty metadata.
822 				 * syncdelay is mainly to get this vnode out
823 				 * of the way so we do not consider it again
824 				 * "soon" in this loop, so the delay time is
825 				 * not critical as long as it is not "soon".
826 				 * While write-back strategy is the file
827 				 * system's domain, we expect write-back to
828 				 * occur no later than syncdelay seconds
829 				 * into the future.
830 				 */
831 				vn_syncer_add1(vp,
832 				    synced ? syncdelay : lockdelay);
833 			}
834 		}
835 		mutex_exit(&syncer_mutex);
836 
837 		/*
838 		 * If it has taken us less than a second to process the
839 		 * current work, then wait.  Otherwise start right over
840 		 * again.  We can still lose time if any single round
841 		 * takes more than two seconds, but it does not really
842 		 * matter as we are just trying to generally pace the
843 		 * filesystem activity.
844 		 */
845 		if (time_second == starttime) {
846 			kpause("syncer", false, hz, &syncer_data_lock);
847 		}
848 		mutex_exit(&syncer_data_lock);
849 	}
850 }
851 
852 static void
sysctl_vfs_syncfs_setup(struct sysctllog ** clog)853 sysctl_vfs_syncfs_setup(struct sysctllog **clog)
854 {
855 	const struct sysctlnode *rnode, *cnode;
856 
857 	sysctl_createv(clog, 0, NULL, &rnode,
858 			CTLFLAG_PERMANENT,
859 			CTLTYPE_NODE, "sync",
860 			SYSCTL_DESCR("syncer options"),
861 			NULL, 0, NULL, 0,
862 			CTL_VFS, CTL_CREATE, CTL_EOL);
863 
864 	sysctl_createv(clog, 0, &rnode, &cnode,
865 			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
866 			CTLTYPE_QUAD, "delay",
867 			SYSCTL_DESCR("max time to delay syncing data"),
868 			NULL, 0, &syncdelay, 0,
869 			CTL_CREATE, CTL_EOL);
870 
871 	sysctl_createv(clog, 0, &rnode, &cnode,
872 			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
873 			CTLTYPE_QUAD, "filedelay",
874 			SYSCTL_DESCR("time to delay syncing files"),
875 			NULL, 0, &filedelay, 0,
876 			CTL_CREATE, CTL_EOL);
877 
878 	sysctl_createv(clog, 0, &rnode, &cnode,
879 			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
880 			CTLTYPE_QUAD, "dirdelay",
881 			SYSCTL_DESCR("time to delay syncing directories"),
882 			NULL, 0, &dirdelay, 0,
883 			CTL_CREATE, CTL_EOL);
884 
885 	sysctl_createv(clog, 0, &rnode, &cnode,
886 			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
887 			CTLTYPE_QUAD, "metadelay",
888 			SYSCTL_DESCR("time to delay syncing metadata"),
889 			NULL, 0, &metadelay, 0,
890 			CTL_CREATE, CTL_EOL);
891 }
892 
893 /*
894  * sysctl helper routine to return list of supported fstypes
895  */
896 int
sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)897 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
898 {
899 	char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
900 	char *where = oldp;
901 	struct vfsops *v;
902 	size_t needed, left, slen;
903 	int error, first;
904 
905 	if (newp != NULL)
906 		return (EPERM);
907 	if (namelen != 0)
908 		return (EINVAL);
909 
910 	first = 1;
911 	error = 0;
912 	needed = 0;
913 	left = *oldlenp;
914 
915 	sysctl_unlock();
916 	mutex_enter(&vfs_list_lock);
917 	LIST_FOREACH(v, &vfs_list, vfs_list) {
918 		if (where == NULL)
919 			needed += strlen(v->vfs_name) + 1;
920 		else {
921 			memset(bf, 0, sizeof(bf));
922 			if (first) {
923 				strncpy(bf, v->vfs_name, sizeof(bf));
924 				first = 0;
925 			} else {
926 				bf[0] = ' ';
927 				strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
928 			}
929 			bf[sizeof(bf)-1] = '\0';
930 			slen = strlen(bf);
931 			if (left < slen + 1)
932 				break;
933 			v->vfs_refcount++;
934 			mutex_exit(&vfs_list_lock);
935 			/* +1 to copy out the trailing NUL byte */
936 			error = copyout(bf, where, slen + 1);
937 			mutex_enter(&vfs_list_lock);
938 			v->vfs_refcount--;
939 			if (error)
940 				break;
941 			where += slen;
942 			needed += slen;
943 			left -= slen;
944 		}
945 	}
946 	mutex_exit(&vfs_list_lock);
947 	sysctl_relock();
948 	*oldlenp = needed;
949 	return (error);
950 }
951 
952 int kinfo_vdebug = 1;
953 int kinfo_vgetfailed;
954 
955 #define KINFO_VNODESLOP	10
956 
957 /*
958  * Dump vnode list (via sysctl).
959  * Copyout address of vnode followed by vnode.
960  */
961 int
sysctl_kern_vnode(SYSCTLFN_ARGS)962 sysctl_kern_vnode(SYSCTLFN_ARGS)
963 {
964 	char *where = oldp;
965 	size_t *sizep = oldlenp;
966 	struct mount *mp, *nmp;
967 	vnode_t *vp, vbuf;
968 	struct vnode_iterator *marker;
969 	char *bp = where;
970 	char *ewhere;
971 	int error;
972 
973 	if (namelen != 0)
974 		return (EOPNOTSUPP);
975 	if (newp != NULL)
976 		return (EPERM);
977 
978 #define VPTRSZ	sizeof(vnode_t *)
979 #define VNODESZ	sizeof(vnode_t)
980 	if (where == NULL) {
981 		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
982 		return (0);
983 	}
984 	ewhere = where + *sizep;
985 
986 	sysctl_unlock();
987 	mutex_enter(&mountlist_lock);
988 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
989 		if (vfs_busy(mp, &nmp)) {
990 			continue;
991 		}
992 		vfs_vnode_iterator_init(mp, &marker);
993 		while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
994 			if (bp + VPTRSZ + VNODESZ > ewhere) {
995 				vrele(vp);
996 				vfs_vnode_iterator_destroy(marker);
997 				vfs_unbusy(mp, false, NULL);
998 				sysctl_relock();
999 				*sizep = bp - where;
1000 				return (ENOMEM);
1001 			}
1002 			memcpy(&vbuf, vp, VNODESZ);
1003 			if ((error = copyout(&vp, bp, VPTRSZ)) ||
1004 			    (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
1005 				vrele(vp);
1006 				vfs_vnode_iterator_destroy(marker);
1007 				vfs_unbusy(mp, false, NULL);
1008 				sysctl_relock();
1009 				return (error);
1010 			}
1011 			vrele(vp);
1012 			bp += VPTRSZ + VNODESZ;
1013 		}
1014 		vfs_vnode_iterator_destroy(marker);
1015 		vfs_unbusy(mp, false, &nmp);
1016 	}
1017 	mutex_exit(&mountlist_lock);
1018 	sysctl_relock();
1019 
1020 	*sizep = bp - where;
1021 	return (0);
1022 }
1023 
1024 /*
1025  * Set vnode attributes to VNOVAL
1026  */
1027 void
vattr_null(struct vattr * vap)1028 vattr_null(struct vattr *vap)
1029 {
1030 
1031 	memset(vap, 0, sizeof(*vap));
1032 
1033 	vap->va_type = VNON;
1034 
1035 	/*
1036 	 * Assign individually so that it is safe even if size and
1037 	 * sign of each member are varied.
1038 	 */
1039 	vap->va_mode = VNOVAL;
1040 	vap->va_nlink = VNOVAL;
1041 	vap->va_uid = VNOVAL;
1042 	vap->va_gid = VNOVAL;
1043 	vap->va_fsid = VNOVAL;
1044 	vap->va_fileid = VNOVAL;
1045 	vap->va_size = VNOVAL;
1046 	vap->va_blocksize = VNOVAL;
1047 	vap->va_atime.tv_sec =
1048 	    vap->va_mtime.tv_sec =
1049 	    vap->va_ctime.tv_sec =
1050 	    vap->va_birthtime.tv_sec = VNOVAL;
1051 	vap->va_atime.tv_nsec =
1052 	    vap->va_mtime.tv_nsec =
1053 	    vap->va_ctime.tv_nsec =
1054 	    vap->va_birthtime.tv_nsec = VNOVAL;
1055 	vap->va_gen = VNOVAL;
1056 	vap->va_flags = VNOVAL;
1057 	vap->va_rdev = VNOVAL;
1058 	vap->va_bytes = VNOVAL;
1059 }
1060 
1061 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
1062 #define ARRAY_PRINT(idx, arr) \
1063     ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
1064 
1065 const char * const vnode_tags[] = { VNODE_TAGS };
1066 const char * const vnode_types[] = { VNODE_TYPES };
1067 const char vnode_flagbits[] = VNODE_FLAGBITS;
1068 
1069 /*
1070  * Print out a description of a vnode.
1071  */
1072 void
vprint(const char * label,struct vnode * vp)1073 vprint(const char *label, struct vnode *vp)
1074 {
1075 	char bf[96];
1076 	int flag;
1077 
1078 	flag = vp->v_iflag | vp->v_vflag | vp->v_uflag;
1079 	snprintb(bf, sizeof(bf), vnode_flagbits, flag);
1080 
1081 	if (label != NULL)
1082 		printf("%s: ", label);
1083 	printf("vnode @ %p, flags (%s)\n\ttag %s(%d), type %s(%d), "
1084 	    "usecount %d, writecount %d, holdcount %d\n"
1085 	    "\tfreelisthd %p, mount %p, data %p lock %p\n",
1086 	    vp, bf, ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
1087 	    ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
1088 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt,
1089 	    vp->v_freelisthd, vp->v_mount, vp->v_data, &vp->v_lock);
1090 	vcache_print(vp, "\t", printf);
1091 	if (vp->v_data != NULL) {
1092 		printf("\t");
1093 		VOP_PRINT(vp);
1094 	}
1095 }
1096 
1097 /* Deprecated. Kept for KPI compatibility. */
1098 int
vaccess(enum vtype type,mode_t file_mode,uid_t uid,gid_t gid,mode_t acc_mode,kauth_cred_t cred)1099 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
1100     mode_t acc_mode, kauth_cred_t cred)
1101 {
1102 
1103 #ifdef DIAGNOSTIC
1104 	printf("vaccess: deprecated interface used.\n");
1105 #endif /* DIAGNOSTIC */
1106 
1107 	return kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(acc_mode,
1108 	    type, file_mode), NULL /* This may panic. */, NULL,
1109 	    genfs_can_access(type, file_mode, uid, gid, acc_mode, cred));
1110 }
1111 
1112 /*
1113  * Given a file system name, look up the vfsops for that
1114  * file system, or return NULL if file system isn't present
1115  * in the kernel.
1116  */
1117 struct vfsops *
vfs_getopsbyname(const char * name)1118 vfs_getopsbyname(const char *name)
1119 {
1120 	struct vfsops *v;
1121 
1122 	mutex_enter(&vfs_list_lock);
1123 	LIST_FOREACH(v, &vfs_list, vfs_list) {
1124 		if (strcmp(v->vfs_name, name) == 0)
1125 			break;
1126 	}
1127 	if (v != NULL)
1128 		v->vfs_refcount++;
1129 	mutex_exit(&vfs_list_lock);
1130 
1131 	return (v);
1132 }
1133 
1134 void
copy_statvfs_info(struct statvfs * sbp,const struct mount * mp)1135 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
1136 {
1137 	const struct statvfs *mbp;
1138 
1139 	if (sbp == (mbp = &mp->mnt_stat))
1140 		return;
1141 
1142 	(void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
1143 	sbp->f_fsid = mbp->f_fsid;
1144 	sbp->f_owner = mbp->f_owner;
1145 	sbp->f_flag = mbp->f_flag;
1146 	sbp->f_syncwrites = mbp->f_syncwrites;
1147 	sbp->f_asyncwrites = mbp->f_asyncwrites;
1148 	sbp->f_syncreads = mbp->f_syncreads;
1149 	sbp->f_asyncreads = mbp->f_asyncreads;
1150 	(void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
1151 	(void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
1152 	    sizeof(sbp->f_fstypename));
1153 	(void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
1154 	    sizeof(sbp->f_mntonname));
1155 	(void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
1156 	    sizeof(sbp->f_mntfromname));
1157 	sbp->f_namemax = mbp->f_namemax;
1158 }
1159 
1160 int
set_statvfs_info(const char * onp,int ukon,const char * fromp,int ukfrom,const char * vfsname,struct mount * mp,struct lwp * l)1161 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
1162     const char *vfsname, struct mount *mp, struct lwp *l)
1163 {
1164 	int error;
1165 	size_t size;
1166 	struct statvfs *sfs = &mp->mnt_stat;
1167 	int (*fun)(const void *, void *, size_t, size_t *);
1168 
1169 	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsname,
1170 	    sizeof(mp->mnt_stat.f_fstypename));
1171 
1172 	if (onp) {
1173 		struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1174 		fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
1175 		if (cwdi->cwdi_rdir != NULL) {
1176 			size_t len;
1177 			char *bp;
1178 			char *path = PNBUF_GET();
1179 
1180 			bp = path + MAXPATHLEN;
1181 			*--bp = '\0';
1182 			rw_enter(&cwdi->cwdi_lock, RW_READER);
1183 			error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
1184 			    path, MAXPATHLEN / 2, 0, l);
1185 			rw_exit(&cwdi->cwdi_lock);
1186 			if (error) {
1187 				PNBUF_PUT(path);
1188 				return error;
1189 			}
1190 
1191 			len = strlen(bp);
1192 			if (len > sizeof(sfs->f_mntonname) - 1)
1193 				len = sizeof(sfs->f_mntonname) - 1;
1194 			(void)strncpy(sfs->f_mntonname, bp, len);
1195 			PNBUF_PUT(path);
1196 
1197 			if (len < sizeof(sfs->f_mntonname) - 1) {
1198 				error = (*fun)(onp, &sfs->f_mntonname[len],
1199 				    sizeof(sfs->f_mntonname) - len - 1, &size);
1200 				if (error)
1201 					return error;
1202 				size += len;
1203 			} else {
1204 				size = len;
1205 			}
1206 		} else {
1207 			error = (*fun)(onp, &sfs->f_mntonname,
1208 			    sizeof(sfs->f_mntonname) - 1, &size);
1209 			if (error)
1210 				return error;
1211 		}
1212 		(void)memset(sfs->f_mntonname + size, 0,
1213 		    sizeof(sfs->f_mntonname) - size);
1214 	}
1215 
1216 	if (fromp) {
1217 		fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
1218 		error = (*fun)(fromp, sfs->f_mntfromname,
1219 		    sizeof(sfs->f_mntfromname) - 1, &size);
1220 		if (error)
1221 			return error;
1222 		(void)memset(sfs->f_mntfromname + size, 0,
1223 		    sizeof(sfs->f_mntfromname) - size);
1224 	}
1225 	return 0;
1226 }
1227 
1228 void
vfs_timestamp(struct timespec * ts)1229 vfs_timestamp(struct timespec *ts)
1230 {
1231 
1232 	nanotime(ts);
1233 }
1234 
1235 time_t	rootfstime;			/* recorded root fs time, if known */
1236 void
setrootfstime(time_t t)1237 setrootfstime(time_t t)
1238 {
1239 	rootfstime = t;
1240 }
1241 
1242 static const uint8_t vttodt_tab[ ] = {
1243 	[VNON]	=	DT_UNKNOWN,
1244 	[VREG]	=	DT_REG,
1245 	[VDIR]	=	DT_DIR,
1246 	[VBLK]	=	DT_BLK,
1247 	[VCHR]	=	DT_CHR,
1248 	[VLNK]	=	DT_LNK,
1249 	[VSOCK]	=	DT_SOCK,
1250 	[VFIFO]	=	DT_FIFO,
1251 	[VBAD]	=	DT_UNKNOWN
1252 };
1253 
1254 uint8_t
vtype2dt(enum vtype vt)1255 vtype2dt(enum vtype vt)
1256 {
1257 
1258 	CTASSERT(VBAD == __arraycount(vttodt_tab) - 1);
1259 	return vttodt_tab[vt];
1260 }
1261 
1262 int
VFS_MOUNT(struct mount * mp,const char * a,void * b,size_t * c)1263 VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
1264 {
1265 	int error;
1266 
1267 	KERNEL_LOCK(1, NULL);
1268 	error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
1269 	KERNEL_UNLOCK_ONE(NULL);
1270 
1271 	return error;
1272 }
1273 
1274 int
VFS_START(struct mount * mp,int a)1275 VFS_START(struct mount *mp, int a)
1276 {
1277 	int error;
1278 
1279 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1280 		KERNEL_LOCK(1, NULL);
1281 	}
1282 	error = (*(mp->mnt_op->vfs_start))(mp, a);
1283 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1284 		KERNEL_UNLOCK_ONE(NULL);
1285 	}
1286 
1287 	return error;
1288 }
1289 
1290 int
VFS_UNMOUNT(struct mount * mp,int a)1291 VFS_UNMOUNT(struct mount *mp, int a)
1292 {
1293 	int error;
1294 
1295 	KERNEL_LOCK(1, NULL);
1296 	error = (*(mp->mnt_op->vfs_unmount))(mp, a);
1297 	KERNEL_UNLOCK_ONE(NULL);
1298 
1299 	return error;
1300 }
1301 
1302 int
VFS_ROOT(struct mount * mp,struct vnode ** a)1303 VFS_ROOT(struct mount *mp, struct vnode **a)
1304 {
1305 	int error;
1306 
1307 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1308 		KERNEL_LOCK(1, NULL);
1309 	}
1310 	error = (*(mp->mnt_op->vfs_root))(mp, a);
1311 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1312 		KERNEL_UNLOCK_ONE(NULL);
1313 	}
1314 
1315 	return error;
1316 }
1317 
1318 int
VFS_QUOTACTL(struct mount * mp,struct quotactl_args * args)1319 VFS_QUOTACTL(struct mount *mp, struct quotactl_args *args)
1320 {
1321 	int error;
1322 
1323 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1324 		KERNEL_LOCK(1, NULL);
1325 	}
1326 	error = (*(mp->mnt_op->vfs_quotactl))(mp, args);
1327 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1328 		KERNEL_UNLOCK_ONE(NULL);
1329 	}
1330 
1331 	return error;
1332 }
1333 
1334 int
VFS_STATVFS(struct mount * mp,struct statvfs * a)1335 VFS_STATVFS(struct mount *mp, struct statvfs *a)
1336 {
1337 	int error;
1338 
1339 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1340 		KERNEL_LOCK(1, NULL);
1341 	}
1342 	error = (*(mp->mnt_op->vfs_statvfs))(mp, a);
1343 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1344 		KERNEL_UNLOCK_ONE(NULL);
1345 	}
1346 
1347 	return error;
1348 }
1349 
1350 int
VFS_SYNC(struct mount * mp,int a,struct kauth_cred * b)1351 VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b)
1352 {
1353 	int error;
1354 
1355 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1356 		KERNEL_LOCK(1, NULL);
1357 	}
1358 	error = (*(mp->mnt_op->vfs_sync))(mp, a, b);
1359 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1360 		KERNEL_UNLOCK_ONE(NULL);
1361 	}
1362 
1363 	return error;
1364 }
1365 
1366 int
VFS_FHTOVP(struct mount * mp,struct fid * a,struct vnode ** b)1367 VFS_FHTOVP(struct mount *mp, struct fid *a, struct vnode **b)
1368 {
1369 	int error;
1370 
1371 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1372 		KERNEL_LOCK(1, NULL);
1373 	}
1374 	error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b);
1375 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1376 		KERNEL_UNLOCK_ONE(NULL);
1377 	}
1378 
1379 	return error;
1380 }
1381 
1382 int
VFS_VPTOFH(struct vnode * vp,struct fid * a,size_t * b)1383 VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b)
1384 {
1385 	int error;
1386 
1387 	if ((vp->v_vflag & VV_MPSAFE) == 0) {
1388 		KERNEL_LOCK(1, NULL);
1389 	}
1390 	error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b);
1391 	if ((vp->v_vflag & VV_MPSAFE) == 0) {
1392 		KERNEL_UNLOCK_ONE(NULL);
1393 	}
1394 
1395 	return error;
1396 }
1397 
1398 int
VFS_SNAPSHOT(struct mount * mp,struct vnode * a,struct timespec * b)1399 VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b)
1400 {
1401 	int error;
1402 
1403 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1404 		KERNEL_LOCK(1, NULL);
1405 	}
1406 	error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b);
1407 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1408 		KERNEL_UNLOCK_ONE(NULL);
1409 	}
1410 
1411 	return error;
1412 }
1413 
1414 int
VFS_EXTATTRCTL(struct mount * mp,int a,struct vnode * b,int c,const char * d)1415 VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d)
1416 {
1417 	int error;
1418 
1419 	KERNEL_LOCK(1, NULL);		/* XXXSMP check ffs */
1420 	error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d);
1421 	KERNEL_UNLOCK_ONE(NULL);	/* XXX */
1422 
1423 	return error;
1424 }
1425 
1426 int
VFS_SUSPENDCTL(struct mount * mp,int a)1427 VFS_SUSPENDCTL(struct mount *mp, int a)
1428 {
1429 	int error;
1430 
1431 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1432 		KERNEL_LOCK(1, NULL);
1433 	}
1434 	error = (*(mp->mnt_op->vfs_suspendctl))(mp, a);
1435 	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1436 		KERNEL_UNLOCK_ONE(NULL);
1437 	}
1438 
1439 	return error;
1440 }
1441 
1442 #if defined(DDB) || defined(DEBUGPRINT)
1443 static const char buf_flagbits[] = BUF_FLAGBITS;
1444 
1445 void
vfs_buf_print(struct buf * bp,int full,void (* pr)(const char *,...))1446 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
1447 {
1448 	char bf[1024];
1449 
1450 	(*pr)("  vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
1451 	    PRIx64 " dev 0x%x\n",
1452 	    bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);
1453 
1454 	snprintb(bf, sizeof(bf),
1455 	    buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags);
1456 	(*pr)("  error %d flags 0x%s\n", bp->b_error, bf);
1457 
1458 	(*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
1459 		  bp->b_bufsize, bp->b_bcount, bp->b_resid);
1460 	(*pr)("  data %p saveaddr %p\n",
1461 		  bp->b_data, bp->b_saveaddr);
1462 	(*pr)("  iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock);
1463 }
1464 
1465 void
vfs_vnode_print(struct vnode * vp,int full,void (* pr)(const char *,...))1466 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
1467 {
1468 	char bf[256];
1469 
1470 	uvm_object_printit(&vp->v_uobj, full, pr);
1471 	snprintb(bf, sizeof(bf),
1472 	    vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag);
1473 	(*pr)("\nVNODE flags %s\n", bf);
1474 	(*pr)("mp %p numoutput %d size 0x%llx writesize 0x%llx\n",
1475 	      vp->v_mount, vp->v_numoutput, vp->v_size, vp->v_writesize);
1476 
1477 	(*pr)("data %p writecount %ld holdcnt %ld\n",
1478 	      vp->v_data, vp->v_writecount, vp->v_holdcnt);
1479 
1480 	(*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
1481 	      ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
1482 	      ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
1483 	      vp->v_mount, vp->v_mountedhere);
1484 
1485 	(*pr)("v_lock %p\n", &vp->v_lock);
1486 
1487 	vcache_print(vp, "", pr);
1488 
1489 	if (full) {
1490 		struct buf *bp;
1491 
1492 		(*pr)("clean bufs:\n");
1493 		LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
1494 			(*pr)(" bp %p\n", bp);
1495 			vfs_buf_print(bp, full, pr);
1496 		}
1497 
1498 		(*pr)("dirty bufs:\n");
1499 		LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
1500 			(*pr)(" bp %p\n", bp);
1501 			vfs_buf_print(bp, full, pr);
1502 		}
1503 	}
1504 }
1505 
1506 void
vfs_mount_print(struct mount * mp,int full,void (* pr)(const char *,...))1507 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
1508 {
1509 	char sbuf[256];
1510 
1511 	(*pr)("vnodecovered = %p data = %p\n",
1512 			mp->mnt_vnodecovered,mp->mnt_data);
1513 
1514 	(*pr)("fs_bshift %d dev_bshift = %d\n",
1515 			mp->mnt_fs_bshift,mp->mnt_dev_bshift);
1516 
1517 	snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag);
1518 	(*pr)("flag = %s\n", sbuf);
1519 
1520 	snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag);
1521 	(*pr)("iflag = %s\n", sbuf);
1522 
1523 	(*pr)("refcnt = %d unmounting @ %p updating @ %p\n", mp->mnt_refcnt,
1524 	    &mp->mnt_unmounting, &mp->mnt_updating);
1525 
1526 	(*pr)("statvfs cache:\n");
1527 	(*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize);
1528 	(*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize);
1529 	(*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize);
1530 
1531 	(*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks);
1532 	(*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree);
1533 	(*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail);
1534 	(*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd);
1535 
1536 	(*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files);
1537 	(*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree);
1538 	(*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail);
1539 	(*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd);
1540 
1541 	(*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
1542 			mp->mnt_stat.f_fsidx.__fsid_val[0],
1543 			mp->mnt_stat.f_fsidx.__fsid_val[1]);
1544 
1545 	(*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner);
1546 	(*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax);
1547 
1548 	snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag);
1549 
1550 	(*pr)("\tflag = %s\n",sbuf);
1551 	(*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites);
1552 	(*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites);
1553 	(*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads);
1554 	(*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads);
1555 	(*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename);
1556 	(*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname);
1557 	(*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname);
1558 
1559 	{
1560 		int cnt = 0;
1561 		struct vnode *vp;
1562 		(*pr)("locked vnodes =");
1563 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1564 			if (VOP_ISLOCKED(vp)) {
1565 				if ((++cnt % 6) == 0) {
1566 					(*pr)(" %p,\n\t", vp);
1567 				} else {
1568 					(*pr)(" %p,", vp);
1569 				}
1570 			}
1571 		}
1572 		(*pr)("\n");
1573 	}
1574 
1575 	if (full) {
1576 		int cnt = 0;
1577 		struct vnode *vp;
1578 		(*pr)("all vnodes =");
1579 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1580 			if (!TAILQ_NEXT(vp, v_mntvnodes)) {
1581 				(*pr)(" %p", vp);
1582 			} else if ((++cnt % 6) == 0) {
1583 				(*pr)(" %p,\n\t", vp);
1584 			} else {
1585 				(*pr)(" %p,", vp);
1586 			}
1587 		}
1588 		(*pr)("\n", vp);
1589 	}
1590 }
1591 
1592 /*
1593  * List all of the locked vnodes in the system.
1594  */
1595 void printlockedvnodes(void);
1596 
1597 void
printlockedvnodes(void)1598 printlockedvnodes(void)
1599 {
1600 	struct mount *mp, *nmp;
1601 	struct vnode *vp;
1602 
1603 	printf("Locked vnodes\n");
1604 	mutex_enter(&mountlist_lock);
1605 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1606 		if (vfs_busy(mp, &nmp)) {
1607 			continue;
1608 		}
1609 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1610 			if (VOP_ISLOCKED(vp))
1611 				vprint(NULL, vp);
1612 		}
1613 		mutex_enter(&mountlist_lock);
1614 		vfs_unbusy(mp, false, &nmp);
1615 	}
1616 	mutex_exit(&mountlist_lock);
1617 }
1618 
1619 #endif /* DDB || DEBUGPRINT */
1620