1 /* $OpenBSD: vfs_subr.c,v 1.325 2024/10/31 10:06:51 mvs Exp $ */
2 /* $NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $ */
3
4 /*
5 * Copyright (c) 1989, 1993
6 * The Regents of the University of California. All rights reserved.
7 * (c) UNIX System Laboratories, Inc.
8 * All or some portions of this file are derived from material licensed
9 * to the University of California by American Telephone and Telegraph
10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11 * the permission of UNIX System Laboratories, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
38 */
39
40 /*
41 * External virtual filesystem routines
42 */
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/sysctl.h>
48 #include <sys/mount.h>
49 #include <sys/fcntl.h>
50 #include <sys/conf.h>
51 #include <sys/vnode.h>
52 #include <sys/lock.h>
53 #include <sys/lockf.h>
54 #include <sys/stat.h>
55 #include <sys/acct.h>
56 #include <sys/namei.h>
57 #include <sys/ucred.h>
58 #include <sys/buf.h>
59 #include <sys/errno.h>
60 #include <sys/malloc.h>
61 #include <sys/mbuf.h>
62 #include <sys/syscallargs.h>
63 #include <sys/pool.h>
64 #include <sys/tree.h>
65 #include <sys/specdev.h>
66 #include <sys/atomic.h>
67
68 #include <netinet/in.h>
69
70 #include <uvm/uvm_extern.h>
71 #include <uvm/uvm_vnode.h>
72
73 #include "softraid.h"
74
75 /*
76 * Locks used to protect data:
77 * a atomic
78 */
79
80 void sr_quiesce(void);
81
82 enum vtype iftovt_tab[16] = {
83 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
84 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
85 };
86
87 int vttoif_tab[9] = {
88 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
89 S_IFSOCK, S_IFIFO, S_IFMT,
90 };
91
92 int prtactive = 0; /* 1 => print out reclaim of active vnodes */
93 int suid_clear = 1; /* [a] 1 => clear SUID / SGID on owner change */
94
95 /*
96 * Insq/Remq for the vnode usage lists.
97 */
98 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
99 #define bufremvn(bp) { \
100 LIST_REMOVE(bp, b_vnbufs); \
101 LIST_NEXT(bp, b_vnbufs) = NOLIST; \
102 }
103
104 TAILQ_HEAD(freelst, vnode);
105 struct freelst vnode_hold_list; /* list of vnodes referencing buffers */
106 struct freelst vnode_free_list; /* vnode free list */
107
108 struct mntlist mountlist; /* mounted filesystem list */
109
110 void vclean(struct vnode *, int, struct proc *);
111
112 void insmntque(struct vnode *, struct mount *);
113 int getdevvp(dev_t, struct vnode **, enum vtype);
114
115 int vfs_hang_addrlist(struct mount *, struct netexport *,
116 struct export_args *);
117 int vfs_free_netcred(struct radix_node *, void *, u_int);
118 void vfs_free_addrlist(struct netexport *);
119 void vputonfreelist(struct vnode *);
120
121 int vflush_vnode(struct vnode *, void *);
122 int maxvnodes;
123
124 struct mutex vnode_mtx = MUTEX_INITIALIZER(IPL_BIO);
125
126 void vfs_unmountall(void);
127
128 #ifdef DEBUG
129 void printlockedvnodes(void);
130 #endif
131
132 struct pool vnode_pool;
133 struct pool uvm_vnode_pool;
134
135 static inline int rb_buf_compare(const struct buf *b1, const struct buf *b2);
136 RBT_GENERATE(buf_rb_bufs, buf, b_rbbufs, rb_buf_compare);
137
138 static inline int
rb_buf_compare(const struct buf * b1,const struct buf * b2)139 rb_buf_compare(const struct buf *b1, const struct buf *b2)
140 {
141 if (b1->b_lblkno < b2->b_lblkno)
142 return(-1);
143 if (b1->b_lblkno > b2->b_lblkno)
144 return(1);
145 return(0);
146 }
147
148 /*
149 * Initialize the vnode management data structures.
150 */
151 void
vntblinit(void)152 vntblinit(void)
153 {
154 /* buffer cache may need a vnode for each buffer */
155 maxvnodes = 2 * initialvnodes;
156 pool_init(&vnode_pool, sizeof(struct vnode), 0, IPL_NONE,
157 PR_WAITOK, "vnodes", NULL);
158 pool_init(&uvm_vnode_pool, sizeof(struct uvm_vnode), 0, IPL_NONE,
159 PR_WAITOK, "uvmvnodes", NULL);
160 TAILQ_INIT(&vnode_hold_list);
161 TAILQ_INIT(&vnode_free_list);
162 TAILQ_INIT(&mountlist);
163 /*
164 * Initialize the filesystem syncer.
165 */
166 vn_initialize_syncerd();
167
168 #ifdef NFSSERVER
169 rn_init(sizeof(struct sockaddr_in));
170 #endif /* NFSSERVER */
171 }
172
173 /*
174 * Allocate a mount point.
175 *
176 * The returned mount point is marked as busy.
177 */
178 struct mount *
vfs_mount_alloc(struct vnode * vp,struct vfsconf * vfsp)179 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp)
180 {
181 struct mount *mp;
182
183 mp = malloc(sizeof(*mp), M_MOUNT, M_WAITOK|M_ZERO);
184 rw_init_flags(&mp->mnt_lock, "vfslock", RWL_IS_VNODE);
185 (void)vfs_busy(mp, VB_READ|VB_NOWAIT);
186
187 TAILQ_INIT(&mp->mnt_vnodelist);
188 mp->mnt_vnodecovered = vp;
189
190 atomic_inc_int(&vfsp->vfc_refcount);
191 mp->mnt_vfc = vfsp;
192 mp->mnt_op = vfsp->vfc_vfsops;
193 mp->mnt_flag = vfsp->vfc_flags;
194 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
195
196 return (mp);
197 }
198
199 /*
200 * Release a mount point.
201 */
202 void
vfs_mount_free(struct mount * mp)203 vfs_mount_free(struct mount *mp)
204 {
205 atomic_dec_int(&mp->mnt_vfc->vfc_refcount);
206 free(mp, M_MOUNT, sizeof(*mp));
207 }
208
209 /*
210 * Mark a mount point as busy. Used to synchronize access and to delay
211 * unmounting.
212 *
213 * Default behaviour is to attempt getting a READ lock and in case of an
214 * ongoing unmount, to wait for it to finish and then return failure.
215 */
216 int
vfs_busy(struct mount * mp,int flags)217 vfs_busy(struct mount *mp, int flags)
218 {
219 int rwflags = 0;
220
221 if (flags & VB_WRITE)
222 rwflags |= RW_WRITE;
223 else
224 rwflags |= RW_READ;
225
226 if (flags & VB_WAIT)
227 rwflags |= RW_SLEEPFAIL;
228 else
229 rwflags |= RW_NOSLEEP;
230
231 #ifdef WITNESS
232 if (flags & VB_DUPOK)
233 rwflags |= RW_DUPOK;
234 #endif
235
236 if (rw_enter(&mp->mnt_lock, rwflags))
237 return (EBUSY);
238
239 return (0);
240 }
241
242 /*
243 * Free a busy file system
244 */
245 void
vfs_unbusy(struct mount * mp)246 vfs_unbusy(struct mount *mp)
247 {
248 rw_exit(&mp->mnt_lock);
249 }
250
251 int
vfs_isbusy(struct mount * mp)252 vfs_isbusy(struct mount *mp)
253 {
254 return (rw_status(&mp->mnt_lock) != 0);
255 }
256
257 /*
258 * Lookup a filesystem type, and if found allocate and initialize
259 * a mount structure for it.
260 *
261 * Devname is usually updated by mount(8) after booting.
262 */
263 int
vfs_rootmountalloc(char * fstypename,char * devname,struct mount ** mpp)264 vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp)
265 {
266 struct vfsconf *vfsp;
267 struct mount *mp;
268
269 vfsp = vfs_byname(fstypename);
270 if (vfsp == NULL)
271 return (ENODEV);
272 mp = vfs_mount_alloc(NULLVP, vfsp);
273 mp->mnt_flag |= MNT_RDONLY;
274 mp->mnt_stat.f_mntonname[0] = '/';
275 strlcpy(mp->mnt_stat.f_mntfromname, devname, MNAMELEN);
276 strlcpy(mp->mnt_stat.f_mntfromspec, devname, MNAMELEN);
277 *mpp = mp;
278 return (0);
279 }
280
281 /*
282 * Lookup a mount point by filesystem identifier.
283 */
284 struct mount *
vfs_getvfs(fsid_t * fsid)285 vfs_getvfs(fsid_t *fsid)
286 {
287 struct mount *mp;
288
289 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
290 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
291 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
292 return (mp);
293 }
294 }
295
296 return (NULL);
297 }
298
299
300 /*
301 * Get a new unique fsid
302 */
303 void
vfs_getnewfsid(struct mount * mp)304 vfs_getnewfsid(struct mount *mp)
305 {
306 static u_short xxxfs_mntid;
307
308 fsid_t tfsid;
309 int mtype;
310
311 mtype = mp->mnt_vfc->vfc_typenum;
312 mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
313 mp->mnt_stat.f_fsid.val[1] = mtype;
314 if (xxxfs_mntid == 0)
315 ++xxxfs_mntid;
316 tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
317 tfsid.val[1] = mtype;
318 if (!TAILQ_EMPTY(&mountlist)) {
319 while (vfs_getvfs(&tfsid)) {
320 tfsid.val[0]++;
321 xxxfs_mntid++;
322 }
323 }
324 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
325 }
326
327 /*
328 * Set vnode attributes to VNOVAL
329 */
330 void
vattr_null(struct vattr * vap)331 vattr_null(struct vattr *vap)
332 {
333
334 vap->va_type = VNON;
335 /*
336 * Don't get fancy: u_quad_t = u_int = VNOVAL leaves the u_quad_t
337 * with 2^31-1 instead of 2^64-1. Just write'm out and let
338 * the compiler do its job.
339 */
340 vap->va_mode = VNOVAL;
341 vap->va_nlink = VNOVAL;
342 vap->va_uid = VNOVAL;
343 vap->va_gid = VNOVAL;
344 vap->va_fsid = VNOVAL;
345 vap->va_fileid = VNOVAL;
346 vap->va_size = VNOVAL;
347 vap->va_blocksize = VNOVAL;
348 vap->va_atime.tv_sec = VNOVAL;
349 vap->va_atime.tv_nsec = VNOVAL;
350 vap->va_mtime.tv_sec = VNOVAL;
351 vap->va_mtime.tv_nsec = VNOVAL;
352 vap->va_ctime.tv_sec = VNOVAL;
353 vap->va_ctime.tv_nsec = VNOVAL;
354 vap->va_gen = VNOVAL;
355 vap->va_flags = VNOVAL;
356 vap->va_rdev = VNOVAL;
357 vap->va_bytes = VNOVAL;
358 vap->va_filerev = VNOVAL;
359 vap->va_vaflags = 0;
360 }
361
362 /*
363 * Routines having to do with the management of the vnode table.
364 */
365 long numvnodes;
366
367 /*
368 * Return the next vnode from the free list.
369 */
370 int
getnewvnode(enum vtagtype tag,struct mount * mp,const struct vops * vops,struct vnode ** vpp)371 getnewvnode(enum vtagtype tag, struct mount *mp, const struct vops *vops,
372 struct vnode **vpp)
373 {
374 struct proc *p = curproc;
375 struct freelst *listhd;
376 static int toggle;
377 struct vnode *vp;
378 int s;
379
380 /*
381 * allow maxvnodes to increase if the buffer cache itself
382 * is big enough to justify it. (we don't shrink it ever)
383 */
384 maxvnodes = maxvnodes < bcstats.numbufs ? bcstats.numbufs
385 : maxvnodes;
386
387 /*
388 * We must choose whether to allocate a new vnode or recycle an
389 * existing one. The criterion for allocating a new one is that
390 * the total number of vnodes is less than the number desired or
391 * there are no vnodes on either free list. Generally we only
392 * want to recycle vnodes that have no buffers associated with
393 * them, so we look first on the vnode_free_list. If it is empty,
394 * we next consider vnodes with referencing buffers on the
395 * vnode_hold_list. The toggle ensures that half the time we
396 * will use a buffer from the vnode_hold_list, and half the time
397 * we will allocate a new one unless the list has grown to twice
398 * the desired size. We are reticent to recycle vnodes from the
399 * vnode_hold_list because we will lose the identity of all its
400 * referencing buffers.
401 */
402 toggle ^= 1;
403 if (numvnodes / 2 > maxvnodes)
404 toggle = 0;
405
406 s = splbio();
407 if ((numvnodes < maxvnodes) ||
408 ((TAILQ_FIRST(listhd = &vnode_free_list) == NULL) &&
409 ((TAILQ_FIRST(listhd = &vnode_hold_list) == NULL) || toggle))) {
410 splx(s);
411 vp = pool_get(&vnode_pool, PR_WAITOK | PR_ZERO);
412 vp->v_uvm = pool_get(&uvm_vnode_pool, PR_WAITOK | PR_ZERO);
413 vp->v_uvm->u_vnode = vp;
414 uvm_obj_init(&vp->v_uvm->u_obj, &uvm_vnodeops, 0);
415 RBT_INIT(buf_rb_bufs, &vp->v_bufs_tree);
416 cache_tree_init(&vp->v_nc_tree);
417 TAILQ_INIT(&vp->v_cache_dst);
418 numvnodes++;
419 } else {
420 TAILQ_FOREACH(vp, listhd, v_freelist) {
421 if (VOP_ISLOCKED(vp) == 0)
422 break;
423 }
424 /*
425 * Unless this is a bad time of the month, at most
426 * the first NCPUS items on the free list are
427 * locked, so this is close enough to being empty.
428 */
429 if (vp == NULL) {
430 splx(s);
431 tablefull("vnode");
432 *vpp = NULL;
433 return (ENFILE);
434 }
435
436 #ifdef DIAGNOSTIC
437 if (vp->v_usecount) {
438 vprint("free vnode", vp);
439 panic("free vnode isn't");
440 }
441 #endif
442
443 TAILQ_REMOVE(listhd, vp, v_freelist);
444 vp->v_bioflag &= ~VBIOONFREELIST;
445 splx(s);
446
447 if (vp->v_type != VBAD)
448 vgonel(vp, p);
449 #ifdef DIAGNOSTIC
450 if (vp->v_data) {
451 vprint("cleaned vnode", vp);
452 panic("cleaned vnode isn't");
453 }
454 s = splbio();
455 if (vp->v_numoutput)
456 panic("Clean vnode has pending I/O's");
457 splx(s);
458 #endif
459 vp->v_flag = 0;
460 vp->v_socket = NULL;
461 }
462 cache_purge(vp);
463 vp->v_type = VNON;
464 vp->v_tag = tag;
465 vp->v_op = vops;
466 insmntque(vp, mp);
467 *vpp = vp;
468 vp->v_usecount = 1;
469 vp->v_data = NULL;
470 return (0);
471 }
472
473 /*
474 * Move a vnode from one mount queue to another.
475 */
476 void
insmntque(struct vnode * vp,struct mount * mp)477 insmntque(struct vnode *vp, struct mount *mp)
478 {
479 /*
480 * Delete from old mount point vnode list, if on one.
481 */
482 if (vp->v_mount != NULL)
483 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
484 /*
485 * Insert into list of vnodes for the new mount point, if available.
486 */
487 if ((vp->v_mount = mp) != NULL)
488 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
489 }
490
491 /*
492 * Create a vnode for a block device.
493 * Used for root filesystem, argdev, and swap areas.
494 * Also used for memory file system special devices.
495 */
496 int
bdevvp(dev_t dev,struct vnode ** vpp)497 bdevvp(dev_t dev, struct vnode **vpp)
498 {
499 return (getdevvp(dev, vpp, VBLK));
500 }
501
502 /*
503 * Create a vnode for a character device.
504 * Used for console handling.
505 */
506 int
cdevvp(dev_t dev,struct vnode ** vpp)507 cdevvp(dev_t dev, struct vnode **vpp)
508 {
509 return (getdevvp(dev, vpp, VCHR));
510 }
511
512 /*
513 * Create a vnode for a device.
514 * Used by bdevvp (block device) for root file system etc.,
515 * and by cdevvp (character device) for console.
516 */
517 int
getdevvp(dev_t dev,struct vnode ** vpp,enum vtype type)518 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type)
519 {
520 struct vnode *vp;
521 struct vnode *nvp;
522 int error;
523
524 if (dev == NODEV) {
525 *vpp = NULLVP;
526 return (0);
527 }
528 error = getnewvnode(VT_NON, NULL, &spec_vops, &nvp);
529 if (error) {
530 *vpp = NULLVP;
531 return (error);
532 }
533 vp = nvp;
534 vp->v_type = type;
535 if ((nvp = checkalias(vp, dev, NULL)) != NULL) {
536 vput(vp);
537 vp = nvp;
538 }
539 if (vp->v_type == VCHR && cdevsw[major(vp->v_rdev)].d_type == D_TTY)
540 vp->v_flag |= VISTTY;
541 *vpp = vp;
542 return (0);
543 }
544
545 /*
546 * Check to see if the new vnode represents a special device
547 * for which we already have a vnode (either because of
548 * bdevvp() or because of a different vnode representing
549 * the same block device). If such an alias exists, deallocate
550 * the existing contents and return the aliased vnode. The
551 * caller is responsible for filling it with its new contents.
552 */
553 struct vnode *
checkalias(struct vnode * nvp,dev_t nvp_rdev,struct mount * mp)554 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp)
555 {
556 struct proc *p = curproc;
557 struct vnode *vp;
558 struct vnodechain *vchain;
559
560 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
561 return (NULLVP);
562
563 vchain = &speclisth[SPECHASH(nvp_rdev)];
564 loop:
565 SLIST_FOREACH(vp, vchain, v_specnext) {
566 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) {
567 continue;
568 }
569 /*
570 * Alias, but not in use, so flush it out.
571 */
572 if (vp->v_usecount == 0) {
573 vgonel(vp, p);
574 goto loop;
575 }
576 if (vget(vp, LK_EXCLUSIVE)) {
577 goto loop;
578 }
579 break;
580 }
581
582 /*
583 * Common case is actually in the if statement
584 */
585 if (vp == NULL || !(vp->v_tag == VT_NON && vp->v_type == VBLK)) {
586 nvp->v_specinfo = malloc(sizeof(struct specinfo), M_VNODE,
587 M_WAITOK);
588 nvp->v_rdev = nvp_rdev;
589 nvp->v_hashchain = vchain;
590 nvp->v_specmountpoint = NULL;
591 nvp->v_speclockf = NULL;
592 nvp->v_specbitmap = NULL;
593 if (nvp->v_type == VCHR &&
594 (cdevsw[major(nvp_rdev)].d_flags & D_CLONE) &&
595 (minor(nvp_rdev) >> CLONE_SHIFT == 0)) {
596 if (vp != NULLVP)
597 nvp->v_specbitmap = vp->v_specbitmap;
598 else
599 nvp->v_specbitmap = malloc(CLONE_MAPSZ,
600 M_VNODE, M_WAITOK | M_ZERO);
601 }
602 SLIST_INSERT_HEAD(vchain, nvp, v_specnext);
603 if (vp != NULLVP) {
604 nvp->v_flag |= VALIASED;
605 vp->v_flag |= VALIASED;
606 vput(vp);
607 }
608 return (NULLVP);
609 }
610
611 /*
612 * This code is the uncommon case. It is called in case
613 * we found an alias that was VT_NON && vtype of VBLK
614 * This means we found a block device that was created
615 * using bdevvp.
616 * An example of such a vnode is the root partition device vnode
617 * created in ffs_mountroot.
618 *
619 * The vnodes created by bdevvp should not be aliased (why?).
620 */
621
622 VOP_UNLOCK(vp);
623 vclean(vp, 0, p);
624 vp->v_op = nvp->v_op;
625 vp->v_tag = nvp->v_tag;
626 nvp->v_type = VNON;
627 insmntque(vp, mp);
628 return (vp);
629 }
630
631 /*
632 * Grab a particular vnode from the free list, increment its
633 * reference count and lock it. If the vnode lock bit is set,
634 * the vnode is being eliminated in vgone. In that case, we
635 * cannot grab it, so the process is awakened when the
636 * transition is completed, and an error code is returned to
637 * indicate that the vnode is no longer usable, possibly
638 * having been changed to a new file system type.
639 */
640 int
vget(struct vnode * vp,int flags)641 vget(struct vnode *vp, int flags)
642 {
643 int error, s, onfreelist;
644
645 /*
646 * If the vnode is in the process of being cleaned out for
647 * another use, we wait for the cleaning to finish and then
648 * return failure. Cleaning is determined by checking that
649 * the VXLOCK flag is set.
650 */
651 mtx_enter(&vnode_mtx);
652 if (vp->v_lflag & VXLOCK) {
653 if (flags & LK_NOWAIT) {
654 mtx_leave(&vnode_mtx);
655 return (EBUSY);
656 }
657
658 vp->v_lflag |= VXWANT;
659 msleep_nsec(vp, &vnode_mtx, PINOD, "vget", INFSLP);
660 mtx_leave(&vnode_mtx);
661 return (ENOENT);
662 }
663 mtx_leave(&vnode_mtx);
664
665 s = splbio();
666 onfreelist = vp->v_bioflag & VBIOONFREELIST;
667 if (vp->v_usecount == 0 && onfreelist) {
668 if (vp->v_holdcnt > 0)
669 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
670 else
671 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
672 vp->v_bioflag &= ~VBIOONFREELIST;
673 }
674 splx(s);
675
676 vp->v_usecount++;
677 if (flags & LK_TYPE_MASK) {
678 if ((error = vn_lock(vp, flags)) != 0) {
679 vp->v_usecount--;
680 if (vp->v_usecount == 0 && onfreelist)
681 vputonfreelist(vp);
682 }
683 return (error);
684 }
685
686 return (0);
687 }
688
689
690 /* Vnode reference. */
691 void
vref(struct vnode * vp)692 vref(struct vnode *vp)
693 {
694 KERNEL_ASSERT_LOCKED();
695
696 #ifdef DIAGNOSTIC
697 if (vp->v_usecount == 0)
698 panic("vref used where vget required");
699 if (vp->v_type == VNON)
700 panic("vref on a VNON vnode");
701 #endif
702 vp->v_usecount++;
703 }
704
705 void
vputonfreelist(struct vnode * vp)706 vputonfreelist(struct vnode *vp)
707 {
708 int s;
709 struct freelst *lst;
710
711 s = splbio();
712 #ifdef DIAGNOSTIC
713 if (vp->v_usecount != 0)
714 panic("Use count is not zero!");
715
716 /*
717 * If the hold count is still positive, one or many threads could still
718 * be waiting on the vnode lock inside uvn_io().
719 */
720 if (vp->v_holdcnt == 0 && vp->v_lockcount != 0)
721 panic("%s: lock count is not zero", __func__);
722
723 if (vp->v_bioflag & VBIOONFREELIST) {
724 vprint("vnode already on free list: ", vp);
725 panic("vnode already on free list");
726 }
727 #endif
728
729 vp->v_bioflag |= VBIOONFREELIST;
730 vp->v_bioflag &= ~VBIOERROR;
731
732 if (vp->v_holdcnt > 0)
733 lst = &vnode_hold_list;
734 else
735 lst = &vnode_free_list;
736
737 if (vp->v_type == VBAD)
738 TAILQ_INSERT_HEAD(lst, vp, v_freelist);
739 else
740 TAILQ_INSERT_TAIL(lst, vp, v_freelist);
741
742 splx(s);
743 }
744
745 /*
746 * vput(), just unlock and vrele()
747 */
748 void
vput(struct vnode * vp)749 vput(struct vnode *vp)
750 {
751 struct proc *p = curproc;
752 int s;
753
754 #ifdef DIAGNOSTIC
755 if (vp == NULL)
756 panic("vput: null vp");
757 #endif
758
759 #ifdef DIAGNOSTIC
760 if (vp->v_usecount == 0) {
761 vprint("vput: bad ref count", vp);
762 panic("vput: ref cnt");
763 }
764 #endif
765 vp->v_usecount--;
766 KASSERT(vp->v_usecount > 0 || vp->v_uvcount == 0);
767 if (vp->v_usecount > 0) {
768 VOP_UNLOCK(vp);
769 return;
770 }
771
772 #ifdef DIAGNOSTIC
773 if (vp->v_writecount != 0) {
774 vprint("vput: bad writecount", vp);
775 panic("vput: v_writecount != 0");
776 }
777 #endif
778
779 VOP_INACTIVE(vp, p);
780
781 s = splbio();
782 if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST))
783 vputonfreelist(vp);
784 splx(s);
785 }
786
787 /*
788 * Vnode release - use for active VNODES.
789 * If count drops to zero, call inactive routine and return to freelist.
790 * Returns 0 if it did not sleep.
791 */
792 int
vrele(struct vnode * vp)793 vrele(struct vnode *vp)
794 {
795 struct proc *p = curproc;
796 int s;
797
798 #ifdef DIAGNOSTIC
799 if (vp == NULL)
800 panic("vrele: null vp");
801 #endif
802 #ifdef DIAGNOSTIC
803 if (vp->v_usecount == 0) {
804 vprint("vrele: bad ref count", vp);
805 panic("vrele: ref cnt");
806 }
807 #endif
808 vp->v_usecount--;
809 if (vp->v_usecount > 0) {
810 return (0);
811 }
812
813 #ifdef DIAGNOSTIC
814 if (vp->v_writecount != 0) {
815 vprint("vrele: bad writecount", vp);
816 panic("vrele: v_writecount != 0");
817 }
818 #endif
819
820 if (vn_lock(vp, LK_EXCLUSIVE)) {
821 #ifdef DIAGNOSTIC
822 vprint("vrele: cannot lock", vp);
823 #endif
824 return (1);
825 }
826
827 VOP_INACTIVE(vp, p);
828
829 s = splbio();
830 if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST))
831 vputonfreelist(vp);
832 splx(s);
833 return (1);
834 }
835
836 /* Page or buffer structure gets a reference. */
837 void
vhold(struct vnode * vp)838 vhold(struct vnode *vp)
839 {
840 int s;
841
842 s = splbio();
843
844 /*
845 * If it is on the freelist and the hold count is currently
846 * zero, move it to the hold list.
847 */
848 if ((vp->v_bioflag & VBIOONFREELIST) &&
849 vp->v_holdcnt == 0 && vp->v_usecount == 0) {
850 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
851 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
852 }
853 vp->v_holdcnt++;
854
855 splx(s);
856 }
857
858 /* Lose interest in a vnode. */
859 void
vdrop(struct vnode * vp)860 vdrop(struct vnode *vp)
861 {
862 int s;
863
864 s = splbio();
865
866 #ifdef DIAGNOSTIC
867 if (vp->v_holdcnt == 0)
868 panic("vdrop: zero holdcnt");
869 #endif
870
871 vp->v_holdcnt--;
872
873 /*
874 * If it is on the holdlist and the hold count drops to
875 * zero, move it to the free list.
876 */
877 if ((vp->v_bioflag & VBIOONFREELIST) &&
878 vp->v_holdcnt == 0 && vp->v_usecount == 0) {
879 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
880 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
881 }
882
883 splx(s);
884 }
885
886 /*
887 * Remove any vnodes in the vnode table belonging to mount point mp.
888 *
889 * If MNT_NOFORCE is specified, there should not be any active ones,
890 * return error if any are found (nb: this is a user error, not a
891 * system error). If MNT_FORCE is specified, detach any active vnodes
892 * that are found.
893 */
894 #ifdef DEBUG_SYSCTL
895 int busyprt = 0; /* print out busy vnodes */
896 struct ctldebug debug_vfs_busyprt = { "vfs_busyprt", &busyprt };
897 #endif
898
899 int
vfs_mount_foreach_vnode(struct mount * mp,int (* func)(struct vnode *,void *),void * arg)900 vfs_mount_foreach_vnode(struct mount *mp,
901 int (*func)(struct vnode *, void *), void *arg) {
902 struct vnode *vp, *nvp;
903 int error = 0;
904
905 loop:
906 TAILQ_FOREACH_SAFE(vp , &mp->mnt_vnodelist, v_mntvnodes, nvp) {
907 if (vp->v_mount != mp)
908 goto loop;
909
910 error = func(vp, arg);
911
912 if (error != 0)
913 break;
914 }
915
916 return (error);
917 }
918
919 struct vflush_args {
920 struct vnode *skipvp;
921 int busy;
922 int flags;
923 };
924
925 int
vflush_vnode(struct vnode * vp,void * arg)926 vflush_vnode(struct vnode *vp, void *arg)
927 {
928 struct vflush_args *va = arg;
929 struct proc *p = curproc;
930 int empty, s;
931
932 if (vp == va->skipvp) {
933 return (0);
934 }
935
936 if ((va->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
937 return (0);
938 }
939
940 /*
941 * If WRITECLOSE is set, only flush out regular file
942 * vnodes open for writing.
943 */
944 if ((va->flags & WRITECLOSE) &&
945 (vp->v_writecount == 0 || vp->v_type != VREG)) {
946 return (0);
947 }
948
949 /*
950 * With v_usecount == 0, all we need to do is clear
951 * out the vnode data structures and we are done.
952 */
953 if (vp->v_usecount == 0) {
954 vgonel(vp, p);
955 return (0);
956 }
957
958 /*
959 * If FORCECLOSE is set, forcibly close the vnode.
960 * For block or character devices, revert to an
961 * anonymous device. For all other files, just kill them.
962 */
963 if (va->flags & FORCECLOSE) {
964 if (vp->v_type != VBLK && vp->v_type != VCHR) {
965 vgonel(vp, p);
966 } else {
967 vclean(vp, 0, p);
968 vp->v_op = &spec_vops;
969 insmntque(vp, NULL);
970 }
971 return (0);
972 }
973
974 /*
975 * If set, this is allowed to ignore vnodes which don't
976 * have changes pending to disk.
977 * XXX Might be nice to check per-fs "inode" flags, but
978 * generally the filesystem is sync'd already, right?
979 */
980 s = splbio();
981 empty = (va->flags & IGNORECLEAN) && LIST_EMPTY(&vp->v_dirtyblkhd);
982 splx(s);
983
984 if (empty)
985 return (0);
986
987 #if defined(DEBUG_SYSCTL) && (defined(DEBUG) || defined(DIAGNOSTIC))
988 if (busyprt)
989 vprint("vflush: busy vnode", vp);
990 #endif
991 va->busy++;
992 return (0);
993 }
994
995 int
vflush(struct mount * mp,struct vnode * skipvp,int flags)996 vflush(struct mount *mp, struct vnode *skipvp, int flags)
997 {
998 struct vflush_args va;
999 va.skipvp = skipvp;
1000 va.busy = 0;
1001 va.flags = flags;
1002
1003 vfs_mount_foreach_vnode(mp, vflush_vnode, &va);
1004
1005 if (va.busy)
1006 return (EBUSY);
1007 return (0);
1008 }
1009
1010 /*
1011 * Disassociate the underlying file system from a vnode.
1012 */
1013 void
vclean(struct vnode * vp,int flags,struct proc * p)1014 vclean(struct vnode *vp, int flags, struct proc *p)
1015 {
1016 int active, do_wakeup = 0;
1017 int s;
1018
1019 /*
1020 * Check to see if the vnode is in use.
1021 * If so we have to reference it before we clean it out
1022 * so that its count cannot fall to zero and generate a
1023 * race against ourselves to recycle it.
1024 */
1025 if ((active = vp->v_usecount) != 0)
1026 vp->v_usecount++;
1027
1028 /*
1029 * Prevent the vnode from being recycled or
1030 * brought into use while we clean it out.
1031 */
1032 mtx_enter(&vnode_mtx);
1033 if (vp->v_lflag & VXLOCK)
1034 panic("vclean: deadlock");
1035 vp->v_lflag |= VXLOCK;
1036
1037 if (vp->v_lockcount > 0) {
1038 /*
1039 * Ensure that any thread currently waiting on the same lock has
1040 * observed that the vnode is about to be exclusively locked
1041 * before continuing.
1042 */
1043 msleep_nsec(&vp->v_lockcount, &vnode_mtx, PINOD, "vop_lock",
1044 INFSLP);
1045 KASSERT(vp->v_lockcount == 0);
1046 }
1047 mtx_leave(&vnode_mtx);
1048
1049 /*
1050 * Even if the count is zero, the VOP_INACTIVE routine may still
1051 * have the object locked while it cleans it out. The VOP_LOCK
1052 * ensures that the VOP_INACTIVE routine is done with its work.
1053 * For active vnodes, it ensures that no other activity can
1054 * occur while the underlying object is being cleaned out.
1055 */
1056 VOP_LOCK(vp, LK_EXCLUSIVE | LK_DRAIN);
1057
1058 /*
1059 * Clean out any VM data associated with the vnode.
1060 */
1061 uvm_vnp_terminate(vp);
1062 /*
1063 * Clean out any buffers associated with the vnode.
1064 */
1065 if (flags & DOCLOSE)
1066 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, INFSLP);
1067 /*
1068 * If purging an active vnode, it must be closed and
1069 * deactivated before being reclaimed. Note that the
1070 * VOP_INACTIVE will unlock the vnode
1071 */
1072 if (active) {
1073 if (flags & DOCLOSE)
1074 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1075 VOP_INACTIVE(vp, p);
1076 } else {
1077 /*
1078 * Any other processes trying to obtain this lock must first
1079 * wait for VXLOCK to clear, then call the new lock operation.
1080 */
1081 VOP_UNLOCK(vp);
1082 }
1083
1084 /*
1085 * Reclaim the vnode.
1086 */
1087 if (VOP_RECLAIM(vp, p))
1088 panic("vclean: cannot reclaim");
1089 if (active) {
1090 vp->v_usecount--;
1091 if (vp->v_usecount == 0) {
1092 s = splbio();
1093 if (vp->v_holdcnt > 0)
1094 panic("vclean: not clean");
1095 vputonfreelist(vp);
1096 splx(s);
1097 }
1098 }
1099 cache_purge(vp);
1100
1101 /*
1102 * Done with purge, notify sleepers of the grim news.
1103 */
1104 vp->v_op = &dead_vops;
1105 VN_KNOTE(vp, NOTE_REVOKE);
1106 vp->v_tag = VT_NON;
1107 #ifdef VFSLCKDEBUG
1108 vp->v_flag &= ~VLOCKSWORK;
1109 #endif
1110 mtx_enter(&vnode_mtx);
1111 vp->v_lflag &= ~VXLOCK;
1112 if (vp->v_lflag & VXWANT) {
1113 vp->v_lflag &= ~VXWANT;
1114 do_wakeup = 1;
1115 }
1116 mtx_leave(&vnode_mtx);
1117 if (do_wakeup)
1118 wakeup(vp);
1119 }
1120
1121 /*
1122 * Recycle an unused vnode to the front of the free list.
1123 */
1124 int
vrecycle(struct vnode * vp,struct proc * p)1125 vrecycle(struct vnode *vp, struct proc *p)
1126 {
1127 if (vp->v_usecount == 0) {
1128 vgonel(vp, p);
1129 return (1);
1130 }
1131 return (0);
1132 }
1133
1134 /*
1135 * Eliminate all activity associated with a vnode
1136 * in preparation for reuse.
1137 */
1138 void
vgone(struct vnode * vp)1139 vgone(struct vnode *vp)
1140 {
1141 struct proc *p = curproc;
1142 vgonel(vp, p);
1143 }
1144
1145 /*
1146 * vgone, with struct proc.
1147 */
1148 void
vgonel(struct vnode * vp,struct proc * p)1149 vgonel(struct vnode *vp, struct proc *p)
1150 {
1151 struct vnode *vq;
1152 struct vnode *vx;
1153 int s;
1154
1155 KASSERT(vp->v_uvcount == 0);
1156
1157 /*
1158 * If a vgone (or vclean) is already in progress,
1159 * wait until it is done and return.
1160 */
1161 mtx_enter(&vnode_mtx);
1162 if (vp->v_lflag & VXLOCK) {
1163 vp->v_lflag |= VXWANT;
1164 msleep_nsec(vp, &vnode_mtx, PINOD, "vgone", INFSLP);
1165 mtx_leave(&vnode_mtx);
1166 return;
1167 }
1168 mtx_leave(&vnode_mtx);
1169
1170 /*
1171 * Clean out the filesystem specific data.
1172 */
1173 vclean(vp, DOCLOSE, p);
1174 /*
1175 * Delete from old mount point vnode list, if on one.
1176 */
1177 if (vp->v_mount != NULL)
1178 insmntque(vp, NULL);
1179 /*
1180 * If special device, remove it from special device alias list
1181 * if it is on one.
1182 */
1183 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
1184 vp->v_specinfo != NULL) {
1185 if ((vp->v_flag & VALIASED) == 0 && vp->v_type == VCHR &&
1186 (cdevsw[major(vp->v_rdev)].d_flags & D_CLONE) &&
1187 (minor(vp->v_rdev) >> CLONE_SHIFT == 0)) {
1188 free(vp->v_specbitmap, M_VNODE, CLONE_MAPSZ);
1189 }
1190 SLIST_REMOVE(vp->v_hashchain, vp, vnode, v_specnext);
1191 if (vp->v_flag & VALIASED) {
1192 vx = NULL;
1193 SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
1194 if (vq->v_rdev != vp->v_rdev ||
1195 vq->v_type != vp->v_type)
1196 continue;
1197 if (vx)
1198 break;
1199 vx = vq;
1200 }
1201 if (vx == NULL)
1202 panic("missing alias");
1203 if (vq == NULL)
1204 vx->v_flag &= ~VALIASED;
1205 vp->v_flag &= ~VALIASED;
1206 }
1207 lf_purgelocks(&vp->v_speclockf);
1208 free(vp->v_specinfo, M_VNODE, sizeof(struct specinfo));
1209 vp->v_specinfo = NULL;
1210 }
1211 /*
1212 * If it is on the freelist and not already at the head,
1213 * move it to the head of the list.
1214 */
1215 vp->v_type = VBAD;
1216
1217 /*
1218 * Move onto the free list, unless we were called from
1219 * getnewvnode and we're not on any free list
1220 */
1221 s = splbio();
1222 if (vp->v_usecount == 0 &&
1223 (vp->v_bioflag & VBIOONFREELIST)) {
1224 if (vp->v_holdcnt > 0)
1225 panic("vgonel: not clean");
1226
1227 if (TAILQ_FIRST(&vnode_free_list) != vp) {
1228 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1229 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1230 }
1231 }
1232 splx(s);
1233 }
1234
1235 /*
1236 * Lookup a vnode by device number.
1237 */
1238 int
vfinddev(dev_t dev,enum vtype type,struct vnode ** vpp)1239 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
1240 {
1241 struct vnode *vp;
1242 int rc =0;
1243
1244 SLIST_FOREACH(vp, &speclisth[SPECHASH(dev)], v_specnext) {
1245 if (dev != vp->v_rdev || type != vp->v_type)
1246 continue;
1247 *vpp = vp;
1248 rc = 1;
1249 break;
1250 }
1251 return (rc);
1252 }
1253
1254 /*
1255 * Revoke all the vnodes corresponding to the specified minor number
1256 * range (endpoints inclusive) of the specified major.
1257 */
1258 void
vdevgone(int maj,int minl,int minh,enum vtype type)1259 vdevgone(int maj, int minl, int minh, enum vtype type)
1260 {
1261 struct vnode *vp;
1262 int mn;
1263
1264 for (mn = minl; mn <= minh; mn++)
1265 if (vfinddev(makedev(maj, mn), type, &vp))
1266 VOP_REVOKE(vp, REVOKEALL);
1267 }
1268
1269 /*
1270 * Calculate the total number of references to a special device.
1271 */
1272 int
vcount(struct vnode * vp)1273 vcount(struct vnode *vp)
1274 {
1275 struct vnode *vq;
1276 int count;
1277
1278 loop:
1279 if ((vp->v_flag & VALIASED) == 0)
1280 return (vp->v_usecount);
1281 count = 0;
1282 SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
1283 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1284 continue;
1285 /*
1286 * Alias, but not in use, so flush it out.
1287 */
1288 if (vq->v_usecount == 0 && vq != vp) {
1289 vgone(vq);
1290 goto loop;
1291 }
1292 count += vq->v_usecount;
1293 }
1294 return (count);
1295 }
1296
1297 #if defined(DEBUG) || defined(DIAGNOSTIC)
1298 /*
1299 * Print out a description of a vnode.
1300 */
1301 static char *typename[] =
1302 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1303
1304 void
vprint(char * label,struct vnode * vp)1305 vprint(char *label, struct vnode *vp)
1306 {
1307 char buf[64];
1308
1309 if (label != NULL)
1310 printf("%s: ", label);
1311 printf("%p, type %s, use %u, write %u, hold %u,",
1312 vp, typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1313 vp->v_holdcnt);
1314 buf[0] = '\0';
1315 if (vp->v_flag & VROOT)
1316 strlcat(buf, "|VROOT", sizeof buf);
1317 if (vp->v_flag & VTEXT)
1318 strlcat(buf, "|VTEXT", sizeof buf);
1319 if (vp->v_flag & VSYSTEM)
1320 strlcat(buf, "|VSYSTEM", sizeof buf);
1321 if (vp->v_lflag & VXLOCK)
1322 strlcat(buf, "|VXLOCK", sizeof buf);
1323 if (vp->v_lflag & VXWANT)
1324 strlcat(buf, "|VXWANT", sizeof buf);
1325 if (vp->v_bioflag & VBIOWAIT)
1326 strlcat(buf, "|VBIOWAIT", sizeof buf);
1327 if (vp->v_bioflag & VBIOONFREELIST)
1328 strlcat(buf, "|VBIOONFREELIST", sizeof buf);
1329 if (vp->v_bioflag & VBIOONSYNCLIST)
1330 strlcat(buf, "|VBIOONSYNCLIST", sizeof buf);
1331 if (vp->v_flag & VALIASED)
1332 strlcat(buf, "|VALIASED", sizeof buf);
1333 if (buf[0] != '\0')
1334 printf(" flags (%s)", &buf[1]);
1335 if (vp->v_data == NULL) {
1336 printf("\n");
1337 } else {
1338 printf("\n\t");
1339 VOP_PRINT(vp);
1340 }
1341 }
1342 #endif /* DEBUG || DIAGNOSTIC */
1343
1344 #ifdef DEBUG
1345 /*
1346 * List all of the locked vnodes in the system.
1347 * Called when debugging the kernel.
1348 */
1349 void
printlockedvnodes(void)1350 printlockedvnodes(void)
1351 {
1352 struct mount *mp;
1353 struct vnode *vp;
1354
1355 printf("Locked vnodes\n");
1356
1357 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1358 if (vfs_busy(mp, VB_READ|VB_NOWAIT))
1359 continue;
1360 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1361 if (VOP_ISLOCKED(vp))
1362 vprint(NULL, vp);
1363 }
1364 vfs_unbusy(mp);
1365 }
1366
1367 }
1368 #endif
1369
1370 /*
1371 * Top level filesystem related information gathering.
1372 */
1373 int
vfs_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)1374 vfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1375 size_t newlen, struct proc *p)
1376 {
1377 struct vfsconf *vfsp, *tmpvfsp;
1378 int ret;
1379
1380 /* all sysctl names at this level are at least name and field */
1381 if (namelen < 2)
1382 return (ENOTDIR); /* overloaded */
1383
1384 if (name[0] != VFS_GENERIC) {
1385 vfsp = vfs_bytypenum(name[0]);
1386 if (vfsp == NULL || vfsp->vfc_vfsops->vfs_sysctl == NULL)
1387 return (EOPNOTSUPP);
1388
1389 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1390 oldp, oldlenp, newp, newlen, p));
1391 }
1392
1393 switch (name[1]) {
1394 case VFS_MAXTYPENUM:
1395 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
1396
1397 case VFS_CONF:
1398 if (namelen < 3)
1399 return (ENOTDIR); /* overloaded */
1400
1401 vfsp = vfs_bytypenum(name[2]);
1402 if (vfsp == NULL)
1403 return (EOPNOTSUPP);
1404
1405 /* Make a copy, clear out kernel pointers */
1406 tmpvfsp = malloc(sizeof(*tmpvfsp), M_TEMP, M_WAITOK|M_ZERO);
1407 memcpy(tmpvfsp, vfsp, sizeof(*tmpvfsp));
1408 tmpvfsp->vfc_vfsops = NULL;
1409
1410 ret = sysctl_rdstruct(oldp, oldlenp, newp, tmpvfsp,
1411 sizeof(struct vfsconf));
1412
1413 free(tmpvfsp, M_TEMP, sizeof(*tmpvfsp));
1414 return (ret);
1415 case VFS_BCACHESTAT: /* buffer cache statistics */
1416 ret = sysctl_rdstruct(oldp, oldlenp, newp, &bcstats,
1417 sizeof(struct bcachestats));
1418 return(ret);
1419 }
1420 return (EOPNOTSUPP);
1421 }
1422
1423 /*
1424 * Check to see if a filesystem is mounted on a block device.
1425 */
1426 int
vfs_mountedon(struct vnode * vp)1427 vfs_mountedon(struct vnode *vp)
1428 {
1429 struct vnode *vq;
1430 int error = 0;
1431
1432 if (vp->v_specmountpoint != NULL)
1433 return (EBUSY);
1434 if (vp->v_flag & VALIASED) {
1435 SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
1436 if (vq->v_rdev != vp->v_rdev ||
1437 vq->v_type != vp->v_type)
1438 continue;
1439 if (vq->v_specmountpoint != NULL) {
1440 error = EBUSY;
1441 break;
1442 }
1443 }
1444 }
1445 return (error);
1446 }
1447
1448 #ifdef NFSSERVER
1449 /*
1450 * Build hash lists of net addresses and hang them off the mount point.
1451 * Called by vfs_export() to set up the lists of export addresses.
1452 */
1453 int
vfs_hang_addrlist(struct mount * mp,struct netexport * nep,struct export_args * argp)1454 vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
1455 struct export_args *argp)
1456 {
1457 struct netcred *np;
1458 struct radix_node_head *rnh;
1459 int nplen, i;
1460 struct radix_node *rn;
1461 struct sockaddr *saddr, *smask = NULL;
1462 int error;
1463
1464 if (argp->ex_addrlen == 0) {
1465 if (mp->mnt_flag & MNT_DEFEXPORTED)
1466 return (EPERM);
1467 np = &nep->ne_defexported;
1468 /* fill in the kernel's ucred from userspace's xucred */
1469 if ((error = crfromxucred(&np->netc_anon, &argp->ex_anon)))
1470 return (error);
1471 mp->mnt_flag |= MNT_DEFEXPORTED;
1472 goto finish;
1473 }
1474 if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN ||
1475 argp->ex_addrlen < 0 || argp->ex_masklen < 0)
1476 return (EINVAL);
1477 nplen = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1478 np = (struct netcred *)malloc(nplen, M_NETADDR, M_WAITOK|M_ZERO);
1479 np->netc_len = nplen;
1480 saddr = (struct sockaddr *)(np + 1);
1481 error = copyin(argp->ex_addr, saddr, argp->ex_addrlen);
1482 if (error)
1483 goto out;
1484 if (saddr->sa_len > argp->ex_addrlen)
1485 saddr->sa_len = argp->ex_addrlen;
1486 if (argp->ex_masklen) {
1487 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
1488 error = copyin(argp->ex_mask, smask, argp->ex_masklen);
1489 if (error)
1490 goto out;
1491 if (smask->sa_len > argp->ex_masklen)
1492 smask->sa_len = argp->ex_masklen;
1493 }
1494 /* fill in the kernel's ucred from userspace's xucred */
1495 if ((error = crfromxucred(&np->netc_anon, &argp->ex_anon)))
1496 goto out;
1497 i = saddr->sa_family;
1498 switch (i) {
1499 case AF_INET:
1500 if ((rnh = nep->ne_rtable_inet) == NULL) {
1501 if (!rn_inithead((void **)&nep->ne_rtable_inet,
1502 offsetof(struct sockaddr_in, sin_addr))) {
1503 error = ENOBUFS;
1504 goto out;
1505 }
1506 rnh = nep->ne_rtable_inet;
1507 }
1508 break;
1509 default:
1510 error = EINVAL;
1511 goto out;
1512 }
1513 rn = rn_addroute(saddr, smask, rnh, np->netc_rnodes, 0);
1514 if (rn == NULL || np != (struct netcred *)rn) { /* already exists */
1515 error = EPERM;
1516 goto out;
1517 }
1518 finish:
1519 np->netc_exflags = argp->ex_flags;
1520 return (0);
1521 out:
1522 free(np, M_NETADDR, np->netc_len);
1523 return (error);
1524 }
1525
1526 int
vfs_free_netcred(struct radix_node * rn,void * w,u_int id)1527 vfs_free_netcred(struct radix_node *rn, void *w, u_int id)
1528 {
1529 struct radix_node_head *rnh = (struct radix_node_head *)w;
1530 struct netcred * np = (struct netcred *)rn;
1531
1532 rn_delete(rn->rn_key, rn->rn_mask, rnh, NULL);
1533 free(np, M_NETADDR, np->netc_len);
1534 return (0);
1535 }
1536
1537 /*
1538 * Free the net address hash lists that are hanging off the mount points.
1539 */
1540 void
vfs_free_addrlist(struct netexport * nep)1541 vfs_free_addrlist(struct netexport *nep)
1542 {
1543 struct radix_node_head *rnh;
1544
1545 if ((rnh = nep->ne_rtable_inet) != NULL) {
1546 rn_walktree(rnh, vfs_free_netcred, rnh);
1547 free(rnh, M_RTABLE, sizeof(*rnh));
1548 nep->ne_rtable_inet = NULL;
1549 }
1550 }
1551 #endif /* NFSSERVER */
1552
1553 int
vfs_export(struct mount * mp,struct netexport * nep,struct export_args * argp)1554 vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp)
1555 {
1556 #ifdef NFSSERVER
1557 int error;
1558
1559 if (argp->ex_flags & MNT_DELEXPORT) {
1560 vfs_free_addrlist(nep);
1561 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1562 }
1563 if (argp->ex_flags & MNT_EXPORTED) {
1564 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
1565 return (error);
1566 mp->mnt_flag |= MNT_EXPORTED;
1567 }
1568 return (0);
1569 #else
1570 return (ENOTSUP);
1571 #endif /* NFSSERVER */
1572 }
1573
1574 struct netcred *
vfs_export_lookup(struct mount * mp,struct netexport * nep,struct mbuf * nam)1575 vfs_export_lookup(struct mount *mp, struct netexport *nep, struct mbuf *nam)
1576 {
1577 #ifdef NFSSERVER
1578 struct netcred *np;
1579 struct radix_node_head *rnh;
1580 struct sockaddr *saddr;
1581
1582 np = NULL;
1583 if (mp->mnt_flag & MNT_EXPORTED) {
1584 /*
1585 * Lookup in the export list first.
1586 */
1587 if (nam != NULL) {
1588 saddr = mtod(nam, struct sockaddr *);
1589 switch(saddr->sa_family) {
1590 case AF_INET:
1591 rnh = nep->ne_rtable_inet;
1592 break;
1593 default:
1594 rnh = NULL;
1595 break;
1596 }
1597 if (rnh != NULL)
1598 np = (struct netcred *)rn_match(saddr, rnh);
1599 }
1600 /*
1601 * If no address match, use the default if it exists.
1602 */
1603 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1604 np = &nep->ne_defexported;
1605 }
1606 return (np);
1607 #else
1608 return (NULL);
1609 #endif /* NFSSERVER */
1610 }
1611
1612 /*
1613 * Do the usual access checking.
1614 * file_mode, uid and gid are from the vnode in question,
1615 * while acc_mode and cred are from the VOP_ACCESS parameter list
1616 */
1617 int
vaccess(enum vtype type,mode_t file_mode,uid_t uid,gid_t gid,mode_t acc_mode,struct ucred * cred)1618 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
1619 mode_t acc_mode, struct ucred *cred)
1620 {
1621 mode_t mask;
1622
1623 /* User id 0 always gets read/write access. */
1624 if (cred->cr_uid == 0) {
1625 /* For VEXEC, at least one of the execute bits must be set. */
1626 if ((acc_mode & VEXEC) && type != VDIR &&
1627 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
1628 return EACCES;
1629 return 0;
1630 }
1631
1632 mask = 0;
1633
1634 /* Otherwise, check the owner. */
1635 if (cred->cr_uid == uid) {
1636 if (acc_mode & VEXEC)
1637 mask |= S_IXUSR;
1638 if (acc_mode & VREAD)
1639 mask |= S_IRUSR;
1640 if (acc_mode & VWRITE)
1641 mask |= S_IWUSR;
1642 return (file_mode & mask) == mask ? 0 : EACCES;
1643 }
1644
1645 /* Otherwise, check the groups. */
1646 if (groupmember(gid, cred)) {
1647 if (acc_mode & VEXEC)
1648 mask |= S_IXGRP;
1649 if (acc_mode & VREAD)
1650 mask |= S_IRGRP;
1651 if (acc_mode & VWRITE)
1652 mask |= S_IWGRP;
1653 return (file_mode & mask) == mask ? 0 : EACCES;
1654 }
1655
1656 /* Otherwise, check everyone else. */
1657 if (acc_mode & VEXEC)
1658 mask |= S_IXOTH;
1659 if (acc_mode & VREAD)
1660 mask |= S_IROTH;
1661 if (acc_mode & VWRITE)
1662 mask |= S_IWOTH;
1663 return (file_mode & mask) == mask ? 0 : EACCES;
1664 }
1665
1666 int
vnoperm(struct vnode * vp)1667 vnoperm(struct vnode *vp)
1668 {
1669 if (vp->v_flag & VROOT || vp->v_mount == NULL)
1670 return 0;
1671
1672 return (vp->v_mount->mnt_flag & MNT_NOPERM);
1673 }
1674
1675 struct rwlock vfs_stall_lock = RWLOCK_INITIALIZER("vfs_stall");
1676 unsigned int vfs_stalling = 0;
1677
1678 int
vfs_stall(struct proc * p,int stall)1679 vfs_stall(struct proc *p, int stall)
1680 {
1681 struct mount *mp;
1682 int allerror = 0, error;
1683
1684 if (stall) {
1685 atomic_inc_int(&vfs_stalling);
1686 rw_enter_write(&vfs_stall_lock);
1687 }
1688
1689 /*
1690 * The loop variable mp is protected by vfs_busy() so that it cannot
1691 * be unmounted while VFS_SYNC() sleeps. Traverse forward to keep the
1692 * lock order consistent with dounmount().
1693 */
1694 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1695 if (stall) {
1696 error = vfs_busy(mp, VB_WRITE|VB_WAIT|VB_DUPOK);
1697 if (error) {
1698 printf("%s: busy\n", mp->mnt_stat.f_mntonname);
1699 allerror = error;
1700 continue;
1701 }
1702 uvm_vnp_sync(mp);
1703 error = VFS_SYNC(mp, MNT_WAIT, stall, p->p_ucred, p);
1704 if (error) {
1705 printf("%s: failed to sync\n",
1706 mp->mnt_stat.f_mntonname);
1707 vfs_unbusy(mp);
1708 allerror = error;
1709 continue;
1710 }
1711 mp->mnt_flag |= MNT_STALLED;
1712 } else {
1713 if (mp->mnt_flag & MNT_STALLED) {
1714 vfs_unbusy(mp);
1715 mp->mnt_flag &= ~MNT_STALLED;
1716 }
1717 }
1718 }
1719
1720 if (!stall) {
1721 rw_exit_write(&vfs_stall_lock);
1722 atomic_dec_int(&vfs_stalling);
1723 }
1724
1725 return (allerror);
1726 }
1727
1728 void
vfs_stall_barrier(void)1729 vfs_stall_barrier(void)
1730 {
1731 if (__predict_false(vfs_stalling)) {
1732 rw_enter_read(&vfs_stall_lock);
1733 rw_exit_read(&vfs_stall_lock);
1734 }
1735 }
1736
1737 /*
1738 * Unmount all file systems.
1739 * We traverse the list in reverse order under the assumption that doing so
1740 * will avoid needing to worry about dependencies.
1741 */
1742 void
vfs_unmountall(void)1743 vfs_unmountall(void)
1744 {
1745 struct mount *mp, *nmp;
1746 int allerror, error, again = 1;
1747
1748 retry:
1749 allerror = 0;
1750 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, nmp) {
1751 if (vfs_busy(mp, VB_WRITE|VB_NOWAIT))
1752 continue;
1753 /* XXX Here is a race, the next pointer is not locked. */
1754 if ((error = dounmount(mp, MNT_FORCE, curproc)) != 0) {
1755 printf("unmount of %s failed with error %d\n",
1756 mp->mnt_stat.f_mntonname, error);
1757 allerror = 1;
1758 }
1759 }
1760
1761 if (allerror) {
1762 printf("WARNING: some file systems would not unmount\n");
1763 if (again) {
1764 printf("retrying\n");
1765 again = 0;
1766 goto retry;
1767 }
1768 }
1769 }
1770
1771 /*
1772 * Sync and unmount file systems before shutting down.
1773 */
1774 void
vfs_shutdown(struct proc * p)1775 vfs_shutdown(struct proc *p)
1776 {
1777 #ifdef ACCOUNTING
1778 acct_shutdown();
1779 #endif
1780
1781 printf("syncing disks...");
1782
1783 if (panicstr == NULL) {
1784 /* Sync before unmount, in case we hang on something. */
1785 sys_sync(p, NULL, NULL);
1786 vfs_unmountall();
1787 }
1788
1789 #if NSOFTRAID > 0
1790 sr_quiesce();
1791 #endif
1792
1793 if (vfs_syncwait(p, 1))
1794 printf(" giving up\n");
1795 else
1796 printf(" done\n");
1797 }
1798
1799 /*
1800 * perform sync() operation and wait for buffers to flush.
1801 */
1802 int
vfs_syncwait(struct proc * p,int verbose)1803 vfs_syncwait(struct proc *p, int verbose)
1804 {
1805 struct buf *bp;
1806 int iter, nbusy, dcount, s;
1807 #ifdef MULTIPROCESSOR
1808 int hold_count;
1809 #endif
1810
1811 sys_sync(p, NULL, NULL);
1812
1813 /* Wait for sync to finish. */
1814 dcount = 10000;
1815 for (iter = 0; iter < 20; iter++) {
1816 nbusy = 0;
1817 LIST_FOREACH(bp, &bufhead, b_list) {
1818 if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
1819 nbusy++;
1820 /*
1821 * With soft updates, some buffers that are
1822 * written will be remarked as dirty until other
1823 * buffers are written.
1824 *
1825 * XXX here be dragons. this should really go away
1826 * but should be carefully made to go away on it's
1827 * own with testing.. XXX
1828 */
1829 if (bp->b_flags & B_DELWRI) {
1830 s = splbio();
1831 bremfree(bp);
1832 buf_acquire(bp);
1833 splx(s);
1834 nbusy++;
1835 bawrite(bp);
1836 if (dcount-- <= 0) {
1837 if (verbose)
1838 printf("softdep ");
1839 return 1;
1840 }
1841 }
1842 }
1843 if (nbusy == 0)
1844 break;
1845 if (verbose)
1846 printf("%d ", nbusy);
1847 #ifdef MULTIPROCESSOR
1848 if (_kernel_lock_held())
1849 hold_count = __mp_release_all(&kernel_lock);
1850 else
1851 hold_count = 0;
1852 #endif
1853 DELAY(40000 * iter);
1854 #ifdef MULTIPROCESSOR
1855 if (hold_count)
1856 __mp_acquire_count(&kernel_lock, hold_count);
1857 #endif
1858 }
1859
1860 return nbusy;
1861 }
1862
1863 /*
1864 * posix file system related system variables.
1865 */
1866 int
fs_posix_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)1867 fs_posix_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
1868 void *newp, size_t newlen, struct proc *p)
1869 {
1870 /* all sysctl names at this level are terminal */
1871 if (namelen != 1)
1872 return (ENOTDIR);
1873
1874 switch (name[0]) {
1875 case FS_POSIX_SETUID:
1876 return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
1877 &suid_clear));
1878 default:
1879 return (EOPNOTSUPP);
1880 }
1881 /* NOTREACHED */
1882 }
1883
1884 /*
1885 * file system related system variables.
1886 */
1887 int
fs_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)1888 fs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1889 size_t newlen, struct proc *p)
1890 {
1891 sysctlfn *fn;
1892
1893 switch (name[0]) {
1894 case FS_POSIX:
1895 fn = fs_posix_sysctl;
1896 break;
1897 default:
1898 return (EOPNOTSUPP);
1899 }
1900 return (*fn)(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p);
1901 }
1902
1903
1904 /*
1905 * Routines dealing with vnodes and buffers
1906 */
1907
1908 /*
1909 * Wait for all outstanding I/Os to complete
1910 *
1911 * Manipulates v_numoutput. Must be called at splbio()
1912 */
1913 int
vwaitforio(struct vnode * vp,int slpflag,char * wmesg,uint64_t timeo)1914 vwaitforio(struct vnode *vp, int slpflag, char *wmesg, uint64_t timeo)
1915 {
1916 int error = 0;
1917
1918 splassert(IPL_BIO);
1919
1920 while (vp->v_numoutput) {
1921 vp->v_bioflag |= VBIOWAIT;
1922 error = tsleep_nsec(&vp->v_numoutput,
1923 slpflag | (PRIBIO + 1), wmesg, timeo);
1924 if (error)
1925 break;
1926 }
1927
1928 return (error);
1929 }
1930
1931 /*
1932 * Update outstanding I/O count and do wakeup if requested.
1933 *
1934 * Manipulates v_numoutput. Must be called at splbio()
1935 */
1936 void
vwakeup(struct vnode * vp)1937 vwakeup(struct vnode *vp)
1938 {
1939 splassert(IPL_BIO);
1940
1941 if (vp != NULL) {
1942 if (vp->v_numoutput-- == 0)
1943 panic("vwakeup: neg numoutput");
1944 if ((vp->v_bioflag & VBIOWAIT) && vp->v_numoutput == 0) {
1945 vp->v_bioflag &= ~VBIOWAIT;
1946 wakeup(&vp->v_numoutput);
1947 }
1948 }
1949 }
1950
1951 /*
1952 * Flush out and invalidate all buffers associated with a vnode.
1953 * Called with the underlying object locked.
1954 */
1955 int
vinvalbuf(struct vnode * vp,int flags,struct ucred * cred,struct proc * p,int slpflag,uint64_t slptimeo)1956 vinvalbuf(struct vnode *vp, int flags, struct ucred *cred, struct proc *p,
1957 int slpflag, uint64_t slptimeo)
1958 {
1959 struct buf *bp;
1960 struct buf *nbp, *blist;
1961 int s, error;
1962
1963 #ifdef VFSLCKDEBUG
1964 if ((vp->v_flag & VLOCKSWORK) && !VOP_ISLOCKED(vp))
1965 panic("%s: vp isn't locked, vp %p", __func__, vp);
1966 #endif
1967
1968 if (flags & V_SAVE) {
1969 s = splbio();
1970 vwaitforio(vp, 0, "vinvalbuf", INFSLP);
1971 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
1972 splx(s);
1973 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
1974 return (error);
1975 s = splbio();
1976 if (vp->v_numoutput > 0 ||
1977 !LIST_EMPTY(&vp->v_dirtyblkhd))
1978 panic("%s: dirty bufs, vp %p", __func__, vp);
1979 }
1980 splx(s);
1981 }
1982 loop:
1983 s = splbio();
1984 for (;;) {
1985 int count = 0;
1986 if ((blist = LIST_FIRST(&vp->v_cleanblkhd)) &&
1987 (flags & V_SAVEMETA))
1988 while (blist && blist->b_lblkno < 0)
1989 blist = LIST_NEXT(blist, b_vnbufs);
1990 if (blist == NULL &&
1991 (blist = LIST_FIRST(&vp->v_dirtyblkhd)) &&
1992 (flags & V_SAVEMETA))
1993 while (blist && blist->b_lblkno < 0)
1994 blist = LIST_NEXT(blist, b_vnbufs);
1995 if (!blist)
1996 break;
1997
1998 for (bp = blist; bp; bp = nbp) {
1999 nbp = LIST_NEXT(bp, b_vnbufs);
2000 if (flags & V_SAVEMETA && bp->b_lblkno < 0)
2001 continue;
2002 if (bp->b_flags & B_BUSY) {
2003 bp->b_flags |= B_WANTED;
2004 error = tsleep_nsec(bp, slpflag | (PRIBIO + 1),
2005 "vinvalbuf", slptimeo);
2006 if (error) {
2007 splx(s);
2008 return (error);
2009 }
2010 break;
2011 }
2012 bremfree(bp);
2013 /*
2014 * XXX Since there are no node locks for NFS, I believe
2015 * there is a slight chance that a delayed write will
2016 * occur while sleeping just above, so check for it.
2017 */
2018 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
2019 buf_acquire(bp);
2020 splx(s);
2021 (void) VOP_BWRITE(bp);
2022 goto loop;
2023 }
2024 buf_acquire_nomap(bp);
2025 bp->b_flags |= B_INVAL;
2026 brelse(bp);
2027 count++;
2028 /*
2029 * XXX Temporary workaround XXX
2030 *
2031 * If this is a gigantisch vnode and we are
2032 * trashing a ton of buffers, drop the lock
2033 * and yield every so often. The longer term
2034 * fix is to add a separate list for these
2035 * invalid buffers so we don't have to do the
2036 * work to free these here.
2037 */
2038 if (count > 100) {
2039 splx(s);
2040 sched_pause(yield);
2041 goto loop;
2042 }
2043 }
2044 }
2045 if (!(flags & V_SAVEMETA) &&
2046 (!LIST_EMPTY(&vp->v_dirtyblkhd) || !LIST_EMPTY(&vp->v_cleanblkhd)))
2047 panic("%s: flush failed, vp %p", __func__, vp);
2048 splx(s);
2049 return (0);
2050 }
2051
2052 void
vflushbuf(struct vnode * vp,int sync)2053 vflushbuf(struct vnode *vp, int sync)
2054 {
2055 struct buf *bp, *nbp;
2056 int s;
2057
2058 loop:
2059 s = splbio();
2060 LIST_FOREACH_SAFE(bp, &vp->v_dirtyblkhd, b_vnbufs, nbp) {
2061 if ((bp->b_flags & B_BUSY))
2062 continue;
2063 if ((bp->b_flags & B_DELWRI) == 0)
2064 panic("vflushbuf: not dirty");
2065 bremfree(bp);
2066 buf_acquire(bp);
2067 splx(s);
2068 /*
2069 * Wait for I/O associated with indirect blocks to complete,
2070 * since there is no way to quickly wait for them below.
2071 */
2072 if (bp->b_vp == vp || sync == 0)
2073 (void) bawrite(bp);
2074 else
2075 (void) bwrite(bp);
2076 goto loop;
2077 }
2078 if (sync == 0) {
2079 splx(s);
2080 return;
2081 }
2082 vwaitforio(vp, 0, "vflushbuf", INFSLP);
2083 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
2084 splx(s);
2085 #ifdef DIAGNOSTIC
2086 vprint("vflushbuf: dirty", vp);
2087 #endif
2088 goto loop;
2089 }
2090 splx(s);
2091 }
2092
2093 /*
2094 * Associate a buffer with a vnode.
2095 *
2096 * Manipulates buffer vnode queues. Must be called at splbio().
2097 */
2098 void
bgetvp(struct vnode * vp,struct buf * bp)2099 bgetvp(struct vnode *vp, struct buf *bp)
2100 {
2101 splassert(IPL_BIO);
2102
2103
2104 if (bp->b_vp)
2105 panic("bgetvp: not free");
2106 vhold(vp);
2107 bp->b_vp = vp;
2108 if (vp->v_type == VBLK || vp->v_type == VCHR)
2109 bp->b_dev = vp->v_rdev;
2110 else
2111 bp->b_dev = NODEV;
2112 /*
2113 * Insert onto list for new vnode.
2114 */
2115 bufinsvn(bp, &vp->v_cleanblkhd);
2116 }
2117
2118 /*
2119 * Disassociate a buffer from a vnode.
2120 *
2121 * Manipulates vnode buffer queues. Must be called at splbio().
2122 */
2123 void
brelvp(struct buf * bp)2124 brelvp(struct buf *bp)
2125 {
2126 struct vnode *vp;
2127
2128 splassert(IPL_BIO);
2129
2130 if ((vp = bp->b_vp) == (struct vnode *) 0)
2131 panic("brelvp: NULL");
2132 /*
2133 * Delete from old vnode list, if on one.
2134 */
2135 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
2136 bufremvn(bp);
2137 if ((vp->v_bioflag & VBIOONSYNCLIST) &&
2138 LIST_EMPTY(&vp->v_dirtyblkhd)) {
2139 vp->v_bioflag &= ~VBIOONSYNCLIST;
2140 LIST_REMOVE(vp, v_synclist);
2141 }
2142 bp->b_vp = NULL;
2143
2144 vdrop(vp);
2145 }
2146
2147 /*
2148 * Replaces the current vnode associated with the buffer, if any,
2149 * with a new vnode.
2150 *
2151 * If an output I/O is pending on the buffer, the old vnode
2152 * I/O count is adjusted.
2153 *
2154 * Ignores vnode buffer queues. Must be called at splbio().
2155 */
2156 void
buf_replacevnode(struct buf * bp,struct vnode * newvp)2157 buf_replacevnode(struct buf *bp, struct vnode *newvp)
2158 {
2159 struct vnode *oldvp = bp->b_vp;
2160
2161 splassert(IPL_BIO);
2162
2163 if (oldvp)
2164 brelvp(bp);
2165
2166 if ((bp->b_flags & (B_READ | B_DONE)) == 0) {
2167 newvp->v_numoutput++; /* put it on swapdev */
2168 vwakeup(oldvp);
2169 }
2170
2171 bgetvp(newvp, bp);
2172 bufremvn(bp);
2173 }
2174
2175 /*
2176 * Used to assign buffers to the appropriate clean or dirty list on
2177 * the vnode and to add newly dirty vnodes to the appropriate
2178 * filesystem syncer list.
2179 *
2180 * Manipulates vnode buffer queues. Must be called at splbio().
2181 */
2182 void
reassignbuf(struct buf * bp)2183 reassignbuf(struct buf *bp)
2184 {
2185 struct buflists *listheadp;
2186 int delay;
2187 struct vnode *vp = bp->b_vp;
2188
2189 splassert(IPL_BIO);
2190
2191 /*
2192 * Delete from old vnode list, if on one.
2193 */
2194 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
2195 bufremvn(bp);
2196
2197 /*
2198 * If dirty, put on list of dirty buffers;
2199 * otherwise insert onto list of clean buffers.
2200 */
2201 if ((bp->b_flags & B_DELWRI) == 0) {
2202 listheadp = &vp->v_cleanblkhd;
2203 if ((vp->v_bioflag & VBIOONSYNCLIST) &&
2204 LIST_EMPTY(&vp->v_dirtyblkhd)) {
2205 vp->v_bioflag &= ~VBIOONSYNCLIST;
2206 LIST_REMOVE(vp, v_synclist);
2207 }
2208 } else {
2209 listheadp = &vp->v_dirtyblkhd;
2210 if ((vp->v_bioflag & VBIOONSYNCLIST) == 0) {
2211 switch (vp->v_type) {
2212 case VDIR:
2213 delay = syncdelay / 2;
2214 break;
2215 case VBLK:
2216 if (vp->v_specmountpoint != NULL) {
2217 delay = syncdelay / 3;
2218 break;
2219 }
2220 /* FALLTHROUGH */
2221 default:
2222 delay = syncdelay;
2223 }
2224 vn_syncer_add_to_worklist(vp, delay);
2225 }
2226 }
2227 bufinsvn(bp, listheadp);
2228 }
2229
2230 #ifdef DDB
2231 #include <machine/db_machdep.h>
2232 #include <ddb/db_interface.h>
2233
2234 void
vfs_buf_print(void * b,int full,int (* pr)(const char *,...))2235 vfs_buf_print(void *b, int full,
2236 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2237 {
2238 struct buf *bp = b;
2239
2240 (*pr)(" vp %p lblkno 0x%llx blkno 0x%llx dev 0x%x\n"
2241 " proc %p error %d flags %lb\n",
2242 bp->b_vp, (int64_t)bp->b_lblkno, (int64_t)bp->b_blkno, bp->b_dev,
2243 bp->b_proc, bp->b_error, bp->b_flags, B_BITS);
2244
2245 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n"
2246 " data %p saveaddr %p iodone %p\n",
2247 bp->b_bufsize, bp->b_bcount, (long)bp->b_resid,
2248 bp->b_data, bp->b_saveaddr,
2249 bp->b_iodone);
2250
2251 (*pr)(" dirty {off 0x%x end 0x%x} valid {off 0x%x end 0x%x}\n",
2252 bp->b_dirtyoff, bp->b_dirtyend, bp->b_validoff, bp->b_validend);
2253
2254 }
2255
2256 const char *vtypes[] = { VTYPE_NAMES };
2257 const char *vtags[] = { VTAG_NAMES };
2258
2259 void
vfs_vnode_print(void * v,int full,int (* pr)(const char *,...))2260 vfs_vnode_print(void *v, int full,
2261 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2262 {
2263 struct vnode *vp = v;
2264
2265 (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
2266 (u_int)vp->v_tag >= nitems(vtags)? "<unk>":vtags[vp->v_tag],
2267 vp->v_tag,
2268 (u_int)vp->v_type >= nitems(vtypes)? "<unk>":vtypes[vp->v_type],
2269 vp->v_type, vp->v_mount, vp->v_mountedhere);
2270
2271 (*pr)("data %p usecount %d writecount %d holdcnt %d numoutput %d\n",
2272 vp->v_data, vp->v_usecount, vp->v_writecount,
2273 vp->v_holdcnt, vp->v_numoutput);
2274
2275 /* uvm_object_printit(&vp->v_uobj, full, pr); */
2276
2277 if (full) {
2278 struct buf *bp;
2279
2280 (*pr)("clean bufs:\n");
2281 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
2282 (*pr)(" bp %p\n", bp);
2283 vfs_buf_print(bp, full, pr);
2284 }
2285
2286 (*pr)("dirty bufs:\n");
2287 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
2288 (*pr)(" bp %p\n", bp);
2289 vfs_buf_print(bp, full, pr);
2290 }
2291 }
2292 }
2293
2294 void
vfs_mount_print(struct mount * mp,int full,int (* pr)(const char *,...))2295 vfs_mount_print(struct mount *mp, int full,
2296 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2297 {
2298 struct vfsconf *vfc = mp->mnt_vfc;
2299 struct vnode *vp;
2300 int cnt;
2301
2302 (*pr)("flags %b\nvnodecovered %p syncer %p data %p\n",
2303 mp->mnt_flag, MNT_BITS,
2304 mp->mnt_vnodecovered, mp->mnt_syncer, mp->mnt_data);
2305
2306 (*pr)("vfsconf: ops %p name \"%s\" num %d ref %u flags 0x%x\n",
2307 vfc->vfc_vfsops, vfc->vfc_name, vfc->vfc_typenum,
2308 vfc->vfc_refcount, vfc->vfc_flags);
2309
2310 (*pr)("statvfs cache: bsize %x iosize %x\n"
2311 "blocks %llu free %llu avail %lld\n",
2312 mp->mnt_stat.f_bsize, mp->mnt_stat.f_iosize, mp->mnt_stat.f_blocks,
2313 mp->mnt_stat.f_bfree, mp->mnt_stat.f_bavail);
2314
2315 (*pr)(" files %llu ffiles %llu favail %lld\n", mp->mnt_stat.f_files,
2316 mp->mnt_stat.f_ffree, mp->mnt_stat.f_favail);
2317
2318 (*pr)(" f_fsidx {0x%x, 0x%x} owner %u ctime 0x%llx\n",
2319 mp->mnt_stat.f_fsid.val[0], mp->mnt_stat.f_fsid.val[1],
2320 mp->mnt_stat.f_owner, mp->mnt_stat.f_ctime);
2321
2322 (*pr)(" syncwrites %llu asyncwrites = %llu\n",
2323 mp->mnt_stat.f_syncwrites, mp->mnt_stat.f_asyncwrites);
2324
2325 (*pr)(" syncreads %llu asyncreads = %llu\n",
2326 mp->mnt_stat.f_syncreads, mp->mnt_stat.f_asyncreads);
2327
2328 (*pr)(" fstype \"%s\" mnton \"%s\" mntfrom \"%s\" mntspec \"%s\"\n",
2329 mp->mnt_stat.f_fstypename, mp->mnt_stat.f_mntonname,
2330 mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromspec);
2331
2332 (*pr)("locked vnodes:");
2333 /* XXX would take mountlist lock, except ddb has no context */
2334 cnt = 0;
2335 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2336 if (VOP_ISLOCKED(vp)) {
2337 if (cnt == 0)
2338 (*pr)("\n %p", vp);
2339 else if ((cnt % (72 / (sizeof(void *) * 2 + 4))) == 0)
2340 (*pr)(",\n %p", vp);
2341 else
2342 (*pr)(", %p", vp);
2343 cnt++;
2344 }
2345 }
2346 (*pr)("\n");
2347
2348 if (full) {
2349 (*pr)("all vnodes:");
2350 /* XXX would take mountlist lock, except ddb has no context */
2351 cnt = 0;
2352 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2353 if (cnt == 0)
2354 (*pr)("\n %p", vp);
2355 else if ((cnt % (72 / (sizeof(void *) * 2 + 4))) == 0)
2356 (*pr)(",\n %p", vp);
2357 else
2358 (*pr)(", %p", vp);
2359 cnt++;
2360 }
2361 (*pr)("\n");
2362 }
2363 }
2364 #endif /* DDB */
2365
2366 void
copy_statfs_info(struct statfs * sbp,const struct mount * mp)2367 copy_statfs_info(struct statfs *sbp, const struct mount *mp)
2368 {
2369 const struct statfs *mbp;
2370
2371 strncpy(sbp->f_fstypename, mp->mnt_vfc->vfc_name, MFSNAMELEN);
2372
2373 if (sbp == (mbp = &mp->mnt_stat))
2374 return;
2375
2376 sbp->f_fsid = mbp->f_fsid;
2377 sbp->f_owner = mbp->f_owner;
2378 sbp->f_flags = mbp->f_flags;
2379 sbp->f_syncwrites = mbp->f_syncwrites;
2380 sbp->f_asyncwrites = mbp->f_asyncwrites;
2381 sbp->f_syncreads = mbp->f_syncreads;
2382 sbp->f_asyncreads = mbp->f_asyncreads;
2383 sbp->f_namemax = mbp->f_namemax;
2384 memcpy(sbp->f_mntonname, mp->mnt_stat.f_mntonname, MNAMELEN);
2385 memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, MNAMELEN);
2386 memcpy(sbp->f_mntfromspec, mp->mnt_stat.f_mntfromspec, MNAMELEN);
2387 memcpy(&sbp->mount_info, &mp->mnt_stat.mount_info,
2388 sizeof(union mount_info));
2389 }
2390