1 /* $NetBSD: vfs_subr.c,v 1.449 2016/05/26 11:07:33 hannken Exp $ */
2
3 /*-
4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran,
10 * by Marshall Kirk McKusick and Greg Ganger at the University of Michigan.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Copyright (c) 1989, 1993
36 * The Regents of the University of California. All rights reserved.
37 * (c) UNIX System Laboratories, Inc.
38 * All or some portions of this file are derived from material licensed
39 * to the University of California by American Telephone and Telegraph
40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
41 * the permission of UNIX System Laboratories, Inc.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
68 */
69
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.449 2016/05/26 11:07:33 hannken Exp $");
72
73 #ifdef _KERNEL_OPT
74 #include "opt_ddb.h"
75 #include "opt_compat_netbsd.h"
76 #include "opt_compat_43.h"
77 #endif
78
79 #define _VFS_VNODE_PRIVATE /* for vcache_print(). */
80
81 #include <sys/param.h>
82 #include <sys/systm.h>
83 #include <sys/conf.h>
84 #include <sys/dirent.h>
85 #include <sys/filedesc.h>
86 #include <sys/kernel.h>
87 #include <sys/mount.h>
88 #include <sys/vnode.h>
89 #include <sys/stat.h>
90 #include <sys/sysctl.h>
91 #include <sys/namei.h>
92 #include <sys/buf.h>
93 #include <sys/errno.h>
94 #include <sys/kmem.h>
95 #include <sys/syscallargs.h>
96 #include <sys/kauth.h>
97 #include <sys/module.h>
98
99 #include <miscfs/genfs/genfs.h>
100 #include <miscfs/specfs/specdev.h>
101 #include <uvm/uvm_ddb.h>
102
103 const enum vtype iftovt_tab[16] = {
104 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
105 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
106 };
107 const int vttoif_tab[9] = {
108 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
109 S_IFSOCK, S_IFIFO, S_IFMT,
110 };
111
112 /*
113 * Insq/Remq for the vnode usage lists.
114 */
115 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
116 #define bufremvn(bp) { \
117 LIST_REMOVE(bp, b_vnbufs); \
118 (bp)->b_vnbufs.le_next = NOLIST; \
119 }
120
121 int doforce = 1; /* 1 => permit forcible unmounting */
122 int prtactive = 0; /* 1 => print out reclaim of active vnodes */
123
124 extern struct mount *dead_rootmount;
125
126 /*
127 * Local declarations.
128 */
129
130 static void vn_initialize_syncerd(void);
131
132 /*
133 * Initialize the vnode management data structures.
134 */
135 void
vntblinit(void)136 vntblinit(void)
137 {
138
139 vn_initialize_syncerd();
140 vfs_mount_sysinit();
141 vfs_vnode_sysinit();
142 }
143
144 /*
145 * Flush out and invalidate all buffers associated with a vnode.
146 * Called with the underlying vnode locked, which should prevent new dirty
147 * buffers from being queued.
148 */
149 int
vinvalbuf(struct vnode * vp,int flags,kauth_cred_t cred,struct lwp * l,bool catch_p,int slptimeo)150 vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
151 bool catch_p, int slptimeo)
152 {
153 struct buf *bp, *nbp;
154 int error;
155 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
156 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
157
158 /* XXXUBC this doesn't look at flags or slp* */
159 mutex_enter(vp->v_interlock);
160 error = VOP_PUTPAGES(vp, 0, 0, flushflags);
161 if (error) {
162 return error;
163 }
164
165 if (flags & V_SAVE) {
166 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
167 if (error)
168 return (error);
169 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
170 }
171
172 mutex_enter(&bufcache_lock);
173 restart:
174 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
175 KASSERT(bp->b_vp == vp);
176 nbp = LIST_NEXT(bp, b_vnbufs);
177 error = bbusy(bp, catch_p, slptimeo, NULL);
178 if (error != 0) {
179 if (error == EPASSTHROUGH)
180 goto restart;
181 mutex_exit(&bufcache_lock);
182 return (error);
183 }
184 brelsel(bp, BC_INVAL | BC_VFLUSH);
185 }
186
187 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
188 KASSERT(bp->b_vp == vp);
189 nbp = LIST_NEXT(bp, b_vnbufs);
190 error = bbusy(bp, catch_p, slptimeo, NULL);
191 if (error != 0) {
192 if (error == EPASSTHROUGH)
193 goto restart;
194 mutex_exit(&bufcache_lock);
195 return (error);
196 }
197 /*
198 * XXX Since there are no node locks for NFS, I believe
199 * there is a slight chance that a delayed write will
200 * occur while sleeping just above, so check for it.
201 */
202 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
203 #ifdef DEBUG
204 printf("buffer still DELWRI\n");
205 #endif
206 bp->b_cflags |= BC_BUSY | BC_VFLUSH;
207 mutex_exit(&bufcache_lock);
208 VOP_BWRITE(bp->b_vp, bp);
209 mutex_enter(&bufcache_lock);
210 goto restart;
211 }
212 brelsel(bp, BC_INVAL | BC_VFLUSH);
213 }
214
215 #ifdef DIAGNOSTIC
216 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
217 panic("vinvalbuf: flush failed, vp %p", vp);
218 #endif
219
220 mutex_exit(&bufcache_lock);
221
222 return (0);
223 }
224
225 /*
226 * Destroy any in core blocks past the truncation length.
227 * Called with the underlying vnode locked, which should prevent new dirty
228 * buffers from being queued.
229 */
230 int
vtruncbuf(struct vnode * vp,daddr_t lbn,bool catch_p,int slptimeo)231 vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo)
232 {
233 struct buf *bp, *nbp;
234 int error;
235 voff_t off;
236
237 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
238 mutex_enter(vp->v_interlock);
239 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
240 if (error) {
241 return error;
242 }
243
244 mutex_enter(&bufcache_lock);
245 restart:
246 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
247 KASSERT(bp->b_vp == vp);
248 nbp = LIST_NEXT(bp, b_vnbufs);
249 if (bp->b_lblkno < lbn)
250 continue;
251 error = bbusy(bp, catch_p, slptimeo, NULL);
252 if (error != 0) {
253 if (error == EPASSTHROUGH)
254 goto restart;
255 mutex_exit(&bufcache_lock);
256 return (error);
257 }
258 brelsel(bp, BC_INVAL | BC_VFLUSH);
259 }
260
261 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
262 KASSERT(bp->b_vp == vp);
263 nbp = LIST_NEXT(bp, b_vnbufs);
264 if (bp->b_lblkno < lbn)
265 continue;
266 error = bbusy(bp, catch_p, slptimeo, NULL);
267 if (error != 0) {
268 if (error == EPASSTHROUGH)
269 goto restart;
270 mutex_exit(&bufcache_lock);
271 return (error);
272 }
273 brelsel(bp, BC_INVAL | BC_VFLUSH);
274 }
275 mutex_exit(&bufcache_lock);
276
277 return (0);
278 }
279
280 /*
281 * Flush all dirty buffers from a vnode.
282 * Called with the underlying vnode locked, which should prevent new dirty
283 * buffers from being queued.
284 */
285 int
vflushbuf(struct vnode * vp,int flags)286 vflushbuf(struct vnode *vp, int flags)
287 {
288 struct buf *bp, *nbp;
289 int error, pflags;
290 bool dirty, sync;
291
292 sync = (flags & FSYNC_WAIT) != 0;
293 pflags = PGO_CLEANIT | PGO_ALLPAGES |
294 (sync ? PGO_SYNCIO : 0) |
295 ((flags & FSYNC_LAZY) ? PGO_LAZY : 0);
296 mutex_enter(vp->v_interlock);
297 (void) VOP_PUTPAGES(vp, 0, 0, pflags);
298
299 loop:
300 mutex_enter(&bufcache_lock);
301 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
302 KASSERT(bp->b_vp == vp);
303 nbp = LIST_NEXT(bp, b_vnbufs);
304 if ((bp->b_cflags & BC_BUSY))
305 continue;
306 if ((bp->b_oflags & BO_DELWRI) == 0)
307 panic("vflushbuf: not dirty, bp %p", bp);
308 bp->b_cflags |= BC_BUSY | BC_VFLUSH;
309 mutex_exit(&bufcache_lock);
310 /*
311 * Wait for I/O associated with indirect blocks to complete,
312 * since there is no way to quickly wait for them below.
313 */
314 if (bp->b_vp == vp || !sync)
315 (void) bawrite(bp);
316 else {
317 error = bwrite(bp);
318 if (error)
319 return error;
320 }
321 goto loop;
322 }
323 mutex_exit(&bufcache_lock);
324
325 if (!sync)
326 return 0;
327
328 mutex_enter(vp->v_interlock);
329 while (vp->v_numoutput != 0)
330 cv_wait(&vp->v_cv, vp->v_interlock);
331 dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
332 mutex_exit(vp->v_interlock);
333
334 if (dirty) {
335 vprint("vflushbuf: dirty", vp);
336 goto loop;
337 }
338
339 return 0;
340 }
341
342 /*
343 * Create a vnode for a block device.
344 * Used for root filesystem and swap areas.
345 * Also used for memory file system special devices.
346 */
347 int
bdevvp(dev_t dev,vnode_t ** vpp)348 bdevvp(dev_t dev, vnode_t **vpp)
349 {
350 struct vattr va;
351
352 vattr_null(&va);
353 va.va_type = VBLK;
354 va.va_rdev = dev;
355
356 return vcache_new(dead_rootmount, NULL, &va, NOCRED, vpp);
357 }
358
359 /*
360 * Create a vnode for a character device.
361 * Used for kernfs and some console handling.
362 */
363 int
cdevvp(dev_t dev,vnode_t ** vpp)364 cdevvp(dev_t dev, vnode_t **vpp)
365 {
366 struct vattr va;
367
368 vattr_null(&va);
369 va.va_type = VCHR;
370 va.va_rdev = dev;
371
372 return vcache_new(dead_rootmount, NULL, &va, NOCRED, vpp);
373 }
374
375 /*
376 * Associate a buffer with a vnode. There must already be a hold on
377 * the vnode.
378 */
379 void
bgetvp(struct vnode * vp,struct buf * bp)380 bgetvp(struct vnode *vp, struct buf *bp)
381 {
382
383 KASSERT(bp->b_vp == NULL);
384 KASSERT(bp->b_objlock == &buffer_lock);
385 KASSERT(mutex_owned(vp->v_interlock));
386 KASSERT(mutex_owned(&bufcache_lock));
387 KASSERT((bp->b_cflags & BC_BUSY) != 0);
388 KASSERT(!cv_has_waiters(&bp->b_done));
389
390 vholdl(vp);
391 bp->b_vp = vp;
392 if (vp->v_type == VBLK || vp->v_type == VCHR)
393 bp->b_dev = vp->v_rdev;
394 else
395 bp->b_dev = NODEV;
396
397 /*
398 * Insert onto list for new vnode.
399 */
400 bufinsvn(bp, &vp->v_cleanblkhd);
401 bp->b_objlock = vp->v_interlock;
402 }
403
404 /*
405 * Disassociate a buffer from a vnode.
406 */
407 void
brelvp(struct buf * bp)408 brelvp(struct buf *bp)
409 {
410 struct vnode *vp = bp->b_vp;
411
412 KASSERT(vp != NULL);
413 KASSERT(bp->b_objlock == vp->v_interlock);
414 KASSERT(mutex_owned(vp->v_interlock));
415 KASSERT(mutex_owned(&bufcache_lock));
416 KASSERT((bp->b_cflags & BC_BUSY) != 0);
417 KASSERT(!cv_has_waiters(&bp->b_done));
418
419 /*
420 * Delete from old vnode list, if on one.
421 */
422 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
423 bufremvn(bp);
424
425 if (vp->v_uobj.uo_npages == 0 && (vp->v_iflag & VI_ONWORKLST) &&
426 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
427 vp->v_iflag &= ~VI_WRMAPDIRTY;
428 vn_syncer_remove_from_worklist(vp);
429 }
430
431 bp->b_objlock = &buffer_lock;
432 bp->b_vp = NULL;
433 holdrelel(vp);
434 }
435
436 /*
437 * Reassign a buffer from one vnode list to another.
438 * The list reassignment must be within the same vnode.
439 * Used to assign file specific control information
440 * (indirect blocks) to the list to which they belong.
441 */
442 void
reassignbuf(struct buf * bp,struct vnode * vp)443 reassignbuf(struct buf *bp, struct vnode *vp)
444 {
445 struct buflists *listheadp;
446 int delayx;
447
448 KASSERT(mutex_owned(&bufcache_lock));
449 KASSERT(bp->b_objlock == vp->v_interlock);
450 KASSERT(mutex_owned(vp->v_interlock));
451 KASSERT((bp->b_cflags & BC_BUSY) != 0);
452
453 /*
454 * Delete from old vnode list, if on one.
455 */
456 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
457 bufremvn(bp);
458
459 /*
460 * If dirty, put on list of dirty buffers;
461 * otherwise insert onto list of clean buffers.
462 */
463 if ((bp->b_oflags & BO_DELWRI) == 0) {
464 listheadp = &vp->v_cleanblkhd;
465 if (vp->v_uobj.uo_npages == 0 &&
466 (vp->v_iflag & VI_ONWORKLST) &&
467 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
468 vp->v_iflag &= ~VI_WRMAPDIRTY;
469 vn_syncer_remove_from_worklist(vp);
470 }
471 } else {
472 listheadp = &vp->v_dirtyblkhd;
473 if ((vp->v_iflag & VI_ONWORKLST) == 0) {
474 switch (vp->v_type) {
475 case VDIR:
476 delayx = dirdelay;
477 break;
478 case VBLK:
479 if (spec_node_getmountedfs(vp) != NULL) {
480 delayx = metadelay;
481 break;
482 }
483 /* fall through */
484 default:
485 delayx = filedelay;
486 break;
487 }
488 if (!vp->v_mount ||
489 (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
490 vn_syncer_add_to_worklist(vp, delayx);
491 }
492 }
493 bufinsvn(bp, listheadp);
494 }
495
496 /*
497 * Lookup a vnode by device number and return it referenced.
498 */
499 int
vfinddev(dev_t dev,enum vtype type,vnode_t ** vpp)500 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
501 {
502
503 return (spec_node_lookup_by_dev(type, dev, vpp) == 0);
504 }
505
506 /*
507 * Revoke all the vnodes corresponding to the specified minor number
508 * range (endpoints inclusive) of the specified major.
509 */
510 void
vdevgone(int maj,int minl,int minh,enum vtype type)511 vdevgone(int maj, int minl, int minh, enum vtype type)
512 {
513 vnode_t *vp;
514 dev_t dev;
515 int mn;
516
517 for (mn = minl; mn <= minh; mn++) {
518 dev = makedev(maj, mn);
519 while (spec_node_lookup_by_dev(type, dev, &vp) == 0) {
520 VOP_REVOKE(vp, REVOKEALL);
521 vrele(vp);
522 }
523 }
524 }
525
526 /*
527 * The filesystem synchronizer mechanism - syncer.
528 *
529 * It is useful to delay writes of file data and filesystem metadata for
530 * a certain amount of time so that quickly created and deleted files need
531 * not waste disk bandwidth being created and removed. To implement this,
532 * vnodes are appended to a "workitem" queue.
533 *
534 * Most pending metadata should not wait for more than ten seconds. Thus,
535 * mounted on block devices are delayed only about a half the time that file
536 * data is delayed. Similarly, directory updates are more critical, so are
537 * only delayed about a third the time that file data is delayed.
538 *
539 * There are SYNCER_MAXDELAY queues that are processed in a round-robin
540 * manner at a rate of one each second (driven off the filesystem syner
541 * thread). The syncer_delayno variable indicates the next queue that is
542 * to be processed. Items that need to be processed soon are placed in
543 * this queue:
544 *
545 * syncer_workitem_pending[syncer_delayno]
546 *
547 * A delay of e.g. fifteen seconds is done by placing the request fifteen
548 * entries later in the queue:
549 *
550 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
551 *
552 * Flag VI_ONWORKLST indicates that vnode is added into the queue.
553 */
554
555 #define SYNCER_MAXDELAY 32
556
557 typedef TAILQ_HEAD(synclist, vnode) synclist_t;
558
559 static void vn_syncer_add1(struct vnode *, int);
560 static void sysctl_vfs_syncfs_setup(struct sysctllog **);
561
562 /*
563 * Defines and variables for the syncer process.
564 */
565 int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
566 time_t syncdelay = 30; /* max time to delay syncing data */
567 time_t filedelay = 30; /* time to delay syncing files */
568 time_t dirdelay = 15; /* time to delay syncing directories */
569 time_t metadelay = 10; /* time to delay syncing metadata */
570 time_t lockdelay = 1; /* time to delay if locking fails */
571
572 kmutex_t syncer_mutex; /* used to freeze syncer, long term */
573 static kmutex_t syncer_data_lock; /* short term lock on data structs */
574
575 static int syncer_delayno = 0;
576 static long syncer_last;
577 static synclist_t * syncer_workitem_pending;
578
579 static void
vn_initialize_syncerd(void)580 vn_initialize_syncerd(void)
581 {
582 int i;
583
584 syncer_last = SYNCER_MAXDELAY + 2;
585
586 sysctl_vfs_syncfs_setup(NULL);
587
588 syncer_workitem_pending =
589 kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP);
590
591 for (i = 0; i < syncer_last; i++)
592 TAILQ_INIT(&syncer_workitem_pending[i]);
593
594 mutex_init(&syncer_mutex, MUTEX_DEFAULT, IPL_NONE);
595 mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE);
596 }
597
598 /*
599 * Return delay factor appropriate for the given file system. For
600 * WAPBL we use the sync vnode to burst out metadata updates: sync
601 * those file systems more frequently.
602 */
603 static inline int
sync_delay(struct mount * mp)604 sync_delay(struct mount *mp)
605 {
606
607 return mp->mnt_wapbl != NULL ? metadelay : syncdelay;
608 }
609
610 /*
611 * Compute the next slot index from delay.
612 */
613 static inline int
sync_delay_slot(int delayx)614 sync_delay_slot(int delayx)
615 {
616
617 if (delayx > syncer_maxdelay - 2)
618 delayx = syncer_maxdelay - 2;
619 return (syncer_delayno + delayx) % syncer_last;
620 }
621
622 /*
623 * Add an item to the syncer work queue.
624 */
625 static void
vn_syncer_add1(struct vnode * vp,int delayx)626 vn_syncer_add1(struct vnode *vp, int delayx)
627 {
628 synclist_t *slp;
629
630 KASSERT(mutex_owned(&syncer_data_lock));
631
632 if (vp->v_iflag & VI_ONWORKLST) {
633 /*
634 * Remove in order to adjust the position of the vnode.
635 * Note: called from sched_sync(), which will not hold
636 * interlock, therefore we cannot modify v_iflag here.
637 */
638 slp = &syncer_workitem_pending[vp->v_synclist_slot];
639 TAILQ_REMOVE(slp, vp, v_synclist);
640 } else {
641 KASSERT(mutex_owned(vp->v_interlock));
642 vp->v_iflag |= VI_ONWORKLST;
643 }
644
645 vp->v_synclist_slot = sync_delay_slot(delayx);
646
647 slp = &syncer_workitem_pending[vp->v_synclist_slot];
648 TAILQ_INSERT_TAIL(slp, vp, v_synclist);
649 }
650
651 void
vn_syncer_add_to_worklist(struct vnode * vp,int delayx)652 vn_syncer_add_to_worklist(struct vnode *vp, int delayx)
653 {
654
655 KASSERT(mutex_owned(vp->v_interlock));
656
657 mutex_enter(&syncer_data_lock);
658 vn_syncer_add1(vp, delayx);
659 mutex_exit(&syncer_data_lock);
660 }
661
662 /*
663 * Remove an item from the syncer work queue.
664 */
665 void
vn_syncer_remove_from_worklist(struct vnode * vp)666 vn_syncer_remove_from_worklist(struct vnode *vp)
667 {
668 synclist_t *slp;
669
670 KASSERT(mutex_owned(vp->v_interlock));
671
672 mutex_enter(&syncer_data_lock);
673 if (vp->v_iflag & VI_ONWORKLST) {
674 vp->v_iflag &= ~VI_ONWORKLST;
675 slp = &syncer_workitem_pending[vp->v_synclist_slot];
676 TAILQ_REMOVE(slp, vp, v_synclist);
677 }
678 mutex_exit(&syncer_data_lock);
679 }
680
681 /*
682 * Add this mount point to the syncer.
683 */
684 void
vfs_syncer_add_to_worklist(struct mount * mp)685 vfs_syncer_add_to_worklist(struct mount *mp)
686 {
687 static int start, incr, next;
688 int vdelay;
689
690 KASSERT(mutex_owned(&mp->mnt_updating));
691 KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) == 0);
692
693 /*
694 * We attempt to scatter the mount points on the list
695 * so that they will go off at evenly distributed times
696 * even if all the filesystems are mounted at once.
697 */
698
699 next += incr;
700 if (next == 0 || next > syncer_maxdelay) {
701 start /= 2;
702 incr /= 2;
703 if (start == 0) {
704 start = syncer_maxdelay / 2;
705 incr = syncer_maxdelay;
706 }
707 next = start;
708 }
709 mp->mnt_iflag |= IMNT_ONWORKLIST;
710 vdelay = sync_delay(mp);
711 mp->mnt_synclist_slot = vdelay > 0 ? next % vdelay : 0;
712 }
713
714 /*
715 * Remove the mount point from the syncer.
716 */
717 void
vfs_syncer_remove_from_worklist(struct mount * mp)718 vfs_syncer_remove_from_worklist(struct mount *mp)
719 {
720
721 KASSERT(mutex_owned(&mp->mnt_updating));
722 KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) != 0);
723
724 mp->mnt_iflag &= ~IMNT_ONWORKLIST;
725 }
726
727 /*
728 * Try lazy sync, return true on success.
729 */
730 static bool
lazy_sync_vnode(struct vnode * vp)731 lazy_sync_vnode(struct vnode *vp)
732 {
733 bool synced;
734
735 KASSERT(mutex_owned(&syncer_data_lock));
736
737 synced = false;
738 /* We are locking in the wrong direction. */
739 if (mutex_tryenter(vp->v_interlock)) {
740 mutex_exit(&syncer_data_lock);
741 if (vget(vp, LK_NOWAIT, false /* !wait */) == 0) {
742 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
743 synced = true;
744 (void) VOP_FSYNC(vp, curlwp->l_cred,
745 FSYNC_LAZY, 0, 0);
746 vput(vp);
747 } else
748 vrele(vp);
749 }
750 mutex_enter(&syncer_data_lock);
751 }
752 return synced;
753 }
754
755 /*
756 * System filesystem synchronizer daemon.
757 */
758 void
sched_sync(void * arg)759 sched_sync(void *arg)
760 {
761 synclist_t *slp;
762 struct vnode *vp;
763 struct mount *mp, *nmp;
764 time_t starttime;
765 bool synced;
766
767 for (;;) {
768 mutex_enter(&syncer_mutex);
769
770 starttime = time_second;
771
772 /*
773 * Sync mounts whose dirty time has expired.
774 */
775 mutex_enter(&mountlist_lock);
776 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
777 if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0 ||
778 mp->mnt_synclist_slot != syncer_delayno) {
779 nmp = TAILQ_NEXT(mp, mnt_list);
780 continue;
781 }
782 mp->mnt_synclist_slot = sync_delay_slot(sync_delay(mp));
783 if (vfs_busy(mp, &nmp))
784 continue;
785 VFS_SYNC(mp, MNT_LAZY, curlwp->l_cred);
786 vfs_unbusy(mp, false, &nmp);
787 }
788 mutex_exit(&mountlist_lock);
789
790 mutex_enter(&syncer_data_lock);
791
792 /*
793 * Push files whose dirty time has expired.
794 */
795 slp = &syncer_workitem_pending[syncer_delayno];
796 syncer_delayno += 1;
797 if (syncer_delayno >= syncer_last)
798 syncer_delayno = 0;
799
800 while ((vp = TAILQ_FIRST(slp)) != NULL) {
801 synced = lazy_sync_vnode(vp);
802
803 /*
804 * XXX The vnode may have been recycled, in which
805 * case it may have a new identity.
806 */
807 if (TAILQ_FIRST(slp) == vp) {
808 /*
809 * Put us back on the worklist. The worklist
810 * routine will remove us from our current
811 * position and then add us back in at a later
812 * position.
813 *
814 * Try again sooner rather than later if
815 * we were unable to lock the vnode. Lock
816 * failure should not prevent us from doing
817 * the sync "soon".
818 *
819 * If we locked it yet arrive here, it's
820 * likely that lazy sync is in progress and
821 * so the vnode still has dirty metadata.
822 * syncdelay is mainly to get this vnode out
823 * of the way so we do not consider it again
824 * "soon" in this loop, so the delay time is
825 * not critical as long as it is not "soon".
826 * While write-back strategy is the file
827 * system's domain, we expect write-back to
828 * occur no later than syncdelay seconds
829 * into the future.
830 */
831 vn_syncer_add1(vp,
832 synced ? syncdelay : lockdelay);
833 }
834 }
835 mutex_exit(&syncer_mutex);
836
837 /*
838 * If it has taken us less than a second to process the
839 * current work, then wait. Otherwise start right over
840 * again. We can still lose time if any single round
841 * takes more than two seconds, but it does not really
842 * matter as we are just trying to generally pace the
843 * filesystem activity.
844 */
845 if (time_second == starttime) {
846 kpause("syncer", false, hz, &syncer_data_lock);
847 }
848 mutex_exit(&syncer_data_lock);
849 }
850 }
851
852 static void
sysctl_vfs_syncfs_setup(struct sysctllog ** clog)853 sysctl_vfs_syncfs_setup(struct sysctllog **clog)
854 {
855 const struct sysctlnode *rnode, *cnode;
856
857 sysctl_createv(clog, 0, NULL, &rnode,
858 CTLFLAG_PERMANENT,
859 CTLTYPE_NODE, "sync",
860 SYSCTL_DESCR("syncer options"),
861 NULL, 0, NULL, 0,
862 CTL_VFS, CTL_CREATE, CTL_EOL);
863
864 sysctl_createv(clog, 0, &rnode, &cnode,
865 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
866 CTLTYPE_QUAD, "delay",
867 SYSCTL_DESCR("max time to delay syncing data"),
868 NULL, 0, &syncdelay, 0,
869 CTL_CREATE, CTL_EOL);
870
871 sysctl_createv(clog, 0, &rnode, &cnode,
872 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
873 CTLTYPE_QUAD, "filedelay",
874 SYSCTL_DESCR("time to delay syncing files"),
875 NULL, 0, &filedelay, 0,
876 CTL_CREATE, CTL_EOL);
877
878 sysctl_createv(clog, 0, &rnode, &cnode,
879 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
880 CTLTYPE_QUAD, "dirdelay",
881 SYSCTL_DESCR("time to delay syncing directories"),
882 NULL, 0, &dirdelay, 0,
883 CTL_CREATE, CTL_EOL);
884
885 sysctl_createv(clog, 0, &rnode, &cnode,
886 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
887 CTLTYPE_QUAD, "metadelay",
888 SYSCTL_DESCR("time to delay syncing metadata"),
889 NULL, 0, &metadelay, 0,
890 CTL_CREATE, CTL_EOL);
891 }
892
893 /*
894 * sysctl helper routine to return list of supported fstypes
895 */
896 int
sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)897 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
898 {
899 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
900 char *where = oldp;
901 struct vfsops *v;
902 size_t needed, left, slen;
903 int error, first;
904
905 if (newp != NULL)
906 return (EPERM);
907 if (namelen != 0)
908 return (EINVAL);
909
910 first = 1;
911 error = 0;
912 needed = 0;
913 left = *oldlenp;
914
915 sysctl_unlock();
916 mutex_enter(&vfs_list_lock);
917 LIST_FOREACH(v, &vfs_list, vfs_list) {
918 if (where == NULL)
919 needed += strlen(v->vfs_name) + 1;
920 else {
921 memset(bf, 0, sizeof(bf));
922 if (first) {
923 strncpy(bf, v->vfs_name, sizeof(bf));
924 first = 0;
925 } else {
926 bf[0] = ' ';
927 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
928 }
929 bf[sizeof(bf)-1] = '\0';
930 slen = strlen(bf);
931 if (left < slen + 1)
932 break;
933 v->vfs_refcount++;
934 mutex_exit(&vfs_list_lock);
935 /* +1 to copy out the trailing NUL byte */
936 error = copyout(bf, where, slen + 1);
937 mutex_enter(&vfs_list_lock);
938 v->vfs_refcount--;
939 if (error)
940 break;
941 where += slen;
942 needed += slen;
943 left -= slen;
944 }
945 }
946 mutex_exit(&vfs_list_lock);
947 sysctl_relock();
948 *oldlenp = needed;
949 return (error);
950 }
951
952 int kinfo_vdebug = 1;
953 int kinfo_vgetfailed;
954
955 #define KINFO_VNODESLOP 10
956
957 /*
958 * Dump vnode list (via sysctl).
959 * Copyout address of vnode followed by vnode.
960 */
961 int
sysctl_kern_vnode(SYSCTLFN_ARGS)962 sysctl_kern_vnode(SYSCTLFN_ARGS)
963 {
964 char *where = oldp;
965 size_t *sizep = oldlenp;
966 struct mount *mp, *nmp;
967 vnode_t *vp, vbuf;
968 struct vnode_iterator *marker;
969 char *bp = where;
970 char *ewhere;
971 int error;
972
973 if (namelen != 0)
974 return (EOPNOTSUPP);
975 if (newp != NULL)
976 return (EPERM);
977
978 #define VPTRSZ sizeof(vnode_t *)
979 #define VNODESZ sizeof(vnode_t)
980 if (where == NULL) {
981 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
982 return (0);
983 }
984 ewhere = where + *sizep;
985
986 sysctl_unlock();
987 mutex_enter(&mountlist_lock);
988 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
989 if (vfs_busy(mp, &nmp)) {
990 continue;
991 }
992 vfs_vnode_iterator_init(mp, &marker);
993 while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
994 if (bp + VPTRSZ + VNODESZ > ewhere) {
995 vrele(vp);
996 vfs_vnode_iterator_destroy(marker);
997 vfs_unbusy(mp, false, NULL);
998 sysctl_relock();
999 *sizep = bp - where;
1000 return (ENOMEM);
1001 }
1002 memcpy(&vbuf, vp, VNODESZ);
1003 if ((error = copyout(&vp, bp, VPTRSZ)) ||
1004 (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
1005 vrele(vp);
1006 vfs_vnode_iterator_destroy(marker);
1007 vfs_unbusy(mp, false, NULL);
1008 sysctl_relock();
1009 return (error);
1010 }
1011 vrele(vp);
1012 bp += VPTRSZ + VNODESZ;
1013 }
1014 vfs_vnode_iterator_destroy(marker);
1015 vfs_unbusy(mp, false, &nmp);
1016 }
1017 mutex_exit(&mountlist_lock);
1018 sysctl_relock();
1019
1020 *sizep = bp - where;
1021 return (0);
1022 }
1023
1024 /*
1025 * Set vnode attributes to VNOVAL
1026 */
1027 void
vattr_null(struct vattr * vap)1028 vattr_null(struct vattr *vap)
1029 {
1030
1031 memset(vap, 0, sizeof(*vap));
1032
1033 vap->va_type = VNON;
1034
1035 /*
1036 * Assign individually so that it is safe even if size and
1037 * sign of each member are varied.
1038 */
1039 vap->va_mode = VNOVAL;
1040 vap->va_nlink = VNOVAL;
1041 vap->va_uid = VNOVAL;
1042 vap->va_gid = VNOVAL;
1043 vap->va_fsid = VNOVAL;
1044 vap->va_fileid = VNOVAL;
1045 vap->va_size = VNOVAL;
1046 vap->va_blocksize = VNOVAL;
1047 vap->va_atime.tv_sec =
1048 vap->va_mtime.tv_sec =
1049 vap->va_ctime.tv_sec =
1050 vap->va_birthtime.tv_sec = VNOVAL;
1051 vap->va_atime.tv_nsec =
1052 vap->va_mtime.tv_nsec =
1053 vap->va_ctime.tv_nsec =
1054 vap->va_birthtime.tv_nsec = VNOVAL;
1055 vap->va_gen = VNOVAL;
1056 vap->va_flags = VNOVAL;
1057 vap->va_rdev = VNOVAL;
1058 vap->va_bytes = VNOVAL;
1059 }
1060
1061 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
1062 #define ARRAY_PRINT(idx, arr) \
1063 ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
1064
1065 const char * const vnode_tags[] = { VNODE_TAGS };
1066 const char * const vnode_types[] = { VNODE_TYPES };
1067 const char vnode_flagbits[] = VNODE_FLAGBITS;
1068
1069 /*
1070 * Print out a description of a vnode.
1071 */
1072 void
vprint(const char * label,struct vnode * vp)1073 vprint(const char *label, struct vnode *vp)
1074 {
1075 char bf[96];
1076 int flag;
1077
1078 flag = vp->v_iflag | vp->v_vflag | vp->v_uflag;
1079 snprintb(bf, sizeof(bf), vnode_flagbits, flag);
1080
1081 if (label != NULL)
1082 printf("%s: ", label);
1083 printf("vnode @ %p, flags (%s)\n\ttag %s(%d), type %s(%d), "
1084 "usecount %d, writecount %d, holdcount %d\n"
1085 "\tfreelisthd %p, mount %p, data %p lock %p\n",
1086 vp, bf, ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
1087 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
1088 vp->v_usecount, vp->v_writecount, vp->v_holdcnt,
1089 vp->v_freelisthd, vp->v_mount, vp->v_data, &vp->v_lock);
1090 vcache_print(vp, "\t", printf);
1091 if (vp->v_data != NULL) {
1092 printf("\t");
1093 VOP_PRINT(vp);
1094 }
1095 }
1096
1097 /* Deprecated. Kept for KPI compatibility. */
1098 int
vaccess(enum vtype type,mode_t file_mode,uid_t uid,gid_t gid,mode_t acc_mode,kauth_cred_t cred)1099 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
1100 mode_t acc_mode, kauth_cred_t cred)
1101 {
1102
1103 #ifdef DIAGNOSTIC
1104 printf("vaccess: deprecated interface used.\n");
1105 #endif /* DIAGNOSTIC */
1106
1107 return kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(acc_mode,
1108 type, file_mode), NULL /* This may panic. */, NULL,
1109 genfs_can_access(type, file_mode, uid, gid, acc_mode, cred));
1110 }
1111
1112 /*
1113 * Given a file system name, look up the vfsops for that
1114 * file system, or return NULL if file system isn't present
1115 * in the kernel.
1116 */
1117 struct vfsops *
vfs_getopsbyname(const char * name)1118 vfs_getopsbyname(const char *name)
1119 {
1120 struct vfsops *v;
1121
1122 mutex_enter(&vfs_list_lock);
1123 LIST_FOREACH(v, &vfs_list, vfs_list) {
1124 if (strcmp(v->vfs_name, name) == 0)
1125 break;
1126 }
1127 if (v != NULL)
1128 v->vfs_refcount++;
1129 mutex_exit(&vfs_list_lock);
1130
1131 return (v);
1132 }
1133
1134 void
copy_statvfs_info(struct statvfs * sbp,const struct mount * mp)1135 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
1136 {
1137 const struct statvfs *mbp;
1138
1139 if (sbp == (mbp = &mp->mnt_stat))
1140 return;
1141
1142 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
1143 sbp->f_fsid = mbp->f_fsid;
1144 sbp->f_owner = mbp->f_owner;
1145 sbp->f_flag = mbp->f_flag;
1146 sbp->f_syncwrites = mbp->f_syncwrites;
1147 sbp->f_asyncwrites = mbp->f_asyncwrites;
1148 sbp->f_syncreads = mbp->f_syncreads;
1149 sbp->f_asyncreads = mbp->f_asyncreads;
1150 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
1151 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
1152 sizeof(sbp->f_fstypename));
1153 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
1154 sizeof(sbp->f_mntonname));
1155 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
1156 sizeof(sbp->f_mntfromname));
1157 sbp->f_namemax = mbp->f_namemax;
1158 }
1159
1160 int
set_statvfs_info(const char * onp,int ukon,const char * fromp,int ukfrom,const char * vfsname,struct mount * mp,struct lwp * l)1161 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
1162 const char *vfsname, struct mount *mp, struct lwp *l)
1163 {
1164 int error;
1165 size_t size;
1166 struct statvfs *sfs = &mp->mnt_stat;
1167 int (*fun)(const void *, void *, size_t, size_t *);
1168
1169 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsname,
1170 sizeof(mp->mnt_stat.f_fstypename));
1171
1172 if (onp) {
1173 struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1174 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
1175 if (cwdi->cwdi_rdir != NULL) {
1176 size_t len;
1177 char *bp;
1178 char *path = PNBUF_GET();
1179
1180 bp = path + MAXPATHLEN;
1181 *--bp = '\0';
1182 rw_enter(&cwdi->cwdi_lock, RW_READER);
1183 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
1184 path, MAXPATHLEN / 2, 0, l);
1185 rw_exit(&cwdi->cwdi_lock);
1186 if (error) {
1187 PNBUF_PUT(path);
1188 return error;
1189 }
1190
1191 len = strlen(bp);
1192 if (len > sizeof(sfs->f_mntonname) - 1)
1193 len = sizeof(sfs->f_mntonname) - 1;
1194 (void)strncpy(sfs->f_mntonname, bp, len);
1195 PNBUF_PUT(path);
1196
1197 if (len < sizeof(sfs->f_mntonname) - 1) {
1198 error = (*fun)(onp, &sfs->f_mntonname[len],
1199 sizeof(sfs->f_mntonname) - len - 1, &size);
1200 if (error)
1201 return error;
1202 size += len;
1203 } else {
1204 size = len;
1205 }
1206 } else {
1207 error = (*fun)(onp, &sfs->f_mntonname,
1208 sizeof(sfs->f_mntonname) - 1, &size);
1209 if (error)
1210 return error;
1211 }
1212 (void)memset(sfs->f_mntonname + size, 0,
1213 sizeof(sfs->f_mntonname) - size);
1214 }
1215
1216 if (fromp) {
1217 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
1218 error = (*fun)(fromp, sfs->f_mntfromname,
1219 sizeof(sfs->f_mntfromname) - 1, &size);
1220 if (error)
1221 return error;
1222 (void)memset(sfs->f_mntfromname + size, 0,
1223 sizeof(sfs->f_mntfromname) - size);
1224 }
1225 return 0;
1226 }
1227
1228 void
vfs_timestamp(struct timespec * ts)1229 vfs_timestamp(struct timespec *ts)
1230 {
1231
1232 nanotime(ts);
1233 }
1234
1235 time_t rootfstime; /* recorded root fs time, if known */
1236 void
setrootfstime(time_t t)1237 setrootfstime(time_t t)
1238 {
1239 rootfstime = t;
1240 }
1241
1242 static const uint8_t vttodt_tab[ ] = {
1243 [VNON] = DT_UNKNOWN,
1244 [VREG] = DT_REG,
1245 [VDIR] = DT_DIR,
1246 [VBLK] = DT_BLK,
1247 [VCHR] = DT_CHR,
1248 [VLNK] = DT_LNK,
1249 [VSOCK] = DT_SOCK,
1250 [VFIFO] = DT_FIFO,
1251 [VBAD] = DT_UNKNOWN
1252 };
1253
1254 uint8_t
vtype2dt(enum vtype vt)1255 vtype2dt(enum vtype vt)
1256 {
1257
1258 CTASSERT(VBAD == __arraycount(vttodt_tab) - 1);
1259 return vttodt_tab[vt];
1260 }
1261
1262 int
VFS_MOUNT(struct mount * mp,const char * a,void * b,size_t * c)1263 VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
1264 {
1265 int error;
1266
1267 KERNEL_LOCK(1, NULL);
1268 error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
1269 KERNEL_UNLOCK_ONE(NULL);
1270
1271 return error;
1272 }
1273
1274 int
VFS_START(struct mount * mp,int a)1275 VFS_START(struct mount *mp, int a)
1276 {
1277 int error;
1278
1279 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1280 KERNEL_LOCK(1, NULL);
1281 }
1282 error = (*(mp->mnt_op->vfs_start))(mp, a);
1283 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1284 KERNEL_UNLOCK_ONE(NULL);
1285 }
1286
1287 return error;
1288 }
1289
1290 int
VFS_UNMOUNT(struct mount * mp,int a)1291 VFS_UNMOUNT(struct mount *mp, int a)
1292 {
1293 int error;
1294
1295 KERNEL_LOCK(1, NULL);
1296 error = (*(mp->mnt_op->vfs_unmount))(mp, a);
1297 KERNEL_UNLOCK_ONE(NULL);
1298
1299 return error;
1300 }
1301
1302 int
VFS_ROOT(struct mount * mp,struct vnode ** a)1303 VFS_ROOT(struct mount *mp, struct vnode **a)
1304 {
1305 int error;
1306
1307 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1308 KERNEL_LOCK(1, NULL);
1309 }
1310 error = (*(mp->mnt_op->vfs_root))(mp, a);
1311 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1312 KERNEL_UNLOCK_ONE(NULL);
1313 }
1314
1315 return error;
1316 }
1317
1318 int
VFS_QUOTACTL(struct mount * mp,struct quotactl_args * args)1319 VFS_QUOTACTL(struct mount *mp, struct quotactl_args *args)
1320 {
1321 int error;
1322
1323 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1324 KERNEL_LOCK(1, NULL);
1325 }
1326 error = (*(mp->mnt_op->vfs_quotactl))(mp, args);
1327 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1328 KERNEL_UNLOCK_ONE(NULL);
1329 }
1330
1331 return error;
1332 }
1333
1334 int
VFS_STATVFS(struct mount * mp,struct statvfs * a)1335 VFS_STATVFS(struct mount *mp, struct statvfs *a)
1336 {
1337 int error;
1338
1339 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1340 KERNEL_LOCK(1, NULL);
1341 }
1342 error = (*(mp->mnt_op->vfs_statvfs))(mp, a);
1343 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1344 KERNEL_UNLOCK_ONE(NULL);
1345 }
1346
1347 return error;
1348 }
1349
1350 int
VFS_SYNC(struct mount * mp,int a,struct kauth_cred * b)1351 VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b)
1352 {
1353 int error;
1354
1355 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1356 KERNEL_LOCK(1, NULL);
1357 }
1358 error = (*(mp->mnt_op->vfs_sync))(mp, a, b);
1359 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1360 KERNEL_UNLOCK_ONE(NULL);
1361 }
1362
1363 return error;
1364 }
1365
1366 int
VFS_FHTOVP(struct mount * mp,struct fid * a,struct vnode ** b)1367 VFS_FHTOVP(struct mount *mp, struct fid *a, struct vnode **b)
1368 {
1369 int error;
1370
1371 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1372 KERNEL_LOCK(1, NULL);
1373 }
1374 error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b);
1375 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1376 KERNEL_UNLOCK_ONE(NULL);
1377 }
1378
1379 return error;
1380 }
1381
1382 int
VFS_VPTOFH(struct vnode * vp,struct fid * a,size_t * b)1383 VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b)
1384 {
1385 int error;
1386
1387 if ((vp->v_vflag & VV_MPSAFE) == 0) {
1388 KERNEL_LOCK(1, NULL);
1389 }
1390 error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b);
1391 if ((vp->v_vflag & VV_MPSAFE) == 0) {
1392 KERNEL_UNLOCK_ONE(NULL);
1393 }
1394
1395 return error;
1396 }
1397
1398 int
VFS_SNAPSHOT(struct mount * mp,struct vnode * a,struct timespec * b)1399 VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b)
1400 {
1401 int error;
1402
1403 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1404 KERNEL_LOCK(1, NULL);
1405 }
1406 error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b);
1407 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1408 KERNEL_UNLOCK_ONE(NULL);
1409 }
1410
1411 return error;
1412 }
1413
1414 int
VFS_EXTATTRCTL(struct mount * mp,int a,struct vnode * b,int c,const char * d)1415 VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d)
1416 {
1417 int error;
1418
1419 KERNEL_LOCK(1, NULL); /* XXXSMP check ffs */
1420 error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d);
1421 KERNEL_UNLOCK_ONE(NULL); /* XXX */
1422
1423 return error;
1424 }
1425
1426 int
VFS_SUSPENDCTL(struct mount * mp,int a)1427 VFS_SUSPENDCTL(struct mount *mp, int a)
1428 {
1429 int error;
1430
1431 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1432 KERNEL_LOCK(1, NULL);
1433 }
1434 error = (*(mp->mnt_op->vfs_suspendctl))(mp, a);
1435 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1436 KERNEL_UNLOCK_ONE(NULL);
1437 }
1438
1439 return error;
1440 }
1441
1442 #if defined(DDB) || defined(DEBUGPRINT)
1443 static const char buf_flagbits[] = BUF_FLAGBITS;
1444
1445 void
vfs_buf_print(struct buf * bp,int full,void (* pr)(const char *,...))1446 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
1447 {
1448 char bf[1024];
1449
1450 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
1451 PRIx64 " dev 0x%x\n",
1452 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);
1453
1454 snprintb(bf, sizeof(bf),
1455 buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags);
1456 (*pr)(" error %d flags 0x%s\n", bp->b_error, bf);
1457
1458 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
1459 bp->b_bufsize, bp->b_bcount, bp->b_resid);
1460 (*pr)(" data %p saveaddr %p\n",
1461 bp->b_data, bp->b_saveaddr);
1462 (*pr)(" iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock);
1463 }
1464
1465 void
vfs_vnode_print(struct vnode * vp,int full,void (* pr)(const char *,...))1466 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
1467 {
1468 char bf[256];
1469
1470 uvm_object_printit(&vp->v_uobj, full, pr);
1471 snprintb(bf, sizeof(bf),
1472 vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag);
1473 (*pr)("\nVNODE flags %s\n", bf);
1474 (*pr)("mp %p numoutput %d size 0x%llx writesize 0x%llx\n",
1475 vp->v_mount, vp->v_numoutput, vp->v_size, vp->v_writesize);
1476
1477 (*pr)("data %p writecount %ld holdcnt %ld\n",
1478 vp->v_data, vp->v_writecount, vp->v_holdcnt);
1479
1480 (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
1481 ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
1482 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
1483 vp->v_mount, vp->v_mountedhere);
1484
1485 (*pr)("v_lock %p\n", &vp->v_lock);
1486
1487 vcache_print(vp, "", pr);
1488
1489 if (full) {
1490 struct buf *bp;
1491
1492 (*pr)("clean bufs:\n");
1493 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
1494 (*pr)(" bp %p\n", bp);
1495 vfs_buf_print(bp, full, pr);
1496 }
1497
1498 (*pr)("dirty bufs:\n");
1499 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
1500 (*pr)(" bp %p\n", bp);
1501 vfs_buf_print(bp, full, pr);
1502 }
1503 }
1504 }
1505
1506 void
vfs_mount_print(struct mount * mp,int full,void (* pr)(const char *,...))1507 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
1508 {
1509 char sbuf[256];
1510
1511 (*pr)("vnodecovered = %p data = %p\n",
1512 mp->mnt_vnodecovered,mp->mnt_data);
1513
1514 (*pr)("fs_bshift %d dev_bshift = %d\n",
1515 mp->mnt_fs_bshift,mp->mnt_dev_bshift);
1516
1517 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag);
1518 (*pr)("flag = %s\n", sbuf);
1519
1520 snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag);
1521 (*pr)("iflag = %s\n", sbuf);
1522
1523 (*pr)("refcnt = %d unmounting @ %p updating @ %p\n", mp->mnt_refcnt,
1524 &mp->mnt_unmounting, &mp->mnt_updating);
1525
1526 (*pr)("statvfs cache:\n");
1527 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize);
1528 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize);
1529 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize);
1530
1531 (*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks);
1532 (*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree);
1533 (*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail);
1534 (*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd);
1535
1536 (*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files);
1537 (*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree);
1538 (*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail);
1539 (*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd);
1540
1541 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
1542 mp->mnt_stat.f_fsidx.__fsid_val[0],
1543 mp->mnt_stat.f_fsidx.__fsid_val[1]);
1544
1545 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner);
1546 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax);
1547
1548 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag);
1549
1550 (*pr)("\tflag = %s\n",sbuf);
1551 (*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites);
1552 (*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites);
1553 (*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads);
1554 (*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads);
1555 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename);
1556 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname);
1557 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname);
1558
1559 {
1560 int cnt = 0;
1561 struct vnode *vp;
1562 (*pr)("locked vnodes =");
1563 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1564 if (VOP_ISLOCKED(vp)) {
1565 if ((++cnt % 6) == 0) {
1566 (*pr)(" %p,\n\t", vp);
1567 } else {
1568 (*pr)(" %p,", vp);
1569 }
1570 }
1571 }
1572 (*pr)("\n");
1573 }
1574
1575 if (full) {
1576 int cnt = 0;
1577 struct vnode *vp;
1578 (*pr)("all vnodes =");
1579 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1580 if (!TAILQ_NEXT(vp, v_mntvnodes)) {
1581 (*pr)(" %p", vp);
1582 } else if ((++cnt % 6) == 0) {
1583 (*pr)(" %p,\n\t", vp);
1584 } else {
1585 (*pr)(" %p,", vp);
1586 }
1587 }
1588 (*pr)("\n", vp);
1589 }
1590 }
1591
1592 /*
1593 * List all of the locked vnodes in the system.
1594 */
1595 void printlockedvnodes(void);
1596
1597 void
printlockedvnodes(void)1598 printlockedvnodes(void)
1599 {
1600 struct mount *mp, *nmp;
1601 struct vnode *vp;
1602
1603 printf("Locked vnodes\n");
1604 mutex_enter(&mountlist_lock);
1605 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1606 if (vfs_busy(mp, &nmp)) {
1607 continue;
1608 }
1609 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1610 if (VOP_ISLOCKED(vp))
1611 vprint(NULL, vp);
1612 }
1613 mutex_enter(&mountlist_lock);
1614 vfs_unbusy(mp, false, &nmp);
1615 }
1616 mutex_exit(&mountlist_lock);
1617 }
1618
1619 #endif /* DDB || DEBUGPRINT */
1620