xref: /dragonfly/sys/kern/vfs_vnops.c (revision 1e45dd8c)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
35  * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/uio.h>
41 #include <sys/fcntl.h>
42 #include <sys/file.h>
43 #include <sys/stat.h>
44 #include <sys/proc.h>
45 #include <sys/priv.h>
46 #include <sys/mount.h>
47 #include <sys/nlookup.h>
48 #include <sys/vnode.h>
49 #include <sys/buf.h>
50 #include <sys/filio.h>
51 #include <sys/ttycom.h>
52 #include <sys/conf.h>
53 #include <sys/sysctl.h>
54 #include <sys/syslog.h>
55 
56 #include <sys/mplock2.h>
57 
58 static int vn_closefile (struct file *fp);
59 static int vn_ioctl (struct file *fp, u_long com, caddr_t data,
60 		struct ucred *cred, struct sysmsg *msg);
61 static int vn_read (struct file *fp, struct uio *uio,
62 		struct ucred *cred, int flags);
63 static int vn_kqfilter (struct file *fp, struct knote *kn);
64 static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred);
65 static int vn_write (struct file *fp, struct uio *uio,
66 		struct ucred *cred, int flags);
67 
68 struct fileops vnode_fileops = {
69 	.fo_read = vn_read,
70 	.fo_write = vn_write,
71 	.fo_ioctl = vn_ioctl,
72 	.fo_kqfilter = vn_kqfilter,
73 	.fo_stat = vn_statfile,
74 	.fo_close = vn_closefile,
75 	.fo_shutdown = nofo_shutdown
76 };
77 
78 /*
79  * Common code for vnode open operations.  Check permissions, and call
80  * the VOP_NOPEN or VOP_NCREATE routine.
81  *
82  * The caller is responsible for setting up nd with nlookup_init() and
83  * for cleaning it up with nlookup_done(), whether we return an error
84  * or not.
85  *
86  * On success nd->nl_open_vp will hold a referenced and, if requested,
87  * locked vnode.  A locked vnode is requested via NLC_LOCKVP.  If fp
88  * is non-NULL the vnode will be installed in the file pointer.
89  *
90  * NOTE: If the caller wishes the namecache entry to be operated with
91  *	 a shared lock it must use NLC_SHAREDLOCK.  If NLC_LOCKVP is set
92  *	 then the vnode lock will also be shared.
93  *
94  * NOTE: The vnode is referenced just once on return whether or not it
95  *	 is also installed in the file pointer.
96  */
97 int
98 vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode)
99 {
100 	struct vnode *vp;
101 	struct ucred *cred = nd->nl_cred;
102 	struct vattr vat;
103 	struct vattr *vap = &vat;
104 	int error;
105 	int vpexcl;
106 	u_int flags;
107 	uint64_t osize;
108 	struct mount *mp;
109 
110 	/*
111 	 * Certain combinations are illegal
112 	 */
113 	if ((fmode & (FWRITE | O_TRUNC)) == O_TRUNC)
114 		return(EACCES);
115 
116 	/*
117 	 * Lookup the path and create or obtain the vnode.  After a
118 	 * successful lookup a locked nd->nl_nch will be returned.
119 	 *
120 	 * The result of this section should be a locked vnode.
121 	 *
122 	 * XXX with only a little work we should be able to avoid locking
123 	 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set.
124 	 */
125 	nd->nl_flags |= NLC_OPEN;
126 	if (fmode & O_APPEND)
127 		nd->nl_flags |= NLC_APPEND;
128 	if (fmode & O_TRUNC)
129 		nd->nl_flags |= NLC_TRUNCATE;
130 	if (fmode & FREAD)
131 		nd->nl_flags |= NLC_READ;
132 	if (fmode & FWRITE)
133 		nd->nl_flags |= NLC_WRITE;
134 	if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
135 		nd->nl_flags |= NLC_FOLLOW;
136 
137 	if (fmode & O_CREAT) {
138 		/*
139 		 * CONDITIONAL CREATE FILE CASE
140 		 *
141 		 * Setting NLC_CREATE causes a negative hit to store
142 		 * the negative hit ncp and not return an error.  Then
143 		 * nc_error or nc_vp may be checked to see if the ncp
144 		 * represents a negative hit.  NLC_CREATE also requires
145 		 * write permission on the governing directory or EPERM
146 		 * is returned.
147 		 * If the file exists but is missing write permission,
148 		 * nlookup() returns EACCES. This has to be handled specially
149 		 * when combined with O_EXCL.
150 		 */
151 		nd->nl_flags |= NLC_CREATE;
152 		nd->nl_flags |= NLC_REFDVP;
153 		bwillinode(1);
154 		error = nlookup(nd);
155 		if (error == EACCES && nd->nl_nch.ncp->nc_vp != NULL &&
156 			(fmode & O_EXCL))
157 			error = EEXIST;
158 	} else {
159 		/*
160 		 * NORMAL OPEN FILE CASE
161 		 */
162 		error = nlookup(nd);
163 	}
164 
165 	if (error)
166 		return (error);
167 
168 	/*
169 	 * split case to allow us to re-resolve and retry the ncp in case
170 	 * we get ESTALE.
171 	 *
172 	 * (error is 0 on entry / retry)
173 	 */
174 again:
175 	/*
176 	 * Checks for (likely) filesystem-modifying cases and allows
177 	 * the filesystem to stall the front-end.
178 	 */
179 	if ((fmode & (FWRITE | O_TRUNC)) ||
180 	    ((fmode & O_CREAT) && nd->nl_nch.ncp->nc_vp == NULL)) {
181 		error = ncp_writechk(&nd->nl_nch);
182 		if (error)
183 			return error;
184 	}
185 
186 	vpexcl = 1;
187 	if (fmode & O_CREAT) {
188 		if (nd->nl_nch.ncp->nc_vp == NULL) {
189 			VATTR_NULL(vap);
190 			vap->va_type = VREG;
191 			vap->va_mode = cmode;
192 			vap->va_fuseflags = fmode; /* FUSE */
193 			if (fmode & O_EXCL)
194 				vap->va_vaflags |= VA_EXCLUSIVE;
195 			error = VOP_NCREATE(&nd->nl_nch, nd->nl_dvp, &vp,
196 					    nd->nl_cred, vap);
197 			if (error)
198 				return (error);
199 			fmode &= ~O_TRUNC;
200 			/* locked vnode is returned */
201 		} else {
202 			if (fmode & O_EXCL) {
203 				error = EEXIST;
204 			} else {
205 				error = cache_vget(&nd->nl_nch, cred,
206 						    LK_EXCLUSIVE, &vp);
207 			}
208 			if (error)
209 				return (error);
210 			fmode &= ~O_CREAT;
211 		}
212 	} else {
213 		/*
214 		 * In most other cases a shared lock on the vnode is
215 		 * sufficient.  However, the O_RDWR case needs an
216 		 * exclusive lock if the vnode is executable.  The
217 		 * NLC_EXCLLOCK_IFEXEC and NCF_NOTX flags help resolve
218 		 * this.
219 		 *
220 		 * NOTE: If NCF_NOTX is not set, we do not know the
221 		 *	 the state of the 'x' bits and have to get
222 		 *	 an exclusive lock for the EXCLLOCK_IFEXEC case.
223 		 */
224 		if ((nd->nl_flags & NLC_SHAREDLOCK) &&
225 		    ((nd->nl_flags & NLC_EXCLLOCK_IFEXEC) == 0 ||
226 		     nd->nl_nch.ncp->nc_flag & NCF_NOTX)) {
227 			error = cache_vget(&nd->nl_nch, cred, LK_SHARED, &vp);
228 			vpexcl = 0;
229 		} else {
230 			error = cache_vget(&nd->nl_nch, cred,
231 					   LK_EXCLUSIVE, &vp);
232 		}
233 		if (error)
234 			return (error);
235 	}
236 
237 	/*
238 	 * We have a locked vnode and ncp now.  Note that the ncp will
239 	 * be cleaned up by the caller if nd->nl_nch is left intact.
240 	 */
241 	if (vp->v_type == VLNK) {
242 		error = EMLINK;
243 		goto bad;
244 	}
245 	if (vp->v_type == VSOCK) {
246 		error = EOPNOTSUPP;
247 		goto bad;
248 	}
249 	if (vp->v_type != VDIR && (fmode & O_DIRECTORY)) {
250 		error = ENOTDIR;
251 		goto bad;
252 	}
253 	if ((fmode & O_CREAT) == 0) {
254 		if (fmode & (FWRITE | O_TRUNC)) {
255 			if (vp->v_type == VDIR) {
256 				error = EISDIR;
257 				goto bad;
258 			}
259 
260 			/*
261 			 * Additional checks on vnode (does not substitute
262 			 * for ncp_writechk()).
263 			 */
264 			error = vn_writechk(vp);
265 			if (error) {
266 				/*
267 				 * Special stale handling, re-resolve the
268 				 * vnode.
269 				 */
270 				if (error == ESTALE) {
271 					vput(vp);
272 					vp = NULL;
273 					if (vpexcl == 0) {
274 						cache_unlock(&nd->nl_nch);
275 						cache_lock(&nd->nl_nch);
276 					}
277 					cache_setunresolved(&nd->nl_nch);
278 					error = cache_resolve(&nd->nl_nch,
279 							      cred);
280 					if (error == 0)
281 						goto again;
282 				}
283 				goto bad;
284 			}
285 		}
286 	}
287 	if (fmode & O_TRUNC) {
288 		vn_unlock(vp);				/* XXX */
289 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
290 		osize = vp->v_filesize;
291 		VATTR_NULL(vap);
292 		vap->va_size = 0;
293 		error = VOP_SETATTR_FP(vp, vap, cred, fp);
294 		if (error)
295 			goto bad;
296 		error = VOP_GETATTR(vp, vap);
297 		if (error)
298 			goto bad;
299 		mp = vq_vptomp(vp);
300 		VFS_ACCOUNT(mp, vap->va_uid, vap->va_gid, -osize);
301 	}
302 
303 	/*
304 	 * Set or clear VNSWAPCACHE on the vp based on nd->nl_nch.ncp->nc_flag.
305 	 * These particular bits a tracked all the way from the root.
306 	 *
307 	 * NOTE: Might not work properly on NFS servers due to the
308 	 * disconnected namecache.
309 	 */
310 	flags = nd->nl_nch.ncp->nc_flag;
311 	if ((flags & (NCF_UF_CACHE | NCF_UF_PCACHE)) &&
312 	    (flags & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) == 0) {
313 		vsetflags(vp, VSWAPCACHE);
314 	} else {
315 		vclrflags(vp, VSWAPCACHE);
316 	}
317 
318 	/*
319 	 * Setup the fp so VOP_OPEN can override it.  No descriptor has been
320 	 * associated with the fp yet so we own it clean.
321 	 *
322 	 * f_nchandle inherits nl_nch.  This used to be necessary only for
323 	 * directories but now we do it unconditionally so f*() ops
324 	 * such as fchmod() can access the actual namespace that was
325 	 * used to open the file.
326 	 */
327 	if (fp) {
328 		if (nd->nl_flags & NLC_APPENDONLY)
329 			fmode |= FAPPENDONLY;
330 		fp->f_nchandle = nd->nl_nch;
331 		cache_zero(&nd->nl_nch);
332 		cache_unlock(&fp->f_nchandle);
333 	}
334 
335 	/*
336 	 * Get rid of nl_nch.  vn_open does not return it (it returns the
337 	 * vnode or the file pointer).  Note: we can't leave nl_nch locked
338 	 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g.
339 	 * on /dev/ttyd0
340 	 */
341 	if (nd->nl_nch.ncp)
342 		cache_put(&nd->nl_nch);
343 
344 	error = VOP_OPEN(vp, fmode, cred, fp);
345 	if (error) {
346 		/*
347 		 * setting f_ops to &badfileops will prevent the descriptor
348 		 * code from trying to close and release the vnode, since
349 		 * the open failed we do not want to call close.
350 		 */
351 		if (fp) {
352 			fp->f_data = NULL;
353 			fp->f_ops = &badfileops;
354 		}
355 		goto bad;
356 	}
357 
358 #if 0
359 	/*
360 	 * Assert that VREG files have been setup for vmio.
361 	 */
362 	KASSERT(vp->v_type != VREG || vp->v_object != NULL,
363 		("vn_open: regular file was not VMIO enabled!"));
364 #endif
365 
366 	/*
367 	 * Return the vnode.  XXX needs some cleaning up.  The vnode is
368 	 * only returned in the fp == NULL case.
369 	 */
370 	if (fp == NULL) {
371 		nd->nl_open_vp = vp;
372 		nd->nl_vp_fmode = fmode;
373 		if ((nd->nl_flags & NLC_LOCKVP) == 0)
374 			vn_unlock(vp);
375 	} else {
376 		vput(vp);
377 	}
378 	return (0);
379 bad:
380 	if (vp)
381 		vput(vp);
382 	return (error);
383 }
384 
385 int
386 vn_opendisk(const char *devname, int fmode, struct vnode **vpp)
387 {
388 	struct vnode *vp;
389 	int error;
390 
391 	if (strncmp(devname, "/dev/", 5) == 0)
392 		devname += 5;
393 	if ((vp = getsynthvnode(devname)) == NULL) {
394 		error = ENODEV;
395 	} else {
396 		error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL);
397 		vn_unlock(vp);
398 		if (error) {
399 			vrele(vp);
400 			vp = NULL;
401 		}
402 	}
403 	*vpp = vp;
404 	return (error);
405 }
406 
407 /*
408  * Checks for special conditions on the vnode which might prevent writing
409  * after the vnode has (likely) been locked.  The vnode might or might not
410  * be locked as of this call, but will be at least referenced.
411  *
412  * Also re-checks the mount RDONLY flag that ncp_writechk() checked prior
413  * to the vnode being locked.
414  */
415 int
416 vn_writechk(struct vnode *vp)
417 {
418 	/*
419 	 * If there's shared text associated with
420 	 * the vnode, try to free it up once.  If
421 	 * we fail, we can't allow writing.
422 	 */
423 	if (vp->v_flag & VTEXT)
424 		return (ETXTBSY);
425 	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_RDONLY))
426 		return (EROFS);
427 	return 0;
428 }
429 
430 /*
431  * Check whether the underlying mount is read-only.  The mount point
432  * referenced by the namecache may be different from the mount point
433  * used by the underlying vnode in the case of NULLFS, so a separate
434  * check is needed.
435  *
436  * Must be called PRIOR to any vnodes being locked.
437  */
438 int
439 ncp_writechk(struct nchandle *nch)
440 {
441 	struct mount *mp;
442 
443 	if ((mp = nch->mount) != NULL) {
444 		if (mp->mnt_flag & MNT_RDONLY)
445 			return (EROFS);
446 		if (mp->mnt_op->vfs_modifying != vfs_stdmodifying)
447 			VFS_MODIFYING(mp);
448 	}
449 	return(0);
450 }
451 
452 /*
453  * Vnode close call
454  *
455  * MPSAFE
456  */
457 int
458 vn_close(struct vnode *vp, int flags, struct file *fp)
459 {
460 	int error;
461 
462 	error = vn_lock(vp, LK_SHARED | LK_RETRY | LK_FAILRECLAIM);
463 	if (error == 0) {
464 		error = VOP_CLOSE(vp, flags, fp);
465 		vn_unlock(vp);
466 	}
467 	vrele(vp);
468 	return (error);
469 }
470 
471 /*
472  * Sequential heuristic.
473  *
474  * MPSAFE (f_seqcount and f_nextoff are allowed to race)
475  */
476 static __inline
477 int
478 sequential_heuristic(struct uio *uio, struct file *fp)
479 {
480 	/*
481 	 * Sequential heuristic - detect sequential operation
482 	 *
483 	 * NOTE: SMP: We allow f_seqcount updates to race.
484 	 */
485 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
486 	    uio->uio_offset == fp->f_nextoff) {
487 		int tmpseq = fp->f_seqcount;
488 
489 		tmpseq += howmany(uio->uio_resid, MAXBSIZE);
490 		if (tmpseq > IO_SEQMAX)
491 			tmpseq = IO_SEQMAX;
492 		fp->f_seqcount = tmpseq;
493 		return(fp->f_seqcount << IO_SEQSHIFT);
494 	}
495 
496 	/*
497 	 * Not sequential, quick draw-down of seqcount
498 	 *
499 	 * NOTE: SMP: We allow f_seqcount updates to race.
500 	 */
501 	if (fp->f_seqcount > 1)
502 		fp->f_seqcount = 1;
503 	else
504 		fp->f_seqcount = 0;
505 	return(0);
506 }
507 
508 /*
509  * get - lock and return the f_offset field.
510  * set - set and unlock the f_offset field.
511  *
512  * These routines serve the dual purpose of serializing access to the
513  * f_offset field (at least on x86) and guaranteeing operational integrity
514  * when multiple read()ers and write()ers are present on the same fp.
515  *
516  * MPSAFE
517  */
518 static __inline off_t
519 vn_get_fpf_offset(struct file *fp)
520 {
521 	u_int	flags;
522 	u_int	nflags;
523 
524 	/*
525 	 * Shortcut critical path.
526 	 */
527 	flags = fp->f_flag & ~FOFFSETLOCK;
528 	if (atomic_cmpset_int(&fp->f_flag, flags, flags | FOFFSETLOCK))
529 		return(fp->f_offset);
530 
531 	/*
532 	 * The hard way
533 	 */
534 	for (;;) {
535 		flags = fp->f_flag;
536 		if (flags & FOFFSETLOCK) {
537 			nflags = flags | FOFFSETWAKE;
538 			tsleep_interlock(&fp->f_flag, 0);
539 			if (atomic_cmpset_int(&fp->f_flag, flags, nflags))
540 				tsleep(&fp->f_flag, PINTERLOCKED, "fpoff", 0);
541 		} else {
542 			nflags = flags | FOFFSETLOCK;
543 			if (atomic_cmpset_int(&fp->f_flag, flags, nflags))
544 				break;
545 		}
546 	}
547 	return(fp->f_offset);
548 }
549 
550 /*
551  * MPSAFE
552  */
553 static __inline void
554 vn_set_fpf_offset(struct file *fp, off_t offset)
555 {
556 	u_int	flags;
557 	u_int	nflags;
558 
559 	/*
560 	 * We hold the lock so we can set the offset without interference.
561 	 */
562 	fp->f_offset = offset;
563 
564 	/*
565 	 * Normal release is already a reasonably critical path.
566 	 */
567 	for (;;) {
568 		flags = fp->f_flag;
569 		nflags = flags & ~(FOFFSETLOCK | FOFFSETWAKE);
570 		if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) {
571 			if (flags & FOFFSETWAKE)
572 				wakeup(&fp->f_flag);
573 			break;
574 		}
575 	}
576 }
577 
578 /*
579  * MPSAFE
580  */
581 static __inline off_t
582 vn_poll_fpf_offset(struct file *fp)
583 {
584 #if defined(__x86_64__)
585 	return(fp->f_offset);
586 #else
587 	off_t off = vn_get_fpf_offset(fp);
588 	vn_set_fpf_offset(fp, off);
589 	return(off);
590 #endif
591 }
592 
593 /*
594  * Package up an I/O request on a vnode into a uio and do it.
595  *
596  * MPSAFE
597  */
598 int
599 vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
600 	off_t offset, enum uio_seg segflg, int ioflg,
601 	struct ucred *cred, int *aresid)
602 {
603 	struct uio auio;
604 	struct iovec aiov;
605 	int error;
606 
607 	if ((ioflg & IO_NODELOCKED) == 0)
608 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
609 	auio.uio_iov = &aiov;
610 	auio.uio_iovcnt = 1;
611 	aiov.iov_base = base;
612 	aiov.iov_len = len;
613 	auio.uio_resid = len;
614 	auio.uio_offset = offset;
615 	auio.uio_segflg = segflg;
616 	auio.uio_rw = rw;
617 	auio.uio_td = curthread;
618 	if (rw == UIO_READ) {
619 		error = VOP_READ(vp, &auio, ioflg, cred);
620 	} else {
621 		error = VOP_WRITE(vp, &auio, ioflg, cred);
622 	}
623 	if (aresid)
624 		*aresid = auio.uio_resid;
625 	else
626 		if (auio.uio_resid && error == 0)
627 			error = EIO;
628 	if ((ioflg & IO_NODELOCKED) == 0)
629 		vn_unlock(vp);
630 	return (error);
631 }
632 
633 /*
634  * Package up an I/O request on a vnode into a uio and do it.  The I/O
635  * request is split up into smaller chunks and we try to avoid saturating
636  * the buffer cache while potentially holding a vnode locked, so we
637  * check bwillwrite() before calling vn_rdwr().  We also call lwkt_user_yield()
638  * to give other processes a chance to lock the vnode (either other processes
639  * core'ing the same binary, or unrelated processes scanning the directory).
640  *
641  * MPSAFE
642  */
643 int
644 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
645 		 off_t offset, enum uio_seg segflg, int ioflg,
646 		 struct ucred *cred, int *aresid)
647 {
648 	int error = 0;
649 
650 	do {
651 		int chunk;
652 
653 		/*
654 		 * Force `offset' to a multiple of MAXBSIZE except possibly
655 		 * for the first chunk, so that filesystems only need to
656 		 * write full blocks except possibly for the first and last
657 		 * chunks.
658 		 */
659 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
660 
661 		if (chunk > len)
662 			chunk = len;
663 		if (vp->v_type == VREG && (ioflg & IO_RECURSE) == 0) {
664 			switch(rw) {
665 			case UIO_READ:
666 				bwillread(chunk);
667 				break;
668 			case UIO_WRITE:
669 				bwillwrite(chunk);
670 				break;
671 			}
672 		}
673 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
674 				ioflg, cred, aresid);
675 		len -= chunk;	/* aresid calc already includes length */
676 		if (error)
677 			break;
678 		offset += chunk;
679 		base += chunk;
680 		lwkt_user_yield();
681 	} while (len);
682 	if (aresid)
683 		*aresid += len;
684 	return (error);
685 }
686 
687 /*
688  * File pointers can no longer get ripped up by revoke so
689  * we don't need to lock access to the vp.
690  *
691  * f_offset updates are not guaranteed against multiple readers
692  */
693 static int
694 vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
695 {
696 	struct vnode *vp;
697 	int error, ioflag;
698 
699 	KASSERT(uio->uio_td == curthread,
700 		("uio_td %p is not td %p", uio->uio_td, curthread));
701 	vp = (struct vnode *)fp->f_data;
702 
703 	ioflag = 0;
704 	if (flags & O_FBLOCKING) {
705 		/* ioflag &= ~IO_NDELAY; */
706 	} else if (flags & O_FNONBLOCKING) {
707 		ioflag |= IO_NDELAY;
708 	} else if (fp->f_flag & FNONBLOCK) {
709 		ioflag |= IO_NDELAY;
710 	}
711 	if (fp->f_flag & O_DIRECT) {
712 		ioflag |= IO_DIRECT;
713 	}
714 	if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0)
715 		uio->uio_offset = vn_get_fpf_offset(fp);
716 	vn_lock(vp, LK_SHARED | LK_RETRY);
717 	ioflag |= sequential_heuristic(uio, fp);
718 
719 	error = VOP_READ_FP(vp, uio, ioflag, cred, fp);
720 	fp->f_nextoff = uio->uio_offset;
721 	vn_unlock(vp);
722 	if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0)
723 		vn_set_fpf_offset(fp, uio->uio_offset);
724 	return (error);
725 }
726 
727 /*
728  * MPSAFE
729  */
730 static int
731 vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
732 {
733 	struct vnode *vp;
734 	int error, ioflag;
735 
736 	KASSERT(uio->uio_td == curthread,
737 		("uio_td %p is not p %p", uio->uio_td, curthread));
738 	vp = (struct vnode *)fp->f_data;
739 
740 	ioflag = IO_UNIT;
741 	if (vp->v_type == VREG &&
742 	   ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
743 		ioflag |= IO_APPEND;
744 	}
745 
746 	if (flags & O_FBLOCKING) {
747 		/* ioflag &= ~IO_NDELAY; */
748 	} else if (flags & O_FNONBLOCKING) {
749 		ioflag |= IO_NDELAY;
750 	} else if (fp->f_flag & FNONBLOCK) {
751 		ioflag |= IO_NDELAY;
752 	}
753 	if (fp->f_flag & O_DIRECT) {
754 		ioflag |= IO_DIRECT;
755 	}
756 	if (flags & O_FASYNCWRITE) {
757 		/* ioflag &= ~IO_SYNC; */
758 	} else if (flags & O_FSYNCWRITE) {
759 		ioflag |= IO_SYNC;
760 	} else if (fp->f_flag & O_FSYNC) {
761 		ioflag |= IO_SYNC;
762 	}
763 
764 	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
765 		ioflag |= IO_SYNC;
766 	if ((flags & O_FOFFSET) == 0)
767 		uio->uio_offset = vn_get_fpf_offset(fp);
768 	if (vp->v_mount)
769 		VFS_MODIFYING(vp->v_mount);
770 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
771 	ioflag |= sequential_heuristic(uio, fp);
772 	error = VOP_WRITE_FP(vp, uio, ioflag, cred, fp);
773 	fp->f_nextoff = uio->uio_offset;
774 	vn_unlock(vp);
775 	if ((flags & O_FOFFSET) == 0)
776 		vn_set_fpf_offset(fp, uio->uio_offset);
777 	return (error);
778 }
779 
780 /*
781  * MPSAFE
782  */
783 static int
784 vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred)
785 {
786 	struct vnode *vp;
787 	int error;
788 
789 	vp = (struct vnode *)fp->f_data;
790 	error = vn_stat(vp, sb, cred);
791 	return (error);
792 }
793 
794 /*
795  * MPSAFE
796  */
797 int
798 vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred)
799 {
800 	struct vattr vattr;
801 	struct vattr *vap;
802 	int error;
803 	u_short mode;
804 	cdev_t dev;
805 
806 	/*
807 	 * vp already has a ref and is validated, can call unlocked.
808 	 */
809 	vap = &vattr;
810 	error = VOP_GETATTR(vp, vap);
811 	if (error)
812 		return (error);
813 
814 	/*
815 	 * Zero the spare stat fields
816 	 */
817 	sb->st_lspare = 0;
818 	sb->st_qspare2 = 0;
819 
820 	/*
821 	 * Copy from vattr table
822 	 */
823 	if (vap->va_fsid != VNOVAL)
824 		sb->st_dev = vap->va_fsid;
825 	else
826 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
827 	sb->st_ino = vap->va_fileid;
828 	mode = vap->va_mode;
829 	switch (vap->va_type) {
830 	case VREG:
831 		mode |= S_IFREG;
832 		break;
833 	case VDATABASE:
834 		mode |= S_IFDB;
835 		break;
836 	case VDIR:
837 		mode |= S_IFDIR;
838 		break;
839 	case VBLK:
840 		mode |= S_IFBLK;
841 		break;
842 	case VCHR:
843 		mode |= S_IFCHR;
844 		break;
845 	case VLNK:
846 		mode |= S_IFLNK;
847 		/* This is a cosmetic change, symlinks do not have a mode. */
848 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
849 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
850 		else
851 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
852 		break;
853 	case VSOCK:
854 		mode |= S_IFSOCK;
855 		break;
856 	case VFIFO:
857 		mode |= S_IFIFO;
858 		break;
859 	default:
860 		return (EBADF);
861 	}
862 	sb->st_mode = mode;
863 	if (vap->va_nlink > (nlink_t)-1)
864 		sb->st_nlink = (nlink_t)-1;
865 	else
866 		sb->st_nlink = vap->va_nlink;
867 	sb->st_uid = vap->va_uid;
868 	sb->st_gid = vap->va_gid;
869 	sb->st_rdev = devid_from_dev(vp->v_rdev);
870 	sb->st_size = vap->va_size;
871 	sb->st_atimespec = vap->va_atime;
872 	sb->st_mtimespec = vap->va_mtime;
873 	sb->st_ctimespec = vap->va_ctime;
874 
875 	/*
876 	 * A VCHR and VBLK device may track the last access and last modified
877 	 * time independantly of the filesystem.  This is particularly true
878 	 * because device read and write calls may bypass the filesystem.
879 	 */
880 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
881 		dev = vp->v_rdev;
882 		if (dev != NULL) {
883 			if (dev->si_lastread) {
884 				sb->st_atimespec.tv_sec = time_second +
885 							  (dev->si_lastread -
886 							   time_uptime);
887 				sb->st_atimespec.tv_nsec = 0;
888 			}
889 			if (dev->si_lastwrite) {
890 				sb->st_mtimespec.tv_sec = time_second +
891 							  (dev->si_lastwrite -
892 							   time_uptime);
893 				sb->st_mtimespec.tv_nsec = 0;
894 			}
895 		}
896 	}
897 
898         /*
899 	 * According to www.opengroup.org, the meaning of st_blksize is
900 	 *   "a filesystem-specific preferred I/O block size for this
901 	 *    object.  In some filesystem types, this may vary from file
902 	 *    to file"
903 	 * Default to PAGE_SIZE after much discussion.
904 	 */
905 
906 	if (vap->va_type == VREG) {
907 		sb->st_blksize = vap->va_blocksize;
908 	} else if (vn_isdisk(vp, NULL)) {
909 		/*
910 		 * XXX this is broken.  If the device is not yet open (aka
911 		 * stat() call, aka v_rdev == NULL), how are we supposed
912 		 * to get a valid block size out of it?
913 		 */
914 		dev = vp->v_rdev;
915 
916 		sb->st_blksize = dev->si_bsize_best;
917 		if (sb->st_blksize < dev->si_bsize_phys)
918 			sb->st_blksize = dev->si_bsize_phys;
919 		if (sb->st_blksize < BLKDEV_IOSIZE)
920 			sb->st_blksize = BLKDEV_IOSIZE;
921 	} else {
922 		sb->st_blksize = PAGE_SIZE;
923 	}
924 
925 	sb->st_flags = vap->va_flags;
926 
927 	error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0);
928 	if (error)
929 		sb->st_gen = 0;
930 	else
931 		sb->st_gen = (u_int32_t)vap->va_gen;
932 
933 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
934 
935 	/*
936 	 * This is for ABI compatibility <= 5.7 (for ABI change made in
937 	 * 5.7 master).
938 	 */
939 	sb->__old_st_blksize = sb->st_blksize;
940 
941 	return (0);
942 }
943 
944 /*
945  * MPALMOSTSAFE - acquires mplock
946  */
947 static int
948 vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred,
949 	 struct sysmsg *msg)
950 {
951 	struct vnode *vp = ((struct vnode *)fp->f_data);
952 	struct vnode *ovp;
953 	struct vattr vattr;
954 	int error;
955 	off_t size;
956 
957 	switch (vp->v_type) {
958 	case VREG:
959 	case VDIR:
960 		if (com == FIONREAD) {
961 			error = VOP_GETATTR(vp, &vattr);
962 			if (error)
963 				break;
964 			size = vattr.va_size;
965 			if ((vp->v_flag & VNOTSEEKABLE) == 0)
966 				size -= vn_poll_fpf_offset(fp);
967 			if (size > 0x7FFFFFFF)
968 				size = 0x7FFFFFFF;
969 			*(int *)data = size;
970 			error = 0;
971 			break;
972 		}
973 		if (com == FIOASYNC) {				/* XXX */
974 			error = 0;				/* XXX */
975 			break;
976 		}
977 		/* fall into ... */
978 	default:
979 #if 0
980 		return (ENOTTY);
981 #endif
982 	case VFIFO:
983 	case VCHR:
984 	case VBLK:
985 		if (com == FIODTYPE) {
986 			if (vp->v_type != VCHR && vp->v_type != VBLK) {
987 				error = ENOTTY;
988 				break;
989 			}
990 			*(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK;
991 			error = 0;
992 			break;
993 		}
994 		error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred, msg);
995 		if (error == 0 && com == TIOCSCTTY) {
996 			struct proc *p = curthread->td_proc;
997 			struct session *sess;
998 
999 			if (p == NULL) {
1000 				error = ENOTTY;
1001 				break;
1002 			}
1003 
1004 			get_mplock();
1005 			sess = p->p_session;
1006 			/* Do nothing if reassigning same control tty */
1007 			if (sess->s_ttyvp == vp) {
1008 				error = 0;
1009 				rel_mplock();
1010 				break;
1011 			}
1012 
1013 			/* Get rid of reference to old control tty */
1014 			ovp = sess->s_ttyvp;
1015 			vref(vp);
1016 			sess->s_ttyvp = vp;
1017 			if (ovp)
1018 				vrele(ovp);
1019 			rel_mplock();
1020 		}
1021 		break;
1022 	}
1023 	return (error);
1024 }
1025 
1026 /*
1027  * Obtain the requested vnode lock
1028  *
1029  *	LK_RETRY	Automatically retry on timeout
1030  *	LK_FAILRECLAIM	Fail if the vnode is being reclaimed
1031  *
1032  * Failures will occur if the vnode is undergoing recyclement, but not
1033  * all callers expect that the function will fail so the caller must pass
1034  * LK_FAILOK if it wants to process an error code.
1035  *
1036  * Errors can occur for other reasons if you pass in other LK_ flags,
1037  * regardless of whether you pass in LK_FAILRECLAIM
1038  */
1039 int
1040 vn_lock(struct vnode *vp, int flags)
1041 {
1042 	int error;
1043 
1044 	do {
1045 		error = lockmgr(&vp->v_lock, flags);
1046 		if (error == 0)
1047 			break;
1048 	} while (flags & LK_RETRY);
1049 
1050 	/*
1051 	 * Because we (had better!) have a ref on the vnode, once it
1052 	 * goes to VRECLAIMED state it will not be recycled until all
1053 	 * refs go away.  So we can just check the flag.
1054 	 */
1055 	if (error == 0 && (vp->v_flag & VRECLAIMED)) {
1056 		if (flags & LK_FAILRECLAIM) {
1057 			lockmgr(&vp->v_lock, LK_RELEASE);
1058 			error = ENOENT;
1059 		}
1060 	}
1061 	return (error);
1062 }
1063 
1064 int
1065 vn_relock(struct vnode *vp, int flags)
1066 {
1067 	int error;
1068 
1069 	do {
1070 		error = lockmgr(&vp->v_lock, flags);
1071 		if (error == 0)
1072 			break;
1073 	} while (flags & LK_RETRY);
1074 
1075 	return error;
1076 }
1077 
1078 #ifdef DEBUG_VN_UNLOCK
1079 
1080 void
1081 debug_vn_unlock(struct vnode *vp, const char *filename, int line)
1082 {
1083 	kprintf("vn_unlock from %s:%d\n", filename, line);
1084 	lockmgr(&vp->v_lock, LK_RELEASE);
1085 }
1086 
1087 #else
1088 
1089 void
1090 vn_unlock(struct vnode *vp)
1091 {
1092 	lockmgr(&vp->v_lock, LK_RELEASE);
1093 }
1094 
1095 #endif
1096 
1097 /*
1098  * MPSAFE
1099  */
1100 int
1101 vn_islocked(struct vnode *vp)
1102 {
1103 	return (lockstatus(&vp->v_lock, curthread));
1104 }
1105 
1106 /*
1107  * Return the lock status of a vnode and unlock the vnode
1108  * if we owned the lock.  This is not a boolean, if the
1109  * caller cares what the lock status is the caller must
1110  * check the various possible values.
1111  *
1112  * This only unlocks exclusive locks held by the caller,
1113  * it will NOT unlock shared locks (there is no way to
1114  * tell who the shared lock belongs to).
1115  *
1116  * MPSAFE
1117  */
1118 int
1119 vn_islocked_unlock(struct vnode *vp)
1120 {
1121 	int vpls;
1122 
1123 	vpls = lockstatus(&vp->v_lock, curthread);
1124 	if (vpls == LK_EXCLUSIVE)
1125 		lockmgr(&vp->v_lock, LK_RELEASE);
1126 	return(vpls);
1127 }
1128 
1129 /*
1130  * Restore a vnode lock that we previously released via
1131  * vn_islocked_unlock().  This is a NOP if we did not
1132  * own the original lock.
1133  *
1134  * MPSAFE
1135  */
1136 void
1137 vn_islocked_relock(struct vnode *vp, int vpls)
1138 {
1139 	int error;
1140 
1141 	if (vpls == LK_EXCLUSIVE)
1142 		error = lockmgr(&vp->v_lock, vpls);
1143 }
1144 
1145 /*
1146  * MPSAFE
1147  */
1148 static int
1149 vn_closefile(struct file *fp)
1150 {
1151 	int error;
1152 
1153 	fp->f_ops = &badfileops;
1154 	error = vn_close(((struct vnode *)fp->f_data), fp->f_flag, fp);
1155 	return (error);
1156 }
1157 
1158 /*
1159  * MPSAFE
1160  */
1161 static int
1162 vn_kqfilter(struct file *fp, struct knote *kn)
1163 {
1164 	int error;
1165 
1166 	error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn);
1167 	return (error);
1168 }
1169