xref: /dragonfly/sys/kern/vfs_vnops.c (revision 8a7bdfea)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
39  * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $
40  * $DragonFly: src/sys/kern/vfs_vnops.c,v 1.54 2007/11/02 19:52:25 dillon Exp $
41  */
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/fcntl.h>
46 #include <sys/file.h>
47 #include <sys/stat.h>
48 #include <sys/proc.h>
49 #include <sys/mount.h>
50 #include <sys/nlookup.h>
51 #include <sys/vnode.h>
52 #include <sys/buf.h>
53 #include <sys/filio.h>
54 #include <sys/ttycom.h>
55 #include <sys/conf.h>
56 #include <sys/syslog.h>
57 
58 static int vn_closefile (struct file *fp);
59 static int vn_ioctl (struct file *fp, u_long com, caddr_t data,
60 		struct ucred *cred);
61 static int vn_read (struct file *fp, struct uio *uio,
62 		struct ucred *cred, int flags);
63 static int svn_read (struct file *fp, struct uio *uio,
64 		struct ucred *cred, int flags);
65 static int vn_poll (struct file *fp, int events, struct ucred *cred);
66 static int vn_kqfilter (struct file *fp, struct knote *kn);
67 static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred);
68 static int vn_write (struct file *fp, struct uio *uio,
69 		struct ucred *cred, int flags);
70 static int svn_write (struct file *fp, struct uio *uio,
71 		struct ucred *cred, int flags);
72 
73 struct fileops vnode_fileops = {
74 	.fo_read = vn_read,
75 	.fo_write = vn_write,
76 	.fo_ioctl = vn_ioctl,
77 	.fo_poll = vn_poll,
78 	.fo_kqfilter = vn_kqfilter,
79 	.fo_stat = vn_statfile,
80 	.fo_close = vn_closefile,
81 	.fo_shutdown = nofo_shutdown
82 };
83 
84 struct fileops specvnode_fileops = {
85 	.fo_read = svn_read,
86 	.fo_write = svn_write,
87 	.fo_ioctl = vn_ioctl,
88 	.fo_poll = vn_poll,
89 	.fo_kqfilter = vn_kqfilter,
90 	.fo_stat = vn_statfile,
91 	.fo_close = vn_closefile,
92 	.fo_shutdown = nofo_shutdown
93 };
94 
95 /*
96  * Shortcut the device read/write.  This avoids a lot of vnode junk.
97  * Basically the specfs vnops for read and write take the locked vnode,
98  * unlock it (because we can't hold the vnode locked while reading or writing
99  * a device which may block indefinitely), issues the device operation, then
100  * relock the vnode before returning, plus other junk.  This bypasses all
101  * of that and just does the device operation.
102  */
103 void
104 vn_setspecops(struct file *fp)
105 {
106 	if (vfs_fastdev && fp->f_ops == &vnode_fileops) {
107 		fp->f_ops = &specvnode_fileops;
108 	}
109 }
110 
111 /*
112  * Common code for vnode open operations.  Check permissions, and call
113  * the VOP_NOPEN or VOP_NCREATE routine.
114  *
115  * The caller is responsible for setting up nd with nlookup_init() and
116  * for cleaning it up with nlookup_done(), whether we return an error
117  * or not.
118  *
119  * On success nd->nl_open_vp will hold a referenced and, if requested,
120  * locked vnode.  A locked vnode is requested via NLC_LOCKVP.  If fp
121  * is non-NULL the vnode will be installed in the file pointer.
122  *
123  * NOTE: The vnode is referenced just once on return whether or not it
124  * is also installed in the file pointer.
125  */
126 int
127 vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode)
128 {
129 	struct vnode *vp;
130 	struct vnode *dvp;
131 	struct ucred *cred = nd->nl_cred;
132 	struct vattr vat;
133 	struct vattr *vap = &vat;
134 	int mode, error;
135 
136 	/*
137 	 * Lookup the path and create or obtain the vnode.  After a
138 	 * successful lookup a locked nd->nl_nch will be returned.
139 	 *
140 	 * The result of this section should be a locked vnode.
141 	 *
142 	 * XXX with only a little work we should be able to avoid locking
143 	 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set.
144 	 */
145 	if (fmode & O_CREAT) {
146 		/*
147 		 * CONDITIONAL CREATE FILE CASE
148 		 *
149 		 * Setting NLC_CREATE causes a negative hit to store
150 		 * the negative hit ncp and not return an error.  Then
151 		 * nc_error or nc_vp may be checked to see if the ncp
152 		 * represents a negative hit.  NLC_CREATE also requires
153 		 * write permission on the governing directory or EPERM
154 		 * is returned.
155 		 */
156 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
157 			nd->nl_flags |= NLC_FOLLOW;
158 		nd->nl_flags |= NLC_CREATE;
159 		bwillwrite();
160 		error = nlookup(nd);
161 	} else {
162 		/*
163 		 * NORMAL OPEN FILE CASE
164 		 */
165 		error = nlookup(nd);
166 	}
167 
168 	if (error)
169 		return (error);
170 
171 	/*
172 	 * split case to allow us to re-resolve and retry the ncp in case
173 	 * we get ESTALE.
174 	 */
175 again:
176 	if (fmode & O_CREAT) {
177 		if (nd->nl_nch.ncp->nc_vp == NULL) {
178 			if ((error = ncp_writechk(&nd->nl_nch)) != 0)
179 				return (error);
180 			if ((dvp = nd->nl_nch.ncp->nc_parent->nc_vp) == NULL)
181 				return (EPERM);
182 			/* vhold(dvp); - dvp can't go away */
183 			VATTR_NULL(vap);
184 			vap->va_type = VREG;
185 			vap->va_mode = cmode;
186 			if (fmode & O_EXCL)
187 				vap->va_vaflags |= VA_EXCLUSIVE;
188 			error = VOP_NCREATE(&nd->nl_nch, dvp, &vp,
189 					    nd->nl_cred, vap);
190 			/* vdrop(dvp); */
191 			if (error)
192 				return (error);
193 			fmode &= ~O_TRUNC;
194 			/* locked vnode is returned */
195 		} else {
196 			if (fmode & O_EXCL) {
197 				error = EEXIST;
198 			} else {
199 				error = cache_vget(&nd->nl_nch, cred,
200 						    LK_EXCLUSIVE, &vp);
201 			}
202 			if (error)
203 				return (error);
204 			fmode &= ~O_CREAT;
205 		}
206 	} else {
207 		error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp);
208 		if (error)
209 			return (error);
210 	}
211 
212 	/*
213 	 * We have a locked vnode and ncp now.  Note that the ncp will
214 	 * be cleaned up by the caller if nd->nl_nch is left intact.
215 	 */
216 	if (vp->v_type == VLNK) {
217 		error = EMLINK;
218 		goto bad;
219 	}
220 	if (vp->v_type == VSOCK) {
221 		error = EOPNOTSUPP;
222 		goto bad;
223 	}
224 	if ((fmode & O_CREAT) == 0) {
225 		mode = 0;
226 		if (fmode & (FWRITE | O_TRUNC)) {
227 			if (vp->v_type == VDIR) {
228 				error = EISDIR;
229 				goto bad;
230 			}
231 			error = vn_writechk(vp, &nd->nl_nch);
232 			if (error) {
233 				/*
234 				 * Special stale handling, re-resolve the
235 				 * vnode.
236 				 */
237 				if (error == ESTALE) {
238 					vput(vp);
239 					vp = NULL;
240 					cache_setunresolved(&nd->nl_nch);
241 					error = cache_resolve(&nd->nl_nch, cred);
242 					if (error == 0)
243 						goto again;
244 				}
245 				goto bad;
246 			}
247 			mode |= VWRITE;
248 		}
249 		if (fmode & FREAD)
250 			mode |= VREAD;
251 		if (mode) {
252 		        error = VOP_ACCESS(vp, mode, cred);
253 			if (error) {
254 				/*
255 				 * Special stale handling, re-resolve the
256 				 * vnode.
257 				 */
258 				if (error == ESTALE) {
259 					vput(vp);
260 					vp = NULL;
261 					cache_setunresolved(&nd->nl_nch);
262 					error = cache_resolve(&nd->nl_nch, cred);
263 					if (error == 0)
264 						goto again;
265 				}
266 				goto bad;
267 			}
268 		}
269 	}
270 	if (fmode & O_TRUNC) {
271 		vn_unlock(vp);				/* XXX */
272 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
273 		VATTR_NULL(vap);
274 		vap->va_size = 0;
275 		error = VOP_SETATTR(vp, vap, cred);
276 		if (error)
277 			goto bad;
278 	}
279 
280 	/*
281 	 * Setup the fp so VOP_OPEN can override it.  No descriptor has been
282 	 * associated with the fp yet so we own it clean.
283 	 *
284 	 * f_nchandle inherits nl_nch.  This used to be necessary only for
285 	 * directories but now we do it unconditionally so f*() ops
286 	 * such as fchmod() can access the actual namespace that was
287 	 * used to open the file.
288 	 */
289 	if (fp) {
290 		fp->f_nchandle = nd->nl_nch;
291 		cache_zero(&nd->nl_nch);
292 		cache_unlock(&fp->f_nchandle);
293 	}
294 
295 	/*
296 	 * Get rid of nl_nch.  vn_open does not return it (it returns the
297 	 * vnode or the file pointer).  Note: we can't leave nl_nch locked
298 	 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g.
299 	 * on /dev/ttyd0
300 	 */
301 	if (nd->nl_nch.ncp)
302 		cache_put(&nd->nl_nch);
303 
304 	error = VOP_OPEN(vp, fmode, cred, fp);
305 	if (error) {
306 		/*
307 		 * setting f_ops to &badfileops will prevent the descriptor
308 		 * code from trying to close and release the vnode, since
309 		 * the open failed we do not want to call close.
310 		 */
311 		if (fp) {
312 			fp->f_data = NULL;
313 			fp->f_ops = &badfileops;
314 		}
315 		goto bad;
316 	}
317 
318 #if 0
319 	/*
320 	 * Assert that VREG files have been setup for vmio.
321 	 */
322 	KASSERT(vp->v_type != VREG || vp->v_object != NULL,
323 		("vn_open: regular file was not VMIO enabled!"));
324 #endif
325 
326 	/*
327 	 * Return the vnode.  XXX needs some cleaning up.  The vnode is
328 	 * only returned in the fp == NULL case.
329 	 */
330 	if (fp == NULL) {
331 		nd->nl_open_vp = vp;
332 		nd->nl_vp_fmode = fmode;
333 		if ((nd->nl_flags & NLC_LOCKVP) == 0)
334 			vn_unlock(vp);
335 	} else {
336 		vput(vp);
337 	}
338 	return (0);
339 bad:
340 	if (vp)
341 		vput(vp);
342 	return (error);
343 }
344 
345 int
346 vn_opendisk(const char *devname, int fmode, struct vnode **vpp)
347 {
348 	struct vnode *vp;
349 	int error;
350 
351 	if (strncmp(devname, "/dev/", 5) == 0)
352 		devname += 5;
353 	if ((vp = getsynthvnode(devname)) == NULL) {
354 		error = ENODEV;
355 	} else {
356 		error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL);
357 		vn_unlock(vp);
358 		if (error) {
359 			vrele(vp);
360 			vp = NULL;
361 		}
362 	}
363 	*vpp = vp;
364 	return (error);
365 }
366 
367 /*
368  * Check for write permissions on the specified vnode.  nch may be NULL.
369  */
370 int
371 vn_writechk(struct vnode *vp, struct nchandle *nch)
372 {
373 	/*
374 	 * If there's shared text associated with
375 	 * the vnode, try to free it up once.  If
376 	 * we fail, we can't allow writing.
377 	 */
378 	if (vp->v_flag & VTEXT)
379 		return (ETXTBSY);
380 
381 	/*
382 	 * If the vnode represents a regular file, check the mount
383 	 * point via the nch.  This may be a different mount point
384 	 * then the one embedded in the vnode (e.g. nullfs).
385 	 *
386 	 * We can still write to non-regular files (e.g. devices)
387 	 * via read-only mounts.
388 	 */
389 	if (nch && nch->ncp && vp->v_type == VREG)
390 		return (ncp_writechk(nch));
391 	return (0);
392 }
393 
394 /*
395  * Check whether the underlying mount is read-only.  The mount point
396  * referenced by the namecache may be different from the mount point
397  * used by the underlying vnode in the case of NULLFS, so a separate
398  * check is needed.
399  */
400 int
401 ncp_writechk(struct nchandle *nch)
402 {
403 	if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY))
404 		return (EROFS);
405 	return(0);
406 }
407 
408 /*
409  * Vnode close call
410  */
411 int
412 vn_close(struct vnode *vp, int flags)
413 {
414 	int error;
415 
416 	if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) == 0) {
417 		error = VOP_CLOSE(vp, flags);
418 		vn_unlock(vp);
419 	}
420 	vrele(vp);
421 	return (error);
422 }
423 
424 static __inline
425 int
426 sequential_heuristic(struct uio *uio, struct file *fp)
427 {
428 	/*
429 	 * Sequential heuristic - detect sequential operation
430 	 */
431 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
432 	    uio->uio_offset == fp->f_nextoff) {
433 		int tmpseq = fp->f_seqcount;
434 		/*
435 		 * XXX we assume that the filesystem block size is
436 		 * the default.  Not true, but still gives us a pretty
437 		 * good indicator of how sequential the read operations
438 		 * are.
439 		 */
440 		tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
441 		if (tmpseq > IO_SEQMAX)
442 			tmpseq = IO_SEQMAX;
443 		fp->f_seqcount = tmpseq;
444 		return(fp->f_seqcount << IO_SEQSHIFT);
445 	}
446 
447 	/*
448 	 * Not sequential, quick draw-down of seqcount
449 	 */
450 	if (fp->f_seqcount > 1)
451 		fp->f_seqcount = 1;
452 	else
453 		fp->f_seqcount = 0;
454 	return(0);
455 }
456 
457 /*
458  * Package up an I/O request on a vnode into a uio and do it.
459  */
460 int
461 vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
462 	off_t offset, enum uio_seg segflg, int ioflg,
463 	struct ucred *cred, int *aresid)
464 {
465 	struct uio auio;
466 	struct iovec aiov;
467 	struct ccms_lock ccms_lock;
468 	int error;
469 
470 	if ((ioflg & IO_NODELOCKED) == 0)
471 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
472 	auio.uio_iov = &aiov;
473 	auio.uio_iovcnt = 1;
474 	aiov.iov_base = base;
475 	aiov.iov_len = len;
476 	auio.uio_resid = len;
477 	auio.uio_offset = offset;
478 	auio.uio_segflg = segflg;
479 	auio.uio_rw = rw;
480 	auio.uio_td = curthread;
481 	ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, &auio);
482 	if (rw == UIO_READ) {
483 		error = VOP_READ(vp, &auio, ioflg, cred);
484 	} else {
485 		error = VOP_WRITE(vp, &auio, ioflg, cred);
486 	}
487 	ccms_lock_put(&vp->v_ccms, &ccms_lock);
488 	if (aresid)
489 		*aresid = auio.uio_resid;
490 	else
491 		if (auio.uio_resid && error == 0)
492 			error = EIO;
493 	if ((ioflg & IO_NODELOCKED) == 0)
494 		vn_unlock(vp);
495 	return (error);
496 }
497 
498 /*
499  * Package up an I/O request on a vnode into a uio and do it.  The I/O
500  * request is split up into smaller chunks and we try to avoid saturating
501  * the buffer cache while potentially holding a vnode locked, so we
502  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
503  * to give other processes a chance to lock the vnode (either other processes
504  * core'ing the same binary, or unrelated processes scanning the directory).
505  */
506 int
507 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
508 		 off_t offset, enum uio_seg segflg, int ioflg,
509 		 struct ucred *cred, int *aresid)
510 {
511 	int error = 0;
512 
513 	do {
514 		int chunk;
515 
516 		/*
517 		 * Force `offset' to a multiple of MAXBSIZE except possibly
518 		 * for the first chunk, so that filesystems only need to
519 		 * write full blocks except possibly for the first and last
520 		 * chunks.
521 		 */
522 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
523 
524 		if (chunk > len)
525 			chunk = len;
526 		if (rw != UIO_READ && vp->v_type == VREG)
527 			bwillwrite();
528 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
529 			    ioflg, cred, aresid);
530 		len -= chunk;	/* aresid calc already includes length */
531 		if (error)
532 			break;
533 		offset += chunk;
534 		base += chunk;
535 		uio_yield();
536 	} while (len);
537 	if (aresid)
538 		*aresid += len;
539 	return (error);
540 }
541 
542 /*
543  * MPALMOSTSAFE - acquires mplock
544  */
545 static int
546 vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
547 {
548 	struct ccms_lock ccms_lock;
549 	struct vnode *vp;
550 	int error, ioflag;
551 
552 	get_mplock();
553 	KASSERT(uio->uio_td == curthread,
554 		("uio_td %p is not td %p", uio->uio_td, curthread));
555 	vp = (struct vnode *)fp->f_data;
556 
557 	ioflag = 0;
558 	if (flags & O_FBLOCKING) {
559 		/* ioflag &= ~IO_NDELAY; */
560 	} else if (flags & O_FNONBLOCKING) {
561 		ioflag |= IO_NDELAY;
562 	} else if (fp->f_flag & FNONBLOCK) {
563 		ioflag |= IO_NDELAY;
564 	}
565 	if (flags & O_FBUFFERED) {
566 		/* ioflag &= ~IO_DIRECT; */
567 	} else if (flags & O_FUNBUFFERED) {
568 		ioflag |= IO_DIRECT;
569 	} else if (fp->f_flag & O_DIRECT) {
570 		ioflag |= IO_DIRECT;
571 	}
572 	vn_lock(vp, LK_SHARED | LK_RETRY);
573 	if ((flags & O_FOFFSET) == 0)
574 		uio->uio_offset = fp->f_offset;
575 	ioflag |= sequential_heuristic(uio, fp);
576 
577 	ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
578 	error = VOP_READ(vp, uio, ioflag, cred);
579 	ccms_lock_put(&vp->v_ccms, &ccms_lock);
580 	if ((flags & O_FOFFSET) == 0)
581 		fp->f_offset = uio->uio_offset;
582 	fp->f_nextoff = uio->uio_offset;
583 	vn_unlock(vp);
584 	rel_mplock();
585 	return (error);
586 }
587 
588 /*
589  * Device-optimized file table vnode read routine.
590  *
591  * This bypasses the VOP table and talks directly to the device.  Most
592  * filesystems just route to specfs and can make this optimization.
593  *
594  * MPALMOSTSAFE - acquires mplock
595  */
596 static int
597 svn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
598 {
599 	struct vnode *vp;
600 	int ioflag;
601 	int error;
602 	cdev_t dev;
603 
604 	get_mplock();
605 	KASSERT(uio->uio_td == curthread,
606 		("uio_td %p is not td %p", uio->uio_td, curthread));
607 
608 	vp = (struct vnode *)fp->f_data;
609 	if (vp == NULL || vp->v_type == VBAD) {
610 		error = EBADF;
611 		goto done;
612 	}
613 
614 	if ((dev = vp->v_rdev) == NULL) {
615 		error = EBADF;
616 		goto done;
617 	}
618 	reference_dev(dev);
619 
620 	if (uio->uio_resid == 0) {
621 		error = 0;
622 		goto done;
623 	}
624 	if ((flags & O_FOFFSET) == 0)
625 		uio->uio_offset = fp->f_offset;
626 
627 	ioflag = 0;
628 	if (flags & O_FBLOCKING) {
629 		/* ioflag &= ~IO_NDELAY; */
630 	} else if (flags & O_FNONBLOCKING) {
631 		ioflag |= IO_NDELAY;
632 	} else if (fp->f_flag & FNONBLOCK) {
633 		ioflag |= IO_NDELAY;
634 	}
635 	if (flags & O_FBUFFERED) {
636 		/* ioflag &= ~IO_DIRECT; */
637 	} else if (flags & O_FUNBUFFERED) {
638 		ioflag |= IO_DIRECT;
639 	} else if (fp->f_flag & O_DIRECT) {
640 		ioflag |= IO_DIRECT;
641 	}
642 	ioflag |= sequential_heuristic(uio, fp);
643 
644 	error = dev_dread(dev, uio, ioflag);
645 
646 	release_dev(dev);
647 	if ((flags & O_FOFFSET) == 0)
648 		fp->f_offset = uio->uio_offset;
649 	fp->f_nextoff = uio->uio_offset;
650 done:
651 	rel_mplock();
652 	return (error);
653 }
654 
655 /*
656  * MPALMOSTSAFE - acquires mplock
657  */
658 static int
659 vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
660 {
661 	struct ccms_lock ccms_lock;
662 	struct vnode *vp;
663 	int error, ioflag;
664 
665 	get_mplock();
666 	KASSERT(uio->uio_td == curthread,
667 		("uio_td %p is not p %p", uio->uio_td, curthread));
668 	vp = (struct vnode *)fp->f_data;
669 	if (vp->v_type == VREG)
670 		bwillwrite();
671 	vp = (struct vnode *)fp->f_data;	/* XXX needed? */
672 
673 	ioflag = IO_UNIT;
674 	if (vp->v_type == VREG &&
675 	   ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
676 		ioflag |= IO_APPEND;
677 	}
678 
679 	if (flags & O_FBLOCKING) {
680 		/* ioflag &= ~IO_NDELAY; */
681 	} else if (flags & O_FNONBLOCKING) {
682 		ioflag |= IO_NDELAY;
683 	} else if (fp->f_flag & FNONBLOCK) {
684 		ioflag |= IO_NDELAY;
685 	}
686 	if (flags & O_FBUFFERED) {
687 		/* ioflag &= ~IO_DIRECT; */
688 	} else if (flags & O_FUNBUFFERED) {
689 		ioflag |= IO_DIRECT;
690 	} else if (fp->f_flag & O_DIRECT) {
691 		ioflag |= IO_DIRECT;
692 	}
693 	if (flags & O_FASYNCWRITE) {
694 		/* ioflag &= ~IO_SYNC; */
695 	} else if (flags & O_FSYNCWRITE) {
696 		ioflag |= IO_SYNC;
697 	} else if (fp->f_flag & O_FSYNC) {
698 		ioflag |= IO_SYNC;
699 	}
700 
701 	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
702 		ioflag |= IO_SYNC;
703 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
704 	if ((flags & O_FOFFSET) == 0)
705 		uio->uio_offset = fp->f_offset;
706 	ioflag |= sequential_heuristic(uio, fp);
707 	ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
708 	error = VOP_WRITE(vp, uio, ioflag, cred);
709 	ccms_lock_put(&vp->v_ccms, &ccms_lock);
710 	if ((flags & O_FOFFSET) == 0)
711 		fp->f_offset = uio->uio_offset;
712 	fp->f_nextoff = uio->uio_offset;
713 	vn_unlock(vp);
714 	rel_mplock();
715 	return (error);
716 }
717 
718 /*
719  * Device-optimized file table vnode write routine.
720  *
721  * This bypasses the VOP table and talks directly to the device.  Most
722  * filesystems just route to specfs and can make this optimization.
723  *
724  * MPALMOSTSAFE - acquires mplock
725  */
726 static int
727 svn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
728 {
729 	struct vnode *vp;
730 	int ioflag;
731 	int error;
732 	cdev_t dev;
733 
734 	get_mplock();
735 	KASSERT(uio->uio_td == curthread,
736 		("uio_td %p is not p %p", uio->uio_td, curthread));
737 
738 	vp = (struct vnode *)fp->f_data;
739 	if (vp == NULL || vp->v_type == VBAD) {
740 		error = EBADF;
741 		goto done;
742 	}
743 	if (vp->v_type == VREG)
744 		bwillwrite();
745 	vp = (struct vnode *)fp->f_data;	/* XXX needed? */
746 
747 	if ((dev = vp->v_rdev) == NULL) {
748 		error = EBADF;
749 		goto done;
750 	}
751 	reference_dev(dev);
752 
753 	if ((flags & O_FOFFSET) == 0)
754 		uio->uio_offset = fp->f_offset;
755 
756 	ioflag = IO_UNIT;
757 	if (vp->v_type == VREG &&
758 	   ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
759 		ioflag |= IO_APPEND;
760 	}
761 
762 	if (flags & O_FBLOCKING) {
763 		/* ioflag &= ~IO_NDELAY; */
764 	} else if (flags & O_FNONBLOCKING) {
765 		ioflag |= IO_NDELAY;
766 	} else if (fp->f_flag & FNONBLOCK) {
767 		ioflag |= IO_NDELAY;
768 	}
769 	if (flags & O_FBUFFERED) {
770 		/* ioflag &= ~IO_DIRECT; */
771 	} else if (flags & O_FUNBUFFERED) {
772 		ioflag |= IO_DIRECT;
773 	} else if (fp->f_flag & O_DIRECT) {
774 		ioflag |= IO_DIRECT;
775 	}
776 	if (flags & O_FASYNCWRITE) {
777 		/* ioflag &= ~IO_SYNC; */
778 	} else if (flags & O_FSYNCWRITE) {
779 		ioflag |= IO_SYNC;
780 	} else if (fp->f_flag & O_FSYNC) {
781 		ioflag |= IO_SYNC;
782 	}
783 
784 	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
785 		ioflag |= IO_SYNC;
786 	ioflag |= sequential_heuristic(uio, fp);
787 
788 	error = dev_dwrite(dev, uio, ioflag);
789 
790 	release_dev(dev);
791 	if ((flags & O_FOFFSET) == 0)
792 		fp->f_offset = uio->uio_offset;
793 	fp->f_nextoff = uio->uio_offset;
794 done:
795 	rel_mplock();
796 	return (error);
797 }
798 
799 /*
800  * MPALMOSTSAFE - acquires mplock
801  */
802 static int
803 vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred)
804 {
805 	struct vnode *vp;
806 	int error;
807 
808 	get_mplock();
809 	vp = (struct vnode *)fp->f_data;
810 	error = vn_stat(vp, sb, cred);
811 	rel_mplock();
812 	return (error);
813 }
814 
815 int
816 vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred)
817 {
818 	struct vattr vattr;
819 	struct vattr *vap;
820 	int error;
821 	u_short mode;
822 	cdev_t dev;
823 
824 	vap = &vattr;
825 	error = VOP_GETATTR(vp, vap);
826 	if (error)
827 		return (error);
828 
829 	/*
830 	 * Zero the spare stat fields
831 	 */
832 	sb->st_lspare = 0;
833 	sb->st_qspare = 0;
834 
835 	/*
836 	 * Copy from vattr table
837 	 */
838 	if (vap->va_fsid != VNOVAL)
839 		sb->st_dev = vap->va_fsid;
840 	else
841 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
842 	sb->st_ino = vap->va_fileid;
843 	mode = vap->va_mode;
844 	switch (vap->va_type) {
845 	case VREG:
846 		mode |= S_IFREG;
847 		break;
848 	case VDATABASE:
849 		mode |= S_IFDB;
850 		break;
851 	case VDIR:
852 		mode |= S_IFDIR;
853 		break;
854 	case VBLK:
855 		mode |= S_IFBLK;
856 		break;
857 	case VCHR:
858 		mode |= S_IFCHR;
859 		break;
860 	case VLNK:
861 		mode |= S_IFLNK;
862 		/* This is a cosmetic change, symlinks do not have a mode. */
863 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
864 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
865 		else
866 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
867 		break;
868 	case VSOCK:
869 		mode |= S_IFSOCK;
870 		break;
871 	case VFIFO:
872 		mode |= S_IFIFO;
873 		break;
874 	default:
875 		return (EBADF);
876 	};
877 	sb->st_mode = mode;
878 	if (vap->va_nlink > (nlink_t)-1)
879 		sb->st_nlink = (nlink_t)-1;
880 	else
881 		sb->st_nlink = vap->va_nlink;
882 	sb->st_uid = vap->va_uid;
883 	sb->st_gid = vap->va_gid;
884 	sb->st_rdev = makeudev(vap->va_rmajor, vap->va_rminor);
885 	sb->st_size = vap->va_size;
886 	sb->st_atimespec = vap->va_atime;
887 	sb->st_mtimespec = vap->va_mtime;
888 	sb->st_ctimespec = vap->va_ctime;
889 
890 	/*
891 	 * A VCHR and VBLK device may track the last access and last modified
892 	 * time independantly of the filesystem.  This is particularly true
893 	 * because device read and write calls may bypass the filesystem.
894 	 */
895 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
896 		if ((dev = vp->v_rdev) != NULL) {
897 			if (dev->si_lastread) {
898 				sb->st_atimespec.tv_sec = dev->si_lastread;
899 				sb->st_atimespec.tv_nsec = 0;
900 			}
901 			if (dev->si_lastwrite) {
902 				sb->st_atimespec.tv_sec = dev->si_lastwrite;
903 				sb->st_atimespec.tv_nsec = 0;
904 			}
905 		}
906 	}
907 
908         /*
909 	 * According to www.opengroup.org, the meaning of st_blksize is
910 	 *   "a filesystem-specific preferred I/O block size for this
911 	 *    object.  In some filesystem types, this may vary from file
912 	 *    to file"
913 	 * Default to PAGE_SIZE after much discussion.
914 	 */
915 
916 	if (vap->va_type == VREG) {
917 		sb->st_blksize = vap->va_blocksize;
918 	} else if (vn_isdisk(vp, NULL)) {
919 		/*
920 		 * XXX this is broken.  If the device is not yet open (aka
921 		 * stat() call, aka v_rdev == NULL), how are we supposed
922 		 * to get a valid block size out of it?
923 		 */
924 		cdev_t dev;
925 
926 		if ((dev = vp->v_rdev) == NULL) {
927 			if (vp->v_type == VCHR)
928 				dev = get_dev(vp->v_umajor, vp->v_uminor);
929 		}
930 		sb->st_blksize = dev->si_bsize_best;
931 		if (sb->st_blksize < dev->si_bsize_phys)
932 			sb->st_blksize = dev->si_bsize_phys;
933 		if (sb->st_blksize < BLKDEV_IOSIZE)
934 			sb->st_blksize = BLKDEV_IOSIZE;
935 	} else {
936 		sb->st_blksize = PAGE_SIZE;
937 	}
938 
939 	sb->st_flags = vap->va_flags;
940 	if (suser_cred(cred, 0))
941 		sb->st_gen = 0;
942 	else
943 		sb->st_gen = (u_int32_t)vap->va_gen;
944 
945 #if (S_BLKSIZE == 512)
946 	/* Optimize this case */
947 	sb->st_blocks = vap->va_bytes >> 9;
948 #else
949 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
950 #endif
951 	sb->st_fsmid = vap->va_fsmid;
952 	return (0);
953 }
954 
955 /*
956  * MPALMOSTSAFE - acquires mplock
957  */
958 static int
959 vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred)
960 {
961 	struct vnode *vp = ((struct vnode *)fp->f_data);
962 	struct vnode *ovp;
963 	struct vattr vattr;
964 	int error;
965 
966 	get_mplock();
967 
968 	switch (vp->v_type) {
969 	case VREG:
970 	case VDIR:
971 		if (com == FIONREAD) {
972 			if ((error = VOP_GETATTR(vp, &vattr)) != 0)
973 				break;
974 			*(int *)data = vattr.va_size - fp->f_offset;
975 			error = 0;
976 			break;
977 		}
978 		if (com == FIOASYNC) {				/* XXX */
979 			error = 0;				/* XXX */
980 			break;
981 		}
982 		/* fall into ... */
983 	default:
984 #if 0
985 		return (ENOTTY);
986 #endif
987 	case VFIFO:
988 	case VCHR:
989 	case VBLK:
990 		if (com == FIODTYPE) {
991 			if (vp->v_type != VCHR && vp->v_type != VBLK) {
992 				error = ENOTTY;
993 				break;
994 			}
995 			*(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK;
996 			error = 0;
997 			break;
998 		}
999 		error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred);
1000 		if (error == 0 && com == TIOCSCTTY) {
1001 			struct proc *p = curthread->td_proc;
1002 			struct session *sess;
1003 
1004 			if (p == NULL) {
1005 				error = ENOTTY;
1006 				break;
1007 			}
1008 
1009 			sess = p->p_session;
1010 			/* Do nothing if reassigning same control tty */
1011 			if (sess->s_ttyvp == vp) {
1012 				error = 0;
1013 				break;
1014 			}
1015 
1016 			/* Get rid of reference to old control tty */
1017 			ovp = sess->s_ttyvp;
1018 			vref(vp);
1019 			sess->s_ttyvp = vp;
1020 			if (ovp)
1021 				vrele(ovp);
1022 		}
1023 		break;
1024 	}
1025 	rel_mplock();
1026 	return (error);
1027 }
1028 
1029 /*
1030  * MPALMOSTSAFE - acquires mplock
1031  */
1032 static int
1033 vn_poll(struct file *fp, int events, struct ucred *cred)
1034 {
1035 	int error;
1036 
1037 	get_mplock();
1038 	error = VOP_POLL(((struct vnode *)fp->f_data), events, cred);
1039 	rel_mplock();
1040 	return (error);
1041 }
1042 
1043 /*
1044  * Check that the vnode is still valid, and if so
1045  * acquire requested lock.
1046  */
1047 int
1048 #ifndef	DEBUG_LOCKS
1049 vn_lock(struct vnode *vp, int flags)
1050 #else
1051 debug_vn_lock(struct vnode *vp, int flags, const char *filename, int line)
1052 #endif
1053 {
1054 	int error;
1055 
1056 	do {
1057 #ifdef	DEBUG_LOCKS
1058 		vp->filename = filename;
1059 		vp->line = line;
1060 		error = debuglockmgr(&vp->v_lock, flags,
1061 				     "vn_lock", filename, line);
1062 #else
1063 		error = lockmgr(&vp->v_lock, flags);
1064 #endif
1065 		if (error == 0)
1066 			break;
1067 	} while (flags & LK_RETRY);
1068 
1069 	/*
1070 	 * Because we (had better!) have a ref on the vnode, once it
1071 	 * goes to VRECLAIMED state it will not be recycled until all
1072 	 * refs go away.  So we can just check the flag.
1073 	 */
1074 	if (error == 0 && (vp->v_flag & VRECLAIMED)) {
1075 		lockmgr(&vp->v_lock, LK_RELEASE);
1076 		error = ENOENT;
1077 	}
1078 	return (error);
1079 }
1080 
1081 void
1082 vn_unlock(struct vnode *vp)
1083 {
1084 	lockmgr(&vp->v_lock, LK_RELEASE);
1085 }
1086 
1087 int
1088 vn_islocked(struct vnode *vp)
1089 {
1090 	return (lockstatus(&vp->v_lock, curthread));
1091 }
1092 
1093 /*
1094  * MPALMOSTSAFE - acquires mplock
1095  */
1096 static int
1097 vn_closefile(struct file *fp)
1098 {
1099 	int error;
1100 
1101 	get_mplock();
1102 	fp->f_ops = &badfileops;
1103 	error = vn_close(((struct vnode *)fp->f_data), fp->f_flag);
1104 	rel_mplock();
1105 	return(error);
1106 }
1107 
1108 /*
1109  * MPALMOSTSAFE - acquires mplock
1110  */
1111 static int
1112 vn_kqfilter(struct file *fp, struct knote *kn)
1113 {
1114 	int error;
1115 
1116 	get_mplock();
1117 	error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn);
1118 	rel_mplock();
1119 	return (error);
1120 }
1121