xref: /dragonfly/sys/kern/vfs_vnops.c (revision 71126e33)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
39  * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $
40  * $DragonFly: src/sys/kern/vfs_vnops.c,v 1.28 2004/11/30 18:59:52 dillon Exp $
41  */
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/fcntl.h>
46 #include <sys/file.h>
47 #include <sys/stat.h>
48 #include <sys/proc.h>
49 #include <sys/mount.h>
50 #include <sys/nlookup.h>
51 #include <sys/vnode.h>
52 #include <sys/buf.h>
53 #include <sys/filio.h>
54 #include <sys/ttycom.h>
55 #include <sys/conf.h>
56 #include <sys/syslog.h>
57 
58 static int vn_closefile (struct file *fp, struct thread *td);
59 static int vn_ioctl (struct file *fp, u_long com, caddr_t data,
60 		struct thread *td);
61 static int vn_read (struct file *fp, struct uio *uio,
62 		struct ucred *cred, int flags, struct thread *td);
63 static int svn_read (struct file *fp, struct uio *uio,
64 		struct ucred *cred, int flags, struct thread *td);
65 static int vn_poll (struct file *fp, int events, struct ucred *cred,
66 		struct thread *td);
67 static int vn_kqfilter (struct file *fp, struct knote *kn);
68 static int vn_statfile (struct file *fp, struct stat *sb, struct thread *td);
69 static int vn_write (struct file *fp, struct uio *uio,
70 		struct ucred *cred, int flags, struct thread *td);
71 static int svn_write (struct file *fp, struct uio *uio,
72 		struct ucred *cred, int flags, struct thread *td);
73 
74 struct fileops vnode_fileops = {
75 	NULL,	/* port */
76 	NULL,	/* clone */
77 	vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter,
78 	vn_statfile, vn_closefile
79 };
80 
81 struct fileops specvnode_fileops = {
82 	NULL,	/* port */
83 	NULL,	/* clone */
84 	svn_read, svn_write, vn_ioctl, vn_poll, vn_kqfilter,
85 	vn_statfile, vn_closefile
86 };
87 
88 /*
89  * Shortcut the device read/write.  This avoids a lot of vnode junk.
90  * Basically the specfs vnops for read and write take the locked vnode,
91  * unlock it (because we can't hold the vnode locked while reading or writing
92  * a device which may block indefinitely), issues the device operation, then
93  * relock the vnode before returning, plus other junk.  This bypasses all
94  * of that and just does the device operation.
95  */
96 void
97 vn_setspecops(struct file *fp)
98 {
99 	if (vfs_fastdev && fp->f_ops == &vnode_fileops) {
100 		fp->f_ops = &specvnode_fileops;
101 	}
102 }
103 
104 /*
105  * Common code for vnode open operations.  Check permissions, and call
106  * the VOP_NOPEN or VOP_NCREATE routine.
107  *
108  * The caller is responsible for setting up nd with nlookup_init() and
109  * for cleaning it up with nlookup_done(), whether we return an error
110  * or not.
111  *
112  * On success nd->nl_open_vp will hold a referenced and, if requested,
113  * locked vnode.  A locked vnode is requested via NLC_LOCKVP.  If fp
114  * is non-NULL the vnode will be installed in the file pointer.
115  *
116  * NOTE: The vnode is referenced just once on return whether or not it
117  * is also installed in the file pointer.
118  */
119 int
120 vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode)
121 {
122 	struct vnode *vp;
123 	struct thread *td = nd->nl_td;
124 	struct ucred *cred = nd->nl_cred;
125 	struct vattr vat;
126 	struct vattr *vap = &vat;
127 	struct namecache *ncp;
128 	int mode, error;
129 
130 	/*
131 	 * Lookup the path and create or obtain the vnode.  After a
132 	 * successful lookup a locked nd->nl_ncp will be returned.
133 	 *
134 	 * The result of this section should be a locked vnode.
135 	 *
136 	 * XXX with only a little work we should be able to avoid locking
137 	 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set.
138 	 */
139 	if (fmode & O_CREAT) {
140 		/*
141 		 * CONDITIONAL CREATE FILE CASE
142 		 *
143 		 * Setting NLC_CREATE causes a negative hit to store
144 		 * the negative hit ncp and not return an error.  Then
145 		 * nc_error or nc_vp may be checked to see if the ncp
146 		 * represents a negative hit.  NLC_CREATE also requires
147 		 * write permission on the governing directory or EPERM
148 		 * is returned.
149 		 */
150 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
151 			nd->nl_flags |= NLC_FOLLOW;
152 		nd->nl_flags |= NLC_CREATE;
153 		bwillwrite();
154 		error = nlookup(nd);
155 	} else {
156 		/*
157 		 * NORMAL OPEN FILE CASE
158 		 */
159 		error = nlookup(nd);
160 	}
161 
162 	if (error)
163 		return (error);
164 	ncp = nd->nl_ncp;
165 
166 	/*
167 	 * split case to allow us to re-resolve and retry the ncp in case
168 	 * we get ESTALE.
169 	 */
170 again:
171 	if (fmode & O_CREAT) {
172 		if (ncp->nc_vp == NULL) {
173 			VATTR_NULL(vap);
174 			vap->va_type = VREG;
175 			vap->va_mode = cmode;
176 			if (fmode & O_EXCL)
177 				vap->va_vaflags |= VA_EXCLUSIVE;
178 			error = VOP_NCREATE(ncp, &vp, nd->nl_cred, vap);
179 			if (error)
180 				return (error);
181 			fmode &= ~O_TRUNC;
182 			ASSERT_VOP_LOCKED(vp, "create");
183 			/* locked vnode is returned */
184 		} else {
185 			if (fmode & O_EXCL) {
186 				error = EEXIST;
187 			} else {
188 				error = cache_vget(ncp, cred,
189 						    LK_EXCLUSIVE, &vp);
190 			}
191 			if (error)
192 				return (error);
193 			fmode &= ~O_CREAT;
194 		}
195 	} else {
196 		error = cache_vget(ncp, cred, LK_EXCLUSIVE, &vp);
197 		if (error)
198 			return (error);
199 	}
200 
201 	/*
202 	 * We have a locked vnode and ncp now.  Note that the ncp will
203 	 * be cleaned up by the caller if nd->nl_ncp is left intact.
204 	 */
205 	if (vp->v_type == VLNK) {
206 		error = EMLINK;
207 		goto bad;
208 	}
209 	if (vp->v_type == VSOCK) {
210 		error = EOPNOTSUPP;
211 		goto bad;
212 	}
213 	if ((fmode & O_CREAT) == 0) {
214 		mode = 0;
215 		if (fmode & (FWRITE | O_TRUNC)) {
216 			if (vp->v_type == VDIR) {
217 				error = EISDIR;
218 				goto bad;
219 			}
220 			error = vn_writechk(vp);
221 			if (error) {
222 				/*
223 				 * Special stale handling, re-resolve the
224 				 * vnode.
225 				 */
226 				if (error == ESTALE) {
227 					vput(vp);
228 					vp = NULL;
229 					cache_setunresolved(ncp);
230 					error = cache_resolve(ncp, cred);
231 					if (error == 0)
232 						goto again;
233 				}
234 				goto bad;
235 			}
236 			mode |= VWRITE;
237 		}
238 		if (fmode & FREAD)
239 			mode |= VREAD;
240 		if (mode) {
241 		        error = VOP_ACCESS(vp, mode, cred, td);
242 			if (error) {
243 				/*
244 				 * Special stale handling, re-resolve the
245 				 * vnode.
246 				 */
247 				if (error == ESTALE) {
248 					vput(vp);
249 					vp = NULL;
250 					cache_setunresolved(ncp);
251 					error = cache_resolve(ncp, cred);
252 					if (error == 0)
253 						goto again;
254 				}
255 				goto bad;
256 			}
257 		}
258 	}
259 	if (fmode & O_TRUNC) {
260 		VOP_UNLOCK(vp, 0, td);			/* XXX */
261 		VOP_LEASE(vp, td, cred, LEASE_WRITE);
262 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);	/* XXX */
263 		VATTR_NULL(vap);
264 		vap->va_size = 0;
265 		error = VOP_SETATTR(vp, vap, cred, td);
266 		if (error)
267 			goto bad;
268 	}
269 
270 	/*
271 	 * Setup the fp so VOP_OPEN can override it.  No descriptor has been
272 	 * associated with the fp yet so we own it clean.  f_data will inherit
273 	 * our vp reference as long as we do not shift f_ops to &badfileops.
274 	 * f_ncp inherits nl_ncp .
275 	 */
276 	if (fp) {
277 		fp->f_data = (caddr_t)vp;
278 		fp->f_flag = fmode & FMASK;
279 		fp->f_ops = &vnode_fileops;
280 		fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
281 		if (vp->v_type == VDIR) {
282 			fp->f_ncp = nd->nl_ncp;
283 			nd->nl_ncp = NULL;
284 			cache_unlock(fp->f_ncp);
285 		}
286 	}
287 
288 	/*
289 	 * Get rid of nl_ncp.  vn_open does not return it (it returns the
290 	 * vnode or the file pointer).  Note: we can't leave nl_ncp locked
291 	 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g.
292 	 * on /dev/ttyd0
293 	 */
294 	if (nd->nl_ncp) {
295 		cache_put(nd->nl_ncp);
296 		nd->nl_ncp = NULL;
297 	}
298 
299 	error = VOP_OPEN(vp, fmode, cred, fp, td);
300 	if (error) {
301 		/*
302 		 * setting f_ops to &badfileops will prevent the descriptor
303 		 * code from trying to close and release the vnode, since
304 		 * the open failed we do not want to call close.
305 		 */
306 		if (fp) {
307 			fp->f_data = NULL;
308 			fp->f_ops = &badfileops;
309 		}
310 		goto bad;
311 	}
312 	if (fmode & FWRITE)
313 		vp->v_writecount++;
314 
315 	/*
316 	 * Make sure that a VM object is created for VMIO support.  If this
317 	 * fails we have to be sure to match VOP_CLOSE's with VOP_OPEN's.
318 	 * Cleanup the fp so we can just vput() the vp in 'bad'.
319 	 */
320 	if (vn_canvmio(vp) == TRUE) {
321 		if ((error = vfs_object_create(vp, td)) != 0) {
322 			if (fp) {
323 				fp->f_data = NULL;
324 				fp->f_ops = &badfileops;
325 			}
326 			VOP_CLOSE(vp, fmode, td);
327 			goto bad;
328 		}
329 	}
330 
331 	/*
332 	 * Return the vnode.  XXX needs some cleaning up.  The vnode is
333 	 * only returned in the fp == NULL case, otherwise the vnode ref
334 	 * is inherited by the fp and we unconditionally unlock it.
335 	 */
336 	if (fp == NULL) {
337 		nd->nl_open_vp = vp;
338 		nd->nl_vp_fmode = fmode;
339 		if ((nd->nl_flags & NLC_LOCKVP) == 0)
340 			VOP_UNLOCK(vp, 0, td);
341 	} else {
342 		VOP_UNLOCK(vp, 0, td);
343 	}
344 	return (0);
345 bad:
346 	if (vp)
347 		vput(vp);
348 	return (error);
349 }
350 
351 /*
352  * Check for write permissions on the specified vnode.
353  * Prototype text segments cannot be written.
354  */
355 int
356 vn_writechk(vp)
357 	struct vnode *vp;
358 {
359 
360 	/*
361 	 * If there's shared text associated with
362 	 * the vnode, try to free it up once.  If
363 	 * we fail, we can't allow writing.
364 	 */
365 	if (vp->v_flag & VTEXT)
366 		return (ETXTBSY);
367 	return (0);
368 }
369 
370 /*
371  * Vnode close call
372  */
373 int
374 vn_close(struct vnode *vp, int flags, struct thread *td)
375 {
376 	int error;
377 
378 	if (flags & FWRITE)
379 		vp->v_writecount--;
380 	if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td)) == 0) {
381 		error = VOP_CLOSE(vp, flags, td);
382 		VOP_UNLOCK(vp, 0, td);
383 	}
384 	vrele(vp);
385 	return (error);
386 }
387 
388 static __inline
389 int
390 sequential_heuristic(struct uio *uio, struct file *fp)
391 {
392 	/*
393 	 * Sequential heuristic - detect sequential operation
394 	 */
395 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
396 	    uio->uio_offset == fp->f_nextoff) {
397 		int tmpseq = fp->f_seqcount;
398 		/*
399 		 * XXX we assume that the filesystem block size is
400 		 * the default.  Not true, but still gives us a pretty
401 		 * good indicator of how sequential the read operations
402 		 * are.
403 		 */
404 		tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
405 		if (tmpseq > IO_SEQMAX)
406 			tmpseq = IO_SEQMAX;
407 		fp->f_seqcount = tmpseq;
408 		return(fp->f_seqcount << IO_SEQSHIFT);
409 	}
410 
411 	/*
412 	 * Not sequential, quick draw-down of seqcount
413 	 */
414 	if (fp->f_seqcount > 1)
415 		fp->f_seqcount = 1;
416 	else
417 		fp->f_seqcount = 0;
418 	return(0);
419 }
420 
421 /*
422  * Package up an I/O request on a vnode into a uio and do it.
423  *
424  * We are going to assume the caller has done the appropriate
425  * VOP_LEASE() call before calling vn_rdwr()
426  */
427 int
428 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
429 	enum uio_rw rw;
430 	struct vnode *vp;
431 	caddr_t base;
432 	int len;
433 	off_t offset;
434 	enum uio_seg segflg;
435 	int ioflg;
436 	struct ucred *cred;
437 	int *aresid;
438 	struct thread *td;
439 {
440 	struct uio auio;
441 	struct iovec aiov;
442 	int error;
443 
444 	if ((ioflg & IO_NODELOCKED) == 0)
445 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
446 	auio.uio_iov = &aiov;
447 	auio.uio_iovcnt = 1;
448 	aiov.iov_base = base;
449 	aiov.iov_len = len;
450 	auio.uio_resid = len;
451 	auio.uio_offset = offset;
452 	auio.uio_segflg = segflg;
453 	auio.uio_rw = rw;
454 	auio.uio_td = td;
455 	if (rw == UIO_READ) {
456 		error = VOP_READ(vp, &auio, ioflg, cred);
457 	} else {
458 		error = VOP_WRITE(vp, &auio, ioflg, cred);
459 	}
460 	if (aresid)
461 		*aresid = auio.uio_resid;
462 	else
463 		if (auio.uio_resid && error == 0)
464 			error = EIO;
465 	if ((ioflg & IO_NODELOCKED) == 0)
466 		VOP_UNLOCK(vp, 0, td);
467 	return (error);
468 }
469 
470 /*
471  * Package up an I/O request on a vnode into a uio and do it.  The I/O
472  * request is split up into smaller chunks and we try to avoid saturating
473  * the buffer cache while potentially holding a vnode locked, so we
474  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
475  * to give other processes a chance to lock the vnode (either other processes
476  * core'ing the same binary, or unrelated processes scanning the directory).
477  */
478 int
479 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
480 	enum uio_rw rw;
481 	struct vnode *vp;
482 	caddr_t base;
483 	int len;
484 	off_t offset;
485 	enum uio_seg segflg;
486 	int ioflg;
487 	struct ucred *cred;
488 	int *aresid;
489 	struct thread *td;
490 {
491 	int error = 0;
492 
493 	do {
494 		int chunk;
495 
496 		/*
497 		 * Force `offset' to a multiple of MAXBSIZE except possibly
498 		 * for the first chunk, so that filesystems only need to
499 		 * write full blocks except possibly for the first and last
500 		 * chunks.
501 		 */
502 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
503 
504 		if (chunk > len)
505 			chunk = len;
506 		if (rw != UIO_READ && vp->v_type == VREG)
507 			bwillwrite();
508 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
509 			    ioflg, cred, aresid, td);
510 		len -= chunk;	/* aresid calc already includes length */
511 		if (error)
512 			break;
513 		offset += chunk;
514 		base += chunk;
515 		uio_yield();
516 	} while (len);
517 	if (aresid)
518 		*aresid += len;
519 	return (error);
520 }
521 
522 /*
523  * File table vnode read routine.
524  */
525 static int
526 vn_read(fp, uio, cred, flags, td)
527 	struct file *fp;
528 	struct uio *uio;
529 	struct ucred *cred;
530 	struct thread *td;
531 	int flags;
532 {
533 	struct vnode *vp;
534 	int error, ioflag;
535 
536 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
537 	vp = (struct vnode *)fp->f_data;
538 	ioflag = 0;
539 	if (fp->f_flag & FNONBLOCK)
540 		ioflag |= IO_NDELAY;
541 	if (fp->f_flag & O_DIRECT)
542 		ioflag |= IO_DIRECT;
543 	VOP_LEASE(vp, td, cred, LEASE_READ);
544 	vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
545 	if ((flags & FOF_OFFSET) == 0)
546 		uio->uio_offset = fp->f_offset;
547 
548 	ioflag |= sequential_heuristic(uio, fp);
549 
550 	error = VOP_READ(vp, uio, ioflag, cred);
551 	if ((flags & FOF_OFFSET) == 0)
552 		fp->f_offset = uio->uio_offset;
553 	fp->f_nextoff = uio->uio_offset;
554 	VOP_UNLOCK(vp, 0, td);
555 	return (error);
556 }
557 
558 /*
559  * Device-optimized file table vnode read routine.
560  *
561  * This bypasses the VOP table and talks directly to the device.  Most
562  * filesystems just route to specfs and can make this optimization.
563  */
564 static int
565 svn_read(fp, uio, cred, flags, td)
566 	struct file *fp;
567 	struct uio *uio;
568 	struct ucred *cred;
569 	struct thread *td;
570 	int flags;
571 {
572 	struct vnode *vp;
573 	int ioflag;
574 	int error;
575 	dev_t dev;
576 
577 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
578 
579 	vp = (struct vnode *)fp->f_data;
580 	if (vp == NULL || vp->v_type == VBAD)
581 		return (EBADF);
582 
583 	if ((dev = vp->v_rdev) == NULL)
584 		return (EBADF);
585 	reference_dev(dev);
586 
587 	if (uio->uio_resid == 0)
588 		return (0);
589 	if ((flags & FOF_OFFSET) == 0)
590 		uio->uio_offset = fp->f_offset;
591 
592 	ioflag = 0;
593 	if (fp->f_flag & FNONBLOCK)
594 		ioflag |= IO_NDELAY;
595 	if (fp->f_flag & O_DIRECT)
596 		ioflag |= IO_DIRECT;
597 	ioflag |= sequential_heuristic(uio, fp);
598 
599 	error = dev_dread(dev, uio, ioflag);
600 
601 	release_dev(dev);
602 	if ((flags & FOF_OFFSET) == 0)
603 		fp->f_offset = uio->uio_offset;
604 	fp->f_nextoff = uio->uio_offset;
605 	return (error);
606 }
607 
608 /*
609  * File table vnode write routine.
610  */
611 static int
612 vn_write(fp, uio, cred, flags, td)
613 	struct file *fp;
614 	struct uio *uio;
615 	struct ucred *cred;
616 	struct thread *td;
617 	int flags;
618 {
619 	struct vnode *vp;
620 	int error, ioflag;
621 
622 	KASSERT(uio->uio_td == td, ("uio_procp %p is not p %p",
623 	    uio->uio_td, td));
624 	vp = (struct vnode *)fp->f_data;
625 	if (vp->v_type == VREG)
626 		bwillwrite();
627 	vp = (struct vnode *)fp->f_data;	/* XXX needed? */
628 	ioflag = IO_UNIT;
629 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
630 		ioflag |= IO_APPEND;
631 	if (fp->f_flag & FNONBLOCK)
632 		ioflag |= IO_NDELAY;
633 	if (fp->f_flag & O_DIRECT)
634 		ioflag |= IO_DIRECT;
635 	if ((fp->f_flag & O_FSYNC) ||
636 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
637 		ioflag |= IO_SYNC;
638 	VOP_LEASE(vp, td, cred, LEASE_WRITE);
639 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
640 	if ((flags & FOF_OFFSET) == 0)
641 		uio->uio_offset = fp->f_offset;
642 	ioflag |= sequential_heuristic(uio, fp);
643 	error = VOP_WRITE(vp, uio, ioflag, cred);
644 	if ((flags & FOF_OFFSET) == 0)
645 		fp->f_offset = uio->uio_offset;
646 	fp->f_nextoff = uio->uio_offset;
647 	VOP_UNLOCK(vp, 0, td);
648 	return (error);
649 }
650 
651 /*
652  * Device-optimized file table vnode write routine.
653  *
654  * This bypasses the VOP table and talks directly to the device.  Most
655  * filesystems just route to specfs and can make this optimization.
656  */
657 static int
658 svn_write(fp, uio, cred, flags, td)
659 	struct file *fp;
660 	struct uio *uio;
661 	struct ucred *cred;
662 	struct thread *td;
663 	int flags;
664 {
665 	struct vnode *vp;
666 	int ioflag;
667 	int error;
668 	dev_t dev;
669 
670 	KASSERT(uio->uio_td == td, ("uio_procp %p is not p %p",
671 	    uio->uio_td, td));
672 
673 	vp = (struct vnode *)fp->f_data;
674 	if (vp == NULL || vp->v_type == VBAD)
675 		return (EBADF);
676 	if (vp->v_type == VREG)
677 		bwillwrite();
678 	vp = (struct vnode *)fp->f_data;	/* XXX needed? */
679 
680 	if ((dev = vp->v_rdev) == NULL)
681 		return (EBADF);
682 	reference_dev(dev);
683 
684 	if ((flags & FOF_OFFSET) == 0)
685 		uio->uio_offset = fp->f_offset;
686 
687 	ioflag = IO_UNIT;
688 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
689 		ioflag |= IO_APPEND;
690 	if (fp->f_flag & FNONBLOCK)
691 		ioflag |= IO_NDELAY;
692 	if (fp->f_flag & O_DIRECT)
693 		ioflag |= IO_DIRECT;
694 	if ((fp->f_flag & O_FSYNC) ||
695 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
696 		ioflag |= IO_SYNC;
697 	ioflag |= sequential_heuristic(uio, fp);
698 
699 	error = dev_dwrite(dev, uio, ioflag);
700 
701 	release_dev(dev);
702 	if ((flags & FOF_OFFSET) == 0)
703 		fp->f_offset = uio->uio_offset;
704 	fp->f_nextoff = uio->uio_offset;
705 
706 	return (error);
707 }
708 
709 /*
710  * File table vnode stat routine.
711  */
712 static int
713 vn_statfile(struct file *fp, struct stat *sb, struct thread *td)
714 {
715 	struct vnode *vp = (struct vnode *)fp->f_data;
716 
717 	return vn_stat(vp, sb, td);
718 }
719 
720 int
721 vn_stat(struct vnode *vp, struct stat *sb, struct thread *td)
722 {
723 	struct vattr vattr;
724 	struct vattr *vap;
725 	int error;
726 	u_short mode;
727 
728 	vap = &vattr;
729 	error = VOP_GETATTR(vp, vap, td);
730 	if (error)
731 		return (error);
732 
733 	/*
734 	 * Zero the spare stat fields
735 	 */
736 	sb->st_lspare = 0;
737 	sb->st_qspare[0] = 0;
738 	sb->st_qspare[1] = 0;
739 
740 	/*
741 	 * Copy from vattr table
742 	 */
743 	if (vap->va_fsid != VNOVAL)
744 		sb->st_dev = vap->va_fsid;
745 	else
746 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
747 	sb->st_ino = vap->va_fileid;
748 	mode = vap->va_mode;
749 	switch (vap->va_type) {
750 	case VREG:
751 		mode |= S_IFREG;
752 		break;
753 	case VDIR:
754 		mode |= S_IFDIR;
755 		break;
756 	case VBLK:
757 		mode |= S_IFBLK;
758 		break;
759 	case VCHR:
760 		mode |= S_IFCHR;
761 		break;
762 	case VLNK:
763 		mode |= S_IFLNK;
764 		/* This is a cosmetic change, symlinks do not have a mode. */
765 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
766 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
767 		else
768 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
769 		break;
770 	case VSOCK:
771 		mode |= S_IFSOCK;
772 		break;
773 	case VFIFO:
774 		mode |= S_IFIFO;
775 		break;
776 	default:
777 		return (EBADF);
778 	};
779 	sb->st_mode = mode;
780 	sb->st_nlink = vap->va_nlink;
781 	sb->st_uid = vap->va_uid;
782 	sb->st_gid = vap->va_gid;
783 	sb->st_rdev = vap->va_rdev;
784 	sb->st_size = vap->va_size;
785 	sb->st_atimespec = vap->va_atime;
786 	sb->st_mtimespec = vap->va_mtime;
787 	sb->st_ctimespec = vap->va_ctime;
788 
789         /*
790 	 * According to www.opengroup.org, the meaning of st_blksize is
791 	 *   "a filesystem-specific preferred I/O block size for this
792 	 *    object.  In some filesystem types, this may vary from file
793 	 *    to file"
794 	 * Default to PAGE_SIZE after much discussion.
795 	 */
796 
797 	if (vap->va_type == VREG) {
798 		sb->st_blksize = vap->va_blocksize;
799 	} else if (vn_isdisk(vp, NULL)) {
800 		/*
801 		 * XXX this is broken.  If the device is not yet open (aka
802 		 * stat() call, aka v_rdev == NULL), how are we supposed
803 		 * to get a valid block size out of it?
804 		 */
805 		dev_t dev;
806 
807 		if ((dev = vp->v_rdev) == NULL)
808 			dev = udev2dev(vp->v_udev, vp->v_type == VBLK);
809 		sb->st_blksize = dev->si_bsize_best;
810 		if (sb->st_blksize < dev->si_bsize_phys)
811 			sb->st_blksize = dev->si_bsize_phys;
812 		if (sb->st_blksize < BLKDEV_IOSIZE)
813 			sb->st_blksize = BLKDEV_IOSIZE;
814 	} else {
815 		sb->st_blksize = PAGE_SIZE;
816 	}
817 
818 	sb->st_flags = vap->va_flags;
819 	if (suser(td))
820 		sb->st_gen = 0;
821 	else
822 		sb->st_gen = vap->va_gen;
823 
824 #if (S_BLKSIZE == 512)
825 	/* Optimize this case */
826 	sb->st_blocks = vap->va_bytes >> 9;
827 #else
828 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
829 #endif
830 	return (0);
831 }
832 
833 /*
834  * File table vnode ioctl routine.
835  */
836 static int
837 vn_ioctl(struct file *fp, u_long com, caddr_t data, struct thread *td)
838 {
839 	struct vnode *vp = ((struct vnode *)fp->f_data);
840 	struct vnode *ovp;
841 	struct ucred *ucred;
842 	struct vattr vattr;
843 	int error;
844 
845 	KKASSERT(td->td_proc != NULL);
846 	ucred = td->td_proc->p_ucred;
847 
848 	switch (vp->v_type) {
849 	case VREG:
850 	case VDIR:
851 		if (com == FIONREAD) {
852 			error = VOP_GETATTR(vp, &vattr, td);
853 			if (error)
854 				return (error);
855 			*(int *)data = vattr.va_size - fp->f_offset;
856 			return (0);
857 		}
858 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
859 			return (0);			/* XXX */
860 		/* fall into ... */
861 	default:
862 #if 0
863 		return (ENOTTY);
864 #endif
865 	case VFIFO:
866 	case VCHR:
867 	case VBLK:
868 		if (com == FIODTYPE) {
869 			if (vp->v_type != VCHR && vp->v_type != VBLK)
870 				return (ENOTTY);
871 			*(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK;
872 			return (0);
873 		}
874 		error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred, td);
875 		if (error == 0 && com == TIOCSCTTY) {
876 			struct session *sess = td->td_proc->p_session;
877 
878 			/* Do nothing if reassigning same control tty */
879 			if (sess->s_ttyvp == vp)
880 				return (0);
881 
882 			/* Get rid of reference to old control tty */
883 			ovp = sess->s_ttyvp;
884 			vref(vp);
885 			sess->s_ttyvp = vp;
886 			if (ovp)
887 				vrele(ovp);
888 		}
889 		return (error);
890 	}
891 }
892 
893 /*
894  * File table vnode poll routine.
895  */
896 static int
897 vn_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
898 {
899 	return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, td));
900 }
901 
902 /*
903  * Check that the vnode is still valid, and if so
904  * acquire requested lock.
905  */
906 int
907 #ifndef	DEBUG_LOCKS
908 vn_lock(struct vnode *vp, int flags, struct thread *td)
909 #else
910 debug_vn_lock(struct vnode *vp, int flags, struct thread *td,
911 		const char *filename, int line)
912 #endif
913 {
914 	int error;
915 
916 	do {
917 #ifdef	DEBUG_LOCKS
918 		vp->filename = filename;
919 		vp->line = line;
920 #endif
921 		error = VOP_LOCK(vp, flags | LK_NOPAUSE, td);
922 		if (error == 0)
923 			break;
924 	} while (flags & LK_RETRY);
925 
926 	/*
927 	 * Because we (had better!) have a ref on the vnode, once it
928 	 * goes to VRECLAIMED state it will not be recycled until all
929 	 * refs go away.  So we can just check the flag.
930 	 */
931 	if (error == 0 && (vp->v_flag & VRECLAIMED)) {
932 		VOP_UNLOCK(vp, 0, td);
933 		error = ENOENT;
934 	}
935 	return (error);
936 }
937 
938 /*
939  * File table vnode close routine.
940  */
941 static int
942 vn_closefile(struct file *fp, struct thread *td)
943 {
944 	int err;
945 
946 	fp->f_ops = &badfileops;
947 	err = vn_close(((struct vnode *)fp->f_data), fp->f_flag, td);
948 	return(err);
949 }
950 
951 static int
952 vn_kqfilter(struct file *fp, struct knote *kn)
953 {
954 
955 	return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn));
956 }
957