xref: /freebsd/sys/kern/vfs_vnops.c (revision aa0a1e58)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/fcntl.h>
43 #include <sys/file.h>
44 #include <sys/kdb.h>
45 #include <sys/stat.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/limits.h>
49 #include <sys/lock.h>
50 #include <sys/mount.h>
51 #include <sys/mutex.h>
52 #include <sys/namei.h>
53 #include <sys/vnode.h>
54 #include <sys/bio.h>
55 #include <sys/buf.h>
56 #include <sys/filio.h>
57 #include <sys/resourcevar.h>
58 #include <sys/sx.h>
59 #include <sys/ttycom.h>
60 #include <sys/conf.h>
61 #include <sys/syslog.h>
62 #include <sys/unistd.h>
63 
64 #include <security/mac/mac_framework.h>
65 
66 static fo_rdwr_t	vn_read;
67 static fo_rdwr_t	vn_write;
68 static fo_truncate_t	vn_truncate;
69 static fo_ioctl_t	vn_ioctl;
70 static fo_poll_t	vn_poll;
71 static fo_kqfilter_t	vn_kqfilter;
72 static fo_stat_t	vn_statfile;
73 static fo_close_t	vn_closefile;
74 
75 struct 	fileops vnops = {
76 	.fo_read = vn_read,
77 	.fo_write = vn_write,
78 	.fo_truncate = vn_truncate,
79 	.fo_ioctl = vn_ioctl,
80 	.fo_poll = vn_poll,
81 	.fo_kqfilter = vn_kqfilter,
82 	.fo_stat = vn_statfile,
83 	.fo_close = vn_closefile,
84 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
85 };
86 
87 int
88 vn_open(ndp, flagp, cmode, fp)
89 	struct nameidata *ndp;
90 	int *flagp, cmode;
91 	struct file *fp;
92 {
93 	struct thread *td = ndp->ni_cnd.cn_thread;
94 
95 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
96 }
97 
98 /*
99  * Common code for vnode open operations.
100  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
101  *
102  * Note that this does NOT free nameidata for the successful case,
103  * due to the NDINIT being done elsewhere.
104  */
105 int
106 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
107     struct ucred *cred, struct file *fp)
108 {
109 	struct vnode *vp;
110 	struct mount *mp;
111 	struct thread *td = ndp->ni_cnd.cn_thread;
112 	struct vattr vat;
113 	struct vattr *vap = &vat;
114 	int fmode, error;
115 	accmode_t accmode;
116 	int vfslocked, mpsafe;
117 
118 	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
119 restart:
120 	vfslocked = 0;
121 	fmode = *flagp;
122 	if (fmode & O_CREAT) {
123 		ndp->ni_cnd.cn_nameiop = CREATE;
124 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
125 		    MPSAFE;
126 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
127 			ndp->ni_cnd.cn_flags |= FOLLOW;
128 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
129 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
130 		bwillwrite();
131 		if ((error = namei(ndp)) != 0)
132 			return (error);
133 		vfslocked = NDHASGIANT(ndp);
134 		if (!mpsafe)
135 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
136 		if (ndp->ni_vp == NULL) {
137 			VATTR_NULL(vap);
138 			vap->va_type = VREG;
139 			vap->va_mode = cmode;
140 			if (fmode & O_EXCL)
141 				vap->va_vaflags |= VA_EXCLUSIVE;
142 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
143 				NDFREE(ndp, NDF_ONLY_PNBUF);
144 				vput(ndp->ni_dvp);
145 				VFS_UNLOCK_GIANT(vfslocked);
146 				if ((error = vn_start_write(NULL, &mp,
147 				    V_XSLEEP | PCATCH)) != 0)
148 					return (error);
149 				goto restart;
150 			}
151 #ifdef MAC
152 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
153 			    &ndp->ni_cnd, vap);
154 			if (error == 0)
155 #endif
156 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
157 						   &ndp->ni_cnd, vap);
158 			vput(ndp->ni_dvp);
159 			vn_finished_write(mp);
160 			if (error) {
161 				VFS_UNLOCK_GIANT(vfslocked);
162 				NDFREE(ndp, NDF_ONLY_PNBUF);
163 				return (error);
164 			}
165 			fmode &= ~O_TRUNC;
166 			vp = ndp->ni_vp;
167 		} else {
168 			if (ndp->ni_dvp == ndp->ni_vp)
169 				vrele(ndp->ni_dvp);
170 			else
171 				vput(ndp->ni_dvp);
172 			ndp->ni_dvp = NULL;
173 			vp = ndp->ni_vp;
174 			if (fmode & O_EXCL) {
175 				error = EEXIST;
176 				goto bad;
177 			}
178 			fmode &= ~O_CREAT;
179 		}
180 	} else {
181 		ndp->ni_cnd.cn_nameiop = LOOKUP;
182 		ndp->ni_cnd.cn_flags = ISOPEN |
183 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
184 		    LOCKLEAF | MPSAFE;
185 		if (!(fmode & FWRITE))
186 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
187 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
188 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
189 		if ((error = namei(ndp)) != 0)
190 			return (error);
191 		if (!mpsafe)
192 			ndp->ni_cnd.cn_flags &= ~MPSAFE;
193 		vfslocked = NDHASGIANT(ndp);
194 		vp = ndp->ni_vp;
195 	}
196 	if (vp->v_type == VLNK) {
197 		error = EMLINK;
198 		goto bad;
199 	}
200 	if (vp->v_type == VSOCK) {
201 		error = EOPNOTSUPP;
202 		goto bad;
203 	}
204 	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
205 		error = ENOTDIR;
206 		goto bad;
207 	}
208 	accmode = 0;
209 	if (fmode & (FWRITE | O_TRUNC)) {
210 		if (vp->v_type == VDIR) {
211 			error = EISDIR;
212 			goto bad;
213 		}
214 		accmode |= VWRITE;
215 	}
216 	if (fmode & FREAD)
217 		accmode |= VREAD;
218 	if (fmode & FEXEC)
219 		accmode |= VEXEC;
220 	if ((fmode & O_APPEND) && (fmode & FWRITE))
221 		accmode |= VAPPEND;
222 #ifdef MAC
223 	error = mac_vnode_check_open(cred, vp, accmode);
224 	if (error)
225 		goto bad;
226 #endif
227 	if ((fmode & O_CREAT) == 0) {
228 		if (accmode & VWRITE) {
229 			error = vn_writechk(vp);
230 			if (error)
231 				goto bad;
232 		}
233 		if (accmode) {
234 		        error = VOP_ACCESS(vp, accmode, cred, td);
235 			if (error)
236 				goto bad;
237 		}
238 	}
239 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
240 		goto bad;
241 
242 	if (fmode & FWRITE)
243 		vp->v_writecount++;
244 	*flagp = fmode;
245 	ASSERT_VOP_LOCKED(vp, "vn_open_cred");
246 	if (!mpsafe)
247 		VFS_UNLOCK_GIANT(vfslocked);
248 	return (0);
249 bad:
250 	NDFREE(ndp, NDF_ONLY_PNBUF);
251 	vput(vp);
252 	VFS_UNLOCK_GIANT(vfslocked);
253 	*flagp = fmode;
254 	ndp->ni_vp = NULL;
255 	return (error);
256 }
257 
258 /*
259  * Check for write permissions on the specified vnode.
260  * Prototype text segments cannot be written.
261  */
262 int
263 vn_writechk(vp)
264 	register struct vnode *vp;
265 {
266 
267 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
268 	/*
269 	 * If there's shared text associated with
270 	 * the vnode, try to free it up once.  If
271 	 * we fail, we can't allow writing.
272 	 */
273 	if (vp->v_vflag & VV_TEXT)
274 		return (ETXTBSY);
275 
276 	return (0);
277 }
278 
279 /*
280  * Vnode close call
281  */
282 int
283 vn_close(vp, flags, file_cred, td)
284 	register struct vnode *vp;
285 	int flags;
286 	struct ucred *file_cred;
287 	struct thread *td;
288 {
289 	struct mount *mp;
290 	int error, lock_flags;
291 
292 	if (!(flags & FWRITE) && vp->v_mount != NULL &&
293 	    vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
294 		lock_flags = LK_SHARED;
295 	else
296 		lock_flags = LK_EXCLUSIVE;
297 
298 	VFS_ASSERT_GIANT(vp->v_mount);
299 
300 	vn_start_write(vp, &mp, V_WAIT);
301 	vn_lock(vp, lock_flags | LK_RETRY);
302 	if (flags & FWRITE) {
303 		VNASSERT(vp->v_writecount > 0, vp,
304 		    ("vn_close: negative writecount"));
305 		vp->v_writecount--;
306 	}
307 	error = VOP_CLOSE(vp, flags, file_cred, td);
308 	vput(vp);
309 	vn_finished_write(mp);
310 	return (error);
311 }
312 
313 /*
314  * Heuristic to detect sequential operation.
315  */
316 static int
317 sequential_heuristic(struct uio *uio, struct file *fp)
318 {
319 
320 	if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
321 		return (fp->f_seqcount << IO_SEQSHIFT);
322 
323 	/*
324 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
325 	 * that the first I/O is normally considered to be slightly
326 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
327 	 * unless previous seeks have reduced f_seqcount to 0, in which
328 	 * case offset 0 is not special.
329 	 */
330 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
331 	    uio->uio_offset == fp->f_nextoff) {
332 		/*
333 		 * f_seqcount is in units of fixed-size blocks so that it
334 		 * depends mainly on the amount of sequential I/O and not
335 		 * much on the number of sequential I/O's.  The fixed size
336 		 * of 16384 is hard-coded here since it is (not quite) just
337 		 * a magic size that works well here.  This size is more
338 		 * closely related to the best I/O size for real disks than
339 		 * to any block size used by software.
340 		 */
341 		fp->f_seqcount += howmany(uio->uio_resid, 16384);
342 		if (fp->f_seqcount > IO_SEQMAX)
343 			fp->f_seqcount = IO_SEQMAX;
344 		return (fp->f_seqcount << IO_SEQSHIFT);
345 	}
346 
347 	/* Not sequential.  Quickly draw-down sequentiality. */
348 	if (fp->f_seqcount > 1)
349 		fp->f_seqcount = 1;
350 	else
351 		fp->f_seqcount = 0;
352 	return (0);
353 }
354 
355 /*
356  * Package up an I/O request on a vnode into a uio and do it.
357  */
358 int
359 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
360     aresid, td)
361 	enum uio_rw rw;
362 	struct vnode *vp;
363 	void *base;
364 	int len;
365 	off_t offset;
366 	enum uio_seg segflg;
367 	int ioflg;
368 	struct ucred *active_cred;
369 	struct ucred *file_cred;
370 	int *aresid;
371 	struct thread *td;
372 {
373 	struct uio auio;
374 	struct iovec aiov;
375 	struct mount *mp;
376 	struct ucred *cred;
377 	int error, lock_flags;
378 
379 	VFS_ASSERT_GIANT(vp->v_mount);
380 
381 	if ((ioflg & IO_NODELOCKED) == 0) {
382 		mp = NULL;
383 		if (rw == UIO_WRITE) {
384 			if (vp->v_type != VCHR &&
385 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
386 			    != 0)
387 				return (error);
388 			if (MNT_SHARED_WRITES(mp) ||
389 			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
390 				lock_flags = LK_SHARED;
391 			} else {
392 				lock_flags = LK_EXCLUSIVE;
393 			}
394 			vn_lock(vp, lock_flags | LK_RETRY);
395 		} else
396 			vn_lock(vp, LK_SHARED | LK_RETRY);
397 
398 	}
399 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
400 	auio.uio_iov = &aiov;
401 	auio.uio_iovcnt = 1;
402 	aiov.iov_base = base;
403 	aiov.iov_len = len;
404 	auio.uio_resid = len;
405 	auio.uio_offset = offset;
406 	auio.uio_segflg = segflg;
407 	auio.uio_rw = rw;
408 	auio.uio_td = td;
409 	error = 0;
410 #ifdef MAC
411 	if ((ioflg & IO_NOMACCHECK) == 0) {
412 		if (rw == UIO_READ)
413 			error = mac_vnode_check_read(active_cred, file_cred,
414 			    vp);
415 		else
416 			error = mac_vnode_check_write(active_cred, file_cred,
417 			    vp);
418 	}
419 #endif
420 	if (error == 0) {
421 		if (file_cred)
422 			cred = file_cred;
423 		else
424 			cred = active_cred;
425 		if (rw == UIO_READ)
426 			error = VOP_READ(vp, &auio, ioflg, cred);
427 		else
428 			error = VOP_WRITE(vp, &auio, ioflg, cred);
429 	}
430 	if (aresid)
431 		*aresid = auio.uio_resid;
432 	else
433 		if (auio.uio_resid && error == 0)
434 			error = EIO;
435 	if ((ioflg & IO_NODELOCKED) == 0) {
436 		if (rw == UIO_WRITE && vp->v_type != VCHR)
437 			vn_finished_write(mp);
438 		VOP_UNLOCK(vp, 0);
439 	}
440 	return (error);
441 }
442 
443 /*
444  * Package up an I/O request on a vnode into a uio and do it.  The I/O
445  * request is split up into smaller chunks and we try to avoid saturating
446  * the buffer cache while potentially holding a vnode locked, so we
447  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
448  * to give other processes a chance to lock the vnode (either other processes
449  * core'ing the same binary, or unrelated processes scanning the directory).
450  */
451 int
452 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
453     file_cred, aresid, td)
454 	enum uio_rw rw;
455 	struct vnode *vp;
456 	void *base;
457 	size_t len;
458 	off_t offset;
459 	enum uio_seg segflg;
460 	int ioflg;
461 	struct ucred *active_cred;
462 	struct ucred *file_cred;
463 	size_t *aresid;
464 	struct thread *td;
465 {
466 	int error = 0;
467 	int iaresid;
468 
469 	VFS_ASSERT_GIANT(vp->v_mount);
470 
471 	do {
472 		int chunk;
473 
474 		/*
475 		 * Force `offset' to a multiple of MAXBSIZE except possibly
476 		 * for the first chunk, so that filesystems only need to
477 		 * write full blocks except possibly for the first and last
478 		 * chunks.
479 		 */
480 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
481 
482 		if (chunk > len)
483 			chunk = len;
484 		if (rw != UIO_READ && vp->v_type == VREG)
485 			bwillwrite();
486 		iaresid = 0;
487 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
488 		    ioflg, active_cred, file_cred, &iaresid, td);
489 		len -= chunk;	/* aresid calc already includes length */
490 		if (error)
491 			break;
492 		offset += chunk;
493 		base = (char *)base + chunk;
494 		kern_yield(curthread->td_user_pri);
495 	} while (len);
496 	if (aresid)
497 		*aresid = len + iaresid;
498 	return (error);
499 }
500 
501 /*
502  * File table vnode read routine.
503  */
504 static int
505 vn_read(fp, uio, active_cred, flags, td)
506 	struct file *fp;
507 	struct uio *uio;
508 	struct ucred *active_cred;
509 	int flags;
510 	struct thread *td;
511 {
512 	struct vnode *vp;
513 	int error, ioflag;
514 	struct mtx *mtxp;
515 	int vfslocked;
516 
517 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
518 	    uio->uio_td, td));
519 	mtxp = NULL;
520 	vp = fp->f_vnode;
521 	ioflag = 0;
522 	if (fp->f_flag & FNONBLOCK)
523 		ioflag |= IO_NDELAY;
524 	if (fp->f_flag & O_DIRECT)
525 		ioflag |= IO_DIRECT;
526 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
527 	/*
528 	 * According to McKusick the vn lock was protecting f_offset here.
529 	 * It is now protected by the FOFFSET_LOCKED flag.
530 	 */
531 	if ((flags & FOF_OFFSET) == 0) {
532 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
533 		mtx_lock(mtxp);
534 		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
535 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
536 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
537 			    "vnread offlock", 0);
538 		}
539 		fp->f_vnread_flags |= FOFFSET_LOCKED;
540 		mtx_unlock(mtxp);
541 		vn_lock(vp, LK_SHARED | LK_RETRY);
542 		uio->uio_offset = fp->f_offset;
543 	} else
544 		vn_lock(vp, LK_SHARED | LK_RETRY);
545 
546 	ioflag |= sequential_heuristic(uio, fp);
547 
548 #ifdef MAC
549 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
550 	if (error == 0)
551 #endif
552 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
553 	if ((flags & FOF_OFFSET) == 0) {
554 		fp->f_offset = uio->uio_offset;
555 		mtx_lock(mtxp);
556 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
557 			wakeup(&fp->f_vnread_flags);
558 		fp->f_vnread_flags = 0;
559 		mtx_unlock(mtxp);
560 	}
561 	fp->f_nextoff = uio->uio_offset;
562 	VOP_UNLOCK(vp, 0);
563 	VFS_UNLOCK_GIANT(vfslocked);
564 	return (error);
565 }
566 
567 /*
568  * File table vnode write routine.
569  */
570 static int
571 vn_write(fp, uio, active_cred, flags, td)
572 	struct file *fp;
573 	struct uio *uio;
574 	struct ucred *active_cred;
575 	int flags;
576 	struct thread *td;
577 {
578 	struct vnode *vp;
579 	struct mount *mp;
580 	int error, ioflag, lock_flags;
581 	int vfslocked;
582 
583 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
584 	    uio->uio_td, td));
585 	vp = fp->f_vnode;
586 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
587 	if (vp->v_type == VREG)
588 		bwillwrite();
589 	ioflag = IO_UNIT;
590 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
591 		ioflag |= IO_APPEND;
592 	if (fp->f_flag & FNONBLOCK)
593 		ioflag |= IO_NDELAY;
594 	if (fp->f_flag & O_DIRECT)
595 		ioflag |= IO_DIRECT;
596 	if ((fp->f_flag & O_FSYNC) ||
597 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
598 		ioflag |= IO_SYNC;
599 	mp = NULL;
600 	if (vp->v_type != VCHR &&
601 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
602 		goto unlock;
603 
604 	if ((MNT_SHARED_WRITES(mp) ||
605 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
606 	    (flags & FOF_OFFSET) != 0) {
607 		lock_flags = LK_SHARED;
608 	} else {
609 		lock_flags = LK_EXCLUSIVE;
610 	}
611 
612 	vn_lock(vp, lock_flags | LK_RETRY);
613 	if ((flags & FOF_OFFSET) == 0)
614 		uio->uio_offset = fp->f_offset;
615 	ioflag |= sequential_heuristic(uio, fp);
616 #ifdef MAC
617 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
618 	if (error == 0)
619 #endif
620 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
621 	if ((flags & FOF_OFFSET) == 0)
622 		fp->f_offset = uio->uio_offset;
623 	fp->f_nextoff = uio->uio_offset;
624 	VOP_UNLOCK(vp, 0);
625 	if (vp->v_type != VCHR)
626 		vn_finished_write(mp);
627 unlock:
628 	VFS_UNLOCK_GIANT(vfslocked);
629 	return (error);
630 }
631 
632 /*
633  * File table truncate routine.
634  */
635 static int
636 vn_truncate(fp, length, active_cred, td)
637 	struct file *fp;
638 	off_t length;
639 	struct ucred *active_cred;
640 	struct thread *td;
641 {
642 	struct vattr vattr;
643 	struct mount *mp;
644 	struct vnode *vp;
645 	int vfslocked;
646 	int error;
647 
648 	vp = fp->f_vnode;
649 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
650 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
651 	if (error) {
652 		VFS_UNLOCK_GIANT(vfslocked);
653 		return (error);
654 	}
655 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
656 	if (vp->v_type == VDIR) {
657 		error = EISDIR;
658 		goto out;
659 	}
660 #ifdef MAC
661 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
662 	if (error)
663 		goto out;
664 #endif
665 	error = vn_writechk(vp);
666 	if (error == 0) {
667 		VATTR_NULL(&vattr);
668 		vattr.va_size = length;
669 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
670 	}
671 out:
672 	VOP_UNLOCK(vp, 0);
673 	vn_finished_write(mp);
674 	VFS_UNLOCK_GIANT(vfslocked);
675 	return (error);
676 }
677 
678 /*
679  * File table vnode stat routine.
680  */
681 static int
682 vn_statfile(fp, sb, active_cred, td)
683 	struct file *fp;
684 	struct stat *sb;
685 	struct ucred *active_cred;
686 	struct thread *td;
687 {
688 	struct vnode *vp = fp->f_vnode;
689 	int vfslocked;
690 	int error;
691 
692 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
693 	vn_lock(vp, LK_SHARED | LK_RETRY);
694 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
695 	VOP_UNLOCK(vp, 0);
696 	VFS_UNLOCK_GIANT(vfslocked);
697 
698 	return (error);
699 }
700 
701 /*
702  * Stat a vnode; implementation for the stat syscall
703  */
704 int
705 vn_stat(vp, sb, active_cred, file_cred, td)
706 	struct vnode *vp;
707 	register struct stat *sb;
708 	struct ucred *active_cred;
709 	struct ucred *file_cred;
710 	struct thread *td;
711 {
712 	struct vattr vattr;
713 	register struct vattr *vap;
714 	int error;
715 	u_short mode;
716 
717 #ifdef MAC
718 	error = mac_vnode_check_stat(active_cred, file_cred, vp);
719 	if (error)
720 		return (error);
721 #endif
722 
723 	vap = &vattr;
724 
725 	/*
726 	 * Initialize defaults for new and unusual fields, so that file
727 	 * systems which don't support these fields don't need to know
728 	 * about them.
729 	 */
730 	vap->va_birthtime.tv_sec = -1;
731 	vap->va_birthtime.tv_nsec = 0;
732 	vap->va_fsid = VNOVAL;
733 	vap->va_rdev = NODEV;
734 
735 	error = VOP_GETATTR(vp, vap, active_cred);
736 	if (error)
737 		return (error);
738 
739 	/*
740 	 * Zero the spare stat fields
741 	 */
742 	bzero(sb, sizeof *sb);
743 
744 	/*
745 	 * Copy from vattr table
746 	 */
747 	if (vap->va_fsid != VNOVAL)
748 		sb->st_dev = vap->va_fsid;
749 	else
750 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
751 	sb->st_ino = vap->va_fileid;
752 	mode = vap->va_mode;
753 	switch (vap->va_type) {
754 	case VREG:
755 		mode |= S_IFREG;
756 		break;
757 	case VDIR:
758 		mode |= S_IFDIR;
759 		break;
760 	case VBLK:
761 		mode |= S_IFBLK;
762 		break;
763 	case VCHR:
764 		mode |= S_IFCHR;
765 		break;
766 	case VLNK:
767 		mode |= S_IFLNK;
768 		break;
769 	case VSOCK:
770 		mode |= S_IFSOCK;
771 		break;
772 	case VFIFO:
773 		mode |= S_IFIFO;
774 		break;
775 	default:
776 		return (EBADF);
777 	};
778 	sb->st_mode = mode;
779 	sb->st_nlink = vap->va_nlink;
780 	sb->st_uid = vap->va_uid;
781 	sb->st_gid = vap->va_gid;
782 	sb->st_rdev = vap->va_rdev;
783 	if (vap->va_size > OFF_MAX)
784 		return (EOVERFLOW);
785 	sb->st_size = vap->va_size;
786 	sb->st_atim = vap->va_atime;
787 	sb->st_mtim = vap->va_mtime;
788 	sb->st_ctim = vap->va_ctime;
789 	sb->st_birthtim = vap->va_birthtime;
790 
791         /*
792 	 * According to www.opengroup.org, the meaning of st_blksize is
793 	 *   "a filesystem-specific preferred I/O block size for this
794 	 *    object.  In some filesystem types, this may vary from file
795 	 *    to file"
796 	 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
797 	 */
798 
799 	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
800 
801 	sb->st_flags = vap->va_flags;
802 	if (priv_check(td, PRIV_VFS_GENERATION))
803 		sb->st_gen = 0;
804 	else
805 		sb->st_gen = vap->va_gen;
806 
807 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
808 	return (0);
809 }
810 
811 /*
812  * File table vnode ioctl routine.
813  */
814 static int
815 vn_ioctl(fp, com, data, active_cred, td)
816 	struct file *fp;
817 	u_long com;
818 	void *data;
819 	struct ucred *active_cred;
820 	struct thread *td;
821 {
822 	struct vnode *vp = fp->f_vnode;
823 	struct vattr vattr;
824 	int vfslocked;
825 	int error;
826 
827 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
828 	error = ENOTTY;
829 	switch (vp->v_type) {
830 	case VREG:
831 	case VDIR:
832 		if (com == FIONREAD) {
833 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
834 			error = VOP_GETATTR(vp, &vattr, active_cred);
835 			VOP_UNLOCK(vp, 0);
836 			if (!error)
837 				*(int *)data = vattr.va_size - fp->f_offset;
838 		}
839 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
840 			error = 0;
841 		else
842 			error = VOP_IOCTL(vp, com, data, fp->f_flag,
843 			    active_cred, td);
844 		break;
845 
846 	default:
847 		break;
848 	}
849 	VFS_UNLOCK_GIANT(vfslocked);
850 	return (error);
851 }
852 
853 /*
854  * File table vnode poll routine.
855  */
856 static int
857 vn_poll(fp, events, active_cred, td)
858 	struct file *fp;
859 	int events;
860 	struct ucred *active_cred;
861 	struct thread *td;
862 {
863 	struct vnode *vp;
864 	int vfslocked;
865 	int error;
866 
867 	vp = fp->f_vnode;
868 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
869 #ifdef MAC
870 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
871 	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
872 	VOP_UNLOCK(vp, 0);
873 	if (!error)
874 #endif
875 
876 	error = VOP_POLL(vp, events, fp->f_cred, td);
877 	VFS_UNLOCK_GIANT(vfslocked);
878 	return (error);
879 }
880 
881 /*
882  * Acquire the requested lock and then check for validity.  LK_RETRY
883  * permits vn_lock to return doomed vnodes.
884  */
885 int
886 _vn_lock(struct vnode *vp, int flags, char *file, int line)
887 {
888 	int error;
889 
890 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
891 	    ("vn_lock called with no locktype."));
892 	do {
893 #ifdef DEBUG_VFS_LOCKS
894 		KASSERT(vp->v_holdcnt != 0,
895 		    ("vn_lock %p: zero hold count", vp));
896 #endif
897 		error = VOP_LOCK1(vp, flags, file, line);
898 		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
899 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
900 		    ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
901 		    flags, error));
902 		/*
903 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
904 		 * If RETRY is not set, we return ENOENT instead.
905 		 */
906 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
907 		    (flags & LK_RETRY) == 0) {
908 			VOP_UNLOCK(vp, 0);
909 			error = ENOENT;
910 			break;
911 		}
912 	} while (flags & LK_RETRY && error != 0);
913 	return (error);
914 }
915 
916 /*
917  * File table vnode close routine.
918  */
919 static int
920 vn_closefile(fp, td)
921 	struct file *fp;
922 	struct thread *td;
923 {
924 	struct vnode *vp;
925 	struct flock lf;
926 	int vfslocked;
927 	int error;
928 
929 	vp = fp->f_vnode;
930 
931 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
932 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
933 		lf.l_whence = SEEK_SET;
934 		lf.l_start = 0;
935 		lf.l_len = 0;
936 		lf.l_type = F_UNLCK;
937 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
938 	}
939 
940 	fp->f_ops = &badfileops;
941 
942 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
943 	VFS_UNLOCK_GIANT(vfslocked);
944 	return (error);
945 }
946 
947 /*
948  * Preparing to start a filesystem write operation. If the operation is
949  * permitted, then we bump the count of operations in progress and
950  * proceed. If a suspend request is in progress, we wait until the
951  * suspension is over, and then proceed.
952  */
953 int
954 vn_start_write(vp, mpp, flags)
955 	struct vnode *vp;
956 	struct mount **mpp;
957 	int flags;
958 {
959 	struct mount *mp;
960 	int error;
961 
962 	error = 0;
963 	/*
964 	 * If a vnode is provided, get and return the mount point that
965 	 * to which it will write.
966 	 */
967 	if (vp != NULL) {
968 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
969 			*mpp = NULL;
970 			if (error != EOPNOTSUPP)
971 				return (error);
972 			return (0);
973 		}
974 	}
975 	if ((mp = *mpp) == NULL)
976 		return (0);
977 
978 	/*
979 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
980 	 * a vfs_ref().
981 	 * As long as a vnode is not provided we need to acquire a
982 	 * refcount for the provided mountpoint too, in order to
983 	 * emulate a vfs_ref().
984 	 */
985 	MNT_ILOCK(mp);
986 	if (vp == NULL)
987 		MNT_REF(mp);
988 
989 	/*
990 	 * Check on status of suspension.
991 	 */
992 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
993 	    mp->mnt_susp_owner != curthread) {
994 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
995 			if (flags & V_NOWAIT) {
996 				error = EWOULDBLOCK;
997 				goto unlock;
998 			}
999 			error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1000 			    (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
1001 			if (error)
1002 				goto unlock;
1003 		}
1004 	}
1005 	if (flags & V_XSLEEP)
1006 		goto unlock;
1007 	mp->mnt_writeopcount++;
1008 unlock:
1009 	if (error != 0 || (flags & V_XSLEEP) != 0)
1010 		MNT_REL(mp);
1011 	MNT_IUNLOCK(mp);
1012 	return (error);
1013 }
1014 
1015 /*
1016  * Secondary suspension. Used by operations such as vop_inactive
1017  * routines that are needed by the higher level functions. These
1018  * are allowed to proceed until all the higher level functions have
1019  * completed (indicated by mnt_writeopcount dropping to zero). At that
1020  * time, these operations are halted until the suspension is over.
1021  */
1022 int
1023 vn_start_secondary_write(vp, mpp, flags)
1024 	struct vnode *vp;
1025 	struct mount **mpp;
1026 	int flags;
1027 {
1028 	struct mount *mp;
1029 	int error;
1030 
1031  retry:
1032 	if (vp != NULL) {
1033 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1034 			*mpp = NULL;
1035 			if (error != EOPNOTSUPP)
1036 				return (error);
1037 			return (0);
1038 		}
1039 	}
1040 	/*
1041 	 * If we are not suspended or have not yet reached suspended
1042 	 * mode, then let the operation proceed.
1043 	 */
1044 	if ((mp = *mpp) == NULL)
1045 		return (0);
1046 
1047 	/*
1048 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1049 	 * a vfs_ref().
1050 	 * As long as a vnode is not provided we need to acquire a
1051 	 * refcount for the provided mountpoint too, in order to
1052 	 * emulate a vfs_ref().
1053 	 */
1054 	MNT_ILOCK(mp);
1055 	if (vp == NULL)
1056 		MNT_REF(mp);
1057 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1058 		mp->mnt_secondary_writes++;
1059 		mp->mnt_secondary_accwrites++;
1060 		MNT_IUNLOCK(mp);
1061 		return (0);
1062 	}
1063 	if (flags & V_NOWAIT) {
1064 		MNT_REL(mp);
1065 		MNT_IUNLOCK(mp);
1066 		return (EWOULDBLOCK);
1067 	}
1068 	/*
1069 	 * Wait for the suspension to finish.
1070 	 */
1071 	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1072 		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1073 	vfs_rel(mp);
1074 	if (error == 0)
1075 		goto retry;
1076 	return (error);
1077 }
1078 
1079 /*
1080  * Filesystem write operation has completed. If we are suspending and this
1081  * operation is the last one, notify the suspender that the suspension is
1082  * now in effect.
1083  */
1084 void
1085 vn_finished_write(mp)
1086 	struct mount *mp;
1087 {
1088 	if (mp == NULL)
1089 		return;
1090 	MNT_ILOCK(mp);
1091 	MNT_REL(mp);
1092 	mp->mnt_writeopcount--;
1093 	if (mp->mnt_writeopcount < 0)
1094 		panic("vn_finished_write: neg cnt");
1095 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1096 	    mp->mnt_writeopcount <= 0)
1097 		wakeup(&mp->mnt_writeopcount);
1098 	MNT_IUNLOCK(mp);
1099 }
1100 
1101 
1102 /*
1103  * Filesystem secondary write operation has completed. If we are
1104  * suspending and this operation is the last one, notify the suspender
1105  * that the suspension is now in effect.
1106  */
1107 void
1108 vn_finished_secondary_write(mp)
1109 	struct mount *mp;
1110 {
1111 	if (mp == NULL)
1112 		return;
1113 	MNT_ILOCK(mp);
1114 	MNT_REL(mp);
1115 	mp->mnt_secondary_writes--;
1116 	if (mp->mnt_secondary_writes < 0)
1117 		panic("vn_finished_secondary_write: neg cnt");
1118 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1119 	    mp->mnt_secondary_writes <= 0)
1120 		wakeup(&mp->mnt_secondary_writes);
1121 	MNT_IUNLOCK(mp);
1122 }
1123 
1124 
1125 
1126 /*
1127  * Request a filesystem to suspend write operations.
1128  */
1129 int
1130 vfs_write_suspend(mp)
1131 	struct mount *mp;
1132 {
1133 	int error;
1134 
1135 	MNT_ILOCK(mp);
1136 	if (mp->mnt_susp_owner == curthread) {
1137 		MNT_IUNLOCK(mp);
1138 		return (EALREADY);
1139 	}
1140 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
1141 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1142 	mp->mnt_kern_flag |= MNTK_SUSPEND;
1143 	mp->mnt_susp_owner = curthread;
1144 	if (mp->mnt_writeopcount > 0)
1145 		(void) msleep(&mp->mnt_writeopcount,
1146 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1147 	else
1148 		MNT_IUNLOCK(mp);
1149 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1150 		vfs_write_resume(mp);
1151 	return (error);
1152 }
1153 
1154 /*
1155  * Request a filesystem to resume write operations.
1156  */
1157 void
1158 vfs_write_resume(mp)
1159 	struct mount *mp;
1160 {
1161 
1162 	MNT_ILOCK(mp);
1163 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1164 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1165 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1166 				       MNTK_SUSPENDED);
1167 		mp->mnt_susp_owner = NULL;
1168 		wakeup(&mp->mnt_writeopcount);
1169 		wakeup(&mp->mnt_flag);
1170 		curthread->td_pflags &= ~TDP_IGNSUSP;
1171 		MNT_IUNLOCK(mp);
1172 		VFS_SUSP_CLEAN(mp);
1173 	} else
1174 		MNT_IUNLOCK(mp);
1175 }
1176 
1177 /*
1178  * Implement kqueues for files by translating it to vnode operation.
1179  */
1180 static int
1181 vn_kqfilter(struct file *fp, struct knote *kn)
1182 {
1183 	int vfslocked;
1184 	int error;
1185 
1186 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
1187 	error = VOP_KQFILTER(fp->f_vnode, kn);
1188 	VFS_UNLOCK_GIANT(vfslocked);
1189 
1190 	return error;
1191 }
1192 
1193 /*
1194  * Simplified in-kernel wrapper calls for extended attribute access.
1195  * Both calls pass in a NULL credential, authorizing as "kernel" access.
1196  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1197  */
1198 int
1199 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1200     const char *attrname, int *buflen, char *buf, struct thread *td)
1201 {
1202 	struct uio	auio;
1203 	struct iovec	iov;
1204 	int	error;
1205 
1206 	iov.iov_len = *buflen;
1207 	iov.iov_base = buf;
1208 
1209 	auio.uio_iov = &iov;
1210 	auio.uio_iovcnt = 1;
1211 	auio.uio_rw = UIO_READ;
1212 	auio.uio_segflg = UIO_SYSSPACE;
1213 	auio.uio_td = td;
1214 	auio.uio_offset = 0;
1215 	auio.uio_resid = *buflen;
1216 
1217 	if ((ioflg & IO_NODELOCKED) == 0)
1218 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1219 
1220 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1221 
1222 	/* authorize attribute retrieval as kernel */
1223 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1224 	    td);
1225 
1226 	if ((ioflg & IO_NODELOCKED) == 0)
1227 		VOP_UNLOCK(vp, 0);
1228 
1229 	if (error == 0) {
1230 		*buflen = *buflen - auio.uio_resid;
1231 	}
1232 
1233 	return (error);
1234 }
1235 
1236 /*
1237  * XXX failure mode if partially written?
1238  */
1239 int
1240 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1241     const char *attrname, int buflen, char *buf, struct thread *td)
1242 {
1243 	struct uio	auio;
1244 	struct iovec	iov;
1245 	struct mount	*mp;
1246 	int	error;
1247 
1248 	iov.iov_len = buflen;
1249 	iov.iov_base = buf;
1250 
1251 	auio.uio_iov = &iov;
1252 	auio.uio_iovcnt = 1;
1253 	auio.uio_rw = UIO_WRITE;
1254 	auio.uio_segflg = UIO_SYSSPACE;
1255 	auio.uio_td = td;
1256 	auio.uio_offset = 0;
1257 	auio.uio_resid = buflen;
1258 
1259 	if ((ioflg & IO_NODELOCKED) == 0) {
1260 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1261 			return (error);
1262 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1263 	}
1264 
1265 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1266 
1267 	/* authorize attribute setting as kernel */
1268 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1269 
1270 	if ((ioflg & IO_NODELOCKED) == 0) {
1271 		vn_finished_write(mp);
1272 		VOP_UNLOCK(vp, 0);
1273 	}
1274 
1275 	return (error);
1276 }
1277 
1278 int
1279 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1280     const char *attrname, struct thread *td)
1281 {
1282 	struct mount	*mp;
1283 	int	error;
1284 
1285 	if ((ioflg & IO_NODELOCKED) == 0) {
1286 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1287 			return (error);
1288 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1289 	}
1290 
1291 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1292 
1293 	/* authorize attribute removal as kernel */
1294 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1295 	if (error == EOPNOTSUPP)
1296 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1297 		    NULL, td);
1298 
1299 	if ((ioflg & IO_NODELOCKED) == 0) {
1300 		vn_finished_write(mp);
1301 		VOP_UNLOCK(vp, 0);
1302 	}
1303 
1304 	return (error);
1305 }
1306 
1307 int
1308 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
1309 {
1310 	struct mount *mp;
1311 	int ltype, error;
1312 
1313 	mp = vp->v_mount;
1314 	ltype = VOP_ISLOCKED(vp);
1315 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
1316 	    ("vn_vget_ino: vp not locked"));
1317 	error = vfs_busy(mp, MBF_NOWAIT);
1318 	if (error != 0) {
1319 		vfs_ref(mp);
1320 		VOP_UNLOCK(vp, 0);
1321 		error = vfs_busy(mp, 0);
1322 		vn_lock(vp, ltype | LK_RETRY);
1323 		vfs_rel(mp);
1324 		if (error != 0)
1325 			return (ENOENT);
1326 		if (vp->v_iflag & VI_DOOMED) {
1327 			vfs_unbusy(mp);
1328 			return (ENOENT);
1329 		}
1330 	}
1331 	VOP_UNLOCK(vp, 0);
1332 	error = VFS_VGET(mp, ino, lkflags, rvp);
1333 	vfs_unbusy(mp);
1334 	vn_lock(vp, ltype | LK_RETRY);
1335 	if (vp->v_iflag & VI_DOOMED) {
1336 		if (error == 0)
1337 			vput(*rvp);
1338 		error = ENOENT;
1339 	}
1340 	return (error);
1341 }
1342 
1343 int
1344 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
1345     const struct thread *td)
1346 {
1347 
1348 	if (vp->v_type != VREG || td == NULL)
1349 		return (0);
1350 	PROC_LOCK(td->td_proc);
1351 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
1352 	    lim_cur(td->td_proc, RLIMIT_FSIZE)) {
1353 		psignal(td->td_proc, SIGXFSZ);
1354 		PROC_UNLOCK(td->td_proc);
1355 		return (EFBIG);
1356 	}
1357 	PROC_UNLOCK(td->td_proc);
1358 	return (0);
1359 }
1360