xref: /freebsd/sys/ufs/ffs/ffs_vnops.c (revision 5b9c547c)
1 /*-
2  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Copyright (c) 1982, 1986, 1989, 1993
33  *	The Regents of the University of California.  All rights reserved.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 4. Neither the name of the University nor the names of its contributors
44  *    may be used to endorse or promote products derived from this software
45  *    without specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  *	from: @(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
60  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
61  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
62  */
63 
64 #include <sys/cdefs.h>
65 __FBSDID("$FreeBSD$");
66 
67 #include <sys/param.h>
68 #include <sys/bio.h>
69 #include <sys/systm.h>
70 #include <sys/buf.h>
71 #include <sys/conf.h>
72 #include <sys/extattr.h>
73 #include <sys/kernel.h>
74 #include <sys/limits.h>
75 #include <sys/malloc.h>
76 #include <sys/mount.h>
77 #include <sys/priv.h>
78 #include <sys/rwlock.h>
79 #include <sys/stat.h>
80 #include <sys/vmmeter.h>
81 #include <sys/vnode.h>
82 
83 #include <vm/vm.h>
84 #include <vm/vm_param.h>
85 #include <vm/vm_extern.h>
86 #include <vm/vm_object.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_pager.h>
89 #include <vm/vnode_pager.h>
90 
91 #include <ufs/ufs/extattr.h>
92 #include <ufs/ufs/quota.h>
93 #include <ufs/ufs/inode.h>
94 #include <ufs/ufs/ufs_extern.h>
95 #include <ufs/ufs/ufsmount.h>
96 
97 #include <ufs/ffs/fs.h>
98 #include <ufs/ffs/ffs_extern.h>
99 #include "opt_directio.h"
100 #include "opt_ffs.h"
101 
102 #ifdef DIRECTIO
103 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
104 #endif
105 static vop_fsync_t	ffs_fsync;
106 static vop_lock1_t	ffs_lock;
107 static vop_read_t	ffs_read;
108 static vop_write_t	ffs_write;
109 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
110 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
111 		    struct ucred *cred);
112 static vop_strategy_t	ffsext_strategy;
113 static vop_closeextattr_t	ffs_closeextattr;
114 static vop_deleteextattr_t	ffs_deleteextattr;
115 static vop_getextattr_t	ffs_getextattr;
116 static vop_listextattr_t	ffs_listextattr;
117 static vop_openextattr_t	ffs_openextattr;
118 static vop_setextattr_t	ffs_setextattr;
119 static vop_vptofh_t	ffs_vptofh;
120 
121 
122 /* Global vfs data structures for ufs. */
123 struct vop_vector ffs_vnodeops1 = {
124 	.vop_default =		&ufs_vnodeops,
125 	.vop_fsync =		ffs_fsync,
126 	.vop_getpages =		vnode_pager_local_getpages,
127 	.vop_getpages_async =	vnode_pager_local_getpages_async,
128 	.vop_lock1 =		ffs_lock,
129 	.vop_read =		ffs_read,
130 	.vop_reallocblks =	ffs_reallocblks,
131 	.vop_write =		ffs_write,
132 	.vop_vptofh =		ffs_vptofh,
133 };
134 
135 struct vop_vector ffs_fifoops1 = {
136 	.vop_default =		&ufs_fifoops,
137 	.vop_fsync =		ffs_fsync,
138 	.vop_reallocblks =	ffs_reallocblks, /* XXX: really ??? */
139 	.vop_vptofh =		ffs_vptofh,
140 };
141 
142 /* Global vfs data structures for ufs. */
143 struct vop_vector ffs_vnodeops2 = {
144 	.vop_default =		&ufs_vnodeops,
145 	.vop_fsync =		ffs_fsync,
146 	.vop_getpages =		vnode_pager_local_getpages,
147 	.vop_getpages_async =	vnode_pager_local_getpages_async,
148 	.vop_lock1 =		ffs_lock,
149 	.vop_read =		ffs_read,
150 	.vop_reallocblks =	ffs_reallocblks,
151 	.vop_write =		ffs_write,
152 	.vop_closeextattr =	ffs_closeextattr,
153 	.vop_deleteextattr =	ffs_deleteextattr,
154 	.vop_getextattr =	ffs_getextattr,
155 	.vop_listextattr =	ffs_listextattr,
156 	.vop_openextattr =	ffs_openextattr,
157 	.vop_setextattr =	ffs_setextattr,
158 	.vop_vptofh =		ffs_vptofh,
159 };
160 
161 struct vop_vector ffs_fifoops2 = {
162 	.vop_default =		&ufs_fifoops,
163 	.vop_fsync =		ffs_fsync,
164 	.vop_lock1 =		ffs_lock,
165 	.vop_reallocblks =	ffs_reallocblks,
166 	.vop_strategy =		ffsext_strategy,
167 	.vop_closeextattr =	ffs_closeextattr,
168 	.vop_deleteextattr =	ffs_deleteextattr,
169 	.vop_getextattr =	ffs_getextattr,
170 	.vop_listextattr =	ffs_listextattr,
171 	.vop_openextattr =	ffs_openextattr,
172 	.vop_setextattr =	ffs_setextattr,
173 	.vop_vptofh =		ffs_vptofh,
174 };
175 
176 /*
177  * Synch an open file.
178  */
179 /* ARGSUSED */
180 static int
181 ffs_fsync(struct vop_fsync_args *ap)
182 {
183 	struct vnode *vp;
184 	struct bufobj *bo;
185 	int error;
186 
187 	vp = ap->a_vp;
188 	bo = &vp->v_bufobj;
189 retry:
190 	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
191 	if (error)
192 		return (error);
193 	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
194 		error = softdep_fsync(vp);
195 		if (error)
196 			return (error);
197 
198 		/*
199 		 * The softdep_fsync() function may drop vp lock,
200 		 * allowing for dirty buffers to reappear on the
201 		 * bo_dirty list. Recheck and resync as needed.
202 		 */
203 		BO_LOCK(bo);
204 		if (vp->v_type == VREG && (bo->bo_numoutput > 0 ||
205 		    bo->bo_dirty.bv_cnt > 0)) {
206 			BO_UNLOCK(bo);
207 			goto retry;
208 		}
209 		BO_UNLOCK(bo);
210 	}
211 	return (0);
212 }
213 
214 int
215 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
216 {
217 	struct inode *ip;
218 	struct bufobj *bo;
219 	struct buf *bp;
220 	struct buf *nbp;
221 	ufs_lbn_t lbn;
222 	int error, wait, passes;
223 
224 	ip = VTOI(vp);
225 	ip->i_flag &= ~IN_NEEDSYNC;
226 	bo = &vp->v_bufobj;
227 
228 	/*
229 	 * When doing MNT_WAIT we must first flush all dependencies
230 	 * on the inode.
231 	 */
232 	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
233 	    (error = softdep_sync_metadata(vp)) != 0)
234 		return (error);
235 
236 	/*
237 	 * Flush all dirty buffers associated with a vnode.
238 	 */
239 	error = 0;
240 	passes = 0;
241 	wait = 0;	/* Always do an async pass first. */
242 	lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
243 	BO_LOCK(bo);
244 loop:
245 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
246 		bp->b_vflags &= ~BV_SCANNED;
247 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
248 		/*
249 		 * Reasons to skip this buffer: it has already been considered
250 		 * on this pass, the buffer has dependencies that will cause
251 		 * it to be redirtied and it has not already been deferred,
252 		 * or it is already being written.
253 		 */
254 		if ((bp->b_vflags & BV_SCANNED) != 0)
255 			continue;
256 		bp->b_vflags |= BV_SCANNED;
257 		/* Flush indirects in order. */
258 		if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR &&
259 		    lbn_level(bp->b_lblkno) >= passes)
260 			continue;
261 		if (bp->b_lblkno > lbn)
262 			panic("ffs_syncvnode: syncing truncated data.");
263 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
264 			BO_UNLOCK(bo);
265 		} else if (wait != 0) {
266 			if (BUF_LOCK(bp,
267 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
268 			    BO_LOCKPTR(bo)) != 0) {
269 				bp->b_vflags &= ~BV_SCANNED;
270 				goto next;
271 			}
272 		} else
273 			continue;
274 		if ((bp->b_flags & B_DELWRI) == 0)
275 			panic("ffs_fsync: not dirty");
276 		/*
277 		 * Check for dependencies and potentially complete them.
278 		 */
279 		if (!LIST_EMPTY(&bp->b_dep) &&
280 		    (error = softdep_sync_buf(vp, bp,
281 		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
282 			/* I/O error. */
283 			if (error != EBUSY) {
284 				BUF_UNLOCK(bp);
285 				return (error);
286 			}
287 			/* If we deferred once, don't defer again. */
288 		    	if ((bp->b_flags & B_DEFERRED) == 0) {
289 				bp->b_flags |= B_DEFERRED;
290 				BUF_UNLOCK(bp);
291 				goto next;
292 			}
293 		}
294 		if (wait) {
295 			bremfree(bp);
296 			if ((error = bwrite(bp)) != 0)
297 				return (error);
298 		} else if ((bp->b_flags & B_CLUSTEROK)) {
299 			(void) vfs_bio_awrite(bp);
300 		} else {
301 			bremfree(bp);
302 			(void) bawrite(bp);
303 		}
304 next:
305 		/*
306 		 * Since we may have slept during the I/O, we need
307 		 * to start from a known point.
308 		 */
309 		BO_LOCK(bo);
310 		nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
311 	}
312 	if (waitfor != MNT_WAIT) {
313 		BO_UNLOCK(bo);
314 		if ((flags & NO_INO_UPDT) != 0)
315 			return (0);
316 		else
317 			return (ffs_update(vp, 0));
318 	}
319 	/* Drain IO to see if we're done. */
320 	bufobj_wwait(bo, 0, 0);
321 	/*
322 	 * Block devices associated with filesystems may have new I/O
323 	 * requests posted for them even if the vnode is locked, so no
324 	 * amount of trying will get them clean.  We make several passes
325 	 * as a best effort.
326 	 *
327 	 * Regular files may need multiple passes to flush all dependency
328 	 * work as it is possible that we must write once per indirect
329 	 * level, once for the leaf, and once for the inode and each of
330 	 * these will be done with one sync and one async pass.
331 	 */
332 	if (bo->bo_dirty.bv_cnt > 0) {
333 		/* Write the inode after sync passes to flush deps. */
334 		if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) {
335 			BO_UNLOCK(bo);
336 			ffs_update(vp, 1);
337 			BO_LOCK(bo);
338 		}
339 		/* switch between sync/async. */
340 		wait = !wait;
341 		if (wait == 1 || ++passes < NIADDR + 2)
342 			goto loop;
343 #ifdef INVARIANTS
344 		if (!vn_isdisk(vp, NULL))
345 			vprint("ffs_fsync: dirty", vp);
346 #endif
347 	}
348 	BO_UNLOCK(bo);
349 	error = 0;
350 	if ((flags & NO_INO_UPDT) == 0)
351 		error = ffs_update(vp, 1);
352 	if (DOINGSUJ(vp))
353 		softdep_journal_fsync(VTOI(vp));
354 	return (error);
355 }
356 
357 static int
358 ffs_lock(ap)
359 	struct vop_lock1_args /* {
360 		struct vnode *a_vp;
361 		int a_flags;
362 		struct thread *a_td;
363 		char *file;
364 		int line;
365 	} */ *ap;
366 {
367 #ifndef NO_FFS_SNAPSHOT
368 	struct vnode *vp;
369 	int flags;
370 	struct lock *lkp;
371 	int result;
372 
373 	switch (ap->a_flags & LK_TYPE_MASK) {
374 	case LK_SHARED:
375 	case LK_UPGRADE:
376 	case LK_EXCLUSIVE:
377 		vp = ap->a_vp;
378 		flags = ap->a_flags;
379 		for (;;) {
380 #ifdef DEBUG_VFS_LOCKS
381 			KASSERT(vp->v_holdcnt != 0,
382 			    ("ffs_lock %p: zero hold count", vp));
383 #endif
384 			lkp = vp->v_vnlock;
385 			result = _lockmgr_args(lkp, flags, VI_MTX(vp),
386 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
387 			    ap->a_file, ap->a_line);
388 			if (lkp == vp->v_vnlock || result != 0)
389 				break;
390 			/*
391 			 * Apparent success, except that the vnode
392 			 * mutated between snapshot file vnode and
393 			 * regular file vnode while this process
394 			 * slept.  The lock currently held is not the
395 			 * right lock.  Release it, and try to get the
396 			 * new lock.
397 			 */
398 			(void) _lockmgr_args(lkp, LK_RELEASE, NULL,
399 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
400 			    ap->a_file, ap->a_line);
401 			if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
402 			    (LK_INTERLOCK | LK_NOWAIT))
403 				return (EBUSY);
404 			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
405 				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
406 			flags &= ~LK_INTERLOCK;
407 		}
408 		break;
409 	default:
410 		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
411 	}
412 	return (result);
413 #else
414 	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
415 #endif
416 }
417 
418 /*
419  * Vnode op for reading.
420  */
421 static int
422 ffs_read(ap)
423 	struct vop_read_args /* {
424 		struct vnode *a_vp;
425 		struct uio *a_uio;
426 		int a_ioflag;
427 		struct ucred *a_cred;
428 	} */ *ap;
429 {
430 	struct vnode *vp;
431 	struct inode *ip;
432 	struct uio *uio;
433 	struct fs *fs;
434 	struct buf *bp;
435 	ufs_lbn_t lbn, nextlbn;
436 	off_t bytesinfile;
437 	long size, xfersize, blkoffset;
438 	ssize_t orig_resid;
439 	int error;
440 	int seqcount;
441 	int ioflag;
442 
443 	vp = ap->a_vp;
444 	uio = ap->a_uio;
445 	ioflag = ap->a_ioflag;
446 	if (ap->a_ioflag & IO_EXT)
447 #ifdef notyet
448 		return (ffs_extread(vp, uio, ioflag));
449 #else
450 		panic("ffs_read+IO_EXT");
451 #endif
452 #ifdef DIRECTIO
453 	if ((ioflag & IO_DIRECT) != 0) {
454 		int workdone;
455 
456 		error = ffs_rawread(vp, uio, &workdone);
457 		if (error != 0 || workdone != 0)
458 			return error;
459 	}
460 #endif
461 
462 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
463 	ip = VTOI(vp);
464 
465 #ifdef INVARIANTS
466 	if (uio->uio_rw != UIO_READ)
467 		panic("ffs_read: mode");
468 
469 	if (vp->v_type == VLNK) {
470 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
471 			panic("ffs_read: short symlink");
472 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
473 		panic("ffs_read: type %d",  vp->v_type);
474 #endif
475 	orig_resid = uio->uio_resid;
476 	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
477 	if (orig_resid == 0)
478 		return (0);
479 	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
480 	fs = ip->i_fs;
481 	if (uio->uio_offset < ip->i_size &&
482 	    uio->uio_offset >= fs->fs_maxfilesize)
483 		return (EOVERFLOW);
484 
485 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
486 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
487 			break;
488 		lbn = lblkno(fs, uio->uio_offset);
489 		nextlbn = lbn + 1;
490 
491 		/*
492 		 * size of buffer.  The buffer representing the
493 		 * end of the file is rounded up to the size of
494 		 * the block type ( fragment or full block,
495 		 * depending ).
496 		 */
497 		size = blksize(fs, ip, lbn);
498 		blkoffset = blkoff(fs, uio->uio_offset);
499 
500 		/*
501 		 * The amount we want to transfer in this iteration is
502 		 * one FS block less the amount of the data before
503 		 * our startpoint (duh!)
504 		 */
505 		xfersize = fs->fs_bsize - blkoffset;
506 
507 		/*
508 		 * But if we actually want less than the block,
509 		 * or the file doesn't have a whole block more of data,
510 		 * then use the lesser number.
511 		 */
512 		if (uio->uio_resid < xfersize)
513 			xfersize = uio->uio_resid;
514 		if (bytesinfile < xfersize)
515 			xfersize = bytesinfile;
516 
517 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
518 			/*
519 			 * Don't do readahead if this is the end of the file.
520 			 */
521 			error = bread_gb(vp, lbn, size, NOCRED,
522 			    GB_UNMAPPED, &bp);
523 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
524 			/*
525 			 * Otherwise if we are allowed to cluster,
526 			 * grab as much as we can.
527 			 *
528 			 * XXX  This may not be a win if we are not
529 			 * doing sequential access.
530 			 */
531 			error = cluster_read(vp, ip->i_size, lbn,
532 			    size, NOCRED, blkoffset + uio->uio_resid,
533 			    seqcount, GB_UNMAPPED, &bp);
534 		} else if (seqcount > 1) {
535 			/*
536 			 * If we are NOT allowed to cluster, then
537 			 * if we appear to be acting sequentially,
538 			 * fire off a request for a readahead
539 			 * as well as a read. Note that the 4th and 5th
540 			 * arguments point to arrays of the size specified in
541 			 * the 6th argument.
542 			 */
543 			u_int nextsize = blksize(fs, ip, nextlbn);
544 			error = breadn_flags(vp, lbn, size, &nextlbn,
545 			    &nextsize, 1, NOCRED, GB_UNMAPPED, &bp);
546 		} else {
547 			/*
548 			 * Failing all of the above, just read what the
549 			 * user asked for. Interestingly, the same as
550 			 * the first option above.
551 			 */
552 			error = bread_gb(vp, lbn, size, NOCRED,
553 			    GB_UNMAPPED, &bp);
554 		}
555 		if (error) {
556 			brelse(bp);
557 			bp = NULL;
558 			break;
559 		}
560 
561 		/*
562 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
563 		 * will cause us to attempt to release the buffer later on
564 		 * and will cause the buffer cache to attempt to free the
565 		 * underlying pages.
566 		 */
567 		if (ioflag & IO_DIRECT)
568 			bp->b_flags |= B_DIRECT;
569 
570 		/*
571 		 * We should only get non-zero b_resid when an I/O error
572 		 * has occurred, which should cause us to break above.
573 		 * However, if the short read did not cause an error,
574 		 * then we want to ensure that we do not uiomove bad
575 		 * or uninitialized data.
576 		 */
577 		size -= bp->b_resid;
578 		if (size < xfersize) {
579 			if (size == 0)
580 				break;
581 			xfersize = size;
582 		}
583 
584 		if ((bp->b_flags & B_UNMAPPED) == 0) {
585 			error = vn_io_fault_uiomove((char *)bp->b_data +
586 			    blkoffset, (int)xfersize, uio);
587 		} else {
588 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
589 			    (int)xfersize, uio);
590 		}
591 		if (error)
592 			break;
593 
594 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
595 		   (LIST_EMPTY(&bp->b_dep))) {
596 			/*
597 			 * If there are no dependencies, and it's VMIO,
598 			 * then we don't need the buf, mark it available
599 			 * for freeing.  For non-direct VMIO reads, the VM
600 			 * has the data.
601 			 */
602 			bp->b_flags |= B_RELBUF;
603 			brelse(bp);
604 		} else {
605 			/*
606 			 * Otherwise let whoever
607 			 * made the request take care of
608 			 * freeing it. We just queue
609 			 * it onto another list.
610 			 */
611 			bqrelse(bp);
612 		}
613 	}
614 
615 	/*
616 	 * This can only happen in the case of an error
617 	 * because the loop above resets bp to NULL on each iteration
618 	 * and on normal completion has not set a new value into it.
619 	 * so it must have come from a 'break' statement
620 	 */
621 	if (bp != NULL) {
622 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
623 		   (LIST_EMPTY(&bp->b_dep))) {
624 			bp->b_flags |= B_RELBUF;
625 			brelse(bp);
626 		} else {
627 			bqrelse(bp);
628 		}
629 	}
630 
631 	if ((error == 0 || uio->uio_resid != orig_resid) &&
632 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 &&
633 	    (ip->i_flag & IN_ACCESS) == 0) {
634 		VI_LOCK(vp);
635 		ip->i_flag |= IN_ACCESS;
636 		VI_UNLOCK(vp);
637 	}
638 	return (error);
639 }
640 
641 /*
642  * Vnode op for writing.
643  */
644 static int
645 ffs_write(ap)
646 	struct vop_write_args /* {
647 		struct vnode *a_vp;
648 		struct uio *a_uio;
649 		int a_ioflag;
650 		struct ucred *a_cred;
651 	} */ *ap;
652 {
653 	struct vnode *vp;
654 	struct uio *uio;
655 	struct inode *ip;
656 	struct fs *fs;
657 	struct buf *bp;
658 	ufs_lbn_t lbn;
659 	off_t osize;
660 	ssize_t resid;
661 	int seqcount;
662 	int blkoffset, error, flags, ioflag, size, xfersize;
663 
664 	vp = ap->a_vp;
665 	uio = ap->a_uio;
666 	ioflag = ap->a_ioflag;
667 	if (ap->a_ioflag & IO_EXT)
668 #ifdef notyet
669 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
670 #else
671 		panic("ffs_write+IO_EXT");
672 #endif
673 
674 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
675 	ip = VTOI(vp);
676 
677 #ifdef INVARIANTS
678 	if (uio->uio_rw != UIO_WRITE)
679 		panic("ffs_write: mode");
680 #endif
681 
682 	switch (vp->v_type) {
683 	case VREG:
684 		if (ioflag & IO_APPEND)
685 			uio->uio_offset = ip->i_size;
686 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
687 			return (EPERM);
688 		/* FALLTHROUGH */
689 	case VLNK:
690 		break;
691 	case VDIR:
692 		panic("ffs_write: dir write");
693 		break;
694 	default:
695 		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
696 			(int)uio->uio_offset,
697 			(int)uio->uio_resid
698 		);
699 	}
700 
701 	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
702 	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
703 	fs = ip->i_fs;
704 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
705 		return (EFBIG);
706 	/*
707 	 * Maybe this should be above the vnode op call, but so long as
708 	 * file servers have no limits, I don't think it matters.
709 	 */
710 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
711 		return (EFBIG);
712 
713 	resid = uio->uio_resid;
714 	osize = ip->i_size;
715 	if (seqcount > BA_SEQMAX)
716 		flags = BA_SEQMAX << BA_SEQSHIFT;
717 	else
718 		flags = seqcount << BA_SEQSHIFT;
719 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
720 		flags |= IO_SYNC;
721 	flags |= BA_UNMAPPED;
722 
723 	for (error = 0; uio->uio_resid > 0;) {
724 		lbn = lblkno(fs, uio->uio_offset);
725 		blkoffset = blkoff(fs, uio->uio_offset);
726 		xfersize = fs->fs_bsize - blkoffset;
727 		if (uio->uio_resid < xfersize)
728 			xfersize = uio->uio_resid;
729 		if (uio->uio_offset + xfersize > ip->i_size)
730 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
731 
732 		/*
733 		 * We must perform a read-before-write if the transfer size
734 		 * does not cover the entire buffer.
735 		 */
736 		if (fs->fs_bsize > xfersize)
737 			flags |= BA_CLRBUF;
738 		else
739 			flags &= ~BA_CLRBUF;
740 /* XXX is uio->uio_offset the right thing here? */
741 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
742 		    ap->a_cred, flags, &bp);
743 		if (error != 0) {
744 			vnode_pager_setsize(vp, ip->i_size);
745 			break;
746 		}
747 		if (ioflag & IO_DIRECT)
748 			bp->b_flags |= B_DIRECT;
749 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
750 			bp->b_flags |= B_NOCACHE;
751 
752 		if (uio->uio_offset + xfersize > ip->i_size) {
753 			ip->i_size = uio->uio_offset + xfersize;
754 			DIP_SET(ip, i_size, ip->i_size);
755 		}
756 
757 		size = blksize(fs, ip, lbn) - bp->b_resid;
758 		if (size < xfersize)
759 			xfersize = size;
760 
761 		if ((bp->b_flags & B_UNMAPPED) == 0) {
762 			error = vn_io_fault_uiomove((char *)bp->b_data +
763 			    blkoffset, (int)xfersize, uio);
764 		} else {
765 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
766 			    (int)xfersize, uio);
767 		}
768 		/*
769 		 * If the buffer is not already filled and we encounter an
770 		 * error while trying to fill it, we have to clear out any
771 		 * garbage data from the pages instantiated for the buffer.
772 		 * If we do not, a failed uiomove() during a write can leave
773 		 * the prior contents of the pages exposed to a userland mmap.
774 		 *
775 		 * Note that we need only clear buffers with a transfer size
776 		 * equal to the block size because buffers with a shorter
777 		 * transfer size were cleared above by the call to UFS_BALLOC()
778 		 * with the BA_CLRBUF flag set.
779 		 *
780 		 * If the source region for uiomove identically mmaps the
781 		 * buffer, uiomove() performed the NOP copy, and the buffer
782 		 * content remains valid because the page fault handler
783 		 * validated the pages.
784 		 */
785 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
786 		    fs->fs_bsize == xfersize)
787 			vfs_bio_clrbuf(bp);
788 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
789 		   (LIST_EMPTY(&bp->b_dep))) {
790 			bp->b_flags |= B_RELBUF;
791 		}
792 
793 		/*
794 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
795 		 * if we have a severe page deficiency write the buffer
796 		 * asynchronously.  Otherwise try to cluster, and if that
797 		 * doesn't do it then either do an async write (if O_DIRECT),
798 		 * or a delayed write (if not).
799 		 */
800 		if (ioflag & IO_SYNC) {
801 			(void)bwrite(bp);
802 		} else if (vm_page_count_severe() ||
803 			    buf_dirty_count_severe() ||
804 			    (ioflag & IO_ASYNC)) {
805 			bp->b_flags |= B_CLUSTEROK;
806 			bawrite(bp);
807 		} else if (xfersize + blkoffset == fs->fs_bsize) {
808 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
809 				bp->b_flags |= B_CLUSTEROK;
810 				cluster_write(vp, bp, ip->i_size, seqcount,
811 				    GB_UNMAPPED);
812 			} else {
813 				bawrite(bp);
814 			}
815 		} else if (ioflag & IO_DIRECT) {
816 			bp->b_flags |= B_CLUSTEROK;
817 			bawrite(bp);
818 		} else {
819 			bp->b_flags |= B_CLUSTEROK;
820 			bdwrite(bp);
821 		}
822 		if (error || xfersize == 0)
823 			break;
824 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
825 	}
826 	/*
827 	 * If we successfully wrote any data, and we are not the superuser
828 	 * we clear the setuid and setgid bits as a precaution against
829 	 * tampering.
830 	 */
831 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
832 	    ap->a_cred) {
833 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
834 			ip->i_mode &= ~(ISUID | ISGID);
835 			DIP_SET(ip, i_mode, ip->i_mode);
836 		}
837 	}
838 	if (error) {
839 		if (ioflag & IO_UNIT) {
840 			(void)ffs_truncate(vp, osize,
841 			    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
842 			uio->uio_offset -= resid - uio->uio_resid;
843 			uio->uio_resid = resid;
844 		}
845 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
846 		error = ffs_update(vp, 1);
847 	return (error);
848 }
849 
850 /*
851  * Extended attribute area reading.
852  */
853 static int
854 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
855 {
856 	struct inode *ip;
857 	struct ufs2_dinode *dp;
858 	struct fs *fs;
859 	struct buf *bp;
860 	ufs_lbn_t lbn, nextlbn;
861 	off_t bytesinfile;
862 	long size, xfersize, blkoffset;
863 	ssize_t orig_resid;
864 	int error;
865 
866 	ip = VTOI(vp);
867 	fs = ip->i_fs;
868 	dp = ip->i_din2;
869 
870 #ifdef INVARIANTS
871 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
872 		panic("ffs_extread: mode");
873 
874 #endif
875 	orig_resid = uio->uio_resid;
876 	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
877 	if (orig_resid == 0)
878 		return (0);
879 	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
880 
881 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
882 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
883 			break;
884 		lbn = lblkno(fs, uio->uio_offset);
885 		nextlbn = lbn + 1;
886 
887 		/*
888 		 * size of buffer.  The buffer representing the
889 		 * end of the file is rounded up to the size of
890 		 * the block type ( fragment or full block,
891 		 * depending ).
892 		 */
893 		size = sblksize(fs, dp->di_extsize, lbn);
894 		blkoffset = blkoff(fs, uio->uio_offset);
895 
896 		/*
897 		 * The amount we want to transfer in this iteration is
898 		 * one FS block less the amount of the data before
899 		 * our startpoint (duh!)
900 		 */
901 		xfersize = fs->fs_bsize - blkoffset;
902 
903 		/*
904 		 * But if we actually want less than the block,
905 		 * or the file doesn't have a whole block more of data,
906 		 * then use the lesser number.
907 		 */
908 		if (uio->uio_resid < xfersize)
909 			xfersize = uio->uio_resid;
910 		if (bytesinfile < xfersize)
911 			xfersize = bytesinfile;
912 
913 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
914 			/*
915 			 * Don't do readahead if this is the end of the info.
916 			 */
917 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
918 		} else {
919 			/*
920 			 * If we have a second block, then
921 			 * fire off a request for a readahead
922 			 * as well as a read. Note that the 4th and 5th
923 			 * arguments point to arrays of the size specified in
924 			 * the 6th argument.
925 			 */
926 			u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
927 
928 			nextlbn = -1 - nextlbn;
929 			error = breadn(vp, -1 - lbn,
930 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
931 		}
932 		if (error) {
933 			brelse(bp);
934 			bp = NULL;
935 			break;
936 		}
937 
938 		/*
939 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
940 		 * will cause us to attempt to release the buffer later on
941 		 * and will cause the buffer cache to attempt to free the
942 		 * underlying pages.
943 		 */
944 		if (ioflag & IO_DIRECT)
945 			bp->b_flags |= B_DIRECT;
946 
947 		/*
948 		 * We should only get non-zero b_resid when an I/O error
949 		 * has occurred, which should cause us to break above.
950 		 * However, if the short read did not cause an error,
951 		 * then we want to ensure that we do not uiomove bad
952 		 * or uninitialized data.
953 		 */
954 		size -= bp->b_resid;
955 		if (size < xfersize) {
956 			if (size == 0)
957 				break;
958 			xfersize = size;
959 		}
960 
961 		error = uiomove((char *)bp->b_data + blkoffset,
962 					(int)xfersize, uio);
963 		if (error)
964 			break;
965 
966 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
967 		   (LIST_EMPTY(&bp->b_dep))) {
968 			/*
969 			 * If there are no dependencies, and it's VMIO,
970 			 * then we don't need the buf, mark it available
971 			 * for freeing.  For non-direct VMIO reads, the VM
972 			 * has the data.
973 			 */
974 			bp->b_flags |= B_RELBUF;
975 			brelse(bp);
976 		} else {
977 			/*
978 			 * Otherwise let whoever
979 			 * made the request take care of
980 			 * freeing it. We just queue
981 			 * it onto another list.
982 			 */
983 			bqrelse(bp);
984 		}
985 	}
986 
987 	/*
988 	 * This can only happen in the case of an error
989 	 * because the loop above resets bp to NULL on each iteration
990 	 * and on normal completion has not set a new value into it.
991 	 * so it must have come from a 'break' statement
992 	 */
993 	if (bp != NULL) {
994 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
995 		   (LIST_EMPTY(&bp->b_dep))) {
996 			bp->b_flags |= B_RELBUF;
997 			brelse(bp);
998 		} else {
999 			bqrelse(bp);
1000 		}
1001 	}
1002 	return (error);
1003 }
1004 
1005 /*
1006  * Extended attribute area writing.
1007  */
1008 static int
1009 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1010 {
1011 	struct inode *ip;
1012 	struct ufs2_dinode *dp;
1013 	struct fs *fs;
1014 	struct buf *bp;
1015 	ufs_lbn_t lbn;
1016 	off_t osize;
1017 	ssize_t resid;
1018 	int blkoffset, error, flags, size, xfersize;
1019 
1020 	ip = VTOI(vp);
1021 	fs = ip->i_fs;
1022 	dp = ip->i_din2;
1023 
1024 #ifdef INVARIANTS
1025 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1026 		panic("ffs_extwrite: mode");
1027 #endif
1028 
1029 	if (ioflag & IO_APPEND)
1030 		uio->uio_offset = dp->di_extsize;
1031 	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1032 	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1033 	if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1034 		return (EFBIG);
1035 
1036 	resid = uio->uio_resid;
1037 	osize = dp->di_extsize;
1038 	flags = IO_EXT;
1039 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1040 		flags |= IO_SYNC;
1041 
1042 	for (error = 0; uio->uio_resid > 0;) {
1043 		lbn = lblkno(fs, uio->uio_offset);
1044 		blkoffset = blkoff(fs, uio->uio_offset);
1045 		xfersize = fs->fs_bsize - blkoffset;
1046 		if (uio->uio_resid < xfersize)
1047 			xfersize = uio->uio_resid;
1048 
1049 		/*
1050 		 * We must perform a read-before-write if the transfer size
1051 		 * does not cover the entire buffer.
1052 		 */
1053 		if (fs->fs_bsize > xfersize)
1054 			flags |= BA_CLRBUF;
1055 		else
1056 			flags &= ~BA_CLRBUF;
1057 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1058 		    ucred, flags, &bp);
1059 		if (error != 0)
1060 			break;
1061 		/*
1062 		 * If the buffer is not valid we have to clear out any
1063 		 * garbage data from the pages instantiated for the buffer.
1064 		 * If we do not, a failed uiomove() during a write can leave
1065 		 * the prior contents of the pages exposed to a userland
1066 		 * mmap().  XXX deal with uiomove() errors a better way.
1067 		 */
1068 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1069 			vfs_bio_clrbuf(bp);
1070 		if (ioflag & IO_DIRECT)
1071 			bp->b_flags |= B_DIRECT;
1072 
1073 		if (uio->uio_offset + xfersize > dp->di_extsize)
1074 			dp->di_extsize = uio->uio_offset + xfersize;
1075 
1076 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1077 		if (size < xfersize)
1078 			xfersize = size;
1079 
1080 		error =
1081 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1082 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1083 		   (LIST_EMPTY(&bp->b_dep))) {
1084 			bp->b_flags |= B_RELBUF;
1085 		}
1086 
1087 		/*
1088 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1089 		 * if we have a severe page deficiency write the buffer
1090 		 * asynchronously.  Otherwise try to cluster, and if that
1091 		 * doesn't do it then either do an async write (if O_DIRECT),
1092 		 * or a delayed write (if not).
1093 		 */
1094 		if (ioflag & IO_SYNC) {
1095 			(void)bwrite(bp);
1096 		} else if (vm_page_count_severe() ||
1097 			    buf_dirty_count_severe() ||
1098 			    xfersize + blkoffset == fs->fs_bsize ||
1099 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1100 			bawrite(bp);
1101 		else
1102 			bdwrite(bp);
1103 		if (error || xfersize == 0)
1104 			break;
1105 		ip->i_flag |= IN_CHANGE;
1106 	}
1107 	/*
1108 	 * If we successfully wrote any data, and we are not the superuser
1109 	 * we clear the setuid and setgid bits as a precaution against
1110 	 * tampering.
1111 	 */
1112 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1113 		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) {
1114 			ip->i_mode &= ~(ISUID | ISGID);
1115 			dp->di_mode = ip->i_mode;
1116 		}
1117 	}
1118 	if (error) {
1119 		if (ioflag & IO_UNIT) {
1120 			(void)ffs_truncate(vp, osize,
1121 			    IO_EXT | (ioflag&IO_SYNC), ucred);
1122 			uio->uio_offset -= resid - uio->uio_resid;
1123 			uio->uio_resid = resid;
1124 		}
1125 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1126 		error = ffs_update(vp, 1);
1127 	return (error);
1128 }
1129 
1130 
1131 /*
1132  * Vnode operating to retrieve a named extended attribute.
1133  *
1134  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1135  * the length of the EA, and possibly the pointer to the entry and to the data.
1136  */
1137 static int
1138 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac)
1139 {
1140 	u_char *p, *pe, *pn, *p0;
1141 	int eapad1, eapad2, ealength, ealen, nlen;
1142 	uint32_t ul;
1143 
1144 	pe = ptr + length;
1145 	nlen = strlen(name);
1146 
1147 	for (p = ptr; p < pe; p = pn) {
1148 		p0 = p;
1149 		bcopy(p, &ul, sizeof(ul));
1150 		pn = p + ul;
1151 		/* make sure this entry is complete */
1152 		if (pn > pe)
1153 			break;
1154 		p += sizeof(uint32_t);
1155 		if (*p != nspace)
1156 			continue;
1157 		p++;
1158 		eapad2 = *p++;
1159 		if (*p != nlen)
1160 			continue;
1161 		p++;
1162 		if (bcmp(p, name, nlen))
1163 			continue;
1164 		ealength = sizeof(uint32_t) + 3 + nlen;
1165 		eapad1 = 8 - (ealength % 8);
1166 		if (eapad1 == 8)
1167 			eapad1 = 0;
1168 		ealength += eapad1;
1169 		ealen = ul - ealength - eapad2;
1170 		p += nlen + eapad1;
1171 		if (eap != NULL)
1172 			*eap = p0;
1173 		if (eac != NULL)
1174 			*eac = p;
1175 		return (ealen);
1176 	}
1177 	return(-1);
1178 }
1179 
1180 static int
1181 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1182 {
1183 	struct inode *ip;
1184 	struct ufs2_dinode *dp;
1185 	struct fs *fs;
1186 	struct uio luio;
1187 	struct iovec liovec;
1188 	u_int easize;
1189 	int error;
1190 	u_char *eae;
1191 
1192 	ip = VTOI(vp);
1193 	fs = ip->i_fs;
1194 	dp = ip->i_din2;
1195 	easize = dp->di_extsize;
1196 	if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize)
1197 		return (EFBIG);
1198 
1199 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1200 
1201 	liovec.iov_base = eae;
1202 	liovec.iov_len = easize;
1203 	luio.uio_iov = &liovec;
1204 	luio.uio_iovcnt = 1;
1205 	luio.uio_offset = 0;
1206 	luio.uio_resid = easize;
1207 	luio.uio_segflg = UIO_SYSSPACE;
1208 	luio.uio_rw = UIO_READ;
1209 	luio.uio_td = td;
1210 
1211 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1212 	if (error) {
1213 		free(eae, M_TEMP);
1214 		return(error);
1215 	}
1216 	*p = eae;
1217 	return (0);
1218 }
1219 
1220 static void
1221 ffs_lock_ea(struct vnode *vp)
1222 {
1223 	struct inode *ip;
1224 
1225 	ip = VTOI(vp);
1226 	VI_LOCK(vp);
1227 	while (ip->i_flag & IN_EA_LOCKED) {
1228 		ip->i_flag |= IN_EA_LOCKWAIT;
1229 		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1230 		    0);
1231 	}
1232 	ip->i_flag |= IN_EA_LOCKED;
1233 	VI_UNLOCK(vp);
1234 }
1235 
1236 static void
1237 ffs_unlock_ea(struct vnode *vp)
1238 {
1239 	struct inode *ip;
1240 
1241 	ip = VTOI(vp);
1242 	VI_LOCK(vp);
1243 	if (ip->i_flag & IN_EA_LOCKWAIT)
1244 		wakeup(&ip->i_ea_refs);
1245 	ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1246 	VI_UNLOCK(vp);
1247 }
1248 
1249 static int
1250 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1251 {
1252 	struct inode *ip;
1253 	struct ufs2_dinode *dp;
1254 	int error;
1255 
1256 	ip = VTOI(vp);
1257 
1258 	ffs_lock_ea(vp);
1259 	if (ip->i_ea_area != NULL) {
1260 		ip->i_ea_refs++;
1261 		ffs_unlock_ea(vp);
1262 		return (0);
1263 	}
1264 	dp = ip->i_din2;
1265 	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1266 	if (error) {
1267 		ffs_unlock_ea(vp);
1268 		return (error);
1269 	}
1270 	ip->i_ea_len = dp->di_extsize;
1271 	ip->i_ea_error = 0;
1272 	ip->i_ea_refs++;
1273 	ffs_unlock_ea(vp);
1274 	return (0);
1275 }
1276 
1277 /*
1278  * Vnode extattr transaction commit/abort
1279  */
1280 static int
1281 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1282 {
1283 	struct inode *ip;
1284 	struct uio luio;
1285 	struct iovec liovec;
1286 	int error;
1287 	struct ufs2_dinode *dp;
1288 
1289 	ip = VTOI(vp);
1290 
1291 	ffs_lock_ea(vp);
1292 	if (ip->i_ea_area == NULL) {
1293 		ffs_unlock_ea(vp);
1294 		return (EINVAL);
1295 	}
1296 	dp = ip->i_din2;
1297 	error = ip->i_ea_error;
1298 	if (commit && error == 0) {
1299 		ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1300 		if (cred == NOCRED)
1301 			cred =  vp->v_mount->mnt_cred;
1302 		liovec.iov_base = ip->i_ea_area;
1303 		liovec.iov_len = ip->i_ea_len;
1304 		luio.uio_iov = &liovec;
1305 		luio.uio_iovcnt = 1;
1306 		luio.uio_offset = 0;
1307 		luio.uio_resid = ip->i_ea_len;
1308 		luio.uio_segflg = UIO_SYSSPACE;
1309 		luio.uio_rw = UIO_WRITE;
1310 		luio.uio_td = td;
1311 		/* XXX: I'm not happy about truncating to zero size */
1312 		if (ip->i_ea_len < dp->di_extsize)
1313 			error = ffs_truncate(vp, 0, IO_EXT, cred);
1314 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1315 	}
1316 	if (--ip->i_ea_refs == 0) {
1317 		free(ip->i_ea_area, M_TEMP);
1318 		ip->i_ea_area = NULL;
1319 		ip->i_ea_len = 0;
1320 		ip->i_ea_error = 0;
1321 	}
1322 	ffs_unlock_ea(vp);
1323 	return (error);
1324 }
1325 
1326 /*
1327  * Vnode extattr strategy routine for fifos.
1328  *
1329  * We need to check for a read or write of the external attributes.
1330  * Otherwise we just fall through and do the usual thing.
1331  */
1332 static int
1333 ffsext_strategy(struct vop_strategy_args *ap)
1334 /*
1335 struct vop_strategy_args {
1336 	struct vnodeop_desc *a_desc;
1337 	struct vnode *a_vp;
1338 	struct buf *a_bp;
1339 };
1340 */
1341 {
1342 	struct vnode *vp;
1343 	daddr_t lbn;
1344 
1345 	vp = ap->a_vp;
1346 	lbn = ap->a_bp->b_lblkno;
1347 	if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1348 	    lbn < 0 && lbn >= -NXADDR)
1349 		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1350 	if (vp->v_type == VFIFO)
1351 		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1352 	panic("spec nodes went here");
1353 }
1354 
1355 /*
1356  * Vnode extattr transaction commit/abort
1357  */
1358 static int
1359 ffs_openextattr(struct vop_openextattr_args *ap)
1360 /*
1361 struct vop_openextattr_args {
1362 	struct vnodeop_desc *a_desc;
1363 	struct vnode *a_vp;
1364 	IN struct ucred *a_cred;
1365 	IN struct thread *a_td;
1366 };
1367 */
1368 {
1369 	struct inode *ip;
1370 	struct fs *fs;
1371 
1372 	ip = VTOI(ap->a_vp);
1373 	fs = ip->i_fs;
1374 
1375 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1376 		return (EOPNOTSUPP);
1377 
1378 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1379 }
1380 
1381 
1382 /*
1383  * Vnode extattr transaction commit/abort
1384  */
1385 static int
1386 ffs_closeextattr(struct vop_closeextattr_args *ap)
1387 /*
1388 struct vop_closeextattr_args {
1389 	struct vnodeop_desc *a_desc;
1390 	struct vnode *a_vp;
1391 	int a_commit;
1392 	IN struct ucred *a_cred;
1393 	IN struct thread *a_td;
1394 };
1395 */
1396 {
1397 	struct inode *ip;
1398 	struct fs *fs;
1399 
1400 	ip = VTOI(ap->a_vp);
1401 	fs = ip->i_fs;
1402 
1403 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1404 		return (EOPNOTSUPP);
1405 
1406 	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1407 		return (EROFS);
1408 
1409 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1410 }
1411 
1412 /*
1413  * Vnode operation to remove a named attribute.
1414  */
1415 static int
1416 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1417 /*
1418 vop_deleteextattr {
1419 	IN struct vnode *a_vp;
1420 	IN int a_attrnamespace;
1421 	IN const char *a_name;
1422 	IN struct ucred *a_cred;
1423 	IN struct thread *a_td;
1424 };
1425 */
1426 {
1427 	struct inode *ip;
1428 	struct fs *fs;
1429 	uint32_t ealength, ul;
1430 	int ealen, olen, eapad1, eapad2, error, i, easize;
1431 	u_char *eae, *p;
1432 
1433 	ip = VTOI(ap->a_vp);
1434 	fs = ip->i_fs;
1435 
1436 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1437 		return (EOPNOTSUPP);
1438 
1439 	if (strlen(ap->a_name) == 0)
1440 		return (EINVAL);
1441 
1442 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1443 		return (EROFS);
1444 
1445 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1446 	    ap->a_cred, ap->a_td, VWRITE);
1447 	if (error) {
1448 
1449 		/*
1450 		 * ffs_lock_ea is not needed there, because the vnode
1451 		 * must be exclusively locked.
1452 		 */
1453 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1454 			ip->i_ea_error = error;
1455 		return (error);
1456 	}
1457 
1458 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1459 	if (error)
1460 		return (error);
1461 
1462 	ealength = eapad1 = ealen = eapad2 = 0;
1463 
1464 	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1465 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1466 	easize = ip->i_ea_len;
1467 
1468 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1469 	    &p, NULL);
1470 	if (olen == -1) {
1471 		/* delete but nonexistent */
1472 		free(eae, M_TEMP);
1473 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1474 		return(ENOATTR);
1475 	}
1476 	bcopy(p, &ul, sizeof ul);
1477 	i = p - eae + ul;
1478 	if (ul != ealength) {
1479 		bcopy(p + ul, p + ealength, easize - i);
1480 		easize += (ealength - ul);
1481 	}
1482 	if (easize > NXADDR * fs->fs_bsize) {
1483 		free(eae, M_TEMP);
1484 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1485 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1486 			ip->i_ea_error = ENOSPC;
1487 		return(ENOSPC);
1488 	}
1489 	p = ip->i_ea_area;
1490 	ip->i_ea_area = eae;
1491 	ip->i_ea_len = easize;
1492 	free(p, M_TEMP);
1493 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1494 	return(error);
1495 }
1496 
1497 /*
1498  * Vnode operation to retrieve a named extended attribute.
1499  */
1500 static int
1501 ffs_getextattr(struct vop_getextattr_args *ap)
1502 /*
1503 vop_getextattr {
1504 	IN struct vnode *a_vp;
1505 	IN int a_attrnamespace;
1506 	IN const char *a_name;
1507 	INOUT struct uio *a_uio;
1508 	OUT size_t *a_size;
1509 	IN struct ucred *a_cred;
1510 	IN struct thread *a_td;
1511 };
1512 */
1513 {
1514 	struct inode *ip;
1515 	struct fs *fs;
1516 	u_char *eae, *p;
1517 	unsigned easize;
1518 	int error, ealen;
1519 
1520 	ip = VTOI(ap->a_vp);
1521 	fs = ip->i_fs;
1522 
1523 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1524 		return (EOPNOTSUPP);
1525 
1526 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1527 	    ap->a_cred, ap->a_td, VREAD);
1528 	if (error)
1529 		return (error);
1530 
1531 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1532 	if (error)
1533 		return (error);
1534 
1535 	eae = ip->i_ea_area;
1536 	easize = ip->i_ea_len;
1537 
1538 	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1539 	    NULL, &p);
1540 	if (ealen >= 0) {
1541 		error = 0;
1542 		if (ap->a_size != NULL)
1543 			*ap->a_size = ealen;
1544 		else if (ap->a_uio != NULL)
1545 			error = uiomove(p, ealen, ap->a_uio);
1546 	} else
1547 		error = ENOATTR;
1548 
1549 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1550 	return(error);
1551 }
1552 
1553 /*
1554  * Vnode operation to retrieve extended attributes on a vnode.
1555  */
1556 static int
1557 ffs_listextattr(struct vop_listextattr_args *ap)
1558 /*
1559 vop_listextattr {
1560 	IN struct vnode *a_vp;
1561 	IN int a_attrnamespace;
1562 	INOUT struct uio *a_uio;
1563 	OUT size_t *a_size;
1564 	IN struct ucred *a_cred;
1565 	IN struct thread *a_td;
1566 };
1567 */
1568 {
1569 	struct inode *ip;
1570 	struct fs *fs;
1571 	u_char *eae, *p, *pe, *pn;
1572 	unsigned easize;
1573 	uint32_t ul;
1574 	int error, ealen;
1575 
1576 	ip = VTOI(ap->a_vp);
1577 	fs = ip->i_fs;
1578 
1579 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1580 		return (EOPNOTSUPP);
1581 
1582 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1583 	    ap->a_cred, ap->a_td, VREAD);
1584 	if (error)
1585 		return (error);
1586 
1587 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1588 	if (error)
1589 		return (error);
1590 	eae = ip->i_ea_area;
1591 	easize = ip->i_ea_len;
1592 
1593 	error = 0;
1594 	if (ap->a_size != NULL)
1595 		*ap->a_size = 0;
1596 	pe = eae + easize;
1597 	for(p = eae; error == 0 && p < pe; p = pn) {
1598 		bcopy(p, &ul, sizeof(ul));
1599 		pn = p + ul;
1600 		if (pn > pe)
1601 			break;
1602 		p += sizeof(ul);
1603 		if (*p++ != ap->a_attrnamespace)
1604 			continue;
1605 		p++;	/* pad2 */
1606 		ealen = *p;
1607 		if (ap->a_size != NULL) {
1608 			*ap->a_size += ealen + 1;
1609 		} else if (ap->a_uio != NULL) {
1610 			error = uiomove(p, ealen + 1, ap->a_uio);
1611 		}
1612 	}
1613 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1614 	return(error);
1615 }
1616 
1617 /*
1618  * Vnode operation to set a named attribute.
1619  */
1620 static int
1621 ffs_setextattr(struct vop_setextattr_args *ap)
1622 /*
1623 vop_setextattr {
1624 	IN struct vnode *a_vp;
1625 	IN int a_attrnamespace;
1626 	IN const char *a_name;
1627 	INOUT struct uio *a_uio;
1628 	IN struct ucred *a_cred;
1629 	IN struct thread *a_td;
1630 };
1631 */
1632 {
1633 	struct inode *ip;
1634 	struct fs *fs;
1635 	uint32_t ealength, ul;
1636 	ssize_t ealen;
1637 	int olen, eapad1, eapad2, error, i, easize;
1638 	u_char *eae, *p;
1639 
1640 	ip = VTOI(ap->a_vp);
1641 	fs = ip->i_fs;
1642 
1643 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1644 		return (EOPNOTSUPP);
1645 
1646 	if (strlen(ap->a_name) == 0)
1647 		return (EINVAL);
1648 
1649 	/* XXX Now unsupported API to delete EAs using NULL uio. */
1650 	if (ap->a_uio == NULL)
1651 		return (EOPNOTSUPP);
1652 
1653 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1654 		return (EROFS);
1655 
1656 	ealen = ap->a_uio->uio_resid;
1657 	if (ealen < 0 || ealen > lblktosize(fs, NXADDR))
1658 		return (EINVAL);
1659 
1660 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1661 	    ap->a_cred, ap->a_td, VWRITE);
1662 	if (error) {
1663 
1664 		/*
1665 		 * ffs_lock_ea is not needed there, because the vnode
1666 		 * must be exclusively locked.
1667 		 */
1668 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1669 			ip->i_ea_error = error;
1670 		return (error);
1671 	}
1672 
1673 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1674 	if (error)
1675 		return (error);
1676 
1677 	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1678 	eapad1 = 8 - (ealength % 8);
1679 	if (eapad1 == 8)
1680 		eapad1 = 0;
1681 	eapad2 = 8 - (ealen % 8);
1682 	if (eapad2 == 8)
1683 		eapad2 = 0;
1684 	ealength += eapad1 + ealen + eapad2;
1685 
1686 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1687 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1688 	easize = ip->i_ea_len;
1689 
1690 	olen = ffs_findextattr(eae, easize,
1691 	    ap->a_attrnamespace, ap->a_name, &p, NULL);
1692         if (olen == -1) {
1693 		/* new, append at end */
1694 		p = eae + easize;
1695 		easize += ealength;
1696 	} else {
1697 		bcopy(p, &ul, sizeof ul);
1698 		i = p - eae + ul;
1699 		if (ul != ealength) {
1700 			bcopy(p + ul, p + ealength, easize - i);
1701 			easize += (ealength - ul);
1702 		}
1703 	}
1704 	if (easize > lblktosize(fs, NXADDR)) {
1705 		free(eae, M_TEMP);
1706 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1707 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1708 			ip->i_ea_error = ENOSPC;
1709 		return(ENOSPC);
1710 	}
1711 	bcopy(&ealength, p, sizeof(ealength));
1712 	p += sizeof(ealength);
1713 	*p++ = ap->a_attrnamespace;
1714 	*p++ = eapad2;
1715 	*p++ = strlen(ap->a_name);
1716 	strcpy(p, ap->a_name);
1717 	p += strlen(ap->a_name);
1718 	bzero(p, eapad1);
1719 	p += eapad1;
1720 	error = uiomove(p, ealen, ap->a_uio);
1721 	if (error) {
1722 		free(eae, M_TEMP);
1723 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1724 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1725 			ip->i_ea_error = error;
1726 		return(error);
1727 	}
1728 	p += ealen;
1729 	bzero(p, eapad2);
1730 
1731 	p = ip->i_ea_area;
1732 	ip->i_ea_area = eae;
1733 	ip->i_ea_len = easize;
1734 	free(p, M_TEMP);
1735 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1736 	return(error);
1737 }
1738 
1739 /*
1740  * Vnode pointer to File handle
1741  */
1742 static int
1743 ffs_vptofh(struct vop_vptofh_args *ap)
1744 /*
1745 vop_vptofh {
1746 	IN struct vnode *a_vp;
1747 	IN struct fid *a_fhp;
1748 };
1749 */
1750 {
1751 	struct inode *ip;
1752 	struct ufid *ufhp;
1753 
1754 	ip = VTOI(ap->a_vp);
1755 	ufhp = (struct ufid *)ap->a_fhp;
1756 	ufhp->ufid_len = sizeof(struct ufid);
1757 	ufhp->ufid_ino = ip->i_number;
1758 	ufhp->ufid_gen = ip->i_gen;
1759 	return (0);
1760 }
1761