xref: /dragonfly/sys/vfs/nfs/nfs_bio.c (revision 8a0bcd56)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
37  * $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $
38  * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.45 2008/07/18 00:09:39 dillon Exp $
39  */
40 
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/resourcevar.h>
45 #include <sys/signalvar.h>
46 #include <sys/proc.h>
47 #include <sys/buf.h>
48 #include <sys/vnode.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
51 #include <sys/mbuf.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_object.h>
57 #include <vm/vm_pager.h>
58 #include <vm/vnode_pager.h>
59 
60 #include <sys/buf2.h>
61 #include <sys/thread2.h>
62 #include <vm/vm_page2.h>
63 
64 #include "rpcv2.h"
65 #include "nfsproto.h"
66 #include "nfs.h"
67 #include "nfsmount.h"
68 #include "nfsnode.h"
69 #include "xdr_subs.h"
70 #include "nfsm_subs.h"
71 
72 
73 static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset,
74 				   int size, struct thread *td);
75 static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen);
76 static void nfsiodone_sync(struct bio *bio);
77 static void nfs_readrpc_bio_done(nfsm_info_t info);
78 static void nfs_writerpc_bio_done(nfsm_info_t info);
79 static void nfs_commitrpc_bio_done(nfsm_info_t info);
80 
81 /*
82  * Vnode op for read using bio
83  */
84 int
85 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag)
86 {
87 	struct nfsnode *np = VTONFS(vp);
88 	int biosize, i;
89 	struct buf *bp, *rabp;
90 	struct vattr vattr;
91 	struct thread *td;
92 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
93 	off_t lbn, rabn;
94 	off_t raoffset;
95 	off_t loffset;
96 	int seqcount;
97 	int nra, error = 0;
98 	int boff = 0;
99 	size_t n;
100 
101 #ifdef DIAGNOSTIC
102 	if (uio->uio_rw != UIO_READ)
103 		panic("nfs_read mode");
104 #endif
105 	if (uio->uio_resid == 0)
106 		return (0);
107 	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
108 		return (EINVAL);
109 	td = uio->uio_td;
110 
111 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
112 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
113 		(void)nfs_fsinfo(nmp, vp, td);
114 	if (vp->v_type != VDIR &&
115 	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
116 		return (EFBIG);
117 	biosize = vp->v_mount->mnt_stat.f_iosize;
118 	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
119 
120 	/*
121 	 * For nfs, cache consistency can only be maintained approximately.
122 	 * Although RFC1094 does not specify the criteria, the following is
123 	 * believed to be compatible with the reference port.
124 	 *
125 	 * NFS:		If local changes have been made and this is a
126 	 *		directory, the directory must be invalidated and
127 	 *		the attribute cache must be cleared.
128 	 *
129 	 *		GETATTR is called to synchronize the file size.
130 	 *
131 	 *		If remote changes are detected local data is flushed
132 	 *		and the cache is invalidated.
133 	 *
134 	 *		NOTE: In the normal case the attribute cache is not
135 	 *		cleared which means GETATTR may use cached data and
136 	 *		not immediately detect changes made on the server.
137 	 */
138 	if ((np->n_flag & NLMODIFIED) && vp->v_type == VDIR) {
139 		nfs_invaldir(vp);
140 		error = nfs_vinvalbuf(vp, V_SAVE, 1);
141 		if (error)
142 			return (error);
143 		np->n_attrstamp = 0;
144 	}
145 	error = VOP_GETATTR(vp, &vattr);
146 	if (error)
147 		return (error);
148 
149 	/*
150 	 * This can deadlock getpages/putpages for regular
151 	 * files.  Only do it for directories.
152 	 */
153 	if (np->n_flag & NRMODIFIED) {
154 		if (vp->v_type == VDIR) {
155 			nfs_invaldir(vp);
156 			error = nfs_vinvalbuf(vp, V_SAVE, 1);
157 			if (error)
158 				return (error);
159 			np->n_flag &= ~NRMODIFIED;
160 		}
161 	}
162 
163 	/*
164 	 * Loop until uio exhausted or we hit EOF
165 	 */
166 	do {
167 	    bp = NULL;
168 
169 	    switch (vp->v_type) {
170 	    case VREG:
171 		nfsstats.biocache_reads++;
172 		lbn = uio->uio_offset / biosize;
173 		boff = uio->uio_offset & (biosize - 1);
174 		loffset = (off_t)lbn * biosize;
175 
176 		/*
177 		 * Start the read ahead(s), as required.
178 		 */
179 		if (nmp->nm_readahead > 0 && nfs_asyncok(nmp)) {
180 		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
181 			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
182 			rabn = lbn + 1 + nra;
183 			raoffset = (off_t)rabn * biosize;
184 			if (findblk(vp, raoffset, FINDBLK_TEST) == NULL) {
185 			    rabp = nfs_getcacheblk(vp, raoffset, biosize, td);
186 			    if (!rabp)
187 				return (EINTR);
188 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
189 				rabp->b_cmd = BUF_CMD_READ;
190 				vfs_busy_pages(vp, rabp);
191 				nfs_asyncio(vp, &rabp->b_bio2);
192 			    } else {
193 				brelse(rabp);
194 			    }
195 			}
196 		    }
197 		}
198 
199 		/*
200 		 * Obtain the buffer cache block.  Figure out the buffer size
201 		 * when we are at EOF.  If we are modifying the size of the
202 		 * buffer based on an EOF condition we need to hold
203 		 * nfs_rslock() through obtaining the buffer to prevent
204 		 * a potential writer-appender from messing with n_size.
205 		 * Otherwise we may accidently truncate the buffer and
206 		 * lose dirty data.
207 		 *
208 		 * Note that bcount is *not* DEV_BSIZE aligned.
209 		 */
210 		if (loffset + boff >= np->n_size) {
211 			n = 0;
212 			break;
213 		}
214 		bp = nfs_getcacheblk(vp, loffset, biosize, td);
215 
216 		if (bp == NULL)
217 			return (EINTR);
218 
219 		/*
220 		 * If B_CACHE is not set, we must issue the read.  If this
221 		 * fails, we return an error.
222 		 */
223 		if ((bp->b_flags & B_CACHE) == 0) {
224 			bp->b_cmd = BUF_CMD_READ;
225 			bp->b_bio2.bio_done = nfsiodone_sync;
226 			bp->b_bio2.bio_flags |= BIO_SYNC;
227 			vfs_busy_pages(vp, bp);
228 			error = nfs_doio(vp, &bp->b_bio2, td);
229 			if (error) {
230 				brelse(bp);
231 				return (error);
232 			}
233 		}
234 
235 		/*
236 		 * on is the offset into the current bp.  Figure out how many
237 		 * bytes we can copy out of the bp.  Note that bcount is
238 		 * NOT DEV_BSIZE aligned.
239 		 *
240 		 * Then figure out how many bytes we can copy into the uio.
241 		 */
242 		n = biosize - boff;
243 		if (n > uio->uio_resid)
244 			n = uio->uio_resid;
245 		if (loffset + boff + n > np->n_size)
246 			n = np->n_size - loffset - boff;
247 		break;
248 	    case VLNK:
249 		biosize = min(NFS_MAXPATHLEN, np->n_size);
250 		nfsstats.biocache_readlinks++;
251 		bp = nfs_getcacheblk(vp, (off_t)0, biosize, td);
252 		if (bp == NULL)
253 			return (EINTR);
254 		if ((bp->b_flags & B_CACHE) == 0) {
255 			bp->b_cmd = BUF_CMD_READ;
256 			bp->b_bio2.bio_done = nfsiodone_sync;
257 			bp->b_bio2.bio_flags |= BIO_SYNC;
258 			vfs_busy_pages(vp, bp);
259 			error = nfs_doio(vp, &bp->b_bio2, td);
260 			if (error) {
261 				bp->b_flags |= B_ERROR | B_INVAL;
262 				brelse(bp);
263 				return (error);
264 			}
265 		}
266 		n = szmin(uio->uio_resid, (size_t)bp->b_bcount - bp->b_resid);
267 		boff = 0;
268 		break;
269 	    case VDIR:
270 		nfsstats.biocache_readdirs++;
271 		if (np->n_direofoffset &&
272 		    uio->uio_offset >= np->n_direofoffset
273 		) {
274 			return (0);
275 		}
276 		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
277 		boff = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
278 		loffset = uio->uio_offset - boff;
279 		bp = nfs_getcacheblk(vp, loffset, NFS_DIRBLKSIZ, td);
280 		if (bp == NULL)
281 			return (EINTR);
282 
283 		if ((bp->b_flags & B_CACHE) == 0) {
284 		    bp->b_cmd = BUF_CMD_READ;
285 		    bp->b_bio2.bio_done = nfsiodone_sync;
286 		    bp->b_bio2.bio_flags |= BIO_SYNC;
287 		    vfs_busy_pages(vp, bp);
288 		    error = nfs_doio(vp, &bp->b_bio2, td);
289 		    if (error)
290 			    brelse(bp);
291 		    while (error == NFSERR_BAD_COOKIE) {
292 			kprintf("got bad cookie vp %p bp %p\n", vp, bp);
293 			nfs_invaldir(vp);
294 			error = nfs_vinvalbuf(vp, 0, 1);
295 			/*
296 			 * Yuck! The directory has been modified on the
297 			 * server. The only way to get the block is by
298 			 * reading from the beginning to get all the
299 			 * offset cookies.
300 			 *
301 			 * Leave the last bp intact unless there is an error.
302 			 * Loop back up to the while if the error is another
303 			 * NFSERR_BAD_COOKIE (double yuch!).
304 			 */
305 			for (i = 0; i <= lbn && !error; i++) {
306 			    if (np->n_direofoffset
307 				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
308 				    return (0);
309 			    bp = nfs_getcacheblk(vp, (off_t)i * NFS_DIRBLKSIZ,
310 						 NFS_DIRBLKSIZ, td);
311 			    if (!bp)
312 				return (EINTR);
313 			    if ((bp->b_flags & B_CACHE) == 0) {
314 				    bp->b_cmd = BUF_CMD_READ;
315 				    bp->b_bio2.bio_done = nfsiodone_sync;
316 				    bp->b_bio2.bio_flags |= BIO_SYNC;
317 				    vfs_busy_pages(vp, bp);
318 				    error = nfs_doio(vp, &bp->b_bio2, td);
319 				    /*
320 				     * no error + B_INVAL == directory EOF,
321 				     * use the block.
322 				     */
323 				    if (error == 0 && (bp->b_flags & B_INVAL))
324 					    break;
325 			    }
326 			    /*
327 			     * An error will throw away the block and the
328 			     * for loop will break out.  If no error and this
329 			     * is not the block we want, we throw away the
330 			     * block and go for the next one via the for loop.
331 			     */
332 			    if (error || i < lbn)
333 				    brelse(bp);
334 			}
335 		    }
336 		    /*
337 		     * The above while is repeated if we hit another cookie
338 		     * error.  If we hit an error and it wasn't a cookie error,
339 		     * we give up.
340 		     */
341 		    if (error)
342 			    return (error);
343 		}
344 
345 		/*
346 		 * If not eof and read aheads are enabled, start one.
347 		 * (You need the current block first, so that you have the
348 		 *  directory offset cookie of the next block.)
349 		 */
350 		if (nmp->nm_readahead > 0 && nfs_asyncok(nmp) &&
351 		    (bp->b_flags & B_INVAL) == 0 &&
352 		    (np->n_direofoffset == 0 ||
353 		    loffset + NFS_DIRBLKSIZ < np->n_direofoffset) &&
354 		    findblk(vp, loffset + NFS_DIRBLKSIZ, FINDBLK_TEST) == NULL
355 		) {
356 			rabp = nfs_getcacheblk(vp, loffset + NFS_DIRBLKSIZ,
357 					       NFS_DIRBLKSIZ, td);
358 			if (rabp) {
359 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
360 				rabp->b_cmd = BUF_CMD_READ;
361 				vfs_busy_pages(vp, rabp);
362 				nfs_asyncio(vp, &rabp->b_bio2);
363 			    } else {
364 				brelse(rabp);
365 			    }
366 			}
367 		}
368 		/*
369 		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
370 		 * chopped for the EOF condition, we cannot tell how large
371 		 * NFS directories are going to be until we hit EOF.  So
372 		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
373 		 * it just so happens that b_resid will effectively chop it
374 		 * to EOF.  *BUT* this information is lost if the buffer goes
375 		 * away and is reconstituted into a B_CACHE state ( due to
376 		 * being VMIO ) later.  So we keep track of the directory eof
377 		 * in np->n_direofoffset and chop it off as an extra step
378 		 * right here.
379 		 *
380 		 * NOTE: boff could already be beyond EOF.
381 		 */
382 		if ((size_t)boff > NFS_DIRBLKSIZ - bp->b_resid) {
383 			n = 0;
384 		} else {
385 			n = szmin(uio->uio_resid,
386 				  NFS_DIRBLKSIZ - bp->b_resid - (size_t)boff);
387 		}
388 		if (np->n_direofoffset &&
389 		    n > (size_t)(np->n_direofoffset - uio->uio_offset)) {
390 			n = (size_t)(np->n_direofoffset - uio->uio_offset);
391 		}
392 		break;
393 	    default:
394 		kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
395 		n = 0;
396 		break;
397 	    };
398 
399 	    switch (vp->v_type) {
400 	    case VREG:
401 		if (n > 0)
402 		    error = uiomove(bp->b_data + boff, n, uio);
403 		break;
404 	    case VLNK:
405 		if (n > 0)
406 		    error = uiomove(bp->b_data + boff, n, uio);
407 		n = 0;
408 		break;
409 	    case VDIR:
410 		if (n > 0) {
411 		    off_t old_off = uio->uio_offset;
412 		    caddr_t cpos, epos;
413 		    struct nfs_dirent *dp;
414 
415 		    /*
416 		     * We are casting cpos to nfs_dirent, it must be
417 		     * int-aligned.
418 		     */
419 		    if (boff & 3) {
420 			error = EINVAL;
421 			break;
422 		    }
423 
424 		    cpos = bp->b_data + boff;
425 		    epos = bp->b_data + boff + n;
426 		    while (cpos < epos && error == 0 && uio->uio_resid > 0) {
427 			    dp = (struct nfs_dirent *)cpos;
428 			    error = nfs_check_dirent(dp, (int)(epos - cpos));
429 			    if (error)
430 				    break;
431 			    if (vop_write_dirent(&error, uio, dp->nfs_ino,
432 				dp->nfs_type, dp->nfs_namlen, dp->nfs_name)) {
433 				    break;
434 			    }
435 			    cpos += dp->nfs_reclen;
436 		    }
437 		    n = 0;
438 		    if (error == 0) {
439 			    uio->uio_offset = old_off + cpos -
440 					      bp->b_data - boff;
441 		    }
442 		}
443 		break;
444 	    default:
445 		kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
446 	    }
447 	    if (bp)
448 		    brelse(bp);
449 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
450 	return (error);
451 }
452 
453 /*
454  * Userland can supply any 'seek' offset when reading a NFS directory.
455  * Validate the structure so we don't panic the kernel.  Note that
456  * the element name is nul terminated and the nul is not included
457  * in nfs_namlen.
458  */
459 static
460 int
461 nfs_check_dirent(struct nfs_dirent *dp, int maxlen)
462 {
463 	int nfs_name_off = offsetof(struct nfs_dirent, nfs_name[0]);
464 
465 	if (nfs_name_off >= maxlen)
466 		return (EINVAL);
467 	if (dp->nfs_reclen < nfs_name_off || dp->nfs_reclen > maxlen)
468 		return (EINVAL);
469 	if (nfs_name_off + dp->nfs_namlen >= dp->nfs_reclen)
470 		return (EINVAL);
471 	if (dp->nfs_reclen & 3)
472 		return (EINVAL);
473 	return (0);
474 }
475 
476 /*
477  * Vnode op for write using bio
478  *
479  * nfs_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
480  *	     struct ucred *a_cred)
481  */
482 int
483 nfs_write(struct vop_write_args *ap)
484 {
485 	struct uio *uio = ap->a_uio;
486 	struct thread *td = uio->uio_td;
487 	struct vnode *vp = ap->a_vp;
488 	struct nfsnode *np = VTONFS(vp);
489 	int ioflag = ap->a_ioflag;
490 	struct buf *bp;
491 	struct vattr vattr;
492 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
493 	off_t loffset;
494 	int boff, bytes;
495 	int error = 0;
496 	int haverslock = 0;
497 	int bcount;
498 	int biosize;
499 	int trivial;
500 
501 #ifdef DIAGNOSTIC
502 	if (uio->uio_rw != UIO_WRITE)
503 		panic("nfs_write mode");
504 	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
505 		panic("nfs_write proc");
506 #endif
507 	if (vp->v_type != VREG)
508 		return (EIO);
509 
510 	lwkt_gettoken(&nmp->nm_token);
511 
512 	if (np->n_flag & NWRITEERR) {
513 		np->n_flag &= ~NWRITEERR;
514 		lwkt_reltoken(&nmp->nm_token);
515 		return (np->n_error);
516 	}
517 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
518 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
519 		(void)nfs_fsinfo(nmp, vp, td);
520 	}
521 
522 	/*
523 	 * Synchronously flush pending buffers if we are in synchronous
524 	 * mode or if we are appending.
525 	 */
526 	if (ioflag & (IO_APPEND | IO_SYNC)) {
527 		if (np->n_flag & NLMODIFIED) {
528 			np->n_attrstamp = 0;
529 			error = nfs_flush(vp, MNT_WAIT, td, 0);
530 			/* error = nfs_vinvalbuf(vp, V_SAVE, 1); */
531 			if (error)
532 				goto  done;
533 		}
534 	}
535 
536 	/*
537 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
538 	 * get the append lock.
539 	 */
540 restart:
541 	if (ioflag & IO_APPEND) {
542 		np->n_attrstamp = 0;
543 		error = VOP_GETATTR(vp, &vattr);
544 		if (error)
545 			goto done;
546 		uio->uio_offset = np->n_size;
547 	}
548 
549 	if (uio->uio_offset < 0) {
550 		error = EINVAL;
551 		goto done;
552 	}
553 	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) {
554 		error = EFBIG;
555 		goto done;
556 	}
557 	if (uio->uio_resid == 0) {
558 		error = 0;
559 		goto done;
560 	}
561 
562 	/*
563 	 * We need to obtain the rslock if we intend to modify np->n_size
564 	 * in order to guarentee the append point with multiple contending
565 	 * writers, to guarentee that no other appenders modify n_size
566 	 * while we are trying to obtain a truncated buffer (i.e. to avoid
567 	 * accidently truncating data written by another appender due to
568 	 * the race), and to ensure that the buffer is populated prior to
569 	 * our extending of the file.  We hold rslock through the entire
570 	 * operation.
571 	 *
572 	 * Note that we do not synchronize the case where someone truncates
573 	 * the file while we are appending to it because attempting to lock
574 	 * this case may deadlock other parts of the system unexpectedly.
575 	 */
576 	if ((ioflag & IO_APPEND) ||
577 	    uio->uio_offset + uio->uio_resid > np->n_size) {
578 		switch(nfs_rslock(np)) {
579 		case ENOLCK:
580 			goto restart;
581 			/* not reached */
582 		case EINTR:
583 		case ERESTART:
584 			error = EINTR;
585 			goto done;
586 			/* not reached */
587 		default:
588 			break;
589 		}
590 		haverslock = 1;
591 	}
592 
593 	/*
594 	 * Maybe this should be above the vnode op call, but so long as
595 	 * file servers have no limits, i don't think it matters
596 	 */
597 	if (td && td->td_proc && uio->uio_offset + uio->uio_resid >
598 	      td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
599 		lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
600 		if (haverslock)
601 			nfs_rsunlock(np);
602 		error = EFBIG;
603 		goto done;
604 	}
605 
606 	biosize = vp->v_mount->mnt_stat.f_iosize;
607 
608 	do {
609 		nfsstats.biocache_writes++;
610 		boff = uio->uio_offset & (biosize-1);
611 		loffset = uio->uio_offset - boff;
612 		bytes = (int)szmin((unsigned)(biosize - boff), uio->uio_resid);
613 again:
614 		/*
615 		 * Handle direct append and file extension cases, calculate
616 		 * unaligned buffer size.  When extending B_CACHE will be
617 		 * set if possible.  See UIO_NOCOPY note below.
618 		 */
619 		if (uio->uio_offset + bytes > np->n_size) {
620 			np->n_flag |= NLMODIFIED;
621 			trivial = (uio->uio_segflg != UIO_NOCOPY &&
622 				   uio->uio_offset <= np->n_size);
623 			nfs_meta_setsize(vp, td, uio->uio_offset + bytes,
624 					 trivial);
625 		}
626 		bp = nfs_getcacheblk(vp, loffset, biosize, td);
627 		if (bp == NULL) {
628 			error = EINTR;
629 			break;
630 		}
631 
632 		/*
633 		 * Actual bytes in buffer which we care about
634 		 */
635 		if (loffset + biosize < np->n_size)
636 			bcount = biosize;
637 		else
638 			bcount = (int)(np->n_size - loffset);
639 
640 		/*
641 		 * Avoid a read by setting B_CACHE where the data we
642 		 * intend to write covers the entire buffer.  Note
643 		 * that the buffer may have been set to B_CACHE by
644 		 * nfs_meta_setsize() above or otherwise inherited the
645 		 * flag, but if B_CACHE isn't set the buffer may be
646 		 * uninitialized and must be zero'd to accomodate
647 		 * future seek+write's.
648 		 *
649 		 * See the comments in kern/vfs_bio.c's getblk() for
650 		 * more information.
651 		 *
652 		 * When doing a UIO_NOCOPY write the buffer is not
653 		 * overwritten and we cannot just set B_CACHE unconditionally
654 		 * for full-block writes.
655 		 */
656 		if (boff == 0 && bytes == biosize &&
657 		    uio->uio_segflg != UIO_NOCOPY) {
658 			bp->b_flags |= B_CACHE;
659 			bp->b_flags &= ~(B_ERROR | B_INVAL);
660 		}
661 
662 		/*
663 		 * b_resid may be set due to file EOF if we extended out.
664 		 * The NFS bio code will zero the difference anyway so
665 		 * just acknowledged the fact and set b_resid to 0.
666 		 */
667 		if ((bp->b_flags & B_CACHE) == 0) {
668 			bp->b_cmd = BUF_CMD_READ;
669 			bp->b_bio2.bio_done = nfsiodone_sync;
670 			bp->b_bio2.bio_flags |= BIO_SYNC;
671 			vfs_busy_pages(vp, bp);
672 			error = nfs_doio(vp, &bp->b_bio2, td);
673 			if (error) {
674 				brelse(bp);
675 				break;
676 			}
677 			bp->b_resid = 0;
678 		}
679 		np->n_flag |= NLMODIFIED;
680 
681 		/*
682 		 * If dirtyend exceeds file size, chop it down.  This should
683 		 * not normally occur but there is an append race where it
684 		 * might occur XXX, so we log it.
685 		 *
686 		 * If the chopping creates a reverse-indexed or degenerate
687 		 * situation with dirtyoff/end, we 0 both of them.
688 		 */
689 		if (bp->b_dirtyend > bcount) {
690 			kprintf("NFS append race @%08llx:%d\n",
691 			    (long long)bp->b_bio2.bio_offset,
692 			    bp->b_dirtyend - bcount);
693 			bp->b_dirtyend = bcount;
694 		}
695 
696 		if (bp->b_dirtyoff >= bp->b_dirtyend)
697 			bp->b_dirtyoff = bp->b_dirtyend = 0;
698 
699 		/*
700 		 * If the new write will leave a contiguous dirty
701 		 * area, just update the b_dirtyoff and b_dirtyend,
702 		 * otherwise force a write rpc of the old dirty area.
703 		 *
704 		 * While it is possible to merge discontiguous writes due to
705 		 * our having a B_CACHE buffer ( and thus valid read data
706 		 * for the hole), we don't because it could lead to
707 		 * significant cache coherency problems with multiple clients,
708 		 * especially if locking is implemented later on.
709 		 *
710 		 * as an optimization we could theoretically maintain
711 		 * a linked list of discontinuous areas, but we would still
712 		 * have to commit them separately so there isn't much
713 		 * advantage to it except perhaps a bit of asynchronization.
714 		 */
715 		if (bp->b_dirtyend > 0 &&
716 		    (boff > bp->b_dirtyend ||
717 		     (boff + bytes) < bp->b_dirtyoff)
718 		) {
719 			if (bwrite(bp) == EINTR) {
720 				error = EINTR;
721 				break;
722 			}
723 			goto again;
724 		}
725 
726 		error = uiomove(bp->b_data + boff, bytes, uio);
727 
728 		/*
729 		 * Since this block is being modified, it must be written
730 		 * again and not just committed.  Since write clustering does
731 		 * not work for the stage 1 data write, only the stage 2
732 		 * commit rpc, we have to clear B_CLUSTEROK as well.
733 		 */
734 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
735 
736 		if (error) {
737 			brelse(bp);
738 			break;
739 		}
740 
741 		/*
742 		 * Only update dirtyoff/dirtyend if not a degenerate
743 		 * condition.
744 		 *
745 		 * The underlying VM pages have been marked valid by
746 		 * virtue of acquiring the bp.  Because the entire buffer
747 		 * is marked dirty we do not have to worry about cleaning
748 		 * out the related dirty bits (and wouldn't really know
749 		 * how to deal with byte ranges anyway)
750 		 */
751 		if (bytes) {
752 			if (bp->b_dirtyend > 0) {
753 				bp->b_dirtyoff = imin(boff, bp->b_dirtyoff);
754 				bp->b_dirtyend = imax(boff + bytes,
755 						      bp->b_dirtyend);
756 			} else {
757 				bp->b_dirtyoff = boff;
758 				bp->b_dirtyend = boff + bytes;
759 			}
760 		}
761 
762 		/*
763 		 * If the lease is non-cachable or IO_SYNC do bwrite().
764 		 *
765 		 * IO_INVAL appears to be unused.  The idea appears to be
766 		 * to turn off caching in this case.  Very odd.  XXX
767 		 *
768 		 * If nfs_async is set bawrite() will use an unstable write
769 		 * (build dirty bufs on the server), so we might as well
770 		 * push it out with bawrite().  If nfs_async is not set we
771 		 * use bdwrite() to cache dirty bufs on the client.
772 		 */
773 		if (ioflag & IO_SYNC) {
774 			if (ioflag & IO_INVAL)
775 				bp->b_flags |= B_NOCACHE;
776 			error = bwrite(bp);
777 			if (error)
778 				break;
779 		} else if (boff + bytes == biosize && nfs_async) {
780 			bawrite(bp);
781 		} else {
782 			bdwrite(bp);
783 		}
784 	} while (uio->uio_resid > 0 && bytes > 0);
785 
786 	if (haverslock)
787 		nfs_rsunlock(np);
788 
789 done:
790 	lwkt_reltoken(&nmp->nm_token);
791 	return (error);
792 }
793 
794 /*
795  * Get an nfs cache block.
796  *
797  * Allocate a new one if the block isn't currently in the cache
798  * and return the block marked busy. If the calling process is
799  * interrupted by a signal for an interruptible mount point, return
800  * NULL.
801  *
802  * The caller must carefully deal with the possible B_INVAL state of
803  * the buffer.  nfs_startio() clears B_INVAL (and nfs_asyncio() clears it
804  * indirectly), so synchronous reads can be issued without worrying about
805  * the B_INVAL state.  We have to be a little more careful when dealing
806  * with writes (see comments in nfs_write()) when extending a file past
807  * its EOF.
808  */
809 static struct buf *
810 nfs_getcacheblk(struct vnode *vp, off_t loffset, int size, struct thread *td)
811 {
812 	struct buf *bp;
813 	struct mount *mp;
814 	struct nfsmount *nmp;
815 
816 	mp = vp->v_mount;
817 	nmp = VFSTONFS(mp);
818 
819 	if (nmp->nm_flag & NFSMNT_INT) {
820 		bp = getblk(vp, loffset, size, GETBLK_PCATCH, 0);
821 		while (bp == NULL) {
822 			if (nfs_sigintr(nmp, NULL, td))
823 				return (NULL);
824 			bp = getblk(vp, loffset, size, 0, 2 * hz);
825 		}
826 	} else {
827 		bp = getblk(vp, loffset, size, 0, 0);
828 	}
829 
830 	/*
831 	 * bio2, the 'device' layer.  Since BIOs use 64 bit byte offsets
832 	 * now, no translation is necessary.
833 	 */
834 	bp->b_bio2.bio_offset = loffset;
835 	return (bp);
836 }
837 
838 /*
839  * Flush and invalidate all dirty buffers. If another process is already
840  * doing the flush, just wait for completion.
841  */
842 int
843 nfs_vinvalbuf(struct vnode *vp, int flags, int intrflg)
844 {
845 	struct nfsnode *np = VTONFS(vp);
846 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
847 	int error = 0, slpflag, slptimeo;
848 	thread_t td = curthread;
849 
850 	if (vp->v_flag & VRECLAIMED)
851 		return (0);
852 
853 	if ((nmp->nm_flag & NFSMNT_INT) == 0)
854 		intrflg = 0;
855 	if (intrflg) {
856 		slpflag = PCATCH;
857 		slptimeo = 2 * hz;
858 	} else {
859 		slpflag = 0;
860 		slptimeo = 0;
861 	}
862 	/*
863 	 * First wait for any other process doing a flush to complete.
864 	 */
865 	while (np->n_flag & NFLUSHINPROG) {
866 		np->n_flag |= NFLUSHWANT;
867 		error = tsleep((caddr_t)&np->n_flag, 0, "nfsvinval", slptimeo);
868 		if (error && intrflg && nfs_sigintr(nmp, NULL, td))
869 			return (EINTR);
870 	}
871 
872 	/*
873 	 * Now, flush as required.
874 	 */
875 	np->n_flag |= NFLUSHINPROG;
876 	error = vinvalbuf(vp, flags, slpflag, 0);
877 	while (error) {
878 		if (intrflg && nfs_sigintr(nmp, NULL, td)) {
879 			np->n_flag &= ~NFLUSHINPROG;
880 			if (np->n_flag & NFLUSHWANT) {
881 				np->n_flag &= ~NFLUSHWANT;
882 				wakeup((caddr_t)&np->n_flag);
883 			}
884 			return (EINTR);
885 		}
886 		error = vinvalbuf(vp, flags, 0, slptimeo);
887 	}
888 	np->n_flag &= ~(NLMODIFIED | NFLUSHINPROG);
889 	if (np->n_flag & NFLUSHWANT) {
890 		np->n_flag &= ~NFLUSHWANT;
891 		wakeup((caddr_t)&np->n_flag);
892 	}
893 	return (0);
894 }
895 
896 /*
897  * Return true (non-zero) if the txthread and rxthread are operational
898  * and we do not already have too many not-yet-started BIO's built up.
899  */
900 int
901 nfs_asyncok(struct nfsmount *nmp)
902 {
903 	return (nmp->nm_bioqlen < nfs_maxasyncbio &&
904 		nmp->nm_bioqlen < nmp->nm_maxasync_scaled / NFS_ASYSCALE &&
905 		nmp->nm_rxstate <= NFSSVC_PENDING &&
906 		nmp->nm_txstate <= NFSSVC_PENDING);
907 }
908 
909 /*
910  * The read-ahead code calls this to queue a bio to the txthread.
911  *
912  * We don't touch the bio otherwise... that is, we do not even
913  * construct or send the initial rpc.  The txthread will do it
914  * for us.
915  *
916  * NOTE!  nm_bioqlen is not decremented until the request completes,
917  *	  so it does not reflect the number of bio's on bioq.
918  */
919 void
920 nfs_asyncio(struct vnode *vp, struct bio *bio)
921 {
922 	struct buf *bp = bio->bio_buf;
923 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
924 
925 	KKASSERT(vp->v_tag == VT_NFS);
926 	BUF_KERNPROC(bp);
927 
928 	/*
929 	 * Shortcut swap cache (not done automatically because we are not
930 	 * using bread()).
931 	 */
932 	if (vn_cache_strategy(vp, bio))
933 		return;
934 
935 	bio->bio_driver_info = vp;
936 	crit_enter();
937 	TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act);
938 	atomic_add_int(&nmp->nm_bioqlen, 1);
939 	crit_exit();
940 	nfssvc_iod_writer_wakeup(nmp);
941 }
942 
943 /*
944  * nfs_dio()	- Execute a BIO operation synchronously.  The BIO will be
945  *		  completed and its error returned.  The caller is responsible
946  *		  for brelse()ing it.  ONLY USE FOR BIO_SYNC IOs!  Otherwise
947  *		  our error probe will be against an invalid pointer.
948  *
949  * nfs_startio()- Execute a BIO operation assynchronously.
950  *
951  * NOTE: nfs_asyncio() is used to initiate an asynchronous BIO operation,
952  *	 which basically just queues it to the txthread.  nfs_startio()
953  *	 actually initiates the I/O AFTER it has gotten to the txthread.
954  *
955  * NOTE: td might be NULL.
956  *
957  * NOTE: Caller has already busied the I/O.
958  */
959 void
960 nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
961 {
962 	struct buf *bp = bio->bio_buf;
963 	struct nfsnode *np;
964 	struct nfsmount *nmp;
965 
966 	KKASSERT(vp->v_tag == VT_NFS);
967 	np = VTONFS(vp);
968 	nmp = VFSTONFS(vp->v_mount);
969 
970 	/*
971 	 * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
972 	 * do this here so we do not have to do it in all the code that
973 	 * calls us.
974 	 */
975 	bp->b_flags &= ~(B_ERROR | B_INVAL);
976 
977 	KASSERT(bp->b_cmd != BUF_CMD_DONE,
978 		("nfs_doio: bp %p already marked done!", bp));
979 
980 	if (bp->b_cmd == BUF_CMD_READ) {
981 	    switch (vp->v_type) {
982 	    case VREG:
983 		nfsstats.read_bios++;
984 		nfs_readrpc_bio(vp, bio);
985 		break;
986 	    case VLNK:
987 #if 0
988 		bio->bio_offset = 0;
989 		nfsstats.readlink_bios++;
990 		nfs_readlinkrpc_bio(vp, bio);
991 #else
992 		nfs_doio(vp, bio, td);
993 #endif
994 		break;
995 	    case VDIR:
996 		/*
997 		 * NOTE: If nfs_readdirplusrpc_bio() is requested but
998 		 *	 not supported, it will chain to
999 		 *	 nfs_readdirrpc_bio().
1000 		 */
1001 #if 0
1002 		nfsstats.readdir_bios++;
1003 		uiop->uio_offset = bio->bio_offset;
1004 		if (nmp->nm_flag & NFSMNT_RDIRPLUS)
1005 			nfs_readdirplusrpc_bio(vp, bio);
1006 		else
1007 			nfs_readdirrpc_bio(vp, bio);
1008 #else
1009 		nfs_doio(vp, bio, td);
1010 #endif
1011 		break;
1012 	    default:
1013 		kprintf("nfs_doio:  type %x unexpected\n",vp->v_type);
1014 		bp->b_flags |= B_ERROR;
1015 		bp->b_error = EINVAL;
1016 		biodone(bio);
1017 		break;
1018 	    }
1019 	} else {
1020 	    /*
1021 	     * If we only need to commit, try to commit.  If this fails
1022 	     * it will chain through to the write.  Basically all the logic
1023 	     * in nfs_doio() is replicated.
1024 	     */
1025 	    KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
1026 	    if (bp->b_flags & B_NEEDCOMMIT)
1027 		nfs_commitrpc_bio(vp, bio);
1028 	    else
1029 		nfs_writerpc_bio(vp, bio);
1030 	}
1031 }
1032 
1033 int
1034 nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
1035 {
1036 	struct buf *bp = bio->bio_buf;
1037 	struct uio *uiop;
1038 	struct nfsnode *np;
1039 	struct nfsmount *nmp;
1040 	int error = 0;
1041 	int iomode, must_commit;
1042 	size_t n;
1043 	struct uio uio;
1044 	struct iovec io;
1045 
1046 #if 0
1047 	/*
1048 	 * Shortcut swap cache (not done automatically because we are not
1049 	 * using bread()).
1050 	 *
1051 	 * XXX The biowait is a hack until we can figure out how to stop a
1052 	 * biodone chain when a middle element is BIO_SYNC.  BIO_SYNC is
1053 	 * set so the bp shouldn't get ripped out from under us.  The only
1054 	 * use-cases are fully synchronous I/O cases.
1055 	 *
1056 	 * XXX This is having problems, give up for now.
1057 	 */
1058 	if (vn_cache_strategy(vp, bio)) {
1059 		kprintf("X");
1060 		error = biowait(&bio->bio_buf->b_bio1, "nfsrsw");
1061 		return (error);
1062 	}
1063 #endif
1064 
1065 	KKASSERT(vp->v_tag == VT_NFS);
1066 	np = VTONFS(vp);
1067 	nmp = VFSTONFS(vp->v_mount);
1068 	uiop = &uio;
1069 	uiop->uio_iov = &io;
1070 	uiop->uio_iovcnt = 1;
1071 	uiop->uio_segflg = UIO_SYSSPACE;
1072 	uiop->uio_td = td;
1073 
1074 	/*
1075 	 * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
1076 	 * do this here so we do not have to do it in all the code that
1077 	 * calls us.
1078 	 */
1079 	bp->b_flags &= ~(B_ERROR | B_INVAL);
1080 
1081 	KASSERT(bp->b_cmd != BUF_CMD_DONE,
1082 		("nfs_doio: bp %p already marked done!", bp));
1083 
1084 	if (bp->b_cmd == BUF_CMD_READ) {
1085 	    io.iov_len = uiop->uio_resid = (size_t)bp->b_bcount;
1086 	    io.iov_base = bp->b_data;
1087 	    uiop->uio_rw = UIO_READ;
1088 
1089 	    switch (vp->v_type) {
1090 	    case VREG:
1091 		/*
1092 		 * When reading from a regular file zero-fill any residual.
1093 		 * Note that this residual has nothing to do with NFS short
1094 		 * reads, which nfs_readrpc_uio() will handle for us.
1095 		 *
1096 		 * We have to do this because when we are write extending
1097 		 * a file the server may not have the same notion of
1098 		 * filesize as we do.  Our BIOs should already be sized
1099 		 * (b_bcount) to account for the file EOF.
1100 		 */
1101 		nfsstats.read_bios++;
1102 		uiop->uio_offset = bio->bio_offset;
1103 		error = nfs_readrpc_uio(vp, uiop);
1104 		if (error == 0 && uiop->uio_resid) {
1105 			n = (size_t)bp->b_bcount - uiop->uio_resid;
1106 			bzero(bp->b_data + n, bp->b_bcount - n);
1107 			uiop->uio_resid = 0;
1108 		}
1109 		if (td && td->td_proc && (vp->v_flag & VTEXT) &&
1110 		    np->n_mtime != np->n_vattr.va_mtime.tv_sec) {
1111 			uprintf("Process killed due to text file modification\n");
1112 			ksignal(td->td_proc, SIGKILL);
1113 		}
1114 		break;
1115 	    case VLNK:
1116 		uiop->uio_offset = 0;
1117 		nfsstats.readlink_bios++;
1118 		error = nfs_readlinkrpc_uio(vp, uiop);
1119 		break;
1120 	    case VDIR:
1121 		nfsstats.readdir_bios++;
1122 		uiop->uio_offset = bio->bio_offset;
1123 		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1124 			error = nfs_readdirplusrpc_uio(vp, uiop);
1125 			if (error == NFSERR_NOTSUPP)
1126 				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1127 		}
1128 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1129 			error = nfs_readdirrpc_uio(vp, uiop);
1130 		/*
1131 		 * end-of-directory sets B_INVAL but does not generate an
1132 		 * error.
1133 		 */
1134 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
1135 			bp->b_flags |= B_INVAL;
1136 		break;
1137 	    default:
1138 		kprintf("nfs_doio:  type %x unexpected\n",vp->v_type);
1139 		break;
1140 	    };
1141 	    if (error) {
1142 		bp->b_flags |= B_ERROR;
1143 		bp->b_error = error;
1144 	    }
1145 	    bp->b_resid = uiop->uio_resid;
1146 	} else {
1147 	    /*
1148 	     * If we only need to commit, try to commit.
1149 	     *
1150 	     * NOTE: The I/O has already been staged for the write and
1151 	     *	     its pages busied, so b_dirtyoff/end is valid.
1152 	     */
1153 	    KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
1154 	    if (bp->b_flags & B_NEEDCOMMIT) {
1155 		    int retv;
1156 		    off_t off;
1157 
1158 		    off = bio->bio_offset + bp->b_dirtyoff;
1159 		    retv = nfs_commitrpc_uio(vp, off,
1160 					     bp->b_dirtyend - bp->b_dirtyoff,
1161 					     td);
1162 		    if (retv == 0) {
1163 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
1164 			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1165 			    bp->b_resid = 0;
1166 			    biodone(bio);
1167 			    return(0);
1168 		    }
1169 		    if (retv == NFSERR_STALEWRITEVERF) {
1170 			    nfs_clearcommit(vp->v_mount);
1171 		    }
1172 	    }
1173 
1174 	    /*
1175 	     * Setup for actual write
1176 	     */
1177 	    if (bio->bio_offset + bp->b_dirtyend > np->n_size)
1178 		bp->b_dirtyend = np->n_size - bio->bio_offset;
1179 
1180 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1181 		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1182 		    - bp->b_dirtyoff;
1183 		uiop->uio_offset = bio->bio_offset + bp->b_dirtyoff;
1184 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1185 		uiop->uio_rw = UIO_WRITE;
1186 		nfsstats.write_bios++;
1187 
1188 		if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0)
1189 		    iomode = NFSV3WRITE_UNSTABLE;
1190 		else
1191 		    iomode = NFSV3WRITE_FILESYNC;
1192 
1193 		must_commit = 0;
1194 		error = nfs_writerpc_uio(vp, uiop, &iomode, &must_commit);
1195 
1196 		/*
1197 		 * We no longer try to use kern/vfs_bio's cluster code to
1198 		 * cluster commits, so B_CLUSTEROK is no longer set with
1199 		 * B_NEEDCOMMIT.  The problem is that a vfs_busy_pages()
1200 		 * may have to clear B_NEEDCOMMIT if it finds underlying
1201 		 * pages have been redirtied through a memory mapping
1202 		 * and doing this on a clustered bp will probably cause
1203 		 * a panic, plus the flag in the underlying NFS bufs
1204 		 * making up the cluster bp will not be properly cleared.
1205 		 */
1206 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1207 		    bp->b_flags |= B_NEEDCOMMIT;
1208 #if 0
1209 		    /* XXX do not enable commit clustering */
1210 		    if (bp->b_dirtyoff == 0
1211 			&& bp->b_dirtyend == bp->b_bcount)
1212 			bp->b_flags |= B_CLUSTEROK;
1213 #endif
1214 		} else {
1215 		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1216 		}
1217 
1218 		/*
1219 		 * For an interrupted write, the buffer is still valid
1220 		 * and the write hasn't been pushed to the server yet,
1221 		 * so we can't set B_ERROR and report the interruption
1222 		 * by setting B_EINTR. For the async case, B_EINTR
1223 		 * is not relevant, so the rpc attempt is essentially
1224 		 * a noop.  For the case of a V3 write rpc not being
1225 		 * committed to stable storage, the block is still
1226 		 * dirty and requires either a commit rpc or another
1227 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1228 		 * the block is reused. This is indicated by setting
1229 		 * the B_DELWRI and B_NEEDCOMMIT flags.
1230 		 *
1231 		 * If the buffer is marked B_PAGING, it does not reside on
1232 		 * the vp's paging queues so we cannot call bdirty().  The
1233 		 * bp in this case is not an NFS cache block so we should
1234 		 * be safe. XXX
1235 		 */
1236     		if (error == EINTR
1237 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1238 			crit_enter();
1239 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1240 			if ((bp->b_flags & B_PAGING) == 0)
1241 			    bdirty(bp);
1242 			if (error)
1243 			    bp->b_flags |= B_EINTR;
1244 			crit_exit();
1245 	    	} else {
1246 		    if (error) {
1247 			bp->b_flags |= B_ERROR;
1248 			bp->b_error = np->n_error = error;
1249 			np->n_flag |= NWRITEERR;
1250 		    }
1251 		    bp->b_dirtyoff = bp->b_dirtyend = 0;
1252 		}
1253 		if (must_commit)
1254 		    nfs_clearcommit(vp->v_mount);
1255 		bp->b_resid = uiop->uio_resid;
1256 	    } else {
1257 		bp->b_resid = 0;
1258 	    }
1259 	}
1260 
1261 	/*
1262 	 * I/O was run synchronously, biodone() it and calculate the
1263 	 * error to return.
1264 	 */
1265 	biodone(bio);
1266 	KKASSERT(bp->b_cmd == BUF_CMD_DONE);
1267 	if (bp->b_flags & B_EINTR)
1268 		return (EINTR);
1269 	if (bp->b_flags & B_ERROR)
1270 		return (bp->b_error ? bp->b_error : EIO);
1271 	return (0);
1272 }
1273 
1274 /*
1275  * Handle all truncation, write-extend, and ftruncate()-extend operations
1276  * on the NFS lcient side.
1277  *
1278  * We use the new API in kern/vfs_vm.c to perform these operations in a
1279  * VM-friendly way.  With this API VM pages are properly zerod and pages
1280  * still mapped into the buffer straddling EOF are not invalidated.
1281  */
1282 int
1283 nfs_meta_setsize(struct vnode *vp, struct thread *td, off_t nsize, int trivial)
1284 {
1285 	struct nfsnode *np = VTONFS(vp);
1286 	off_t osize;
1287 	int biosize = vp->v_mount->mnt_stat.f_iosize;
1288 	int error;
1289 
1290 	osize = np->n_size;
1291 	np->n_size = nsize;
1292 
1293 	if (nsize < osize) {
1294 		error = nvtruncbuf(vp, nsize, biosize, -1);
1295 	} else {
1296 		error = nvextendbuf(vp, osize, nsize,
1297 				    biosize, biosize, -1, -1,
1298 				    trivial);
1299 	}
1300 	return(error);
1301 }
1302 
1303 /*
1304  * Synchronous completion for nfs_doio.  Call bpdone() with elseit=FALSE.
1305  * Caller is responsible for brelse()'ing the bp.
1306  */
1307 static void
1308 nfsiodone_sync(struct bio *bio)
1309 {
1310 	bio->bio_flags = 0;
1311 	bpdone(bio->bio_buf, 0);
1312 }
1313 
1314 /*
1315  * nfs read rpc - BIO version
1316  */
1317 void
1318 nfs_readrpc_bio(struct vnode *vp, struct bio *bio)
1319 {
1320 	struct buf *bp = bio->bio_buf;
1321 	u_int32_t *tl;
1322 	struct nfsmount *nmp;
1323 	int error = 0, len, tsiz;
1324 	struct nfsm_info *info;
1325 
1326 	info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
1327 	info->mrep = NULL;
1328 	info->v3 = NFS_ISV3(vp);
1329 
1330 	nmp = VFSTONFS(vp->v_mount);
1331 	tsiz = bp->b_bcount;
1332 	KKASSERT(tsiz <= nmp->nm_rsize);
1333 	if (bio->bio_offset + tsiz > nmp->nm_maxfilesize) {
1334 		error = EFBIG;
1335 		goto nfsmout;
1336 	}
1337 	nfsstats.rpccnt[NFSPROC_READ]++;
1338 	len = tsiz;
1339 	nfsm_reqhead(info, vp, NFSPROC_READ,
1340 		     NFSX_FH(info->v3) + NFSX_UNSIGNED * 3);
1341 	ERROROUT(nfsm_fhtom(info, vp));
1342 	tl = nfsm_build(info, NFSX_UNSIGNED * 3);
1343 	if (info->v3) {
1344 		txdr_hyper(bio->bio_offset, tl);
1345 		*(tl + 2) = txdr_unsigned(len);
1346 	} else {
1347 		*tl++ = txdr_unsigned(bio->bio_offset);
1348 		*tl++ = txdr_unsigned(len);
1349 		*tl = 0;
1350 	}
1351 	info->bio = bio;
1352 	info->done = nfs_readrpc_bio_done;
1353 	nfsm_request_bio(info, vp, NFSPROC_READ, NULL,
1354 			 nfs_vpcred(vp, ND_READ));
1355 	return;
1356 nfsmout:
1357 	kfree(info, M_NFSREQ);
1358 	bp->b_error = error;
1359 	bp->b_flags |= B_ERROR;
1360 	biodone(bio);
1361 }
1362 
1363 static void
1364 nfs_readrpc_bio_done(nfsm_info_t info)
1365 {
1366 	struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
1367 	struct bio *bio = info->bio;
1368 	struct buf *bp = bio->bio_buf;
1369 	u_int32_t *tl;
1370 	int attrflag;
1371 	int retlen;
1372 	int eof;
1373 	int error = 0;
1374 
1375 	KKASSERT(info->state == NFSM_STATE_DONE);
1376 
1377 	lwkt_gettoken(&nmp->nm_token);
1378 
1379 	if (info->v3) {
1380 		ERROROUT(nfsm_postop_attr(info, info->vp, &attrflag,
1381 					 NFS_LATTR_NOSHRINK));
1382 		NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED));
1383 		eof = fxdr_unsigned(int, *(tl + 1));
1384 	} else {
1385 		ERROROUT(nfsm_loadattr(info, info->vp, NULL));
1386 		eof = 0;
1387 	}
1388 	NEGATIVEOUT(retlen = nfsm_strsiz(info, nmp->nm_rsize));
1389 	ERROROUT(nfsm_mtobio(info, bio, retlen));
1390 	m_freem(info->mrep);
1391 	info->mrep = NULL;
1392 
1393 	/*
1394 	 * No error occured, if retlen is less then bcount and no EOF
1395 	 * and NFSv3 a zero-fill short read occured.
1396 	 *
1397 	 * For NFSv2 a short-read indicates EOF.
1398 	 */
1399 	if (retlen < bp->b_bcount && info->v3 && eof == 0) {
1400 		bzero(bp->b_data + retlen, bp->b_bcount - retlen);
1401 		retlen = bp->b_bcount;
1402 	}
1403 
1404 	/*
1405 	 * If we hit an EOF we still zero-fill, but return the expected
1406 	 * b_resid anyway.  This should normally not occur since async
1407 	 * BIOs are not used for read-before-write case.  Races against
1408 	 * the server can cause it though and we don't want to leave
1409 	 * garbage in the buffer.
1410 	 */
1411 	if (retlen < bp->b_bcount) {
1412 		bzero(bp->b_data + retlen, bp->b_bcount - retlen);
1413 	}
1414 	bp->b_resid = 0;
1415 	/* bp->b_resid = bp->b_bcount - retlen; */
1416 nfsmout:
1417 	lwkt_reltoken(&nmp->nm_token);
1418 	kfree(info, M_NFSREQ);
1419 	if (error) {
1420 		bp->b_error = error;
1421 		bp->b_flags |= B_ERROR;
1422 	}
1423 	biodone(bio);
1424 }
1425 
1426 /*
1427  * nfs write call - BIO version
1428  *
1429  * NOTE: Caller has already busied the I/O.
1430  */
1431 void
1432 nfs_writerpc_bio(struct vnode *vp, struct bio *bio)
1433 {
1434 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1435 	struct nfsnode *np = VTONFS(vp);
1436 	struct buf *bp = bio->bio_buf;
1437 	u_int32_t *tl;
1438 	int len;
1439 	int iomode;
1440 	int error = 0;
1441 	struct nfsm_info *info;
1442 	off_t offset;
1443 
1444 	/*
1445 	 * Setup for actual write.  Just clean up the bio if there
1446 	 * is nothing to do.  b_dirtyoff/end have already been staged
1447 	 * by the bp's pages getting busied.
1448 	 */
1449 	if (bio->bio_offset + bp->b_dirtyend > np->n_size)
1450 		bp->b_dirtyend = np->n_size - bio->bio_offset;
1451 
1452 	if (bp->b_dirtyend <= bp->b_dirtyoff) {
1453 		bp->b_resid = 0;
1454 		biodone(bio);
1455 		return;
1456 	}
1457 	len = bp->b_dirtyend - bp->b_dirtyoff;
1458 	offset = bio->bio_offset + bp->b_dirtyoff;
1459 	if (offset + len > nmp->nm_maxfilesize) {
1460 		bp->b_flags |= B_ERROR;
1461 		bp->b_error = EFBIG;
1462 		biodone(bio);
1463 		return;
1464 	}
1465 	bp->b_resid = len;
1466 	nfsstats.write_bios++;
1467 
1468 	info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
1469 	info->mrep = NULL;
1470 	info->v3 = NFS_ISV3(vp);
1471 	info->info_writerpc.must_commit = 0;
1472 	if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0)
1473 		iomode = NFSV3WRITE_UNSTABLE;
1474 	else
1475 		iomode = NFSV3WRITE_FILESYNC;
1476 
1477 	KKASSERT(len <= nmp->nm_wsize);
1478 
1479 	nfsstats.rpccnt[NFSPROC_WRITE]++;
1480 	nfsm_reqhead(info, vp, NFSPROC_WRITE,
1481 		     NFSX_FH(info->v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
1482 	ERROROUT(nfsm_fhtom(info, vp));
1483 	if (info->v3) {
1484 		tl = nfsm_build(info, 5 * NFSX_UNSIGNED);
1485 		txdr_hyper(offset, tl);
1486 		tl += 2;
1487 		*tl++ = txdr_unsigned(len);
1488 		*tl++ = txdr_unsigned(iomode);
1489 		*tl = txdr_unsigned(len);
1490 	} else {
1491 		u_int32_t x;
1492 
1493 		tl = nfsm_build(info, 4 * NFSX_UNSIGNED);
1494 		/* Set both "begin" and "current" to non-garbage. */
1495 		x = txdr_unsigned((u_int32_t)offset);
1496 		*tl++ = x;	/* "begin offset" */
1497 		*tl++ = x;	/* "current offset" */
1498 		x = txdr_unsigned(len);
1499 		*tl++ = x;	/* total to this offset */
1500 		*tl = x;	/* size of this write */
1501 	}
1502 	ERROROUT(nfsm_biotom(info, bio, bp->b_dirtyoff, len));
1503 	info->bio = bio;
1504 	info->done = nfs_writerpc_bio_done;
1505 	nfsm_request_bio(info, vp, NFSPROC_WRITE, NULL,
1506 			 nfs_vpcred(vp, ND_WRITE));
1507 	return;
1508 nfsmout:
1509 	kfree(info, M_NFSREQ);
1510 	bp->b_error = error;
1511 	bp->b_flags |= B_ERROR;
1512 	biodone(bio);
1513 }
1514 
1515 static void
1516 nfs_writerpc_bio_done(nfsm_info_t info)
1517 {
1518 	struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
1519 	struct nfsnode *np = VTONFS(info->vp);
1520 	struct bio *bio = info->bio;
1521 	struct buf *bp = bio->bio_buf;
1522 	int wccflag = NFSV3_WCCRATTR;
1523 	int iomode = NFSV3WRITE_FILESYNC;
1524 	int commit;
1525 	int rlen;
1526 	int error;
1527 	int len = bp->b_resid;	/* b_resid was set to shortened length */
1528 	u_int32_t *tl;
1529 
1530 	lwkt_gettoken(&nmp->nm_token);
1531 
1532 	if (info->v3) {
1533 		/*
1534 		 * The write RPC returns a before and after mtime.  The
1535 		 * nfsm_wcc_data() macro checks the before n_mtime
1536 		 * against the before time and stores the after time
1537 		 * in the nfsnode's cached vattr and n_mtime field.
1538 		 * The NRMODIFIED bit will be set if the before
1539 		 * time did not match the original mtime.
1540 		 */
1541 		wccflag = NFSV3_WCCCHK;
1542 		ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag));
1543 		if (error == 0) {
1544 			NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF));
1545 			rlen = fxdr_unsigned(int, *tl++);
1546 			if (rlen == 0) {
1547 				error = NFSERR_IO;
1548 				m_freem(info->mrep);
1549 				info->mrep = NULL;
1550 				goto nfsmout;
1551 			} else if (rlen < len) {
1552 #if 0
1553 				/*
1554 				 * XXX what do we do here?
1555 				 */
1556 				backup = len - rlen;
1557 				uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup;
1558 				uiop->uio_iov->iov_len += backup;
1559 				uiop->uio_offset -= backup;
1560 				uiop->uio_resid += backup;
1561 				len = rlen;
1562 #endif
1563 			}
1564 			commit = fxdr_unsigned(int, *tl++);
1565 
1566 			/*
1567 			 * Return the lowest committment level
1568 			 * obtained by any of the RPCs.
1569 			 */
1570 			if (iomode == NFSV3WRITE_FILESYNC)
1571 				iomode = commit;
1572 			else if (iomode == NFSV3WRITE_DATASYNC &&
1573 				commit == NFSV3WRITE_UNSTABLE)
1574 				iomode = commit;
1575 			if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
1576 			    bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF);
1577 			    nmp->nm_state |= NFSSTA_HASWRITEVERF;
1578 			} else if (bcmp(tl, nmp->nm_verf, NFSX_V3WRITEVERF)) {
1579 			    info->info_writerpc.must_commit = 1;
1580 			    bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF);
1581 			}
1582 		}
1583 	} else {
1584 		ERROROUT(nfsm_loadattr(info, info->vp, NULL));
1585 	}
1586 	m_freem(info->mrep);
1587 	info->mrep = NULL;
1588 	len = 0;
1589 nfsmout:
1590 	if (info->vp->v_mount->mnt_flag & MNT_ASYNC)
1591 		iomode = NFSV3WRITE_FILESYNC;
1592 	bp->b_resid = len;
1593 
1594 	/*
1595 	 * End of RPC.  Now clean up the bp.
1596 	 *
1597 	 * We no longer enable write clustering for commit operations,
1598 	 * See around line 1157 for a more detailed comment.
1599 	 */
1600 	if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1601 		bp->b_flags |= B_NEEDCOMMIT;
1602 #if 0
1603 		/* XXX do not enable commit clustering */
1604 		if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount)
1605 			bp->b_flags |= B_CLUSTEROK;
1606 #endif
1607 	} else {
1608 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1609 	}
1610 
1611 	/*
1612 	 * For an interrupted write, the buffer is still valid
1613 	 * and the write hasn't been pushed to the server yet,
1614 	 * so we can't set B_ERROR and report the interruption
1615 	 * by setting B_EINTR. For the async case, B_EINTR
1616 	 * is not relevant, so the rpc attempt is essentially
1617 	 * a noop.  For the case of a V3 write rpc not being
1618 	 * committed to stable storage, the block is still
1619 	 * dirty and requires either a commit rpc or another
1620 	 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1621 	 * the block is reused. This is indicated by setting
1622 	 * the B_DELWRI and B_NEEDCOMMIT flags.
1623 	 *
1624 	 * If the buffer is marked B_PAGING, it does not reside on
1625 	 * the vp's paging queues so we cannot call bdirty().  The
1626 	 * bp in this case is not an NFS cache block so we should
1627 	 * be safe. XXX
1628 	 */
1629 	if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1630 		crit_enter();
1631 		bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1632 		if ((bp->b_flags & B_PAGING) == 0)
1633 			bdirty(bp);
1634 		if (error)
1635 			bp->b_flags |= B_EINTR;
1636 		crit_exit();
1637 	} else {
1638 		if (error) {
1639 			bp->b_flags |= B_ERROR;
1640 			bp->b_error = np->n_error = error;
1641 			np->n_flag |= NWRITEERR;
1642 		}
1643 		bp->b_dirtyoff = bp->b_dirtyend = 0;
1644 	}
1645 	if (info->info_writerpc.must_commit)
1646 		nfs_clearcommit(info->vp->v_mount);
1647 	lwkt_reltoken(&nmp->nm_token);
1648 
1649 	kfree(info, M_NFSREQ);
1650 	if (error) {
1651 		bp->b_flags |= B_ERROR;
1652 		bp->b_error = error;
1653 	}
1654 	biodone(bio);
1655 }
1656 
1657 /*
1658  * Nfs Version 3 commit rpc - BIO version
1659  *
1660  * This function issues the commit rpc and will chain to a write
1661  * rpc if necessary.
1662  */
1663 void
1664 nfs_commitrpc_bio(struct vnode *vp, struct bio *bio)
1665 {
1666 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1667 	struct buf *bp = bio->bio_buf;
1668 	struct nfsm_info *info;
1669 	int error = 0;
1670 	u_int32_t *tl;
1671 
1672 	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
1673 		bp->b_dirtyoff = bp->b_dirtyend = 0;
1674 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1675 		bp->b_resid = 0;
1676 		biodone(bio);
1677 		return;
1678 	}
1679 
1680 	info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
1681 	info->mrep = NULL;
1682 	info->v3 = 1;
1683 
1684 	nfsstats.rpccnt[NFSPROC_COMMIT]++;
1685 	nfsm_reqhead(info, vp, NFSPROC_COMMIT, NFSX_FH(1));
1686 	ERROROUT(nfsm_fhtom(info, vp));
1687 	tl = nfsm_build(info, 3 * NFSX_UNSIGNED);
1688 	txdr_hyper(bio->bio_offset + bp->b_dirtyoff, tl);
1689 	tl += 2;
1690 	*tl = txdr_unsigned(bp->b_dirtyend - bp->b_dirtyoff);
1691 	info->bio = bio;
1692 	info->done = nfs_commitrpc_bio_done;
1693 	nfsm_request_bio(info, vp, NFSPROC_COMMIT, NULL,
1694 			 nfs_vpcred(vp, ND_WRITE));
1695 	return;
1696 nfsmout:
1697 	/*
1698 	 * Chain to write RPC on (early) error
1699 	 */
1700 	kfree(info, M_NFSREQ);
1701 	nfs_writerpc_bio(vp, bio);
1702 }
1703 
1704 static void
1705 nfs_commitrpc_bio_done(nfsm_info_t info)
1706 {
1707 	struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
1708 	struct bio *bio = info->bio;
1709 	struct buf *bp = bio->bio_buf;
1710 	u_int32_t *tl;
1711 	int wccflag = NFSV3_WCCRATTR;
1712 	int error = 0;
1713 
1714 	lwkt_gettoken(&nmp->nm_token);
1715 
1716 	ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag));
1717 	if (error == 0) {
1718 		NULLOUT(tl = nfsm_dissect(info, NFSX_V3WRITEVERF));
1719 		if (bcmp(nmp->nm_verf, tl, NFSX_V3WRITEVERF)) {
1720 			bcopy(tl, nmp->nm_verf, NFSX_V3WRITEVERF);
1721 			error = NFSERR_STALEWRITEVERF;
1722 		}
1723 	}
1724 	m_freem(info->mrep);
1725 	info->mrep = NULL;
1726 
1727 	/*
1728 	 * On completion we must chain to a write bio if an
1729 	 * error occurred.
1730 	 */
1731 nfsmout:
1732 	kfree(info, M_NFSREQ);
1733 	if (error == 0) {
1734 		bp->b_dirtyoff = bp->b_dirtyend = 0;
1735 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1736 		bp->b_resid = 0;
1737 		biodone(bio);
1738 	} else {
1739 		nfs_writerpc_bio(info->vp, bio);
1740 	}
1741 	lwkt_reltoken(&nmp->nm_token);
1742 }
1743 
1744