xref: /dragonfly/sys/vfs/nfs/nfs_bio.c (revision b71f52a9)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Rick Macklem at The University of Guelph.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
37  * $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $
38  * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.45 2008/07/18 00:09:39 dillon Exp $
39  */
40 
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/resourcevar.h>
45 #include <sys/signalvar.h>
46 #include <sys/proc.h>
47 #include <sys/buf.h>
48 #include <sys/vnode.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
51 #include <sys/buf2.h>
52 #include <sys/msfbuf.h>
53 
54 #include <vm/vm.h>
55 #include <vm/vm_extern.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_pager.h>
59 #include <vm/vnode_pager.h>
60 
61 #include <sys/thread2.h>
62 
63 #include "rpcv2.h"
64 #include "nfsproto.h"
65 #include "nfs.h"
66 #include "nfsmount.h"
67 #include "nfsnode.h"
68 
69 static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset,
70 				   int size, struct thread *td);
71 static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen);
72 static void nfsiodone_sync(struct bio *bio);
73 
74 extern int nfs_numasync;
75 extern int nfs_pbuf_freecnt;
76 extern struct nfsstats nfsstats;
77 
78 /*
79  * Vnode op for VM getpages.
80  *
81  * nfs_getpages(struct vnode *a_vp, vm_page_t *a_m, int a_count,
82  *		int a_reqpage, vm_ooffset_t a_offset)
83  */
84 int
85 nfs_getpages(struct vop_getpages_args *ap)
86 {
87 	struct thread *td = curthread;		/* XXX */
88 	int i, error, nextoff, size, toff, count, npages;
89 	struct uio uio;
90 	struct iovec iov;
91 	char *kva;
92 	struct vnode *vp;
93 	struct nfsmount *nmp;
94 	vm_page_t *pages;
95 	vm_page_t m;
96 	struct msf_buf *msf;
97 
98 	vp = ap->a_vp;
99 	nmp = VFSTONFS(vp->v_mount);
100 	pages = ap->a_m;
101 	count = ap->a_count;
102 
103 	if (vp->v_object == NULL) {
104 		kprintf("nfs_getpages: called with non-merged cache vnode??\n");
105 		return VM_PAGER_ERROR;
106 	}
107 
108 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
109 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
110 		(void)nfs_fsinfo(nmp, vp, td);
111 
112 	npages = btoc(count);
113 
114 	/*
115 	 * NOTE that partially valid pages may occur in cases other
116 	 * then file EOF, such as when a file is partially written and
117 	 * ftruncate()-extended to a larger size.   It is also possible
118 	 * for the valid bits to be set on garbage beyond the file EOF and
119 	 * clear in the area before EOF (e.g. m->valid == 0xfc), which can
120 	 * occur due to vtruncbuf() and the buffer cache's handling of
121 	 * pages which 'straddle' buffers or when b_bufsize is not a
122 	 * multiple of PAGE_SIZE.... the buffer cache cannot normally
123 	 * clear the extra bits.  This kind of situation occurs when you
124 	 * make a small write() (m->valid == 0x03) and then mmap() and
125 	 * fault in the buffer(m->valid = 0xFF).  When NFS flushes the
126 	 * buffer (vinvalbuf() m->valid = 0xFC) we are left with a mess.
127 	 *
128 	 * This is combined with the possibility that the pages are partially
129 	 * dirty or that there is a buffer backing the pages that is dirty
130 	 * (even if m->dirty is 0).
131 	 *
132 	 * To solve this problem several hacks have been made:  (1) NFS
133 	 * guarentees that the IO block size is a multiple of PAGE_SIZE and
134 	 * (2) The buffer cache, when invalidating an NFS buffer, will
135 	 * disregard the buffer's fragmentory b_bufsize and invalidate
136 	 * the whole page rather then just the piece the buffer owns.
137 	 *
138 	 * This allows us to assume that a partially valid page found here
139 	 * is fully valid (vm_fault will zero'd out areas of the page not
140 	 * marked as valid).
141 	 */
142 	m = pages[ap->a_reqpage];
143 	if (m->valid != 0) {
144 		for (i = 0; i < npages; ++i) {
145 			if (i != ap->a_reqpage)
146 				vnode_pager_freepage(pages[i]);
147 		}
148 		return(0);
149 	}
150 
151 	/*
152 	 * Use an MSF_BUF as a medium to retrieve data from the pages.
153 	 */
154 	msf_map_pagelist(&msf, pages, npages, 0);
155 	KKASSERT(msf);
156 	kva = msf_buf_kva(msf);
157 
158 	iov.iov_base = kva;
159 	iov.iov_len = count;
160 	uio.uio_iov = &iov;
161 	uio.uio_iovcnt = 1;
162 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
163 	uio.uio_resid = count;
164 	uio.uio_segflg = UIO_SYSSPACE;
165 	uio.uio_rw = UIO_READ;
166 	uio.uio_td = td;
167 
168 	error = nfs_readrpc(vp, &uio);
169 	msf_buf_free(msf);
170 
171 	if (error && (uio.uio_resid == count)) {
172 		kprintf("nfs_getpages: error %d\n", error);
173 		for (i = 0; i < npages; ++i) {
174 			if (i != ap->a_reqpage)
175 				vnode_pager_freepage(pages[i]);
176 		}
177 		return VM_PAGER_ERROR;
178 	}
179 
180 	/*
181 	 * Calculate the number of bytes read and validate only that number
182 	 * of bytes.  Note that due to pending writes, size may be 0.  This
183 	 * does not mean that the remaining data is invalid!
184 	 */
185 
186 	size = count - uio.uio_resid;
187 
188 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
189 		nextoff = toff + PAGE_SIZE;
190 		m = pages[i];
191 
192 		m->flags &= ~PG_ZERO;
193 
194 		if (nextoff <= size) {
195 			/*
196 			 * Read operation filled an entire page
197 			 */
198 			m->valid = VM_PAGE_BITS_ALL;
199 			vm_page_undirty(m);
200 		} else if (size > toff) {
201 			/*
202 			 * Read operation filled a partial page.
203 			 */
204 			m->valid = 0;
205 			vm_page_set_validclean(m, 0, size - toff);
206 			/* handled by vm_fault now	  */
207 			/* vm_page_zero_invalid(m, TRUE); */
208 		} else {
209 			/*
210 			 * Read operation was short.  If no error occured
211 			 * we may have hit a zero-fill section.   We simply
212 			 * leave valid set to 0.
213 			 */
214 			;
215 		}
216 		if (i != ap->a_reqpage) {
217 			/*
218 			 * Whether or not to leave the page activated is up in
219 			 * the air, but we should put the page on a page queue
220 			 * somewhere (it already is in the object).  Result:
221 			 * It appears that emperical results show that
222 			 * deactivating pages is best.
223 			 */
224 
225 			/*
226 			 * Just in case someone was asking for this page we
227 			 * now tell them that it is ok to use.
228 			 */
229 			if (!error) {
230 				if (m->flags & PG_WANTED)
231 					vm_page_activate(m);
232 				else
233 					vm_page_deactivate(m);
234 				vm_page_wakeup(m);
235 			} else {
236 				vnode_pager_freepage(m);
237 			}
238 		}
239 	}
240 	return 0;
241 }
242 
243 /*
244  * Vnode op for VM putpages.
245  *
246  * nfs_putpages(struct vnode *a_vp, vm_page_t *a_m, int a_count, int a_sync,
247  *		int *a_rtvals, vm_ooffset_t a_offset)
248  */
249 int
250 nfs_putpages(struct vop_putpages_args *ap)
251 {
252 	struct thread *td = curthread;
253 	struct uio uio;
254 	struct iovec iov;
255 	char *kva;
256 	int iomode, must_commit, i, error, npages, count;
257 	off_t offset;
258 	int *rtvals;
259 	struct vnode *vp;
260 	struct nfsmount *nmp;
261 	struct nfsnode *np;
262 	vm_page_t *pages;
263 	struct msf_buf *msf;
264 
265 	vp = ap->a_vp;
266 	np = VTONFS(vp);
267 	nmp = VFSTONFS(vp->v_mount);
268 	pages = ap->a_m;
269 	count = ap->a_count;
270 	rtvals = ap->a_rtvals;
271 	npages = btoc(count);
272 	offset = IDX_TO_OFF(pages[0]->pindex);
273 
274 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
275 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
276 		(void)nfs_fsinfo(nmp, vp, td);
277 
278 	for (i = 0; i < npages; i++) {
279 		rtvals[i] = VM_PAGER_AGAIN;
280 	}
281 
282 	/*
283 	 * When putting pages, do not extend file past EOF.
284 	 */
285 
286 	if (offset + count > np->n_size) {
287 		count = np->n_size - offset;
288 		if (count < 0)
289 			count = 0;
290 	}
291 
292 	/*
293 	 * Use an MSF_BUF as a medium to retrieve data from the pages.
294 	 */
295 	msf_map_pagelist(&msf, pages, npages, 0);
296 	KKASSERT(msf);
297 	kva = msf_buf_kva(msf);
298 
299 	iov.iov_base = kva;
300 	iov.iov_len = count;
301 	uio.uio_iov = &iov;
302 	uio.uio_iovcnt = 1;
303 	uio.uio_offset = offset;
304 	uio.uio_resid = count;
305 	uio.uio_segflg = UIO_SYSSPACE;
306 	uio.uio_rw = UIO_WRITE;
307 	uio.uio_td = td;
308 
309 	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
310 	    iomode = NFSV3WRITE_UNSTABLE;
311 	else
312 	    iomode = NFSV3WRITE_FILESYNC;
313 
314 	error = nfs_writerpc(vp, &uio, &iomode, &must_commit);
315 
316 	msf_buf_free(msf);
317 
318 	if (!error) {
319 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
320 		for (i = 0; i < nwritten; i++) {
321 			rtvals[i] = VM_PAGER_OK;
322 			vm_page_undirty(pages[i]);
323 		}
324 		if (must_commit)
325 			nfs_clearcommit(vp->v_mount);
326 	}
327 	return rtvals[0];
328 }
329 
330 /*
331  * Vnode op for read using bio
332  */
333 int
334 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag)
335 {
336 	struct nfsnode *np = VTONFS(vp);
337 	int biosize, i;
338 	struct buf *bp = 0, *rabp;
339 	struct vattr vattr;
340 	struct thread *td;
341 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
342 	daddr_t lbn, rabn;
343 	off_t raoffset;
344 	off_t loffset;
345 	int bcount;
346 	int seqcount;
347 	int nra, error = 0, n = 0, on = 0;
348 
349 #ifdef DIAGNOSTIC
350 	if (uio->uio_rw != UIO_READ)
351 		panic("nfs_read mode");
352 #endif
353 	if (uio->uio_resid == 0)
354 		return (0);
355 	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
356 		return (EINVAL);
357 	td = uio->uio_td;
358 
359 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
360 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
361 		(void)nfs_fsinfo(nmp, vp, td);
362 	if (vp->v_type != VDIR &&
363 	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
364 		return (EFBIG);
365 	biosize = vp->v_mount->mnt_stat.f_iosize;
366 	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
367 
368 	/*
369 	 * For nfs, cache consistency can only be maintained approximately.
370 	 * Although RFC1094 does not specify the criteria, the following is
371 	 * believed to be compatible with the reference port.
372 	 *
373 	 * NFS:		If local changes have been made and this is a
374 	 *		directory, the directory must be invalidated and
375 	 *		the attribute cache must be cleared.
376 	 *
377 	 *		GETATTR is called to synchronize the file size.
378 	 *
379 	 *		If remote changes are detected local data is flushed
380 	 *		and the cache is invalidated.
381 	 *
382 	 *		NOTE: In the normal case the attribute cache is not
383 	 *		cleared which means GETATTR may use cached data and
384 	 *		not immediately detect changes made on the server.
385 	 */
386 	if ((np->n_flag & NLMODIFIED) && vp->v_type == VDIR) {
387 		nfs_invaldir(vp);
388 		error = nfs_vinvalbuf(vp, V_SAVE, 1);
389 		if (error)
390 			return (error);
391 		np->n_attrstamp = 0;
392 	}
393 	error = VOP_GETATTR(vp, &vattr);
394 	if (error)
395 		return (error);
396 	if (np->n_flag & NRMODIFIED) {
397 		if (vp->v_type == VDIR)
398 			nfs_invaldir(vp);
399 		error = nfs_vinvalbuf(vp, V_SAVE, 1);
400 		if (error)
401 			return (error);
402 		np->n_flag &= ~NRMODIFIED;
403 	}
404 	do {
405 	    if (np->n_flag & NDONTCACHE) {
406 		switch (vp->v_type) {
407 		case VREG:
408 			return (nfs_readrpc(vp, uio));
409 		case VLNK:
410 			return (nfs_readlinkrpc(vp, uio));
411 		case VDIR:
412 			break;
413 		default:
414 			kprintf(" NDONTCACHE: type %x unexpected\n", vp->v_type);
415 			break;
416 		};
417 	    }
418 	    switch (vp->v_type) {
419 	    case VREG:
420 		nfsstats.biocache_reads++;
421 		lbn = uio->uio_offset / biosize;
422 		on = uio->uio_offset & (biosize - 1);
423 		loffset = (off_t)lbn * biosize;
424 
425 		/*
426 		 * Start the read ahead(s), as required.
427 		 */
428 		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
429 		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
430 			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
431 			rabn = lbn + 1 + nra;
432 			raoffset = (off_t)rabn * biosize;
433 			if (findblk(vp, raoffset, FINDBLK_TEST) == NULL) {
434 			    rabp = nfs_getcacheblk(vp, raoffset, biosize, td);
435 			    if (!rabp)
436 				return (EINTR);
437 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
438 				rabp->b_cmd = BUF_CMD_READ;
439 				vfs_busy_pages(vp, rabp);
440 				if (nfs_asyncio(vp, &rabp->b_bio2, td)) {
441 				    rabp->b_flags |= B_INVAL|B_ERROR;
442 				    vfs_unbusy_pages(rabp);
443 				    brelse(rabp);
444 				    break;
445 				}
446 			    } else {
447 				brelse(rabp);
448 			    }
449 			}
450 		    }
451 		}
452 
453 		/*
454 		 * Obtain the buffer cache block.  Figure out the buffer size
455 		 * when we are at EOF.  If we are modifying the size of the
456 		 * buffer based on an EOF condition we need to hold
457 		 * nfs_rslock() through obtaining the buffer to prevent
458 		 * a potential writer-appender from messing with n_size.
459 		 * Otherwise we may accidently truncate the buffer and
460 		 * lose dirty data.
461 		 *
462 		 * Note that bcount is *not* DEV_BSIZE aligned.
463 		 */
464 
465 again:
466 		bcount = biosize;
467 		if (loffset >= np->n_size) {
468 			bcount = 0;
469 		} else if (loffset + biosize > np->n_size) {
470 			bcount = np->n_size - loffset;
471 		}
472 		if (bcount != biosize) {
473 			switch(nfs_rslock(np)) {
474 			case ENOLCK:
475 				goto again;
476 				/* not reached */
477 			case EINTR:
478 			case ERESTART:
479 				return(EINTR);
480 				/* not reached */
481 			default:
482 				break;
483 			}
484 		}
485 
486 		bp = nfs_getcacheblk(vp, loffset, bcount, td);
487 
488 		if (bcount != biosize)
489 			nfs_rsunlock(np);
490 		if (!bp)
491 			return (EINTR);
492 
493 		/*
494 		 * If B_CACHE is not set, we must issue the read.  If this
495 		 * fails, we return an error.
496 		 */
497 
498 		if ((bp->b_flags & B_CACHE) == 0) {
499 		    bp->b_cmd = BUF_CMD_READ;
500 		    bp->b_bio2.bio_done = nfsiodone_sync;
501 		    bp->b_bio2.bio_flags |= BIO_SYNC;
502 		    vfs_busy_pages(vp, bp);
503 		    error = nfs_doio(vp, &bp->b_bio2, td);
504 		    if (error) {
505 			brelse(bp);
506 			return (error);
507 		    }
508 		}
509 
510 		/*
511 		 * on is the offset into the current bp.  Figure out how many
512 		 * bytes we can copy out of the bp.  Note that bcount is
513 		 * NOT DEV_BSIZE aligned.
514 		 *
515 		 * Then figure out how many bytes we can copy into the uio.
516 		 */
517 
518 		n = 0;
519 		if (on < bcount)
520 			n = min((unsigned)(bcount - on), uio->uio_resid);
521 		break;
522 	    case VLNK:
523 		biosize = min(NFS_MAXPATHLEN, np->n_size);
524 		nfsstats.biocache_readlinks++;
525 		bp = nfs_getcacheblk(vp, (off_t)0, biosize, td);
526 		if (bp == NULL)
527 			return (EINTR);
528 		if ((bp->b_flags & B_CACHE) == 0) {
529 		    bp->b_cmd = BUF_CMD_READ;
530 		    bp->b_bio2.bio_done = nfsiodone_sync;
531 		    bp->b_bio2.bio_flags |= BIO_SYNC;
532 		    vfs_busy_pages(vp, bp);
533 		    error = nfs_doio(vp, &bp->b_bio2, td);
534 		    if (error) {
535 			bp->b_flags |= B_ERROR | B_INVAL;
536 			brelse(bp);
537 			return (error);
538 		    }
539 		}
540 		n = min(uio->uio_resid, bp->b_bcount - bp->b_resid);
541 		on = 0;
542 		break;
543 	    case VDIR:
544 		nfsstats.biocache_readdirs++;
545 		if (np->n_direofoffset
546 		    && uio->uio_offset >= np->n_direofoffset) {
547 		    return (0);
548 		}
549 		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
550 		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
551 		loffset = uio->uio_offset - on;
552 		bp = nfs_getcacheblk(vp, loffset, NFS_DIRBLKSIZ, td);
553 		if (bp == NULL)
554 		    return (EINTR);
555 
556 		if ((bp->b_flags & B_CACHE) == 0) {
557 		    bp->b_cmd = BUF_CMD_READ;
558 		    bp->b_bio2.bio_done = nfsiodone_sync;
559 		    bp->b_bio2.bio_flags |= BIO_SYNC;
560 		    vfs_busy_pages(vp, bp);
561 		    error = nfs_doio(vp, &bp->b_bio2, td);
562 		    if (error) {
563 			    brelse(bp);
564 		    }
565 		    while (error == NFSERR_BAD_COOKIE) {
566 			kprintf("got bad cookie vp %p bp %p\n", vp, bp);
567 			nfs_invaldir(vp);
568 			error = nfs_vinvalbuf(vp, 0, 1);
569 			/*
570 			 * Yuck! The directory has been modified on the
571 			 * server. The only way to get the block is by
572 			 * reading from the beginning to get all the
573 			 * offset cookies.
574 			 *
575 			 * Leave the last bp intact unless there is an error.
576 			 * Loop back up to the while if the error is another
577 			 * NFSERR_BAD_COOKIE (double yuch!).
578 			 */
579 			for (i = 0; i <= lbn && !error; i++) {
580 			    if (np->n_direofoffset
581 				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
582 				    return (0);
583 			    bp = nfs_getcacheblk(vp, (off_t)i * NFS_DIRBLKSIZ,
584 						 NFS_DIRBLKSIZ, td);
585 			    if (!bp)
586 				return (EINTR);
587 			    if ((bp->b_flags & B_CACHE) == 0) {
588 				    bp->b_cmd = BUF_CMD_READ;
589 				    bp->b_bio2.bio_done = nfsiodone_sync;
590 				    bp->b_bio2.bio_flags |= BIO_SYNC;
591 				    vfs_busy_pages(vp, bp);
592 				    error = nfs_doio(vp, &bp->b_bio2, td);
593 				    /*
594 				     * no error + B_INVAL == directory EOF,
595 				     * use the block.
596 				     */
597 				    if (error == 0 && (bp->b_flags & B_INVAL))
598 					    break;
599 			    }
600 			    /*
601 			     * An error will throw away the block and the
602 			     * for loop will break out.  If no error and this
603 			     * is not the block we want, we throw away the
604 			     * block and go for the next one via the for loop.
605 			     */
606 			    if (error || i < lbn)
607 				    brelse(bp);
608 			}
609 		    }
610 		    /*
611 		     * The above while is repeated if we hit another cookie
612 		     * error.  If we hit an error and it wasn't a cookie error,
613 		     * we give up.
614 		     */
615 		    if (error)
616 			    return (error);
617 		}
618 
619 		/*
620 		 * If not eof and read aheads are enabled, start one.
621 		 * (You need the current block first, so that you have the
622 		 *  directory offset cookie of the next block.)
623 		 */
624 		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
625 		    (bp->b_flags & B_INVAL) == 0 &&
626 		    (np->n_direofoffset == 0 ||
627 		    loffset + NFS_DIRBLKSIZ < np->n_direofoffset) &&
628 		    (np->n_flag & NDONTCACHE) == 0 &&
629 		    findblk(vp, loffset + NFS_DIRBLKSIZ, FINDBLK_TEST) == NULL
630 		) {
631 			rabp = nfs_getcacheblk(vp, loffset + NFS_DIRBLKSIZ,
632 					       NFS_DIRBLKSIZ, td);
633 			if (rabp) {
634 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
635 				rabp->b_cmd = BUF_CMD_READ;
636 				vfs_busy_pages(vp, rabp);
637 				if (nfs_asyncio(vp, &rabp->b_bio2, td)) {
638 				    rabp->b_flags |= B_INVAL|B_ERROR;
639 				    vfs_unbusy_pages(rabp);
640 				    brelse(rabp);
641 				}
642 			    } else {
643 				brelse(rabp);
644 			    }
645 			}
646 		}
647 		/*
648 		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
649 		 * chopped for the EOF condition, we cannot tell how large
650 		 * NFS directories are going to be until we hit EOF.  So
651 		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
652 		 * it just so happens that b_resid will effectively chop it
653 		 * to EOF.  *BUT* this information is lost if the buffer goes
654 		 * away and is reconstituted into a B_CACHE state ( due to
655 		 * being VMIO ) later.  So we keep track of the directory eof
656 		 * in np->n_direofoffset and chop it off as an extra step
657 		 * right here.
658 		 */
659 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
660 		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
661 			n = np->n_direofoffset - uio->uio_offset;
662 		break;
663 	    default:
664 		kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
665 		break;
666 	    };
667 
668 	    switch (vp->v_type) {
669 	    case VREG:
670 		if (n > 0)
671 		    error = uiomove(bp->b_data + on, (int)n, uio);
672 		break;
673 	    case VLNK:
674 		if (n > 0)
675 		    error = uiomove(bp->b_data + on, (int)n, uio);
676 		n = 0;
677 		break;
678 	    case VDIR:
679 		if (n > 0) {
680 		    off_t old_off = uio->uio_offset;
681 		    caddr_t cpos, epos;
682 		    struct nfs_dirent *dp;
683 
684 		    /*
685 		     * We are casting cpos to nfs_dirent, it must be
686 		     * int-aligned.
687 		     */
688 		    if (on & 3) {
689 			error = EINVAL;
690 			break;
691 		    }
692 
693 		    cpos = bp->b_data + on;
694 		    epos = bp->b_data + on + n;
695 		    while (cpos < epos && error == 0 && uio->uio_resid > 0) {
696 			    dp = (struct nfs_dirent *)cpos;
697 			    error = nfs_check_dirent(dp, (int)(epos - cpos));
698 			    if (error)
699 				    break;
700 			    if (vop_write_dirent(&error, uio, dp->nfs_ino,
701 				dp->nfs_type, dp->nfs_namlen, dp->nfs_name)) {
702 				    break;
703 			    }
704 			    cpos += dp->nfs_reclen;
705 		    }
706 		    n = 0;
707 		    if (error == 0)
708 			    uio->uio_offset = old_off + cpos - bp->b_data - on;
709 		}
710 		/*
711 		 * Invalidate buffer if caching is disabled, forcing a
712 		 * re-read from the remote later.
713 		 */
714 		if (np->n_flag & NDONTCACHE)
715 			bp->b_flags |= B_INVAL;
716 		break;
717 	    default:
718 		kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
719 	    }
720 	    brelse(bp);
721 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
722 	return (error);
723 }
724 
725 /*
726  * Userland can supply any 'seek' offset when reading a NFS directory.
727  * Validate the structure so we don't panic the kernel.  Note that
728  * the element name is nul terminated and the nul is not included
729  * in nfs_namlen.
730  */
731 static
732 int
733 nfs_check_dirent(struct nfs_dirent *dp, int maxlen)
734 {
735 	int nfs_name_off = offsetof(struct nfs_dirent, nfs_name[0]);
736 
737 	if (nfs_name_off >= maxlen)
738 		return (EINVAL);
739 	if (dp->nfs_reclen < nfs_name_off || dp->nfs_reclen > maxlen)
740 		return (EINVAL);
741 	if (nfs_name_off + dp->nfs_namlen >= dp->nfs_reclen)
742 		return (EINVAL);
743 	if (dp->nfs_reclen & 3)
744 		return (EINVAL);
745 	return (0);
746 }
747 
748 /*
749  * Vnode op for write using bio
750  *
751  * nfs_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
752  *	     struct ucred *a_cred)
753  */
754 int
755 nfs_write(struct vop_write_args *ap)
756 {
757 	struct uio *uio = ap->a_uio;
758 	struct thread *td = uio->uio_td;
759 	struct vnode *vp = ap->a_vp;
760 	struct nfsnode *np = VTONFS(vp);
761 	int ioflag = ap->a_ioflag;
762 	struct buf *bp;
763 	struct vattr vattr;
764 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
765 	daddr_t lbn;
766 	off_t loffset;
767 	int n, on, error = 0, iomode, must_commit;
768 	int haverslock = 0;
769 	int bcount;
770 	int biosize;
771 
772 #ifdef DIAGNOSTIC
773 	if (uio->uio_rw != UIO_WRITE)
774 		panic("nfs_write mode");
775 	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
776 		panic("nfs_write proc");
777 #endif
778 	if (vp->v_type != VREG)
779 		return (EIO);
780 	if (np->n_flag & NWRITEERR) {
781 		np->n_flag &= ~NWRITEERR;
782 		return (np->n_error);
783 	}
784 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
785 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
786 		(void)nfs_fsinfo(nmp, vp, td);
787 
788 	/*
789 	 * Synchronously flush pending buffers if we are in synchronous
790 	 * mode or if we are appending.
791 	 */
792 	if (ioflag & (IO_APPEND | IO_SYNC)) {
793 		if (np->n_flag & NLMODIFIED) {
794 			np->n_attrstamp = 0;
795 			error = nfs_flush(vp, MNT_WAIT, td, 0);
796 			/* error = nfs_vinvalbuf(vp, V_SAVE, 1); */
797 			if (error)
798 				return (error);
799 		}
800 	}
801 
802 	/*
803 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
804 	 * get the append lock.
805 	 */
806 restart:
807 	if (ioflag & IO_APPEND) {
808 		np->n_attrstamp = 0;
809 		error = VOP_GETATTR(vp, &vattr);
810 		if (error)
811 			return (error);
812 		uio->uio_offset = np->n_size;
813 	}
814 
815 	if (uio->uio_offset < 0)
816 		return (EINVAL);
817 	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
818 		return (EFBIG);
819 	if (uio->uio_resid == 0)
820 		return (0);
821 
822 	/*
823 	 * We need to obtain the rslock if we intend to modify np->n_size
824 	 * in order to guarentee the append point with multiple contending
825 	 * writers, to guarentee that no other appenders modify n_size
826 	 * while we are trying to obtain a truncated buffer (i.e. to avoid
827 	 * accidently truncating data written by another appender due to
828 	 * the race), and to ensure that the buffer is populated prior to
829 	 * our extending of the file.  We hold rslock through the entire
830 	 * operation.
831 	 *
832 	 * Note that we do not synchronize the case where someone truncates
833 	 * the file while we are appending to it because attempting to lock
834 	 * this case may deadlock other parts of the system unexpectedly.
835 	 */
836 	if ((ioflag & IO_APPEND) ||
837 	    uio->uio_offset + uio->uio_resid > np->n_size) {
838 		switch(nfs_rslock(np)) {
839 		case ENOLCK:
840 			goto restart;
841 			/* not reached */
842 		case EINTR:
843 		case ERESTART:
844 			return(EINTR);
845 			/* not reached */
846 		default:
847 			break;
848 		}
849 		haverslock = 1;
850 	}
851 
852 	/*
853 	 * Maybe this should be above the vnode op call, but so long as
854 	 * file servers have no limits, i don't think it matters
855 	 */
856 	if (td->td_proc && uio->uio_offset + uio->uio_resid >
857 	      td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
858 		lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
859 		if (haverslock)
860 			nfs_rsunlock(np);
861 		return (EFBIG);
862 	}
863 
864 	biosize = vp->v_mount->mnt_stat.f_iosize;
865 
866 	do {
867 		if ((np->n_flag & NDONTCACHE) && uio->uio_iovcnt == 1) {
868 		    iomode = NFSV3WRITE_FILESYNC;
869 		    error = nfs_writerpc(vp, uio, &iomode, &must_commit);
870 		    if (must_commit)
871 			    nfs_clearcommit(vp->v_mount);
872 		    break;
873 		}
874 		nfsstats.biocache_writes++;
875 		lbn = uio->uio_offset / biosize;
876 		on = uio->uio_offset & (biosize-1);
877 		loffset = uio->uio_offset - on;
878 		n = min((unsigned)(biosize - on), uio->uio_resid);
879 again:
880 		/*
881 		 * Handle direct append and file extension cases, calculate
882 		 * unaligned buffer size.
883 		 */
884 
885 		if (uio->uio_offset == np->n_size && n) {
886 			/*
887 			 * Get the buffer (in its pre-append state to maintain
888 			 * B_CACHE if it was previously set).  Resize the
889 			 * nfsnode after we have locked the buffer to prevent
890 			 * readers from reading garbage.
891 			 */
892 			bcount = on;
893 			bp = nfs_getcacheblk(vp, loffset, bcount, td);
894 
895 			if (bp != NULL) {
896 				long save;
897 
898 				np->n_size = uio->uio_offset + n;
899 				np->n_flag |= NLMODIFIED;
900 				vnode_pager_setsize(vp, np->n_size);
901 
902 				save = bp->b_flags & B_CACHE;
903 				bcount += n;
904 				allocbuf(bp, bcount);
905 				bp->b_flags |= save;
906 			}
907 		} else {
908 			/*
909 			 * Obtain the locked cache block first, and then
910 			 * adjust the file's size as appropriate.
911 			 */
912 			bcount = on + n;
913 			if (loffset + bcount < np->n_size) {
914 				if (loffset + biosize < np->n_size)
915 					bcount = biosize;
916 				else
917 					bcount = np->n_size - loffset;
918 			}
919 			bp = nfs_getcacheblk(vp, loffset, bcount, td);
920 			if (uio->uio_offset + n > np->n_size) {
921 				np->n_size = uio->uio_offset + n;
922 				np->n_flag |= NLMODIFIED;
923 				vnode_pager_setsize(vp, np->n_size);
924 			}
925 		}
926 
927 		if (bp == NULL) {
928 			error = EINTR;
929 			break;
930 		}
931 
932 		/*
933 		 * Issue a READ if B_CACHE is not set.  In special-append
934 		 * mode, B_CACHE is based on the buffer prior to the write
935 		 * op and is typically set, avoiding the read.  If a read
936 		 * is required in special append mode, the server will
937 		 * probably send us a short-read since we extended the file
938 		 * on our end, resulting in b_resid == 0 and, thusly,
939 		 * B_CACHE getting set.
940 		 *
941 		 * We can also avoid issuing the read if the write covers
942 		 * the entire buffer.  We have to make sure the buffer state
943 		 * is reasonable in this case since we will not be initiating
944 		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
945 		 * more information.
946 		 *
947 		 * B_CACHE may also be set due to the buffer being cached
948 		 * normally.
949 		 *
950 		 * When doing a UIO_NOCOPY write the buffer is not
951 		 * overwritten and we cannot just set B_CACHE unconditionally
952 		 * for full-block writes.
953 		 */
954 
955 		if (on == 0 && n == bcount && uio->uio_segflg != UIO_NOCOPY) {
956 			bp->b_flags |= B_CACHE;
957 			bp->b_flags &= ~(B_ERROR | B_INVAL);
958 		}
959 
960 		if ((bp->b_flags & B_CACHE) == 0) {
961 			bp->b_cmd = BUF_CMD_READ;
962 			bp->b_bio2.bio_done = nfsiodone_sync;
963 			bp->b_bio2.bio_flags |= BIO_SYNC;
964 			vfs_busy_pages(vp, bp);
965 			error = nfs_doio(vp, &bp->b_bio2, td);
966 			if (error) {
967 				brelse(bp);
968 				break;
969 			}
970 		}
971 		if (!bp) {
972 			error = EINTR;
973 			break;
974 		}
975 		np->n_flag |= NLMODIFIED;
976 
977 		/*
978 		 * If dirtyend exceeds file size, chop it down.  This should
979 		 * not normally occur but there is an append race where it
980 		 * might occur XXX, so we log it.
981 		 *
982 		 * If the chopping creates a reverse-indexed or degenerate
983 		 * situation with dirtyoff/end, we 0 both of them.
984 		 */
985 
986 		if (bp->b_dirtyend > bcount) {
987 			kprintf("NFS append race @%08llx:%d\n",
988 			    (long long)bp->b_bio2.bio_offset,
989 			    bp->b_dirtyend - bcount);
990 			bp->b_dirtyend = bcount;
991 		}
992 
993 		if (bp->b_dirtyoff >= bp->b_dirtyend)
994 			bp->b_dirtyoff = bp->b_dirtyend = 0;
995 
996 		/*
997 		 * If the new write will leave a contiguous dirty
998 		 * area, just update the b_dirtyoff and b_dirtyend,
999 		 * otherwise force a write rpc of the old dirty area.
1000 		 *
1001 		 * While it is possible to merge discontiguous writes due to
1002 		 * our having a B_CACHE buffer ( and thus valid read data
1003 		 * for the hole), we don't because it could lead to
1004 		 * significant cache coherency problems with multiple clients,
1005 		 * especially if locking is implemented later on.
1006 		 *
1007 		 * as an optimization we could theoretically maintain
1008 		 * a linked list of discontinuous areas, but we would still
1009 		 * have to commit them separately so there isn't much
1010 		 * advantage to it except perhaps a bit of asynchronization.
1011 		 */
1012 
1013 		if (bp->b_dirtyend > 0 &&
1014 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
1015 			if (bwrite(bp) == EINTR) {
1016 				error = EINTR;
1017 				break;
1018 			}
1019 			goto again;
1020 		}
1021 
1022 		error = uiomove((char *)bp->b_data + on, n, uio);
1023 
1024 		/*
1025 		 * Since this block is being modified, it must be written
1026 		 * again and not just committed.  Since write clustering does
1027 		 * not work for the stage 1 data write, only the stage 2
1028 		 * commit rpc, we have to clear B_CLUSTEROK as well.
1029 		 */
1030 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1031 
1032 		if (error) {
1033 			bp->b_flags |= B_ERROR;
1034 			brelse(bp);
1035 			break;
1036 		}
1037 
1038 		/*
1039 		 * Only update dirtyoff/dirtyend if not a degenerate
1040 		 * condition.
1041 		 */
1042 		if (n) {
1043 			if (bp->b_dirtyend > 0) {
1044 				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
1045 				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
1046 			} else {
1047 				bp->b_dirtyoff = on;
1048 				bp->b_dirtyend = on + n;
1049 			}
1050 			vfs_bio_set_validclean(bp, on, n);
1051 		}
1052 
1053 		/*
1054 		 * If the lease is non-cachable or IO_SYNC do bwrite().
1055 		 *
1056 		 * IO_INVAL appears to be unused.  The idea appears to be
1057 		 * to turn off caching in this case.  Very odd.  XXX
1058 		 *
1059 		 * If nfs_async is set bawrite() will use an unstable write
1060 		 * (build dirty bufs on the server), so we might as well
1061 		 * push it out with bawrite().  If nfs_async is not set we
1062 		 * use bdwrite() to cache dirty bufs on the client.
1063 		 */
1064 		if ((np->n_flag & NDONTCACHE) || (ioflag & IO_SYNC)) {
1065 			if (ioflag & IO_INVAL)
1066 				bp->b_flags |= B_NOCACHE;
1067 			error = bwrite(bp);
1068 			if (error)
1069 				break;
1070 			if (np->n_flag & NDONTCACHE) {
1071 				error = nfs_vinvalbuf(vp, V_SAVE, 1);
1072 				if (error)
1073 					break;
1074 			}
1075 		} else if ((n + on) == biosize && nfs_async) {
1076 			bawrite(bp);
1077 		} else {
1078 			bdwrite(bp);
1079 		}
1080 	} while (uio->uio_resid > 0 && n > 0);
1081 
1082 	if (haverslock)
1083 		nfs_rsunlock(np);
1084 
1085 	return (error);
1086 }
1087 
1088 /*
1089  * Get an nfs cache block.
1090  *
1091  * Allocate a new one if the block isn't currently in the cache
1092  * and return the block marked busy. If the calling process is
1093  * interrupted by a signal for an interruptible mount point, return
1094  * NULL.
1095  *
1096  * The caller must carefully deal with the possible B_INVAL state of
1097  * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
1098  * indirectly), so synchronous reads can be issued without worrying about
1099  * the B_INVAL state.  We have to be a little more careful when dealing
1100  * with writes (see comments in nfs_write()) when extending a file past
1101  * its EOF.
1102  */
1103 static struct buf *
1104 nfs_getcacheblk(struct vnode *vp, off_t loffset, int size, struct thread *td)
1105 {
1106 	struct buf *bp;
1107 	struct mount *mp;
1108 	struct nfsmount *nmp;
1109 
1110 	mp = vp->v_mount;
1111 	nmp = VFSTONFS(mp);
1112 
1113 	if (nmp->nm_flag & NFSMNT_INT) {
1114 		bp = getblk(vp, loffset, size, GETBLK_PCATCH, 0);
1115 		while (bp == NULL) {
1116 			if (nfs_sigintr(nmp, NULL, td))
1117 				return (NULL);
1118 			bp = getblk(vp, loffset, size, 0, 2 * hz);
1119 		}
1120 	} else {
1121 		bp = getblk(vp, loffset, size, 0, 0);
1122 	}
1123 
1124 	/*
1125 	 * bio2, the 'device' layer.  Since BIOs use 64 bit byte offsets
1126 	 * now, no translation is necessary.
1127 	 */
1128 	bp->b_bio2.bio_offset = loffset;
1129 	return (bp);
1130 }
1131 
1132 /*
1133  * Flush and invalidate all dirty buffers. If another process is already
1134  * doing the flush, just wait for completion.
1135  */
1136 int
1137 nfs_vinvalbuf(struct vnode *vp, int flags, int intrflg)
1138 {
1139 	struct nfsnode *np = VTONFS(vp);
1140 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1141 	int error = 0, slpflag, slptimeo;
1142 	thread_t td = curthread;
1143 
1144 	if (vp->v_flag & VRECLAIMED)
1145 		return (0);
1146 
1147 	if ((nmp->nm_flag & NFSMNT_INT) == 0)
1148 		intrflg = 0;
1149 	if (intrflg) {
1150 		slpflag = PCATCH;
1151 		slptimeo = 2 * hz;
1152 	} else {
1153 		slpflag = 0;
1154 		slptimeo = 0;
1155 	}
1156 	/*
1157 	 * First wait for any other process doing a flush to complete.
1158 	 */
1159 	while (np->n_flag & NFLUSHINPROG) {
1160 		np->n_flag |= NFLUSHWANT;
1161 		error = tsleep((caddr_t)&np->n_flag, 0, "nfsvinval", slptimeo);
1162 		if (error && intrflg && nfs_sigintr(nmp, NULL, td))
1163 			return (EINTR);
1164 	}
1165 
1166 	/*
1167 	 * Now, flush as required.
1168 	 */
1169 	np->n_flag |= NFLUSHINPROG;
1170 	error = vinvalbuf(vp, flags, slpflag, 0);
1171 	while (error) {
1172 		if (intrflg && nfs_sigintr(nmp, NULL, td)) {
1173 			np->n_flag &= ~NFLUSHINPROG;
1174 			if (np->n_flag & NFLUSHWANT) {
1175 				np->n_flag &= ~NFLUSHWANT;
1176 				wakeup((caddr_t)&np->n_flag);
1177 			}
1178 			return (EINTR);
1179 		}
1180 		error = vinvalbuf(vp, flags, 0, slptimeo);
1181 	}
1182 	np->n_flag &= ~(NLMODIFIED | NFLUSHINPROG);
1183 	if (np->n_flag & NFLUSHWANT) {
1184 		np->n_flag &= ~NFLUSHWANT;
1185 		wakeup((caddr_t)&np->n_flag);
1186 	}
1187 	return (0);
1188 }
1189 
1190 /*
1191  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
1192  * This is mainly to avoid queueing async I/O requests when the nfsiods
1193  * are all hung on a dead server.
1194  *
1195  * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
1196  * is eventually dequeued by the async daemon, nfs_doio() *will*.
1197  */
1198 int
1199 nfs_asyncio(struct vnode *vp, struct bio *bio, struct thread *td)
1200 {
1201 	struct buf *bp = bio->bio_buf;
1202 	struct nfsmount *nmp;
1203 	int i;
1204 	int gotiod;
1205 	int slpflag = 0;
1206 	int slptimeo = 0;
1207 	int error;
1208 
1209 	/*
1210 	 * If no async daemons then return EIO to force caller to run the rpc
1211 	 * synchronously.
1212 	 */
1213 	if (nfs_numasync == 0)
1214 		return (EIO);
1215 
1216 	KKASSERT(vp->v_tag == VT_NFS);
1217 	nmp = VFSTONFS(vp->v_mount);
1218 
1219 	/*
1220 	 * Commits are usually short and sweet so lets save some cpu and
1221 	 * leave the async daemons for more important rpc's (such as reads
1222 	 * and writes).
1223 	 */
1224 	if (bp->b_cmd == BUF_CMD_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
1225 	    (nmp->nm_bioqiods > nfs_numasync / 2)) {
1226 		return(EIO);
1227 	}
1228 
1229 again:
1230 	if (nmp->nm_flag & NFSMNT_INT)
1231 		slpflag = PCATCH;
1232 	gotiod = FALSE;
1233 
1234 	/*
1235 	 * Find a free iod to process this request.
1236 	 */
1237 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
1238 		if (nfs_iodwant[i]) {
1239 			/*
1240 			 * Found one, so wake it up and tell it which
1241 			 * mount to process.
1242 			 */
1243 			NFS_DPF(ASYNCIO,
1244 				("nfs_asyncio: waking iod %d for mount %p\n",
1245 				 i, nmp));
1246 			nfs_iodwant[i] = NULL;
1247 			nfs_iodmount[i] = nmp;
1248 			nmp->nm_bioqiods++;
1249 			wakeup((caddr_t)&nfs_iodwant[i]);
1250 			gotiod = TRUE;
1251 			break;
1252 		}
1253 
1254 	/*
1255 	 * If none are free, we may already have an iod working on this mount
1256 	 * point.  If so, it will process our request.
1257 	 */
1258 	if (!gotiod) {
1259 		if (nmp->nm_bioqiods > 0) {
1260 			NFS_DPF(ASYNCIO,
1261 				("nfs_asyncio: %d iods are already processing mount %p\n",
1262 				 nmp->nm_bioqiods, nmp));
1263 			gotiod = TRUE;
1264 		}
1265 	}
1266 
1267 	/*
1268 	 * If we have an iod which can process the request, then queue
1269 	 * the buffer.
1270 	 */
1271 	if (gotiod) {
1272 		/*
1273 		 * Ensure that the queue never grows too large.  We still want
1274 		 * to asynchronize so we block rather then return EIO.
1275 		 */
1276 		while (nmp->nm_bioqlen >= 2*nfs_numasync) {
1277 			NFS_DPF(ASYNCIO,
1278 				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1279 			nmp->nm_bioqwant = TRUE;
1280 			error = tsleep(&nmp->nm_bioq, slpflag,
1281 				       "nfsaio", slptimeo);
1282 			if (error) {
1283 				if (nfs_sigintr(nmp, NULL, td))
1284 					return (EINTR);
1285 				if (slpflag == PCATCH) {
1286 					slpflag = 0;
1287 					slptimeo = 2 * hz;
1288 				}
1289 			}
1290 			/*
1291 			 * We might have lost our iod while sleeping,
1292 			 * so check and loop if nescessary.
1293 			 */
1294 			if (nmp->nm_bioqiods == 0) {
1295 				NFS_DPF(ASYNCIO,
1296 					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1297 				goto again;
1298 			}
1299 		}
1300 		BUF_KERNPROC(bp);
1301 
1302 		/*
1303 		 * The passed bio's buffer is not necessary associated with
1304 		 * the NFS vnode it is being written to.  Store the NFS vnode
1305 		 * in the BIO driver info.
1306 		 */
1307 		bio->bio_driver_info = vp;
1308 		TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act);
1309 		nmp->nm_bioqlen++;
1310 		return (0);
1311 	}
1312 
1313 	/*
1314 	 * All the iods are busy on other mounts, so return EIO to
1315 	 * force the caller to process the i/o synchronously.
1316 	 */
1317 	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1318 	return (EIO);
1319 }
1320 
1321 /*
1322  * Do an I/O operation to/from a cache block. This may be called
1323  * synchronously or from an nfsiod.  The BIO is normalized for DEV_BSIZE.
1324  *
1325  * A locked, completed I/O is returned and the caller is responsible for
1326  * brelse()'ing it.
1327  *
1328  * NOTE! TD MIGHT BE NULL
1329  */
1330 int
1331 nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
1332 {
1333 	struct buf *bp = bio->bio_buf;
1334 	struct uio *uiop;
1335 	struct nfsnode *np;
1336 	struct nfsmount *nmp;
1337 	int error = 0, iomode, must_commit = 0;
1338 	struct uio uio;
1339 	struct iovec io;
1340 
1341 	KKASSERT(vp->v_tag == VT_NFS);
1342 	np = VTONFS(vp);
1343 	nmp = VFSTONFS(vp->v_mount);
1344 	uiop = &uio;
1345 	uiop->uio_iov = &io;
1346 	uiop->uio_iovcnt = 1;
1347 	uiop->uio_segflg = UIO_SYSSPACE;
1348 	uiop->uio_td = td;
1349 
1350 	/*
1351 	 * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
1352 	 * do this here so we do not have to do it in all the code that
1353 	 * calls us.
1354 	 */
1355 	bp->b_flags &= ~(B_ERROR | B_INVAL);
1356 
1357 
1358 	KASSERT(bp->b_cmd != BUF_CMD_DONE,
1359 		("nfs_doio: bp %p already marked done!", bp));
1360 
1361 	if (bp->b_cmd == BUF_CMD_READ) {
1362 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
1363 	    io.iov_base = bp->b_data;
1364 	    uiop->uio_rw = UIO_READ;
1365 
1366 	    switch (vp->v_type) {
1367 	    case VREG:
1368 		uiop->uio_offset = bio->bio_offset;
1369 		nfsstats.read_bios++;
1370 		error = nfs_readrpc(vp, uiop);
1371 
1372 		if (!error) {
1373 		    if (uiop->uio_resid) {
1374 			/*
1375 			 * If we had a short read with no error, we must have
1376 			 * hit a file hole.  We should zero-fill the remainder.
1377 			 * This can also occur if the server hits the file EOF.
1378 			 *
1379 			 * Holes used to be able to occur due to pending
1380 			 * writes, but that is not possible any longer.
1381 			 */
1382 			int nread = bp->b_bcount - uiop->uio_resid;
1383 			int left  = uiop->uio_resid;
1384 
1385 			if (left > 0)
1386 				bzero((char *)bp->b_data + nread, left);
1387 			uiop->uio_resid = 0;
1388 		    }
1389 		}
1390 		if (td && td->td_proc && (vp->v_flag & VTEXT) &&
1391 		    np->n_mtime != np->n_vattr.va_mtime.tv_sec) {
1392 			uprintf("Process killed due to text file modification\n");
1393 			ksignal(td->td_proc, SIGKILL);
1394 		}
1395 		break;
1396 	    case VLNK:
1397 		uiop->uio_offset = 0;
1398 		nfsstats.readlink_bios++;
1399 		error = nfs_readlinkrpc(vp, uiop);
1400 		break;
1401 	    case VDIR:
1402 		nfsstats.readdir_bios++;
1403 		uiop->uio_offset = bio->bio_offset;
1404 		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1405 			error = nfs_readdirplusrpc(vp, uiop);
1406 			if (error == NFSERR_NOTSUPP)
1407 				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1408 		}
1409 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1410 			error = nfs_readdirrpc(vp, uiop);
1411 		/*
1412 		 * end-of-directory sets B_INVAL but does not generate an
1413 		 * error.
1414 		 */
1415 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
1416 			bp->b_flags |= B_INVAL;
1417 		break;
1418 	    default:
1419 		kprintf("nfs_doio:  type %x unexpected\n",vp->v_type);
1420 		break;
1421 	    };
1422 	    if (error) {
1423 		bp->b_flags |= B_ERROR;
1424 		bp->b_error = error;
1425 	    }
1426 	} else {
1427 	    /*
1428 	     * If we only need to commit, try to commit
1429 	     */
1430 	    KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
1431 	    if (bp->b_flags & B_NEEDCOMMIT) {
1432 		    int retv;
1433 		    off_t off;
1434 
1435 		    off = bio->bio_offset + bp->b_dirtyoff;
1436 		    retv = nfs_commit(vp, off,
1437 				bp->b_dirtyend - bp->b_dirtyoff, td);
1438 		    if (retv == 0) {
1439 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
1440 			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1441 			    bp->b_resid = 0;
1442 			    biodone(bio);
1443 			    return (0);
1444 		    }
1445 		    if (retv == NFSERR_STALEWRITEVERF) {
1446 			    nfs_clearcommit(vp->v_mount);
1447 		    }
1448 	    }
1449 
1450 	    /*
1451 	     * Setup for actual write
1452 	     */
1453 
1454 	    if (bio->bio_offset + bp->b_dirtyend > np->n_size)
1455 		bp->b_dirtyend = np->n_size - bio->bio_offset;
1456 
1457 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1458 		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1459 		    - bp->b_dirtyoff;
1460 		uiop->uio_offset = bio->bio_offset + bp->b_dirtyoff;
1461 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1462 		uiop->uio_rw = UIO_WRITE;
1463 		nfsstats.write_bios++;
1464 
1465 		if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0)
1466 		    iomode = NFSV3WRITE_UNSTABLE;
1467 		else
1468 		    iomode = NFSV3WRITE_FILESYNC;
1469 
1470 		error = nfs_writerpc(vp, uiop, &iomode, &must_commit);
1471 
1472 		/*
1473 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1474 		 * to cluster the buffers needing commit.  This will allow
1475 		 * the system to submit a single commit rpc for the whole
1476 		 * cluster.  We can do this even if the buffer is not 100%
1477 		 * dirty (relative to the NFS blocksize), so we optimize the
1478 		 * append-to-file-case.
1479 		 *
1480 		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1481 		 * cleared because write clustering only works for commit
1482 		 * rpc's, not for the data portion of the write).
1483 		 */
1484 
1485 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1486 		    bp->b_flags |= B_NEEDCOMMIT;
1487 		    if (bp->b_dirtyoff == 0
1488 			&& bp->b_dirtyend == bp->b_bcount)
1489 			bp->b_flags |= B_CLUSTEROK;
1490 		} else {
1491 		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1492 		}
1493 
1494 		/*
1495 		 * For an interrupted write, the buffer is still valid
1496 		 * and the write hasn't been pushed to the server yet,
1497 		 * so we can't set B_ERROR and report the interruption
1498 		 * by setting B_EINTR. For the async case, B_EINTR
1499 		 * is not relevant, so the rpc attempt is essentially
1500 		 * a noop.  For the case of a V3 write rpc not being
1501 		 * committed to stable storage, the block is still
1502 		 * dirty and requires either a commit rpc or another
1503 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1504 		 * the block is reused. This is indicated by setting
1505 		 * the B_DELWRI and B_NEEDCOMMIT flags.
1506 		 *
1507 		 * If the buffer is marked B_PAGING, it does not reside on
1508 		 * the vp's paging queues so we cannot call bdirty().  The
1509 		 * bp in this case is not an NFS cache block so we should
1510 		 * be safe. XXX
1511 		 */
1512     		if (error == EINTR
1513 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1514 			crit_enter();
1515 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1516 			if ((bp->b_flags & B_PAGING) == 0)
1517 			    bdirty(bp);
1518 			if (error)
1519 			    bp->b_flags |= B_EINTR;
1520 			crit_exit();
1521 	    	} else {
1522 		    if (error) {
1523 			bp->b_flags |= B_ERROR;
1524 			bp->b_error = np->n_error = error;
1525 			np->n_flag |= NWRITEERR;
1526 		    }
1527 		    bp->b_dirtyoff = bp->b_dirtyend = 0;
1528 		}
1529 	    } else {
1530 		bp->b_resid = 0;
1531 		biodone(bio);
1532 		return (0);
1533 	    }
1534 	}
1535 	bp->b_resid = uiop->uio_resid;
1536 	if (must_commit)
1537 	    nfs_clearcommit(vp->v_mount);
1538 	biodone(bio);
1539 	return (error);
1540 }
1541 
1542 /*
1543  * Used to aid in handling ftruncate() operations on the NFS client side.
1544  * Truncation creates a number of special problems for NFS.  We have to
1545  * throw away VM pages and buffer cache buffers that are beyond EOF, and
1546  * we have to properly handle VM pages or (potentially dirty) buffers
1547  * that straddle the truncation point.
1548  */
1549 
1550 int
1551 nfs_meta_setsize(struct vnode *vp, struct thread *td, u_quad_t nsize)
1552 {
1553 	struct nfsnode *np = VTONFS(vp);
1554 	u_quad_t tsize = np->n_size;
1555 	int biosize = vp->v_mount->mnt_stat.f_iosize;
1556 	int error = 0;
1557 
1558 	np->n_size = nsize;
1559 
1560 	if (np->n_size < tsize) {
1561 		struct buf *bp;
1562 		daddr_t lbn;
1563 		off_t loffset;
1564 		int bufsize;
1565 
1566 		/*
1567 		 * vtruncbuf() doesn't get the buffer overlapping the
1568 		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
1569 		 * buffer that now needs to be truncated.
1570 		 */
1571 		error = vtruncbuf(vp, nsize, biosize);
1572 		lbn = nsize / biosize;
1573 		bufsize = nsize & (biosize - 1);
1574 		loffset = nsize - bufsize;
1575 		bp = nfs_getcacheblk(vp, loffset, bufsize, td);
1576 		if (bp->b_dirtyoff > bp->b_bcount)
1577 			bp->b_dirtyoff = bp->b_bcount;
1578 		if (bp->b_dirtyend > bp->b_bcount)
1579 			bp->b_dirtyend = bp->b_bcount;
1580 		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
1581 		brelse(bp);
1582 	} else {
1583 		vnode_pager_setsize(vp, nsize);
1584 	}
1585 	return(error);
1586 }
1587 
1588 /*
1589  * Synchronous completion for nfs_doio.  Call bpdone() with elseit=FALSE.
1590  * Caller is responsible for brelse()'ing the bp.
1591  */
1592 static void
1593 nfsiodone_sync(struct bio *bio)
1594 {
1595 	bio->bio_flags = 0;
1596 	bpdone(bio->bio_buf, 0);
1597 }
1598