1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 37 * $FreeBSD: src/sys/nfs/nfs_bio.c,v 1.83.2.4 2002/12/29 18:19:53 dillon Exp $ 38 * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.2 2003/06/17 04:28:54 dillon Exp $ 39 */ 40 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/resourcevar.h> 45 #include <sys/signalvar.h> 46 #include <sys/proc.h> 47 #include <sys/buf.h> 48 #include <sys/vnode.h> 49 #include <sys/mount.h> 50 #include <sys/kernel.h> 51 52 #include <vm/vm.h> 53 #include <vm/vm_extern.h> 54 #include <vm/vm_page.h> 55 #include <vm/vm_object.h> 56 #include <vm/vm_pager.h> 57 #include <vm/vnode_pager.h> 58 59 #include <nfs/rpcv2.h> 60 #include <nfs/nfsproto.h> 61 #include <nfs/nfs.h> 62 #include <nfs/nfsmount.h> 63 #include <nfs/nqnfs.h> 64 #include <nfs/nfsnode.h> 65 66 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, 67 struct proc *p)); 68 69 extern int nfs_numasync; 70 extern int nfs_pbuf_freecnt; 71 extern struct nfsstats nfsstats; 72 73 /* 74 * Vnode op for VM getpages. 75 */ 76 int 77 nfs_getpages(ap) 78 struct vop_getpages_args /* { 79 struct vnode *a_vp; 80 vm_page_t *a_m; 81 int a_count; 82 int a_reqpage; 83 vm_ooffset_t a_offset; 84 } */ *ap; 85 { 86 int i, error, nextoff, size, toff, count, npages; 87 struct uio uio; 88 struct iovec iov; 89 vm_offset_t kva; 90 struct buf *bp; 91 struct vnode *vp; 92 struct proc *p; 93 struct ucred *cred; 94 struct nfsmount *nmp; 95 vm_page_t *pages; 96 97 vp = ap->a_vp; 98 p = curproc; /* XXX */ 99 cred = curproc->p_ucred; /* XXX */ 100 nmp = VFSTONFS(vp->v_mount); 101 pages = ap->a_m; 102 count = ap->a_count; 103 104 if (vp->v_object == NULL) { 105 printf("nfs_getpages: called with non-merged cache vnode??\n"); 106 return VM_PAGER_ERROR; 107 } 108 109 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 110 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 111 (void)nfs_fsinfo(nmp, vp, cred, p); 112 113 npages = btoc(count); 114 115 /* 116 * If the requested page is partially valid, just return it and 117 * allow the pager to zero-out the blanks. Partially valid pages 118 * can only occur at the file EOF. 119 */ 120 121 { 122 vm_page_t m = pages[ap->a_reqpage]; 123 124 if (m->valid != 0) { 125 /* handled by vm_fault now */ 126 /* vm_page_zero_invalid(m, TRUE); */ 127 for (i = 0; i < npages; ++i) { 128 if (i != ap->a_reqpage) 129 vnode_pager_freepage(pages[i]); 130 } 131 return(0); 132 } 133 } 134 135 /* 136 * We use only the kva address for the buffer, but this is extremely 137 * convienient and fast. 138 */ 139 bp = getpbuf(&nfs_pbuf_freecnt); 140 141 kva = (vm_offset_t) bp->b_data; 142 pmap_qenter(kva, pages, npages); 143 144 iov.iov_base = (caddr_t) kva; 145 iov.iov_len = count; 146 uio.uio_iov = &iov; 147 uio.uio_iovcnt = 1; 148 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 149 uio.uio_resid = count; 150 uio.uio_segflg = UIO_SYSSPACE; 151 uio.uio_rw = UIO_READ; 152 uio.uio_procp = p; 153 154 error = nfs_readrpc(vp, &uio, cred); 155 pmap_qremove(kva, npages); 156 157 relpbuf(bp, &nfs_pbuf_freecnt); 158 159 if (error && (uio.uio_resid == count)) { 160 printf("nfs_getpages: error %d\n", error); 161 for (i = 0; i < npages; ++i) { 162 if (i != ap->a_reqpage) 163 vnode_pager_freepage(pages[i]); 164 } 165 return VM_PAGER_ERROR; 166 } 167 168 /* 169 * Calculate the number of bytes read and validate only that number 170 * of bytes. Note that due to pending writes, size may be 0. This 171 * does not mean that the remaining data is invalid! 172 */ 173 174 size = count - uio.uio_resid; 175 176 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 177 vm_page_t m; 178 nextoff = toff + PAGE_SIZE; 179 m = pages[i]; 180 181 m->flags &= ~PG_ZERO; 182 183 if (nextoff <= size) { 184 /* 185 * Read operation filled an entire page 186 */ 187 m->valid = VM_PAGE_BITS_ALL; 188 vm_page_undirty(m); 189 } else if (size > toff) { 190 /* 191 * Read operation filled a partial page. 192 */ 193 m->valid = 0; 194 vm_page_set_validclean(m, 0, size - toff); 195 /* handled by vm_fault now */ 196 /* vm_page_zero_invalid(m, TRUE); */ 197 } else { 198 /* 199 * Read operation was short. If no error occured 200 * we may have hit a zero-fill section. We simply 201 * leave valid set to 0. 202 */ 203 ; 204 } 205 if (i != ap->a_reqpage) { 206 /* 207 * Whether or not to leave the page activated is up in 208 * the air, but we should put the page on a page queue 209 * somewhere (it already is in the object). Result: 210 * It appears that emperical results show that 211 * deactivating pages is best. 212 */ 213 214 /* 215 * Just in case someone was asking for this page we 216 * now tell them that it is ok to use. 217 */ 218 if (!error) { 219 if (m->flags & PG_WANTED) 220 vm_page_activate(m); 221 else 222 vm_page_deactivate(m); 223 vm_page_wakeup(m); 224 } else { 225 vnode_pager_freepage(m); 226 } 227 } 228 } 229 return 0; 230 } 231 232 /* 233 * Vnode op for VM putpages. 234 */ 235 int 236 nfs_putpages(ap) 237 struct vop_putpages_args /* { 238 struct vnode *a_vp; 239 vm_page_t *a_m; 240 int a_count; 241 int a_sync; 242 int *a_rtvals; 243 vm_ooffset_t a_offset; 244 } */ *ap; 245 { 246 struct uio uio; 247 struct iovec iov; 248 vm_offset_t kva; 249 struct buf *bp; 250 int iomode, must_commit, i, error, npages, count; 251 off_t offset; 252 int *rtvals; 253 struct vnode *vp; 254 struct proc *p; 255 struct ucred *cred; 256 struct nfsmount *nmp; 257 struct nfsnode *np; 258 vm_page_t *pages; 259 260 vp = ap->a_vp; 261 np = VTONFS(vp); 262 p = curproc; /* XXX */ 263 cred = curproc->p_ucred; /* XXX */ 264 nmp = VFSTONFS(vp->v_mount); 265 pages = ap->a_m; 266 count = ap->a_count; 267 rtvals = ap->a_rtvals; 268 npages = btoc(count); 269 offset = IDX_TO_OFF(pages[0]->pindex); 270 271 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 272 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 273 (void)nfs_fsinfo(nmp, vp, cred, p); 274 275 for (i = 0; i < npages; i++) { 276 rtvals[i] = VM_PAGER_AGAIN; 277 } 278 279 /* 280 * When putting pages, do not extend file past EOF. 281 */ 282 283 if (offset + count > np->n_size) { 284 count = np->n_size - offset; 285 if (count < 0) 286 count = 0; 287 } 288 289 /* 290 * We use only the kva address for the buffer, but this is extremely 291 * convienient and fast. 292 */ 293 bp = getpbuf(&nfs_pbuf_freecnt); 294 295 kva = (vm_offset_t) bp->b_data; 296 pmap_qenter(kva, pages, npages); 297 298 iov.iov_base = (caddr_t) kva; 299 iov.iov_len = count; 300 uio.uio_iov = &iov; 301 uio.uio_iovcnt = 1; 302 uio.uio_offset = offset; 303 uio.uio_resid = count; 304 uio.uio_segflg = UIO_SYSSPACE; 305 uio.uio_rw = UIO_WRITE; 306 uio.uio_procp = p; 307 308 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 309 iomode = NFSV3WRITE_UNSTABLE; 310 else 311 iomode = NFSV3WRITE_FILESYNC; 312 313 error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); 314 315 pmap_qremove(kva, npages); 316 relpbuf(bp, &nfs_pbuf_freecnt); 317 318 if (!error) { 319 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 320 for (i = 0; i < nwritten; i++) { 321 rtvals[i] = VM_PAGER_OK; 322 vm_page_undirty(pages[i]); 323 } 324 if (must_commit) 325 nfs_clearcommit(vp->v_mount); 326 } 327 return rtvals[0]; 328 } 329 330 /* 331 * Vnode op for read using bio 332 */ 333 int 334 nfs_bioread(vp, uio, ioflag, cred) 335 register struct vnode *vp; 336 register struct uio *uio; 337 int ioflag; 338 struct ucred *cred; 339 { 340 register struct nfsnode *np = VTONFS(vp); 341 register int biosize, i; 342 struct buf *bp = 0, *rabp; 343 struct vattr vattr; 344 struct proc *p; 345 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 346 daddr_t lbn, rabn; 347 int bcount; 348 int seqcount; 349 int nra, error = 0, n = 0, on = 0; 350 351 #ifdef DIAGNOSTIC 352 if (uio->uio_rw != UIO_READ) 353 panic("nfs_read mode"); 354 #endif 355 if (uio->uio_resid == 0) 356 return (0); 357 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 358 return (EINVAL); 359 p = uio->uio_procp; 360 361 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 362 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 363 (void)nfs_fsinfo(nmp, vp, cred, p); 364 if (vp->v_type != VDIR && 365 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 366 return (EFBIG); 367 biosize = vp->v_mount->mnt_stat.f_iosize; 368 seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); 369 /* 370 * For nfs, cache consistency can only be maintained approximately. 371 * Although RFC1094 does not specify the criteria, the following is 372 * believed to be compatible with the reference port. 373 * For nqnfs, full cache consistency is maintained within the loop. 374 * For nfs: 375 * If the file's modify time on the server has changed since the 376 * last read rpc or you have written to the file, 377 * you may have lost data cache consistency with the 378 * server, so flush all of the file's data out of the cache. 379 * Then force a getattr rpc to ensure that you have up to date 380 * attributes. 381 * NB: This implies that cache data can be read when up to 382 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 383 * attributes this could be forced by setting n_attrstamp to 0 before 384 * the VOP_GETATTR() call. 385 */ 386 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 387 if (np->n_flag & NMODIFIED) { 388 if (vp->v_type != VREG) { 389 if (vp->v_type != VDIR) 390 panic("nfs: bioread, not dir"); 391 nfs_invaldir(vp); 392 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 393 if (error) 394 return (error); 395 } 396 np->n_attrstamp = 0; 397 error = VOP_GETATTR(vp, &vattr, cred, p); 398 if (error) 399 return (error); 400 np->n_mtime = vattr.va_mtime.tv_sec; 401 } else { 402 error = VOP_GETATTR(vp, &vattr, cred, p); 403 if (error) 404 return (error); 405 if (np->n_mtime != vattr.va_mtime.tv_sec) { 406 if (vp->v_type == VDIR) 407 nfs_invaldir(vp); 408 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 409 if (error) 410 return (error); 411 np->n_mtime = vattr.va_mtime.tv_sec; 412 } 413 } 414 } 415 do { 416 417 /* 418 * Get a valid lease. If cached data is stale, flush it. 419 */ 420 if (nmp->nm_flag & NFSMNT_NQNFS) { 421 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 422 do { 423 error = nqnfs_getlease(vp, ND_READ, cred, p); 424 } while (error == NQNFS_EXPIRED); 425 if (error) 426 return (error); 427 if (np->n_lrev != np->n_brev || 428 (np->n_flag & NQNFSNONCACHE) || 429 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 430 if (vp->v_type == VDIR) 431 nfs_invaldir(vp); 432 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 433 if (error) 434 return (error); 435 np->n_brev = np->n_lrev; 436 } 437 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 438 nfs_invaldir(vp); 439 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 440 if (error) 441 return (error); 442 } 443 } 444 if (np->n_flag & NQNFSNONCACHE) { 445 switch (vp->v_type) { 446 case VREG: 447 return (nfs_readrpc(vp, uio, cred)); 448 case VLNK: 449 return (nfs_readlinkrpc(vp, uio, cred)); 450 case VDIR: 451 break; 452 default: 453 printf(" NQNFSNONCACHE: type %x unexpected\n", 454 vp->v_type); 455 }; 456 } 457 switch (vp->v_type) { 458 case VREG: 459 nfsstats.biocache_reads++; 460 lbn = uio->uio_offset / biosize; 461 on = uio->uio_offset & (biosize - 1); 462 463 /* 464 * Start the read ahead(s), as required. 465 */ 466 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 467 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 468 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 469 rabn = lbn + 1 + nra; 470 if (!incore(vp, rabn)) { 471 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 472 if (!rabp) 473 return (EINTR); 474 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 475 rabp->b_flags |= (B_READ | B_ASYNC); 476 vfs_busy_pages(rabp, 0); 477 if (nfs_asyncio(rabp, cred, p)) { 478 rabp->b_flags |= B_INVAL|B_ERROR; 479 vfs_unbusy_pages(rabp); 480 brelse(rabp); 481 break; 482 } 483 } else { 484 brelse(rabp); 485 } 486 } 487 } 488 } 489 490 /* 491 * Obtain the buffer cache block. Figure out the buffer size 492 * when we are at EOF. If we are modifying the size of the 493 * buffer based on an EOF condition we need to hold 494 * nfs_rslock() through obtaining the buffer to prevent 495 * a potential writer-appender from messing with n_size. 496 * Otherwise we may accidently truncate the buffer and 497 * lose dirty data. 498 * 499 * Note that bcount is *not* DEV_BSIZE aligned. 500 */ 501 502 again: 503 bcount = biosize; 504 if ((off_t)lbn * biosize >= np->n_size) { 505 bcount = 0; 506 } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 507 bcount = np->n_size - (off_t)lbn * biosize; 508 } 509 if (bcount != biosize) { 510 switch(nfs_rslock(np, p)) { 511 case ENOLCK: 512 goto again; 513 /* not reached */ 514 case EINTR: 515 case ERESTART: 516 return(EINTR); 517 /* not reached */ 518 default: 519 break; 520 } 521 } 522 523 bp = nfs_getcacheblk(vp, lbn, bcount, p); 524 525 if (bcount != biosize) 526 nfs_rsunlock(np, p); 527 if (!bp) 528 return (EINTR); 529 530 /* 531 * If B_CACHE is not set, we must issue the read. If this 532 * fails, we return an error. 533 */ 534 535 if ((bp->b_flags & B_CACHE) == 0) { 536 bp->b_flags |= B_READ; 537 vfs_busy_pages(bp, 0); 538 error = nfs_doio(bp, cred, p); 539 if (error) { 540 brelse(bp); 541 return (error); 542 } 543 } 544 545 /* 546 * on is the offset into the current bp. Figure out how many 547 * bytes we can copy out of the bp. Note that bcount is 548 * NOT DEV_BSIZE aligned. 549 * 550 * Then figure out how many bytes we can copy into the uio. 551 */ 552 553 n = 0; 554 if (on < bcount) 555 n = min((unsigned)(bcount - on), uio->uio_resid); 556 break; 557 case VLNK: 558 nfsstats.biocache_readlinks++; 559 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 560 if (!bp) 561 return (EINTR); 562 if ((bp->b_flags & B_CACHE) == 0) { 563 bp->b_flags |= B_READ; 564 vfs_busy_pages(bp, 0); 565 error = nfs_doio(bp, cred, p); 566 if (error) { 567 bp->b_flags |= B_ERROR; 568 brelse(bp); 569 return (error); 570 } 571 } 572 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 573 on = 0; 574 break; 575 case VDIR: 576 nfsstats.biocache_readdirs++; 577 if (np->n_direofoffset 578 && uio->uio_offset >= np->n_direofoffset) { 579 return (0); 580 } 581 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 582 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 583 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 584 if (!bp) 585 return (EINTR); 586 if ((bp->b_flags & B_CACHE) == 0) { 587 bp->b_flags |= B_READ; 588 vfs_busy_pages(bp, 0); 589 error = nfs_doio(bp, cred, p); 590 if (error) { 591 brelse(bp); 592 } 593 while (error == NFSERR_BAD_COOKIE) { 594 printf("got bad cookie vp %p bp %p\n", vp, bp); 595 nfs_invaldir(vp); 596 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 597 /* 598 * Yuck! The directory has been modified on the 599 * server. The only way to get the block is by 600 * reading from the beginning to get all the 601 * offset cookies. 602 * 603 * Leave the last bp intact unless there is an error. 604 * Loop back up to the while if the error is another 605 * NFSERR_BAD_COOKIE (double yuch!). 606 */ 607 for (i = 0; i <= lbn && !error; i++) { 608 if (np->n_direofoffset 609 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 610 return (0); 611 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 612 if (!bp) 613 return (EINTR); 614 if ((bp->b_flags & B_CACHE) == 0) { 615 bp->b_flags |= B_READ; 616 vfs_busy_pages(bp, 0); 617 error = nfs_doio(bp, cred, p); 618 /* 619 * no error + B_INVAL == directory EOF, 620 * use the block. 621 */ 622 if (error == 0 && (bp->b_flags & B_INVAL)) 623 break; 624 } 625 /* 626 * An error will throw away the block and the 627 * for loop will break out. If no error and this 628 * is not the block we want, we throw away the 629 * block and go for the next one via the for loop. 630 */ 631 if (error || i < lbn) 632 brelse(bp); 633 } 634 } 635 /* 636 * The above while is repeated if we hit another cookie 637 * error. If we hit an error and it wasn't a cookie error, 638 * we give up. 639 */ 640 if (error) 641 return (error); 642 } 643 644 /* 645 * If not eof and read aheads are enabled, start one. 646 * (You need the current block first, so that you have the 647 * directory offset cookie of the next block.) 648 */ 649 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 650 (bp->b_flags & B_INVAL) == 0 && 651 (np->n_direofoffset == 0 || 652 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 653 !(np->n_flag & NQNFSNONCACHE) && 654 !incore(vp, lbn + 1)) { 655 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 656 if (rabp) { 657 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 658 rabp->b_flags |= (B_READ | B_ASYNC); 659 vfs_busy_pages(rabp, 0); 660 if (nfs_asyncio(rabp, cred, p)) { 661 rabp->b_flags |= B_INVAL|B_ERROR; 662 vfs_unbusy_pages(rabp); 663 brelse(rabp); 664 } 665 } else { 666 brelse(rabp); 667 } 668 } 669 } 670 /* 671 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 672 * chopped for the EOF condition, we cannot tell how large 673 * NFS directories are going to be until we hit EOF. So 674 * an NFS directory buffer is *not* chopped to its EOF. Now, 675 * it just so happens that b_resid will effectively chop it 676 * to EOF. *BUT* this information is lost if the buffer goes 677 * away and is reconstituted into a B_CACHE state ( due to 678 * being VMIO ) later. So we keep track of the directory eof 679 * in np->n_direofoffset and chop it off as an extra step 680 * right here. 681 */ 682 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 683 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 684 n = np->n_direofoffset - uio->uio_offset; 685 break; 686 default: 687 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 688 break; 689 }; 690 691 if (n > 0) { 692 error = uiomove(bp->b_data + on, (int)n, uio); 693 } 694 switch (vp->v_type) { 695 case VREG: 696 break; 697 case VLNK: 698 n = 0; 699 break; 700 case VDIR: 701 /* 702 * Invalidate buffer if caching is disabled, forcing a 703 * re-read from the remote later. 704 */ 705 if (np->n_flag & NQNFSNONCACHE) 706 bp->b_flags |= B_INVAL; 707 break; 708 default: 709 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 710 } 711 brelse(bp); 712 } while (error == 0 && uio->uio_resid > 0 && n > 0); 713 return (error); 714 } 715 716 /* 717 * Vnode op for write using bio 718 */ 719 int 720 nfs_write(ap) 721 struct vop_write_args /* { 722 struct vnode *a_vp; 723 struct uio *a_uio; 724 int a_ioflag; 725 struct ucred *a_cred; 726 } */ *ap; 727 { 728 int biosize; 729 struct uio *uio = ap->a_uio; 730 struct proc *p = uio->uio_procp; 731 struct vnode *vp = ap->a_vp; 732 struct nfsnode *np = VTONFS(vp); 733 struct ucred *cred = ap->a_cred; 734 int ioflag = ap->a_ioflag; 735 struct buf *bp; 736 struct vattr vattr; 737 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 738 daddr_t lbn; 739 int bcount; 740 int n, on, error = 0, iomode, must_commit; 741 int haverslock = 0; 742 743 #ifdef DIAGNOSTIC 744 if (uio->uio_rw != UIO_WRITE) 745 panic("nfs_write mode"); 746 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 747 panic("nfs_write proc"); 748 #endif 749 if (vp->v_type != VREG) 750 return (EIO); 751 if (np->n_flag & NWRITEERR) { 752 np->n_flag &= ~NWRITEERR; 753 return (np->n_error); 754 } 755 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 756 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 757 (void)nfs_fsinfo(nmp, vp, cred, p); 758 759 /* 760 * Synchronously flush pending buffers if we are in synchronous 761 * mode or if we are appending. 762 */ 763 if (ioflag & (IO_APPEND | IO_SYNC)) { 764 if (np->n_flag & NMODIFIED) { 765 np->n_attrstamp = 0; 766 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 767 if (error) 768 return (error); 769 } 770 } 771 772 /* 773 * If IO_APPEND then load uio_offset. We restart here if we cannot 774 * get the append lock. 775 */ 776 restart: 777 if (ioflag & IO_APPEND) { 778 np->n_attrstamp = 0; 779 error = VOP_GETATTR(vp, &vattr, cred, p); 780 if (error) 781 return (error); 782 uio->uio_offset = np->n_size; 783 } 784 785 if (uio->uio_offset < 0) 786 return (EINVAL); 787 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 788 return (EFBIG); 789 if (uio->uio_resid == 0) 790 return (0); 791 792 /* 793 * We need to obtain the rslock if we intend to modify np->n_size 794 * in order to guarentee the append point with multiple contending 795 * writers, to guarentee that no other appenders modify n_size 796 * while we are trying to obtain a truncated buffer (i.e. to avoid 797 * accidently truncating data written by another appender due to 798 * the race), and to ensure that the buffer is populated prior to 799 * our extending of the file. We hold rslock through the entire 800 * operation. 801 * 802 * Note that we do not synchronize the case where someone truncates 803 * the file while we are appending to it because attempting to lock 804 * this case may deadlock other parts of the system unexpectedly. 805 */ 806 if ((ioflag & IO_APPEND) || 807 uio->uio_offset + uio->uio_resid > np->n_size) { 808 switch(nfs_rslock(np, p)) { 809 case ENOLCK: 810 goto restart; 811 /* not reached */ 812 case EINTR: 813 case ERESTART: 814 return(EINTR); 815 /* not reached */ 816 default: 817 break; 818 } 819 haverslock = 1; 820 } 821 822 /* 823 * Maybe this should be above the vnode op call, but so long as 824 * file servers have no limits, i don't think it matters 825 */ 826 if (p && uio->uio_offset + uio->uio_resid > 827 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 828 psignal(p, SIGXFSZ); 829 if (haverslock) 830 nfs_rsunlock(np, p); 831 return (EFBIG); 832 } 833 834 biosize = vp->v_mount->mnt_stat.f_iosize; 835 836 do { 837 /* 838 * Check for a valid write lease. 839 */ 840 if ((nmp->nm_flag & NFSMNT_NQNFS) && 841 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 842 do { 843 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 844 } while (error == NQNFS_EXPIRED); 845 if (error) 846 break; 847 if (np->n_lrev != np->n_brev || 848 (np->n_flag & NQNFSNONCACHE)) { 849 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 850 if (error) 851 break; 852 np->n_brev = np->n_lrev; 853 } 854 } 855 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 856 iomode = NFSV3WRITE_FILESYNC; 857 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 858 if (must_commit) 859 nfs_clearcommit(vp->v_mount); 860 break; 861 } 862 nfsstats.biocache_writes++; 863 lbn = uio->uio_offset / biosize; 864 on = uio->uio_offset & (biosize-1); 865 n = min((unsigned)(biosize - on), uio->uio_resid); 866 again: 867 /* 868 * Handle direct append and file extension cases, calculate 869 * unaligned buffer size. 870 */ 871 872 if (uio->uio_offset == np->n_size && n) { 873 /* 874 * Get the buffer (in its pre-append state to maintain 875 * B_CACHE if it was previously set). Resize the 876 * nfsnode after we have locked the buffer to prevent 877 * readers from reading garbage. 878 */ 879 bcount = on; 880 bp = nfs_getcacheblk(vp, lbn, bcount, p); 881 882 if (bp != NULL) { 883 long save; 884 885 np->n_size = uio->uio_offset + n; 886 np->n_flag |= NMODIFIED; 887 vnode_pager_setsize(vp, np->n_size); 888 889 save = bp->b_flags & B_CACHE; 890 bcount += n; 891 allocbuf(bp, bcount); 892 bp->b_flags |= save; 893 } 894 } else { 895 /* 896 * Obtain the locked cache block first, and then 897 * adjust the file's size as appropriate. 898 */ 899 bcount = on + n; 900 if ((off_t)lbn * biosize + bcount < np->n_size) { 901 if ((off_t)(lbn + 1) * biosize < np->n_size) 902 bcount = biosize; 903 else 904 bcount = np->n_size - (off_t)lbn * biosize; 905 } 906 bp = nfs_getcacheblk(vp, lbn, bcount, p); 907 if (uio->uio_offset + n > np->n_size) { 908 np->n_size = uio->uio_offset + n; 909 np->n_flag |= NMODIFIED; 910 vnode_pager_setsize(vp, np->n_size); 911 } 912 } 913 914 if (!bp) { 915 error = EINTR; 916 break; 917 } 918 919 /* 920 * Issue a READ if B_CACHE is not set. In special-append 921 * mode, B_CACHE is based on the buffer prior to the write 922 * op and is typically set, avoiding the read. If a read 923 * is required in special append mode, the server will 924 * probably send us a short-read since we extended the file 925 * on our end, resulting in b_resid == 0 and, thusly, 926 * B_CACHE getting set. 927 * 928 * We can also avoid issuing the read if the write covers 929 * the entire buffer. We have to make sure the buffer state 930 * is reasonable in this case since we will not be initiating 931 * I/O. See the comments in kern/vfs_bio.c's getblk() for 932 * more information. 933 * 934 * B_CACHE may also be set due to the buffer being cached 935 * normally. 936 */ 937 938 if (on == 0 && n == bcount) { 939 bp->b_flags |= B_CACHE; 940 bp->b_flags &= ~(B_ERROR | B_INVAL); 941 } 942 943 if ((bp->b_flags & B_CACHE) == 0) { 944 bp->b_flags |= B_READ; 945 vfs_busy_pages(bp, 0); 946 error = nfs_doio(bp, cred, p); 947 if (error) { 948 brelse(bp); 949 break; 950 } 951 } 952 if (!bp) { 953 error = EINTR; 954 break; 955 } 956 if (bp->b_wcred == NOCRED) { 957 crhold(cred); 958 bp->b_wcred = cred; 959 } 960 np->n_flag |= NMODIFIED; 961 962 /* 963 * If dirtyend exceeds file size, chop it down. This should 964 * not normally occur but there is an append race where it 965 * might occur XXX, so we log it. 966 * 967 * If the chopping creates a reverse-indexed or degenerate 968 * situation with dirtyoff/end, we 0 both of them. 969 */ 970 971 if (bp->b_dirtyend > bcount) { 972 printf("NFS append race @%lx:%d\n", 973 (long)bp->b_blkno * DEV_BSIZE, 974 bp->b_dirtyend - bcount); 975 bp->b_dirtyend = bcount; 976 } 977 978 if (bp->b_dirtyoff >= bp->b_dirtyend) 979 bp->b_dirtyoff = bp->b_dirtyend = 0; 980 981 /* 982 * If the new write will leave a contiguous dirty 983 * area, just update the b_dirtyoff and b_dirtyend, 984 * otherwise force a write rpc of the old dirty area. 985 * 986 * While it is possible to merge discontiguous writes due to 987 * our having a B_CACHE buffer ( and thus valid read data 988 * for the hole), we don't because it could lead to 989 * significant cache coherency problems with multiple clients, 990 * especially if locking is implemented later on. 991 * 992 * as an optimization we could theoretically maintain 993 * a linked list of discontinuous areas, but we would still 994 * have to commit them separately so there isn't much 995 * advantage to it except perhaps a bit of asynchronization. 996 */ 997 998 if (bp->b_dirtyend > 0 && 999 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1000 if (VOP_BWRITE(bp->b_vp, bp) == EINTR) { 1001 error = EINTR; 1002 break; 1003 } 1004 goto again; 1005 } 1006 1007 /* 1008 * Check for valid write lease and get one as required. 1009 * In case getblk() and/or bwrite() delayed us. 1010 */ 1011 if ((nmp->nm_flag & NFSMNT_NQNFS) && 1012 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 1013 do { 1014 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 1015 } while (error == NQNFS_EXPIRED); 1016 if (error) { 1017 brelse(bp); 1018 break; 1019 } 1020 if (np->n_lrev != np->n_brev || 1021 (np->n_flag & NQNFSNONCACHE)) { 1022 brelse(bp); 1023 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1024 if (error) 1025 break; 1026 np->n_brev = np->n_lrev; 1027 goto again; 1028 } 1029 } 1030 1031 error = uiomove((char *)bp->b_data + on, n, uio); 1032 1033 /* 1034 * Since this block is being modified, it must be written 1035 * again and not just committed. Since write clustering does 1036 * not work for the stage 1 data write, only the stage 2 1037 * commit rpc, we have to clear B_CLUSTEROK as well. 1038 */ 1039 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1040 1041 if (error) { 1042 bp->b_flags |= B_ERROR; 1043 brelse(bp); 1044 break; 1045 } 1046 1047 /* 1048 * Only update dirtyoff/dirtyend if not a degenerate 1049 * condition. 1050 */ 1051 if (n) { 1052 if (bp->b_dirtyend > 0) { 1053 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 1054 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 1055 } else { 1056 bp->b_dirtyoff = on; 1057 bp->b_dirtyend = on + n; 1058 } 1059 vfs_bio_set_validclean(bp, on, n); 1060 } 1061 /* 1062 * If IO_NOWDRAIN then set B_NOWDRAIN (e.g. nfs-backed VN 1063 * filesystem). XXX also use for loopback NFS mounts. 1064 */ 1065 if (ioflag & IO_NOWDRAIN) 1066 bp->b_flags |= B_NOWDRAIN; 1067 1068 /* 1069 * If the lease is non-cachable or IO_SYNC do bwrite(). 1070 * 1071 * IO_INVAL appears to be unused. The idea appears to be 1072 * to turn off caching in this case. Very odd. XXX 1073 */ 1074 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 1075 if (ioflag & IO_INVAL) 1076 bp->b_flags |= B_NOCACHE; 1077 error = VOP_BWRITE(bp->b_vp, bp); 1078 if (error) 1079 break; 1080 if (np->n_flag & NQNFSNONCACHE) { 1081 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1082 if (error) 1083 break; 1084 } 1085 } else if ((n + on) == biosize && 1086 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 1087 bp->b_flags |= B_ASYNC; 1088 (void)nfs_writebp(bp, 0, 0); 1089 } else { 1090 bdwrite(bp); 1091 } 1092 } while (uio->uio_resid > 0 && n > 0); 1093 1094 if (haverslock) 1095 nfs_rsunlock(np, p); 1096 1097 return (error); 1098 } 1099 1100 /* 1101 * Get an nfs cache block. 1102 * 1103 * Allocate a new one if the block isn't currently in the cache 1104 * and return the block marked busy. If the calling process is 1105 * interrupted by a signal for an interruptible mount point, return 1106 * NULL. 1107 * 1108 * The caller must carefully deal with the possible B_INVAL state of 1109 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1110 * indirectly), so synchronous reads can be issued without worrying about 1111 * the B_INVAL state. We have to be a little more careful when dealing 1112 * with writes (see comments in nfs_write()) when extending a file past 1113 * its EOF. 1114 */ 1115 static struct buf * 1116 nfs_getcacheblk(vp, bn, size, p) 1117 struct vnode *vp; 1118 daddr_t bn; 1119 int size; 1120 struct proc *p; 1121 { 1122 register struct buf *bp; 1123 struct mount *mp; 1124 struct nfsmount *nmp; 1125 1126 mp = vp->v_mount; 1127 nmp = VFSTONFS(mp); 1128 1129 if (nmp->nm_flag & NFSMNT_INT) { 1130 bp = getblk(vp, bn, size, PCATCH, 0); 1131 while (bp == (struct buf *)0) { 1132 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 1133 return ((struct buf *)0); 1134 bp = getblk(vp, bn, size, 0, 2 * hz); 1135 } 1136 } else { 1137 bp = getblk(vp, bn, size, 0, 0); 1138 } 1139 1140 if (vp->v_type == VREG) { 1141 int biosize; 1142 1143 biosize = mp->mnt_stat.f_iosize; 1144 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1145 } 1146 return (bp); 1147 } 1148 1149 /* 1150 * Flush and invalidate all dirty buffers. If another process is already 1151 * doing the flush, just wait for completion. 1152 */ 1153 int 1154 nfs_vinvalbuf(vp, flags, cred, p, intrflg) 1155 struct vnode *vp; 1156 int flags; 1157 struct ucred *cred; 1158 struct proc *p; 1159 int intrflg; 1160 { 1161 register struct nfsnode *np = VTONFS(vp); 1162 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1163 int error = 0, slpflag, slptimeo; 1164 1165 if (vp->v_flag & VXLOCK) { 1166 return (0); 1167 } 1168 1169 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1170 intrflg = 0; 1171 if (intrflg) { 1172 slpflag = PCATCH; 1173 slptimeo = 2 * hz; 1174 } else { 1175 slpflag = 0; 1176 slptimeo = 0; 1177 } 1178 /* 1179 * First wait for any other process doing a flush to complete. 1180 */ 1181 while (np->n_flag & NFLUSHINPROG) { 1182 np->n_flag |= NFLUSHWANT; 1183 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 1184 slptimeo); 1185 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 1186 return (EINTR); 1187 } 1188 1189 /* 1190 * Now, flush as required. 1191 */ 1192 np->n_flag |= NFLUSHINPROG; 1193 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 1194 while (error) { 1195 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 1196 np->n_flag &= ~NFLUSHINPROG; 1197 if (np->n_flag & NFLUSHWANT) { 1198 np->n_flag &= ~NFLUSHWANT; 1199 wakeup((caddr_t)&np->n_flag); 1200 } 1201 return (EINTR); 1202 } 1203 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 1204 } 1205 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 1206 if (np->n_flag & NFLUSHWANT) { 1207 np->n_flag &= ~NFLUSHWANT; 1208 wakeup((caddr_t)&np->n_flag); 1209 } 1210 return (0); 1211 } 1212 1213 /* 1214 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1215 * This is mainly to avoid queueing async I/O requests when the nfsiods 1216 * are all hung on a dead server. 1217 * 1218 * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp 1219 * is eventually dequeued by the async daemon, nfs_doio() *will*. 1220 */ 1221 int 1222 nfs_asyncio(bp, cred, procp) 1223 register struct buf *bp; 1224 struct ucred *cred; 1225 struct proc *procp; 1226 { 1227 struct nfsmount *nmp; 1228 int i; 1229 int gotiod; 1230 int slpflag = 0; 1231 int slptimeo = 0; 1232 int error; 1233 1234 /* 1235 * If no async daemons then return EIO to force caller to run the rpc 1236 * synchronously. 1237 */ 1238 if (nfs_numasync == 0) 1239 return (EIO); 1240 1241 nmp = VFSTONFS(bp->b_vp->v_mount); 1242 1243 /* 1244 * Commits are usually short and sweet so lets save some cpu and 1245 * leave the async daemons for more important rpc's (such as reads 1246 * and writes). 1247 */ 1248 if ((bp->b_flags & (B_READ|B_NEEDCOMMIT)) == B_NEEDCOMMIT && 1249 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1250 return(EIO); 1251 } 1252 1253 again: 1254 if (nmp->nm_flag & NFSMNT_INT) 1255 slpflag = PCATCH; 1256 gotiod = FALSE; 1257 1258 /* 1259 * Find a free iod to process this request. 1260 */ 1261 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 1262 if (nfs_iodwant[i]) { 1263 /* 1264 * Found one, so wake it up and tell it which 1265 * mount to process. 1266 */ 1267 NFS_DPF(ASYNCIO, 1268 ("nfs_asyncio: waking iod %d for mount %p\n", 1269 i, nmp)); 1270 nfs_iodwant[i] = (struct proc *)0; 1271 nfs_iodmount[i] = nmp; 1272 nmp->nm_bufqiods++; 1273 wakeup((caddr_t)&nfs_iodwant[i]); 1274 gotiod = TRUE; 1275 break; 1276 } 1277 1278 /* 1279 * If none are free, we may already have an iod working on this mount 1280 * point. If so, it will process our request. 1281 */ 1282 if (!gotiod) { 1283 if (nmp->nm_bufqiods > 0) { 1284 NFS_DPF(ASYNCIO, 1285 ("nfs_asyncio: %d iods are already processing mount %p\n", 1286 nmp->nm_bufqiods, nmp)); 1287 gotiod = TRUE; 1288 } 1289 } 1290 1291 /* 1292 * If we have an iod which can process the request, then queue 1293 * the buffer. 1294 */ 1295 if (gotiod) { 1296 /* 1297 * Ensure that the queue never grows too large. We still want 1298 * to asynchronize so we block rather then return EIO. 1299 */ 1300 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1301 NFS_DPF(ASYNCIO, 1302 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1303 nmp->nm_bufqwant = TRUE; 1304 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 1305 "nfsaio", slptimeo); 1306 if (error) { 1307 if (nfs_sigintr(nmp, NULL, procp)) 1308 return (EINTR); 1309 if (slpflag == PCATCH) { 1310 slpflag = 0; 1311 slptimeo = 2 * hz; 1312 } 1313 } 1314 /* 1315 * We might have lost our iod while sleeping, 1316 * so check and loop if nescessary. 1317 */ 1318 if (nmp->nm_bufqiods == 0) { 1319 NFS_DPF(ASYNCIO, 1320 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1321 goto again; 1322 } 1323 } 1324 1325 if (bp->b_flags & B_READ) { 1326 if (bp->b_rcred == NOCRED && cred != NOCRED) { 1327 crhold(cred); 1328 bp->b_rcred = cred; 1329 } 1330 } else { 1331 bp->b_flags |= B_WRITEINPROG; 1332 if (bp->b_wcred == NOCRED && cred != NOCRED) { 1333 crhold(cred); 1334 bp->b_wcred = cred; 1335 } 1336 } 1337 1338 BUF_KERNPROC(bp); 1339 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1340 nmp->nm_bufqlen++; 1341 return (0); 1342 } 1343 1344 /* 1345 * All the iods are busy on other mounts, so return EIO to 1346 * force the caller to process the i/o synchronously. 1347 */ 1348 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1349 return (EIO); 1350 } 1351 1352 /* 1353 * Do an I/O operation to/from a cache block. This may be called 1354 * synchronously or from an nfsiod. 1355 */ 1356 int 1357 nfs_doio(bp, cr, p) 1358 struct buf *bp; 1359 struct ucred *cr; 1360 struct proc *p; 1361 { 1362 struct uio *uiop; 1363 struct vnode *vp; 1364 struct nfsnode *np; 1365 struct nfsmount *nmp; 1366 int error = 0, iomode, must_commit = 0; 1367 struct uio uio; 1368 struct iovec io; 1369 1370 vp = bp->b_vp; 1371 np = VTONFS(vp); 1372 nmp = VFSTONFS(vp->v_mount); 1373 uiop = &uio; 1374 uiop->uio_iov = &io; 1375 uiop->uio_iovcnt = 1; 1376 uiop->uio_segflg = UIO_SYSSPACE; 1377 uiop->uio_procp = p; 1378 1379 /* 1380 * clear B_ERROR and B_INVAL state prior to initiating the I/O. We 1381 * do this here so we do not have to do it in all the code that 1382 * calls us. 1383 */ 1384 bp->b_flags &= ~(B_ERROR | B_INVAL); 1385 1386 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1387 1388 /* 1389 * Historically, paging was done with physio, but no more. 1390 */ 1391 if (bp->b_flags & B_PHYS) { 1392 /* 1393 * ...though reading /dev/drum still gets us here. 1394 */ 1395 io.iov_len = uiop->uio_resid = bp->b_bcount; 1396 /* mapping was done by vmapbuf() */ 1397 io.iov_base = bp->b_data; 1398 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1399 if (bp->b_flags & B_READ) { 1400 uiop->uio_rw = UIO_READ; 1401 nfsstats.read_physios++; 1402 error = nfs_readrpc(vp, uiop, cr); 1403 } else { 1404 int com; 1405 1406 iomode = NFSV3WRITE_DATASYNC; 1407 uiop->uio_rw = UIO_WRITE; 1408 nfsstats.write_physios++; 1409 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 1410 } 1411 if (error) { 1412 bp->b_flags |= B_ERROR; 1413 bp->b_error = error; 1414 } 1415 } else if (bp->b_flags & B_READ) { 1416 io.iov_len = uiop->uio_resid = bp->b_bcount; 1417 io.iov_base = bp->b_data; 1418 uiop->uio_rw = UIO_READ; 1419 1420 switch (vp->v_type) { 1421 case VREG: 1422 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1423 nfsstats.read_bios++; 1424 error = nfs_readrpc(vp, uiop, cr); 1425 1426 if (!error) { 1427 if (uiop->uio_resid) { 1428 /* 1429 * If we had a short read with no error, we must have 1430 * hit a file hole. We should zero-fill the remainder. 1431 * This can also occur if the server hits the file EOF. 1432 * 1433 * Holes used to be able to occur due to pending 1434 * writes, but that is not possible any longer. 1435 */ 1436 int nread = bp->b_bcount - uiop->uio_resid; 1437 int left = uiop->uio_resid; 1438 1439 if (left > 0) 1440 bzero((char *)bp->b_data + nread, left); 1441 uiop->uio_resid = 0; 1442 } 1443 } 1444 if (p && (vp->v_flag & VTEXT) && 1445 (((nmp->nm_flag & NFSMNT_NQNFS) && 1446 NQNFS_CKINVALID(vp, np, ND_READ) && 1447 np->n_lrev != np->n_brev) || 1448 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1449 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 1450 uprintf("Process killed due to text file modification\n"); 1451 psignal(p, SIGKILL); 1452 PHOLD(p); 1453 } 1454 break; 1455 case VLNK: 1456 uiop->uio_offset = (off_t)0; 1457 nfsstats.readlink_bios++; 1458 error = nfs_readlinkrpc(vp, uiop, cr); 1459 break; 1460 case VDIR: 1461 nfsstats.readdir_bios++; 1462 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1463 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1464 error = nfs_readdirplusrpc(vp, uiop, cr); 1465 if (error == NFSERR_NOTSUPP) 1466 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1467 } 1468 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1469 error = nfs_readdirrpc(vp, uiop, cr); 1470 /* 1471 * end-of-directory sets B_INVAL but does not generate an 1472 * error. 1473 */ 1474 if (error == 0 && uiop->uio_resid == bp->b_bcount) 1475 bp->b_flags |= B_INVAL; 1476 break; 1477 default: 1478 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1479 break; 1480 }; 1481 if (error) { 1482 bp->b_flags |= B_ERROR; 1483 bp->b_error = error; 1484 } 1485 } else { 1486 /* 1487 * If we only need to commit, try to commit 1488 */ 1489 if (bp->b_flags & B_NEEDCOMMIT) { 1490 int retv; 1491 off_t off; 1492 1493 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1494 bp->b_flags |= B_WRITEINPROG; 1495 retv = nfs_commit( 1496 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1497 bp->b_wcred, p); 1498 bp->b_flags &= ~B_WRITEINPROG; 1499 if (retv == 0) { 1500 bp->b_dirtyoff = bp->b_dirtyend = 0; 1501 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1502 bp->b_resid = 0; 1503 biodone(bp); 1504 return (0); 1505 } 1506 if (retv == NFSERR_STALEWRITEVERF) { 1507 nfs_clearcommit(bp->b_vp->v_mount); 1508 } 1509 } 1510 1511 /* 1512 * Setup for actual write 1513 */ 1514 1515 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1516 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1517 1518 if (bp->b_dirtyend > bp->b_dirtyoff) { 1519 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1520 - bp->b_dirtyoff; 1521 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1522 + bp->b_dirtyoff; 1523 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1524 uiop->uio_rw = UIO_WRITE; 1525 nfsstats.write_bios++; 1526 1527 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1528 iomode = NFSV3WRITE_UNSTABLE; 1529 else 1530 iomode = NFSV3WRITE_FILESYNC; 1531 1532 bp->b_flags |= B_WRITEINPROG; 1533 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1534 1535 /* 1536 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1537 * to cluster the buffers needing commit. This will allow 1538 * the system to submit a single commit rpc for the whole 1539 * cluster. We can do this even if the buffer is not 100% 1540 * dirty (relative to the NFS blocksize), so we optimize the 1541 * append-to-file-case. 1542 * 1543 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1544 * cleared because write clustering only works for commit 1545 * rpc's, not for the data portion of the write). 1546 */ 1547 1548 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1549 bp->b_flags |= B_NEEDCOMMIT; 1550 if (bp->b_dirtyoff == 0 1551 && bp->b_dirtyend == bp->b_bcount) 1552 bp->b_flags |= B_CLUSTEROK; 1553 } else { 1554 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1555 } 1556 bp->b_flags &= ~B_WRITEINPROG; 1557 1558 /* 1559 * For an interrupted write, the buffer is still valid 1560 * and the write hasn't been pushed to the server yet, 1561 * so we can't set B_ERROR and report the interruption 1562 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1563 * is not relevant, so the rpc attempt is essentially 1564 * a noop. For the case of a V3 write rpc not being 1565 * committed to stable storage, the block is still 1566 * dirty and requires either a commit rpc or another 1567 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1568 * the block is reused. This is indicated by setting 1569 * the B_DELWRI and B_NEEDCOMMIT flags. 1570 * 1571 * If the buffer is marked B_PAGING, it does not reside on 1572 * the vp's paging queues so we cannot call bdirty(). The 1573 * bp in this case is not an NFS cache block so we should 1574 * be safe. XXX 1575 */ 1576 if (error == EINTR 1577 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1578 int s; 1579 1580 s = splbio(); 1581 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1582 if ((bp->b_flags & B_PAGING) == 0) { 1583 bdirty(bp); 1584 bp->b_flags &= ~B_DONE; 1585 } 1586 if (error && (bp->b_flags & B_ASYNC) == 0) 1587 bp->b_flags |= B_EINTR; 1588 splx(s); 1589 } else { 1590 if (error) { 1591 bp->b_flags |= B_ERROR; 1592 bp->b_error = np->n_error = error; 1593 np->n_flag |= NWRITEERR; 1594 } 1595 bp->b_dirtyoff = bp->b_dirtyend = 0; 1596 } 1597 } else { 1598 bp->b_resid = 0; 1599 biodone(bp); 1600 return (0); 1601 } 1602 } 1603 bp->b_resid = uiop->uio_resid; 1604 if (must_commit) 1605 nfs_clearcommit(vp->v_mount); 1606 biodone(bp); 1607 return (error); 1608 } 1609 1610 /* 1611 * Used to aid in handling ftruncate() operations on the NFS client side. 1612 * Truncation creates a number of special problems for NFS. We have to 1613 * throw away VM pages and buffer cache buffers that are beyond EOF, and 1614 * we have to properly handle VM pages or (potentially dirty) buffers 1615 * that straddle the truncation point. 1616 */ 1617 1618 int 1619 nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct proc *p, u_quad_t nsize) 1620 { 1621 struct nfsnode *np = VTONFS(vp); 1622 u_quad_t tsize = np->n_size; 1623 int biosize = vp->v_mount->mnt_stat.f_iosize; 1624 int error = 0; 1625 1626 np->n_size = nsize; 1627 1628 if (np->n_size < tsize) { 1629 struct buf *bp; 1630 daddr_t lbn; 1631 int bufsize; 1632 1633 /* 1634 * vtruncbuf() doesn't get the buffer overlapping the 1635 * truncation point. We may have a B_DELWRI and/or B_CACHE 1636 * buffer that now needs to be truncated. 1637 */ 1638 error = vtruncbuf(vp, cred, p, nsize, biosize); 1639 lbn = nsize / biosize; 1640 bufsize = nsize & (biosize - 1); 1641 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 1642 if (bp->b_dirtyoff > bp->b_bcount) 1643 bp->b_dirtyoff = bp->b_bcount; 1644 if (bp->b_dirtyend > bp->b_bcount) 1645 bp->b_dirtyend = bp->b_bcount; 1646 bp->b_flags |= B_RELBUF; /* don't leave garbage around */ 1647 brelse(bp); 1648 } else { 1649 vnode_pager_setsize(vp, nsize); 1650 } 1651 return(error); 1652 } 1653 1654