1 /* $NetBSD: nfs_bio.c,v 1.81 2002/05/06 03:20:54 enami Exp $ */ 2 3 /* 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 39 */ 40 41 #include <sys/cdefs.h> 42 __KERNEL_RCSID(0, "$NetBSD: nfs_bio.c,v 1.81 2002/05/06 03:20:54 enami Exp $"); 43 44 #include "opt_nfs.h" 45 #include "opt_ddb.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/resourcevar.h> 50 #include <sys/signalvar.h> 51 #include <sys/proc.h> 52 #include <sys/buf.h> 53 #include <sys/vnode.h> 54 #include <sys/mount.h> 55 #include <sys/kernel.h> 56 #include <sys/namei.h> 57 #include <sys/dirent.h> 58 #include <sys/malloc.h> 59 60 #include <uvm/uvm_extern.h> 61 #include <uvm/uvm.h> 62 63 #include <nfs/rpcv2.h> 64 #include <nfs/nfsproto.h> 65 #include <nfs/nfs.h> 66 #include <nfs/nfsmount.h> 67 #include <nfs/nqnfs.h> 68 #include <nfs/nfsnode.h> 69 #include <nfs/nfs_var.h> 70 71 extern int nfs_numasync; 72 extern int nfs_commitsize; 73 extern struct nfsstats nfsstats; 74 75 /* 76 * Vnode op for read using bio 77 * Any similarity to readip() is purely coincidental 78 */ 79 int 80 nfs_bioread(vp, uio, ioflag, cred, cflag) 81 struct vnode *vp; 82 struct uio *uio; 83 int ioflag, cflag; 84 struct ucred *cred; 85 { 86 struct nfsnode *np = VTONFS(vp); 87 struct buf *bp = NULL, *rabp; 88 struct vattr vattr; 89 struct proc *p; 90 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 91 struct nfsdircache *ndp = NULL, *nndp = NULL; 92 caddr_t baddr, ep, edp; 93 int got_buf = 0, error = 0, n = 0, on = 0, en, enn; 94 int enough = 0; 95 struct dirent *dp, *pdp; 96 off_t curoff = 0; 97 98 #ifdef DIAGNOSTIC 99 if (uio->uio_rw != UIO_READ) 100 panic("nfs_read mode"); 101 #endif 102 if (uio->uio_resid == 0) 103 return (0); 104 if (vp->v_type != VDIR && uio->uio_offset < 0) 105 return (EINVAL); 106 p = uio->uio_procp; 107 #ifndef NFS_V2_ONLY 108 if ((nmp->nm_flag & NFSMNT_NFSV3) && 109 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 110 (void)nfs_fsinfo(nmp, vp, cred, p); 111 #endif 112 if (vp->v_type != VDIR && 113 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 114 return (EFBIG); 115 116 /* 117 * For nfs, cache consistency can only be maintained approximately. 118 * Although RFC1094 does not specify the criteria, the following is 119 * believed to be compatible with the reference port. 120 * For nqnfs, full cache consistency is maintained within the loop. 121 * For nfs: 122 * If the file's modify time on the server has changed since the 123 * last read rpc or you have written to the file, 124 * you may have lost data cache consistency with the 125 * server, so flush all of the file's data out of the cache. 126 * Then force a getattr rpc to ensure that you have up to date 127 * attributes. 128 * NB: This implies that cache data can be read when up to 129 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 130 * attributes this could be forced by setting n_attrstamp to 0 before 131 * the VOP_GETATTR() call. 132 */ 133 134 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { 135 if (np->n_flag & NMODIFIED) { 136 if (vp->v_type != VREG) { 137 if (vp->v_type != VDIR) 138 panic("nfs: bioread, not dir"); 139 nfs_invaldircache(vp, 0); 140 np->n_direofoffset = 0; 141 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 142 if (error) 143 return (error); 144 } 145 np->n_attrstamp = 0; 146 error = VOP_GETATTR(vp, &vattr, cred, p); 147 if (error) 148 return (error); 149 np->n_mtime = vattr.va_mtime.tv_sec; 150 } else { 151 error = VOP_GETATTR(vp, &vattr, cred, p); 152 if (error) 153 return (error); 154 if (np->n_mtime != vattr.va_mtime.tv_sec) { 155 if (vp->v_type == VDIR) { 156 nfs_invaldircache(vp, 0); 157 np->n_direofoffset = 0; 158 } 159 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 160 if (error) 161 return (error); 162 np->n_mtime = vattr.va_mtime.tv_sec; 163 } 164 } 165 } 166 167 /* 168 * update the cached read creds for this node. 169 */ 170 171 if (np->n_rcred) { 172 crfree(np->n_rcred); 173 } 174 np->n_rcred = cred; 175 crhold(cred); 176 177 do { 178 #ifndef NFS_V2_ONLY 179 /* 180 * Get a valid lease. If cached data is stale, flush it. 181 */ 182 if (nmp->nm_flag & NFSMNT_NQNFS) { 183 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 184 do { 185 error = nqnfs_getlease(vp, ND_READ, cred, p); 186 } while (error == NQNFS_EXPIRED); 187 if (error) 188 return (error); 189 if (np->n_lrev != np->n_brev || 190 (np->n_flag & NQNFSNONCACHE) || 191 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 192 if (vp->v_type == VDIR) { 193 nfs_invaldircache(vp, 0); 194 np->n_direofoffset = 0; 195 } 196 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 197 if (error) 198 return (error); 199 np->n_brev = np->n_lrev; 200 } 201 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 202 nfs_invaldircache(vp, 0); 203 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 204 np->n_direofoffset = 0; 205 if (error) 206 return (error); 207 } 208 } 209 #endif 210 /* 211 * Don't cache symlinks. 212 */ 213 if (np->n_flag & NQNFSNONCACHE 214 || ((vp->v_flag & VROOT) && vp->v_type == VLNK)) { 215 switch (vp->v_type) { 216 case VREG: 217 return (nfs_readrpc(vp, uio)); 218 case VLNK: 219 return (nfs_readlinkrpc(vp, uio, cred)); 220 case VDIR: 221 break; 222 default: 223 printf(" NQNFSNONCACHE: type %x unexpected\n", 224 vp->v_type); 225 }; 226 } 227 baddr = (caddr_t)0; 228 switch (vp->v_type) { 229 case VREG: 230 nfsstats.biocache_reads++; 231 232 error = 0; 233 if (uio->uio_offset >= np->n_size) { 234 break; 235 } 236 while (uio->uio_resid > 0) { 237 void *win; 238 vsize_t bytelen = MIN(np->n_size - uio->uio_offset, 239 uio->uio_resid); 240 241 if (bytelen == 0) 242 break; 243 win = ubc_alloc(&vp->v_uobj, uio->uio_offset, 244 &bytelen, UBC_READ); 245 error = uiomove(win, bytelen, uio); 246 ubc_release(win, 0); 247 if (error) { 248 break; 249 } 250 } 251 n = 0; 252 break; 253 254 case VLNK: 255 nfsstats.biocache_readlinks++; 256 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 257 if (!bp) 258 return (EINTR); 259 if ((bp->b_flags & B_DONE) == 0) { 260 bp->b_flags |= B_READ; 261 error = nfs_doio(bp, p); 262 if (error) { 263 brelse(bp); 264 return (error); 265 } 266 } 267 n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 268 got_buf = 1; 269 on = 0; 270 break; 271 case VDIR: 272 diragain: 273 nfsstats.biocache_readdirs++; 274 ndp = nfs_searchdircache(vp, uio->uio_offset, 275 (nmp->nm_flag & NFSMNT_XLATECOOKIE), 0); 276 if (!ndp) { 277 /* 278 * We've been handed a cookie that is not 279 * in the cache. If we're not translating 280 * 32 <-> 64, it may be a value that was 281 * flushed out of the cache because it grew 282 * too big. Let the server judge if it's 283 * valid or not. In the translation case, 284 * we have no way of validating this value, 285 * so punt. 286 */ 287 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) 288 return (EINVAL); 289 ndp = nfs_enterdircache(vp, uio->uio_offset, 290 uio->uio_offset, 0, 0); 291 } 292 293 if (uio->uio_offset != 0 && 294 ndp->dc_cookie == np->n_direofoffset) { 295 nfsstats.direofcache_hits++; 296 return (0); 297 } 298 299 bp = nfs_getcacheblk(vp, ndp->dc_blkno, NFS_DIRBLKSIZ, p); 300 if (!bp) 301 return (EINTR); 302 if ((bp->b_flags & B_DONE) == 0) { 303 bp->b_flags |= B_READ; 304 bp->b_dcookie = ndp->dc_blkcookie; 305 error = nfs_doio(bp, p); 306 if (error) { 307 /* 308 * Yuck! The directory has been modified on the 309 * server. Punt and let the userland code 310 * deal with it. 311 */ 312 brelse(bp); 313 if (error == NFSERR_BAD_COOKIE) { 314 nfs_invaldircache(vp, 0); 315 nfs_vinvalbuf(vp, 0, cred, p, 1); 316 error = EINVAL; 317 } 318 return (error); 319 } 320 } 321 322 /* 323 * Just return if we hit EOF right away with this 324 * block. Always check here, because direofoffset 325 * may have been set by an nfsiod since the last 326 * check. 327 */ 328 if (np->n_direofoffset != 0 && 329 ndp->dc_blkcookie == np->n_direofoffset) { 330 brelse(bp); 331 return (0); 332 } 333 334 /* 335 * Find the entry we were looking for in the block. 336 */ 337 338 en = ndp->dc_entry; 339 340 pdp = dp = (struct dirent *)bp->b_data; 341 edp = bp->b_data + bp->b_bcount - bp->b_resid; 342 enn = 0; 343 while (enn < en && (caddr_t)dp < edp) { 344 pdp = dp; 345 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 346 enn++; 347 } 348 349 /* 350 * If the entry number was bigger than the number of 351 * entries in the block, or the cookie of the previous 352 * entry doesn't match, the directory cache is 353 * stale. Flush it and try again (i.e. go to 354 * the server). 355 */ 356 if ((caddr_t)dp >= edp || (caddr_t)dp + dp->d_reclen > edp || 357 (en > 0 && NFS_GETCOOKIE(pdp) != ndp->dc_cookie)) { 358 #ifdef DEBUG 359 printf("invalid cache: %p %p %p off %lx %lx\n", 360 pdp, dp, edp, 361 (unsigned long)uio->uio_offset, 362 (unsigned long)NFS_GETCOOKIE(pdp)); 363 #endif 364 brelse(bp); 365 nfs_invaldircache(vp, 0); 366 nfs_vinvalbuf(vp, 0, cred, p, 0); 367 goto diragain; 368 } 369 370 on = (caddr_t)dp - bp->b_data; 371 372 /* 373 * Cache all entries that may be exported to the 374 * user, as they may be thrown back at us. The 375 * NFSBIO_CACHECOOKIES flag indicates that all 376 * entries are being 'exported', so cache them all. 377 */ 378 379 if (en == 0 && pdp == dp) { 380 dp = (struct dirent *) 381 ((caddr_t)dp + dp->d_reclen); 382 enn++; 383 } 384 385 if (uio->uio_resid < (bp->b_bcount - bp->b_resid - on)) { 386 n = uio->uio_resid; 387 enough = 1; 388 } else 389 n = bp->b_bcount - bp->b_resid - on; 390 391 ep = bp->b_data + on + n; 392 393 /* 394 * Find last complete entry to copy, caching entries 395 * (if requested) as we go. 396 */ 397 398 while ((caddr_t)dp < ep && (caddr_t)dp + dp->d_reclen <= ep) { 399 if (cflag & NFSBIO_CACHECOOKIES) { 400 nndp = nfs_enterdircache(vp, NFS_GETCOOKIE(pdp), 401 ndp->dc_blkcookie, enn, bp->b_lblkno); 402 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 403 NFS_STASHCOOKIE32(pdp, 404 nndp->dc_cookie32); 405 } 406 } 407 pdp = dp; 408 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 409 enn++; 410 } 411 412 /* 413 * If the last requested entry was not the last in the 414 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ), 415 * cache the cookie of the last requested one, and 416 * set of the offset to it. 417 */ 418 419 if ((on + n) < bp->b_bcount - bp->b_resid) { 420 curoff = NFS_GETCOOKIE(pdp); 421 nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie, 422 enn, bp->b_lblkno); 423 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 424 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 425 curoff = nndp->dc_cookie32; 426 } 427 } else 428 curoff = bp->b_dcookie; 429 430 /* 431 * Always cache the entry for the next block, 432 * so that readaheads can use it. 433 */ 434 nndp = nfs_enterdircache(vp, bp->b_dcookie, bp->b_dcookie, 0,0); 435 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 436 if (curoff == bp->b_dcookie) { 437 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 438 curoff = nndp->dc_cookie32; 439 } 440 } 441 442 n = ((caddr_t)pdp + pdp->d_reclen) - (bp->b_data + on); 443 444 /* 445 * If not eof and read aheads are enabled, start one. 446 * (You need the current block first, so that you have the 447 * directory offset cookie of the next block.) 448 */ 449 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 450 np->n_direofoffset == 0 && !(np->n_flag & NQNFSNONCACHE)) { 451 rabp = nfs_getcacheblk(vp, nndp->dc_blkno, 452 NFS_DIRBLKSIZ, p); 453 if (rabp) { 454 if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { 455 rabp->b_dcookie = nndp->dc_cookie; 456 rabp->b_flags |= (B_READ | B_ASYNC); 457 if (nfs_asyncio(rabp)) { 458 rabp->b_flags |= B_INVAL; 459 brelse(rabp); 460 } 461 } else 462 brelse(rabp); 463 } 464 } 465 got_buf = 1; 466 break; 467 default: 468 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 469 break; 470 } 471 472 if (n > 0) { 473 if (!baddr) 474 baddr = bp->b_data; 475 error = uiomove(baddr + on, (int)n, uio); 476 } 477 switch (vp->v_type) { 478 case VREG: 479 break; 480 case VLNK: 481 n = 0; 482 break; 483 case VDIR: 484 if (np->n_flag & NQNFSNONCACHE) 485 bp->b_flags |= B_INVAL; 486 uio->uio_offset = curoff; 487 if (enough) 488 n = 0; 489 break; 490 default: 491 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 492 } 493 if (got_buf) 494 brelse(bp); 495 } while (error == 0 && uio->uio_resid > 0 && n > 0); 496 return (error); 497 } 498 499 /* 500 * Vnode op for write using bio 501 */ 502 int 503 nfs_write(v) 504 void *v; 505 { 506 struct vop_write_args /* { 507 struct vnode *a_vp; 508 struct uio *a_uio; 509 int a_ioflag; 510 struct ucred *a_cred; 511 } */ *ap = v; 512 struct uio *uio = ap->a_uio; 513 struct proc *p = uio->uio_procp; 514 struct vnode *vp = ap->a_vp; 515 struct nfsnode *np = VTONFS(vp); 516 struct ucred *cred = ap->a_cred; 517 int ioflag = ap->a_ioflag; 518 struct vattr vattr; 519 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 520 void *win; 521 voff_t oldoff, origoff; 522 vsize_t bytelen; 523 int error = 0, iomode, must_commit; 524 525 #ifdef DIAGNOSTIC 526 if (uio->uio_rw != UIO_WRITE) 527 panic("nfs_write mode"); 528 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 529 panic("nfs_write proc"); 530 #endif 531 if (vp->v_type != VREG) 532 return (EIO); 533 if (np->n_flag & NWRITEERR) { 534 np->n_flag &= ~NWRITEERR; 535 return (np->n_error); 536 } 537 #ifndef NFS_V2_ONLY 538 if ((nmp->nm_flag & NFSMNT_NFSV3) && 539 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 540 (void)nfs_fsinfo(nmp, vp, cred, p); 541 #endif 542 if (ioflag & (IO_APPEND | IO_SYNC)) { 543 if (np->n_flag & NMODIFIED) { 544 np->n_attrstamp = 0; 545 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 546 if (error) 547 return (error); 548 } 549 if (ioflag & IO_APPEND) { 550 np->n_attrstamp = 0; 551 error = VOP_GETATTR(vp, &vattr, cred, p); 552 if (error) 553 return (error); 554 uio->uio_offset = np->n_size; 555 } 556 } 557 if (uio->uio_offset < 0) 558 return (EINVAL); 559 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 560 return (EFBIG); 561 if (uio->uio_resid == 0) 562 return (0); 563 /* 564 * Maybe this should be above the vnode op call, but so long as 565 * file servers have no limits, i don't think it matters 566 */ 567 if (p && uio->uio_offset + uio->uio_resid > 568 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 569 psignal(p, SIGXFSZ); 570 return (EFBIG); 571 } 572 573 /* 574 * update the cached write creds for this node. 575 */ 576 577 if (np->n_wcred) { 578 crfree(np->n_wcred); 579 } 580 np->n_wcred = cred; 581 crhold(cred); 582 583 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 584 iomode = NFSV3WRITE_FILESYNC; 585 error = nfs_writerpc(vp, uio, &iomode, &must_commit); 586 if (must_commit) 587 nfs_clearcommit(vp->v_mount); 588 return (error); 589 } 590 591 origoff = uio->uio_offset; 592 do { 593 oldoff = uio->uio_offset; 594 bytelen = uio->uio_resid; 595 596 #ifndef NFS_V2_ONLY 597 /* 598 * Check for a valid write lease. 599 */ 600 if ((nmp->nm_flag & NFSMNT_NQNFS) && 601 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 602 do { 603 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 604 } while (error == NQNFS_EXPIRED); 605 if (error) 606 return (error); 607 if (np->n_lrev != np->n_brev || 608 (np->n_flag & NQNFSNONCACHE)) { 609 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 610 if (error) 611 return (error); 612 np->n_brev = np->n_lrev; 613 } 614 } 615 #endif 616 nfsstats.biocache_writes++; 617 618 np->n_flag |= NMODIFIED; 619 if (np->n_size < uio->uio_offset + bytelen) { 620 np->n_size = uio->uio_offset + bytelen; 621 } 622 if ((uio->uio_offset & PAGE_MASK) == 0 && 623 (bytelen & PAGE_MASK) == 0 && 624 uio->uio_offset >= vp->v_size) { 625 win = ubc_alloc(&vp->v_uobj, uio->uio_offset, &bytelen, 626 UBC_WRITE | UBC_FAULTBUSY); 627 } else { 628 win = ubc_alloc(&vp->v_uobj, uio->uio_offset, &bytelen, 629 UBC_WRITE); 630 } 631 error = uiomove(win, bytelen, uio); 632 ubc_release(win, 0); 633 if (error) { 634 break; 635 } 636 637 /* 638 * update UVM's notion of the size now that we've 639 * copied the data into the vnode's pages. 640 */ 641 642 if (vp->v_size < uio->uio_offset) { 643 uvm_vnp_setsize(vp, uio->uio_offset); 644 } 645 646 if ((oldoff & ~(nmp->nm_wsize - 1)) != 647 (uio->uio_offset & ~(nmp->nm_wsize - 1))) { 648 simple_lock(&vp->v_interlock); 649 error = VOP_PUTPAGES(vp, 650 trunc_page(oldoff & ~(nmp->nm_wsize - 1)), 651 round_page((uio->uio_offset + nmp->nm_wsize - 1) & 652 ~(nmp->nm_wsize - 1)), PGO_CLEANIT); 653 } 654 } while (uio->uio_resid > 0); 655 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 656 simple_lock(&vp->v_interlock); 657 error = VOP_PUTPAGES(vp, 658 trunc_page(origoff & ~(nmp->nm_wsize - 1)), 659 round_page((uio->uio_offset + nmp->nm_wsize - 1) & 660 ~(nmp->nm_wsize - 1)), 661 PGO_CLEANIT | PGO_SYNCIO); 662 } 663 return error; 664 } 665 666 /* 667 * Get an nfs cache block. 668 * Allocate a new one if the block isn't currently in the cache 669 * and return the block marked busy. If the calling process is 670 * interrupted by a signal for an interruptible mount point, return 671 * NULL. 672 */ 673 struct buf * 674 nfs_getcacheblk(vp, bn, size, p) 675 struct vnode *vp; 676 daddr_t bn; 677 int size; 678 struct proc *p; 679 { 680 struct buf *bp; 681 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 682 683 if (nmp->nm_flag & NFSMNT_INT) { 684 bp = getblk(vp, bn, size, PCATCH, 0); 685 while (bp == NULL) { 686 if (nfs_sigintr(nmp, NULL, p)) 687 return (NULL); 688 bp = getblk(vp, bn, size, 0, 2 * hz); 689 } 690 } else 691 bp = getblk(vp, bn, size, 0, 0); 692 return (bp); 693 } 694 695 /* 696 * Flush and invalidate all dirty buffers. If another process is already 697 * doing the flush, just wait for completion. 698 */ 699 int 700 nfs_vinvalbuf(vp, flags, cred, p, intrflg) 701 struct vnode *vp; 702 int flags; 703 struct ucred *cred; 704 struct proc *p; 705 int intrflg; 706 { 707 struct nfsnode *np = VTONFS(vp); 708 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 709 int error = 0, slpflag, slptimeo; 710 711 if ((nmp->nm_flag & NFSMNT_INT) == 0) 712 intrflg = 0; 713 if (intrflg) { 714 slpflag = PCATCH; 715 slptimeo = 2 * hz; 716 } else { 717 slpflag = 0; 718 slptimeo = 0; 719 } 720 /* 721 * First wait for any other process doing a flush to complete. 722 */ 723 while (np->n_flag & NFLUSHINPROG) { 724 np->n_flag |= NFLUSHWANT; 725 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 726 slptimeo); 727 if (error && intrflg && nfs_sigintr(nmp, NULL, p)) 728 return (EINTR); 729 } 730 731 /* 732 * Now, flush as required. 733 */ 734 np->n_flag |= NFLUSHINPROG; 735 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 736 while (error) { 737 if (intrflg && nfs_sigintr(nmp, NULL, p)) { 738 np->n_flag &= ~NFLUSHINPROG; 739 if (np->n_flag & NFLUSHWANT) { 740 np->n_flag &= ~NFLUSHWANT; 741 wakeup((caddr_t)&np->n_flag); 742 } 743 return (EINTR); 744 } 745 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 746 } 747 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 748 if (np->n_flag & NFLUSHWANT) { 749 np->n_flag &= ~NFLUSHWANT; 750 wakeup((caddr_t)&np->n_flag); 751 } 752 return (0); 753 } 754 755 /* 756 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 757 * This is mainly to avoid queueing async I/O requests when the nfsiods 758 * are all hung on a dead server. 759 */ 760 761 int 762 nfs_asyncio(bp) 763 struct buf *bp; 764 { 765 int i; 766 struct nfsmount *nmp; 767 int gotiod, slpflag = 0, slptimeo = 0, error; 768 769 if (nfs_numasync == 0) 770 return (EIO); 771 772 nmp = VFSTONFS(bp->b_vp->v_mount); 773 again: 774 if (nmp->nm_flag & NFSMNT_INT) 775 slpflag = PCATCH; 776 gotiod = FALSE; 777 778 /* 779 * Find a free iod to process this request. 780 */ 781 782 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 783 if (nfs_iodwant[i]) { 784 /* 785 * Found one, so wake it up and tell it which 786 * mount to process. 787 */ 788 nfs_iodwant[i] = NULL; 789 nfs_iodmount[i] = nmp; 790 nmp->nm_bufqiods++; 791 wakeup((caddr_t)&nfs_iodwant[i]); 792 gotiod = TRUE; 793 break; 794 } 795 796 /* 797 * If none are free, we may already have an iod working on this mount 798 * point. If so, it will process our request. 799 */ 800 801 if (!gotiod && nmp->nm_bufqiods > 0) 802 gotiod = TRUE; 803 804 /* 805 * If we have an iod which can process the request, then queue 806 * the buffer. 807 */ 808 809 if (gotiod) { 810 811 /* 812 * Ensure that the queue never grows too large. 813 */ 814 815 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 816 nmp->nm_bufqwant = TRUE; 817 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 818 "nfsaio", slptimeo); 819 if (error) { 820 if (nfs_sigintr(nmp, NULL, curproc)) 821 return (EINTR); 822 if (slpflag == PCATCH) { 823 slpflag = 0; 824 slptimeo = 2 * hz; 825 } 826 } 827 828 /* 829 * We might have lost our iod while sleeping, 830 * so check and loop if nescessary. 831 */ 832 833 if (nmp->nm_bufqiods == 0) 834 goto again; 835 } 836 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 837 nmp->nm_bufqlen++; 838 return (0); 839 } 840 841 /* 842 * All the iods are busy on other mounts, so return EIO to 843 * force the caller to process the i/o synchronously. 844 */ 845 846 return (EIO); 847 } 848 849 /* 850 * Do an I/O operation to/from a cache block. This may be called 851 * synchronously or from an nfsiod. 852 */ 853 int 854 nfs_doio(bp, p) 855 struct buf *bp; 856 struct proc *p; 857 { 858 struct uio *uiop; 859 struct vnode *vp; 860 struct nfsnode *np; 861 struct nfsmount *nmp; 862 int error = 0, diff, len, iomode, must_commit = 0; 863 int pushedrange; 864 struct uio uio; 865 struct iovec io; 866 off_t off, cnt; 867 struct uvm_object *uobj; 868 UVMHIST_FUNC("nfs_doio"); UVMHIST_CALLED(ubchist); 869 870 vp = bp->b_vp; 871 uobj = &vp->v_uobj; 872 np = VTONFS(vp); 873 nmp = VFSTONFS(vp->v_mount); 874 uiop = &uio; 875 uiop->uio_iov = &io; 876 uiop->uio_iovcnt = 1; 877 uiop->uio_segflg = UIO_SYSSPACE; 878 uiop->uio_procp = p; 879 880 /* 881 * Historically, paging was done with physio, but no more... 882 */ 883 if (bp->b_flags & B_PHYS) { 884 /* 885 * ...though reading /dev/drum still gets us here. 886 */ 887 io.iov_len = uiop->uio_resid = bp->b_bcount; 888 /* mapping was done by vmapbuf() */ 889 io.iov_base = bp->b_data; 890 uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; 891 if (bp->b_flags & B_READ) { 892 uiop->uio_rw = UIO_READ; 893 nfsstats.read_physios++; 894 error = nfs_readrpc(vp, uiop); 895 } else { 896 iomode = NFSV3WRITE_DATASYNC; 897 uiop->uio_rw = UIO_WRITE; 898 nfsstats.write_physios++; 899 error = nfs_writerpc(vp, uiop, &iomode, &must_commit); 900 } 901 if (error) { 902 bp->b_flags |= B_ERROR; 903 bp->b_error = error; 904 } 905 } else if (bp->b_flags & B_READ) { 906 io.iov_len = uiop->uio_resid = bp->b_bcount; 907 io.iov_base = bp->b_data; 908 uiop->uio_rw = UIO_READ; 909 switch (vp->v_type) { 910 case VREG: 911 uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; 912 nfsstats.read_bios++; 913 error = nfs_readrpc(vp, uiop); 914 if (!error && uiop->uio_resid) { 915 916 /* 917 * If len > 0, there is a hole in the file and 918 * no writes after the hole have been pushed to 919 * the server yet. 920 * Just zero fill the rest of the valid area. 921 */ 922 923 diff = bp->b_bcount - uiop->uio_resid; 924 len = np->n_size - ((((off_t)bp->b_blkno) << DEV_BSHIFT) 925 + diff); 926 if (len > 0) { 927 len = MIN(len, uiop->uio_resid); 928 memset((char *)bp->b_data + diff, 0, len); 929 } 930 } 931 if (p && (vp->v_flag & VTEXT) && 932 (((nmp->nm_flag & NFSMNT_NQNFS) && 933 NQNFS_CKINVALID(vp, np, ND_READ) && 934 np->n_lrev != np->n_brev) || 935 (!(nmp->nm_flag & NFSMNT_NQNFS) && 936 np->n_mtime != np->n_vattr->va_mtime.tv_sec))) { 937 uprintf("Process killed due to " 938 "text file modification\n"); 939 psignal(p, SIGKILL); 940 p->p_holdcnt++; 941 } 942 break; 943 case VLNK: 944 uiop->uio_offset = (off_t)0; 945 nfsstats.readlink_bios++; 946 error = nfs_readlinkrpc(vp, uiop, curproc->p_ucred); 947 break; 948 case VDIR: 949 nfsstats.readdir_bios++; 950 uiop->uio_offset = bp->b_dcookie; 951 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 952 error = nfs_readdirplusrpc(vp, uiop, curproc->p_ucred); 953 if (error == NFSERR_NOTSUPP) 954 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 955 } 956 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 957 error = nfs_readdirrpc(vp, uiop, curproc->p_ucred); 958 if (!error) { 959 bp->b_dcookie = uiop->uio_offset; 960 } 961 break; 962 default: 963 printf("nfs_doio: type %x unexpected\n",vp->v_type); 964 break; 965 } 966 if (error) { 967 bp->b_flags |= B_ERROR; 968 bp->b_error = error; 969 } 970 } else { 971 int i, npages = bp->b_bufsize >> PAGE_SHIFT; 972 struct vm_page *pgs[npages]; 973 boolean_t needcommit = TRUE; 974 975 if ((bp->b_flags & B_ASYNC) != 0 && NFS_ISV3(vp)) { 976 iomode = NFSV3WRITE_UNSTABLE; 977 } else { 978 iomode = NFSV3WRITE_FILESYNC; 979 } 980 981 for (i = 0; i < npages; i++) { 982 pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + 983 (i << PAGE_SHIFT)); 984 if ((pgs[i]->flags & PG_NEEDCOMMIT) == 0) { 985 needcommit = FALSE; 986 } 987 } 988 if (!needcommit && iomode == NFSV3WRITE_UNSTABLE) { 989 for (i = 0; i < npages; i++) { 990 pgs[i]->flags |= PG_NEEDCOMMIT | PG_RDONLY; 991 pmap_page_protect(pgs[i], VM_PROT_READ); 992 } 993 } 994 995 uiop->uio_offset = (((off_t)bp->b_blkno) << DEV_BSHIFT); 996 off = uiop->uio_offset; 997 cnt = bp->b_bcount; 998 999 /* 1000 * Send the data to the server if necessary, 1001 * otherwise just send a commit rpc. 1002 */ 1003 1004 if (needcommit) { 1005 1006 /* 1007 * If the buffer is in the range that we already committed, 1008 * there's nothing to do. 1009 * 1010 * If it's in the range that we need to commit, push the 1011 * whole range at once, otherwise only push the buffer. 1012 * In both these cases, acquire the commit lock to avoid 1013 * other processes modifying the range. 1014 */ 1015 1016 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1017 if (!nfs_in_committed_range(vp, off, bp->b_bcount)) { 1018 if (nfs_in_tobecommitted_range(vp, off, bp->b_bcount)) { 1019 pushedrange = 1; 1020 off = np->n_pushlo; 1021 cnt = np->n_pushhi - np->n_pushlo; 1022 } else { 1023 pushedrange = 0; 1024 } 1025 error = nfs_commit(vp, off, cnt, curproc); 1026 if (error == 0) { 1027 if (pushedrange) { 1028 nfs_merge_commit_ranges(vp); 1029 } else { 1030 nfs_add_committed_range(vp, off, cnt); 1031 } 1032 } 1033 } 1034 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1035 if (!error) { 1036 bp->b_resid = 0; 1037 simple_lock(&uobj->vmobjlock); 1038 for (i = 0; i < npages; i++) { 1039 pgs[i]->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY); 1040 } 1041 simple_unlock(&uobj->vmobjlock); 1042 biodone(bp); 1043 return (0); 1044 } else if (error == NFSERR_STALEWRITEVERF) { 1045 nfs_clearcommit(bp->b_vp->v_mount); 1046 } 1047 } 1048 io.iov_base = bp->b_data; 1049 io.iov_len = uiop->uio_resid = bp->b_bcount; 1050 uiop->uio_rw = UIO_WRITE; 1051 nfsstats.write_bios++; 1052 error = nfs_writerpc(vp, uiop, &iomode, &must_commit); 1053 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1054 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1055 nfs_add_tobecommitted_range(vp, off, cnt); 1056 simple_lock(&uobj->vmobjlock); 1057 for (i = 0; i < npages; i++) { 1058 pgs[i]->flags &= ~PG_CLEAN; 1059 } 1060 simple_unlock(&uobj->vmobjlock); 1061 if (np->n_pushhi - np->n_pushlo > nfs_commitsize) { 1062 off = np->n_pushlo; 1063 cnt = nfs_commitsize >> 1; 1064 error = nfs_commit(vp, off, cnt, curproc); 1065 if (!error) { 1066 nfs_add_committed_range(vp, off, cnt); 1067 nfs_del_tobecommitted_range(vp, off, cnt); 1068 } 1069 } 1070 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1071 } else if (!error && needcommit) { 1072 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1073 nfs_del_committed_range(vp, off, cnt); 1074 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1075 simple_lock(&uobj->vmobjlock); 1076 for (i = 0; i < npages; i++) { 1077 pgs[i]->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY); 1078 } 1079 simple_unlock(&uobj->vmobjlock); 1080 } 1081 } 1082 bp->b_resid = uiop->uio_resid; 1083 if (must_commit || (error == NFSERR_STALEWRITEVERF)) { 1084 nfs_clearcommit(vp->v_mount); 1085 } 1086 biodone(bp); 1087 return (error); 1088 } 1089 1090 /* 1091 * Vnode op for VM getpages. 1092 */ 1093 1094 int 1095 nfs_getpages(v) 1096 void *v; 1097 { 1098 struct vop_getpages_args /* { 1099 struct vnode *a_vp; 1100 voff_t a_offset; 1101 struct vm_page **a_m; 1102 int *a_count; 1103 int a_centeridx; 1104 vm_prot_t a_access_type; 1105 int a_advice; 1106 int a_flags; 1107 } */ *ap = v; 1108 1109 struct vnode *vp = ap->a_vp; 1110 struct uvm_object *uobj = &vp->v_uobj; 1111 struct nfsnode *np = VTONFS(vp); 1112 const int npages = *ap->a_count; 1113 struct vm_page *pg, **pgs, *opgs[npages]; 1114 off_t origoffset, len; 1115 int i, error; 1116 boolean_t v3 = NFS_ISV3(vp); 1117 boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0; 1118 boolean_t locked = (ap->a_flags & PGO_LOCKED) != 0; 1119 1120 /* 1121 * update the cached read creds for this node. 1122 */ 1123 1124 if (np->n_rcred) { 1125 crfree(np->n_rcred); 1126 } 1127 np->n_rcred = curproc->p_ucred; 1128 crhold(np->n_rcred); 1129 1130 /* 1131 * call the genfs code to get the pages. `pgs' may be NULL 1132 * when doing read-ahead. 1133 */ 1134 1135 pgs = ap->a_m; 1136 if (write && locked && v3) { 1137 KASSERT(pgs != NULL); 1138 #ifdef DEBUG 1139 1140 /* 1141 * If PGO_LOCKED is set, real pages shouldn't exists 1142 * in the array. 1143 */ 1144 1145 for (i = 0; i < npages; i++) 1146 KDASSERT(pgs[i] == NULL || pgs[i] == PGO_DONTCARE); 1147 #endif 1148 memcpy(opgs, pgs, npages * sizeof(struct vm_pages *)); 1149 } 1150 error = genfs_getpages(v); 1151 if (error) { 1152 return (error); 1153 } 1154 1155 /* 1156 * for read faults where the nfs node is not yet marked NMODIFIED, 1157 * set PG_RDONLY on the pages so that we come back here if someone 1158 * tries to modify later via the mapping that will be entered for 1159 * this fault. 1160 */ 1161 1162 if (!write && (np->n_flag & NMODIFIED) == 0 && pgs != NULL) { 1163 if (!locked) { 1164 simple_lock(&uobj->vmobjlock); 1165 } 1166 for (i = 0; i < npages; i++) { 1167 pg = pgs[i]; 1168 if (pg == NULL || pg == PGO_DONTCARE) { 1169 continue; 1170 } 1171 pg->flags |= PG_RDONLY; 1172 } 1173 if (!locked) { 1174 simple_unlock(&uobj->vmobjlock); 1175 } 1176 } 1177 if (!write) { 1178 return (0); 1179 } 1180 1181 /* 1182 * this is a write fault, update the commit info. 1183 */ 1184 1185 origoffset = ap->a_offset; 1186 len = npages << PAGE_SHIFT; 1187 1188 if (v3) { 1189 error = lockmgr(&np->n_commitlock, 1190 LK_EXCLUSIVE | (locked ? LK_NOWAIT : 0), NULL); 1191 if (error) { 1192 KASSERT(locked != 0); 1193 1194 /* 1195 * Since PGO_LOCKED is set, we need to unbusy 1196 * all pages fetched by genfs_getpages() above, 1197 * tell the caller that there are no pages 1198 * available and put back original pgs array. 1199 */ 1200 1201 uvm_lock_pageq(); 1202 uvm_page_unbusy(pgs, npages); 1203 uvm_unlock_pageq(); 1204 *ap->a_count = 0; 1205 memcpy(pgs, opgs, 1206 npages * sizeof(struct vm_pages *)); 1207 return (error); 1208 } 1209 nfs_del_committed_range(vp, origoffset, len); 1210 nfs_del_tobecommitted_range(vp, origoffset, len); 1211 } 1212 np->n_flag |= NMODIFIED; 1213 if (!locked) { 1214 simple_lock(&uobj->vmobjlock); 1215 } 1216 for (i = 0; i < npages; i++) { 1217 pg = pgs[i]; 1218 if (pg == NULL || pg == PGO_DONTCARE) { 1219 continue; 1220 } 1221 pg->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY); 1222 } 1223 if (!locked) { 1224 simple_unlock(&uobj->vmobjlock); 1225 } 1226 if (v3) { 1227 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1228 } 1229 return (0); 1230 } 1231