1 /* $NetBSD: nfs_bio.c,v 1.85 2002/10/29 10:15:16 yamt Exp $ */ 2 3 /* 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Rick Macklem at The University of Guelph. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 39 */ 40 41 #include <sys/cdefs.h> 42 __KERNEL_RCSID(0, "$NetBSD: nfs_bio.c,v 1.85 2002/10/29 10:15:16 yamt Exp $"); 43 44 #include "opt_nfs.h" 45 #include "opt_ddb.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/resourcevar.h> 50 #include <sys/signalvar.h> 51 #include <sys/proc.h> 52 #include <sys/buf.h> 53 #include <sys/vnode.h> 54 #include <sys/mount.h> 55 #include <sys/kernel.h> 56 #include <sys/namei.h> 57 #include <sys/dirent.h> 58 #include <sys/malloc.h> 59 60 #include <uvm/uvm_extern.h> 61 #include <uvm/uvm.h> 62 63 #include <nfs/rpcv2.h> 64 #include <nfs/nfsproto.h> 65 #include <nfs/nfs.h> 66 #include <nfs/nfsmount.h> 67 #include <nfs/nqnfs.h> 68 #include <nfs/nfsnode.h> 69 #include <nfs/nfs_var.h> 70 71 extern int nfs_numasync; 72 extern int nfs_commitsize; 73 extern struct nfsstats nfsstats; 74 75 /* 76 * Vnode op for read using bio 77 * Any similarity to readip() is purely coincidental 78 */ 79 int 80 nfs_bioread(vp, uio, ioflag, cred, cflag) 81 struct vnode *vp; 82 struct uio *uio; 83 int ioflag, cflag; 84 struct ucred *cred; 85 { 86 struct nfsnode *np = VTONFS(vp); 87 struct buf *bp = NULL, *rabp; 88 struct vattr vattr; 89 struct proc *p; 90 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 91 struct nfsdircache *ndp = NULL, *nndp = NULL; 92 caddr_t baddr, ep, edp; 93 int got_buf = 0, error = 0, n = 0, on = 0, en, enn; 94 int enough = 0; 95 struct dirent *dp, *pdp; 96 off_t curoff = 0; 97 98 #ifdef DIAGNOSTIC 99 if (uio->uio_rw != UIO_READ) 100 panic("nfs_read mode"); 101 #endif 102 if (uio->uio_resid == 0) 103 return (0); 104 if (vp->v_type != VDIR && uio->uio_offset < 0) 105 return (EINVAL); 106 p = uio->uio_procp; 107 #ifndef NFS_V2_ONLY 108 if ((nmp->nm_flag & NFSMNT_NFSV3) && 109 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 110 (void)nfs_fsinfo(nmp, vp, cred, p); 111 #endif 112 if (vp->v_type != VDIR && 113 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 114 return (EFBIG); 115 116 /* 117 * For nfs, cache consistency can only be maintained approximately. 118 * Although RFC1094 does not specify the criteria, the following is 119 * believed to be compatible with the reference port. 120 * For nqnfs, full cache consistency is maintained within the loop. 121 * For nfs: 122 * If the file's modify time on the server has changed since the 123 * last read rpc or you have written to the file, 124 * you may have lost data cache consistency with the 125 * server, so flush all of the file's data out of the cache. 126 * Then force a getattr rpc to ensure that you have up to date 127 * attributes. 128 * NB: This implies that cache data can be read when up to 129 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 130 * attributes this could be forced by setting n_attrstamp to 0 before 131 * the VOP_GETATTR() call. 132 */ 133 134 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { 135 if (np->n_flag & NMODIFIED) { 136 if (vp->v_type != VREG) { 137 if (vp->v_type != VDIR) 138 panic("nfs: bioread, not dir"); 139 nfs_invaldircache(vp, 0); 140 np->n_direofoffset = 0; 141 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 142 if (error) 143 return (error); 144 } 145 np->n_attrstamp = 0; 146 error = VOP_GETATTR(vp, &vattr, cred, p); 147 if (error) 148 return (error); 149 np->n_mtime = vattr.va_mtime.tv_sec; 150 } else { 151 error = VOP_GETATTR(vp, &vattr, cred, p); 152 if (error) 153 return (error); 154 if (np->n_mtime != vattr.va_mtime.tv_sec) { 155 if (vp->v_type == VDIR) { 156 nfs_invaldircache(vp, 0); 157 np->n_direofoffset = 0; 158 } 159 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 160 if (error) 161 return (error); 162 np->n_mtime = vattr.va_mtime.tv_sec; 163 } 164 } 165 } 166 167 /* 168 * update the cached read creds for this node. 169 */ 170 171 if (np->n_rcred) { 172 crfree(np->n_rcred); 173 } 174 np->n_rcred = cred; 175 crhold(cred); 176 177 do { 178 #ifndef NFS_V2_ONLY 179 /* 180 * Get a valid lease. If cached data is stale, flush it. 181 */ 182 if (nmp->nm_flag & NFSMNT_NQNFS) { 183 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 184 do { 185 error = nqnfs_getlease(vp, ND_READ, cred, p); 186 } while (error == NQNFS_EXPIRED); 187 if (error) 188 return (error); 189 if (np->n_lrev != np->n_brev || 190 (np->n_flag & NQNFSNONCACHE) || 191 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 192 if (vp->v_type == VDIR) { 193 nfs_invaldircache(vp, 0); 194 np->n_direofoffset = 0; 195 } 196 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 197 if (error) 198 return (error); 199 np->n_brev = np->n_lrev; 200 } 201 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 202 nfs_invaldircache(vp, 0); 203 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 204 np->n_direofoffset = 0; 205 if (error) 206 return (error); 207 } 208 } 209 #endif 210 /* 211 * Don't cache symlinks. 212 */ 213 if (np->n_flag & NQNFSNONCACHE 214 || ((vp->v_flag & VROOT) && vp->v_type == VLNK)) { 215 switch (vp->v_type) { 216 case VREG: 217 return (nfs_readrpc(vp, uio)); 218 case VLNK: 219 return (nfs_readlinkrpc(vp, uio, cred)); 220 case VDIR: 221 break; 222 default: 223 printf(" NQNFSNONCACHE: type %x unexpected\n", 224 vp->v_type); 225 }; 226 } 227 baddr = (caddr_t)0; 228 switch (vp->v_type) { 229 case VREG: 230 nfsstats.biocache_reads++; 231 232 error = 0; 233 if (uio->uio_offset >= np->n_size) { 234 break; 235 } 236 while (uio->uio_resid > 0) { 237 void *win; 238 vsize_t bytelen = MIN(np->n_size - uio->uio_offset, 239 uio->uio_resid); 240 241 if (bytelen == 0) 242 break; 243 win = ubc_alloc(&vp->v_uobj, uio->uio_offset, 244 &bytelen, UBC_READ); 245 error = uiomove(win, bytelen, uio); 246 ubc_release(win, 0); 247 if (error) { 248 break; 249 } 250 } 251 n = 0; 252 break; 253 254 case VLNK: 255 nfsstats.biocache_readlinks++; 256 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 257 if (!bp) 258 return (EINTR); 259 if ((bp->b_flags & B_DONE) == 0) { 260 bp->b_flags |= B_READ; 261 error = nfs_doio(bp, p); 262 if (error) { 263 brelse(bp); 264 return (error); 265 } 266 } 267 n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 268 got_buf = 1; 269 on = 0; 270 break; 271 case VDIR: 272 diragain: 273 nfsstats.biocache_readdirs++; 274 ndp = nfs_searchdircache(vp, uio->uio_offset, 275 (nmp->nm_flag & NFSMNT_XLATECOOKIE), 0); 276 if (!ndp) { 277 /* 278 * We've been handed a cookie that is not 279 * in the cache. If we're not translating 280 * 32 <-> 64, it may be a value that was 281 * flushed out of the cache because it grew 282 * too big. Let the server judge if it's 283 * valid or not. In the translation case, 284 * we have no way of validating this value, 285 * so punt. 286 */ 287 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) 288 return (EINVAL); 289 ndp = nfs_enterdircache(vp, uio->uio_offset, 290 uio->uio_offset, 0, 0); 291 } 292 293 if (uio->uio_offset != 0 && 294 ndp->dc_cookie == np->n_direofoffset) { 295 nfsstats.direofcache_hits++; 296 return (0); 297 } 298 299 bp = nfs_getcacheblk(vp, ndp->dc_blkno, NFS_DIRBLKSIZ, p); 300 if (!bp) 301 return (EINTR); 302 if ((bp->b_flags & B_DONE) == 0) { 303 bp->b_flags |= B_READ; 304 bp->b_dcookie = ndp->dc_blkcookie; 305 error = nfs_doio(bp, p); 306 if (error) { 307 /* 308 * Yuck! The directory has been modified on the 309 * server. Punt and let the userland code 310 * deal with it. 311 */ 312 brelse(bp); 313 if (error == NFSERR_BAD_COOKIE) { 314 nfs_invaldircache(vp, 0); 315 nfs_vinvalbuf(vp, 0, cred, p, 1); 316 error = EINVAL; 317 } 318 return (error); 319 } 320 } 321 322 /* 323 * Just return if we hit EOF right away with this 324 * block. Always check here, because direofoffset 325 * may have been set by an nfsiod since the last 326 * check. 327 */ 328 if (np->n_direofoffset != 0 && 329 ndp->dc_blkcookie == np->n_direofoffset) { 330 brelse(bp); 331 return (0); 332 } 333 334 /* 335 * Find the entry we were looking for in the block. 336 */ 337 338 en = ndp->dc_entry; 339 340 pdp = dp = (struct dirent *)bp->b_data; 341 edp = bp->b_data + bp->b_bcount - bp->b_resid; 342 enn = 0; 343 while (enn < en && (caddr_t)dp < edp) { 344 pdp = dp; 345 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 346 enn++; 347 } 348 349 /* 350 * If the entry number was bigger than the number of 351 * entries in the block, or the cookie of the previous 352 * entry doesn't match, the directory cache is 353 * stale. Flush it and try again (i.e. go to 354 * the server). 355 */ 356 if ((caddr_t)dp >= edp || (caddr_t)dp + dp->d_reclen > edp || 357 (en > 0 && NFS_GETCOOKIE(pdp) != ndp->dc_cookie)) { 358 #ifdef DEBUG 359 printf("invalid cache: %p %p %p off %lx %lx\n", 360 pdp, dp, edp, 361 (unsigned long)uio->uio_offset, 362 (unsigned long)NFS_GETCOOKIE(pdp)); 363 #endif 364 brelse(bp); 365 nfs_invaldircache(vp, 0); 366 nfs_vinvalbuf(vp, 0, cred, p, 0); 367 goto diragain; 368 } 369 370 on = (caddr_t)dp - bp->b_data; 371 372 /* 373 * Cache all entries that may be exported to the 374 * user, as they may be thrown back at us. The 375 * NFSBIO_CACHECOOKIES flag indicates that all 376 * entries are being 'exported', so cache them all. 377 */ 378 379 if (en == 0 && pdp == dp) { 380 dp = (struct dirent *) 381 ((caddr_t)dp + dp->d_reclen); 382 enn++; 383 } 384 385 if (uio->uio_resid < (bp->b_bcount - bp->b_resid - on)) { 386 n = uio->uio_resid; 387 enough = 1; 388 } else 389 n = bp->b_bcount - bp->b_resid - on; 390 391 ep = bp->b_data + on + n; 392 393 /* 394 * Find last complete entry to copy, caching entries 395 * (if requested) as we go. 396 */ 397 398 while ((caddr_t)dp < ep && (caddr_t)dp + dp->d_reclen <= ep) { 399 if (cflag & NFSBIO_CACHECOOKIES) { 400 nndp = nfs_enterdircache(vp, NFS_GETCOOKIE(pdp), 401 ndp->dc_blkcookie, enn, bp->b_lblkno); 402 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 403 NFS_STASHCOOKIE32(pdp, 404 nndp->dc_cookie32); 405 } 406 } 407 pdp = dp; 408 dp = (struct dirent *)((caddr_t)dp + dp->d_reclen); 409 enn++; 410 } 411 412 /* 413 * If the last requested entry was not the last in the 414 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ), 415 * cache the cookie of the last requested one, and 416 * set of the offset to it. 417 */ 418 419 if ((on + n) < bp->b_bcount - bp->b_resid) { 420 curoff = NFS_GETCOOKIE(pdp); 421 nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie, 422 enn, bp->b_lblkno); 423 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 424 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 425 curoff = nndp->dc_cookie32; 426 } 427 } else 428 curoff = bp->b_dcookie; 429 430 /* 431 * Always cache the entry for the next block, 432 * so that readaheads can use it. 433 */ 434 nndp = nfs_enterdircache(vp, bp->b_dcookie, bp->b_dcookie, 0,0); 435 if (nmp->nm_flag & NFSMNT_XLATECOOKIE) { 436 if (curoff == bp->b_dcookie) { 437 NFS_STASHCOOKIE32(pdp, nndp->dc_cookie32); 438 curoff = nndp->dc_cookie32; 439 } 440 } 441 442 n = ((caddr_t)pdp + pdp->d_reclen) - (bp->b_data + on); 443 444 /* 445 * If not eof and read aheads are enabled, start one. 446 * (You need the current block first, so that you have the 447 * directory offset cookie of the next block.) 448 */ 449 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 450 np->n_direofoffset == 0 && !(np->n_flag & NQNFSNONCACHE)) { 451 rabp = nfs_getcacheblk(vp, nndp->dc_blkno, 452 NFS_DIRBLKSIZ, p); 453 if (rabp) { 454 if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { 455 rabp->b_dcookie = nndp->dc_cookie; 456 rabp->b_flags |= (B_READ | B_ASYNC); 457 if (nfs_asyncio(rabp)) { 458 rabp->b_flags |= B_INVAL; 459 brelse(rabp); 460 } 461 } else 462 brelse(rabp); 463 } 464 } 465 got_buf = 1; 466 break; 467 default: 468 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 469 break; 470 } 471 472 if (n > 0) { 473 if (!baddr) 474 baddr = bp->b_data; 475 error = uiomove(baddr + on, (int)n, uio); 476 } 477 switch (vp->v_type) { 478 case VREG: 479 break; 480 case VLNK: 481 n = 0; 482 break; 483 case VDIR: 484 if (np->n_flag & NQNFSNONCACHE) 485 bp->b_flags |= B_INVAL; 486 uio->uio_offset = curoff; 487 if (enough) 488 n = 0; 489 break; 490 default: 491 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 492 } 493 if (got_buf) 494 brelse(bp); 495 } while (error == 0 && uio->uio_resid > 0 && n > 0); 496 return (error); 497 } 498 499 /* 500 * Vnode op for write using bio 501 */ 502 int 503 nfs_write(v) 504 void *v; 505 { 506 struct vop_write_args /* { 507 struct vnode *a_vp; 508 struct uio *a_uio; 509 int a_ioflag; 510 struct ucred *a_cred; 511 } */ *ap = v; 512 struct uio *uio = ap->a_uio; 513 struct proc *p = uio->uio_procp; 514 struct vnode *vp = ap->a_vp; 515 struct nfsnode *np = VTONFS(vp); 516 struct ucred *cred = ap->a_cred; 517 int ioflag = ap->a_ioflag; 518 struct vattr vattr; 519 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 520 void *win; 521 voff_t oldoff, origoff; 522 vsize_t bytelen; 523 int error = 0, iomode, must_commit; 524 int extended = 0, wrotedta = 0; 525 526 #ifdef DIAGNOSTIC 527 if (uio->uio_rw != UIO_WRITE) 528 panic("nfs_write mode"); 529 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 530 panic("nfs_write proc"); 531 #endif 532 if (vp->v_type != VREG) 533 return (EIO); 534 if (np->n_flag & NWRITEERR) { 535 np->n_flag &= ~NWRITEERR; 536 return (np->n_error); 537 } 538 #ifndef NFS_V2_ONLY 539 if ((nmp->nm_flag & NFSMNT_NFSV3) && 540 !(nmp->nm_iflag & NFSMNT_GOTFSINFO)) 541 (void)nfs_fsinfo(nmp, vp, cred, p); 542 #endif 543 if (ioflag & (IO_APPEND | IO_SYNC)) { 544 if (np->n_flag & NMODIFIED) { 545 np->n_attrstamp = 0; 546 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 547 if (error) 548 return (error); 549 } 550 if (ioflag & IO_APPEND) { 551 np->n_attrstamp = 0; 552 error = VOP_GETATTR(vp, &vattr, cred, p); 553 if (error) 554 return (error); 555 uio->uio_offset = np->n_size; 556 } 557 } 558 if (uio->uio_offset < 0) 559 return (EINVAL); 560 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 561 return (EFBIG); 562 if (uio->uio_resid == 0) 563 return (0); 564 /* 565 * Maybe this should be above the vnode op call, but so long as 566 * file servers have no limits, i don't think it matters 567 */ 568 if (p && uio->uio_offset + uio->uio_resid > 569 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 570 psignal(p, SIGXFSZ); 571 return (EFBIG); 572 } 573 574 /* 575 * update the cached write creds for this node. 576 */ 577 578 if (np->n_wcred) { 579 crfree(np->n_wcred); 580 } 581 np->n_wcred = cred; 582 crhold(cred); 583 584 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 585 iomode = NFSV3WRITE_FILESYNC; 586 error = nfs_writerpc(vp, uio, &iomode, &must_commit); 587 if (must_commit) 588 nfs_clearcommit(vp->v_mount); 589 return (error); 590 } 591 592 origoff = uio->uio_offset; 593 do { 594 boolean_t extending; /* if we are extending whole pages */ 595 u_quad_t oldsize; 596 oldoff = uio->uio_offset; 597 bytelen = uio->uio_resid; 598 599 #ifndef NFS_V2_ONLY 600 /* 601 * Check for a valid write lease. 602 */ 603 if ((nmp->nm_flag & NFSMNT_NQNFS) && 604 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 605 do { 606 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 607 } while (error == NQNFS_EXPIRED); 608 if (error) 609 return (error); 610 if (np->n_lrev != np->n_brev || 611 (np->n_flag & NQNFSNONCACHE)) { 612 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 613 if (error) 614 return (error); 615 np->n_brev = np->n_lrev; 616 } 617 } 618 #endif 619 nfsstats.biocache_writes++; 620 621 oldsize = np->n_size; 622 np->n_flag |= NMODIFIED; 623 if (np->n_size < uio->uio_offset + bytelen) { 624 np->n_size = uio->uio_offset + bytelen; 625 } 626 extending = ((uio->uio_offset & PAGE_MASK) == 0 && 627 (bytelen & PAGE_MASK) == 0 && 628 uio->uio_offset >= vp->v_size); 629 if (extending) { 630 win = ubc_alloc(&vp->v_uobj, uio->uio_offset, &bytelen, 631 UBC_WRITE | UBC_FAULTBUSY); 632 } else { 633 win = ubc_alloc(&vp->v_uobj, uio->uio_offset, &bytelen, 634 UBC_WRITE); 635 } 636 error = uiomove(win, bytelen, uio); 637 ubc_release(win, 0); 638 if (error) { 639 if (extending) { 640 /* 641 * backout size and free pages past eof. 642 */ 643 np->n_size = oldsize; 644 (void)VOP_PUTPAGES(vp, round_page(vp->v_size), 645 0, PGO_SYNCIO | PGO_FREE); 646 } 647 break; 648 } 649 wrotedta = 1; 650 651 /* 652 * update UVM's notion of the size now that we've 653 * copied the data into the vnode's pages. 654 */ 655 656 if (vp->v_size < uio->uio_offset) { 657 uvm_vnp_setsize(vp, uio->uio_offset); 658 extended = 1; 659 } 660 661 if ((oldoff & ~(nmp->nm_wsize - 1)) != 662 (uio->uio_offset & ~(nmp->nm_wsize - 1))) { 663 simple_lock(&vp->v_interlock); 664 error = VOP_PUTPAGES(vp, 665 trunc_page(oldoff & ~(nmp->nm_wsize - 1)), 666 round_page((uio->uio_offset + nmp->nm_wsize - 1) & 667 ~(nmp->nm_wsize - 1)), PGO_CLEANIT); 668 } 669 } while (uio->uio_resid > 0); 670 if (wrotedta) 671 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 672 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 673 simple_lock(&vp->v_interlock); 674 error = VOP_PUTPAGES(vp, 675 trunc_page(origoff & ~(nmp->nm_wsize - 1)), 676 round_page((uio->uio_offset + nmp->nm_wsize - 1) & 677 ~(nmp->nm_wsize - 1)), 678 PGO_CLEANIT | PGO_SYNCIO); 679 } 680 return error; 681 } 682 683 /* 684 * Get an nfs cache block. 685 * Allocate a new one if the block isn't currently in the cache 686 * and return the block marked busy. If the calling process is 687 * interrupted by a signal for an interruptible mount point, return 688 * NULL. 689 */ 690 struct buf * 691 nfs_getcacheblk(vp, bn, size, p) 692 struct vnode *vp; 693 daddr_t bn; 694 int size; 695 struct proc *p; 696 { 697 struct buf *bp; 698 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 699 700 if (nmp->nm_flag & NFSMNT_INT) { 701 bp = getblk(vp, bn, size, PCATCH, 0); 702 while (bp == NULL) { 703 if (nfs_sigintr(nmp, NULL, p)) 704 return (NULL); 705 bp = getblk(vp, bn, size, 0, 2 * hz); 706 } 707 } else 708 bp = getblk(vp, bn, size, 0, 0); 709 return (bp); 710 } 711 712 /* 713 * Flush and invalidate all dirty buffers. If another process is already 714 * doing the flush, just wait for completion. 715 */ 716 int 717 nfs_vinvalbuf(vp, flags, cred, p, intrflg) 718 struct vnode *vp; 719 int flags; 720 struct ucred *cred; 721 struct proc *p; 722 int intrflg; 723 { 724 struct nfsnode *np = VTONFS(vp); 725 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 726 int error = 0, slpflag, slptimeo; 727 728 if ((nmp->nm_flag & NFSMNT_INT) == 0) 729 intrflg = 0; 730 if (intrflg) { 731 slpflag = PCATCH; 732 slptimeo = 2 * hz; 733 } else { 734 slpflag = 0; 735 slptimeo = 0; 736 } 737 /* 738 * First wait for any other process doing a flush to complete. 739 */ 740 while (np->n_flag & NFLUSHINPROG) { 741 np->n_flag |= NFLUSHWANT; 742 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 743 slptimeo); 744 if (error && intrflg && nfs_sigintr(nmp, NULL, p)) 745 return (EINTR); 746 } 747 748 /* 749 * Now, flush as required. 750 */ 751 np->n_flag |= NFLUSHINPROG; 752 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 753 while (error) { 754 if (intrflg && nfs_sigintr(nmp, NULL, p)) { 755 np->n_flag &= ~NFLUSHINPROG; 756 if (np->n_flag & NFLUSHWANT) { 757 np->n_flag &= ~NFLUSHWANT; 758 wakeup((caddr_t)&np->n_flag); 759 } 760 return (EINTR); 761 } 762 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 763 } 764 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 765 if (np->n_flag & NFLUSHWANT) { 766 np->n_flag &= ~NFLUSHWANT; 767 wakeup((caddr_t)&np->n_flag); 768 } 769 return (0); 770 } 771 772 /* 773 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 774 * This is mainly to avoid queueing async I/O requests when the nfsiods 775 * are all hung on a dead server. 776 */ 777 778 int 779 nfs_asyncio(bp) 780 struct buf *bp; 781 { 782 int i; 783 struct nfsmount *nmp; 784 int gotiod, slpflag = 0, slptimeo = 0, error; 785 786 if (nfs_numasync == 0) 787 return (EIO); 788 789 nmp = VFSTONFS(bp->b_vp->v_mount); 790 again: 791 if (nmp->nm_flag & NFSMNT_INT) 792 slpflag = PCATCH; 793 gotiod = FALSE; 794 795 /* 796 * Find a free iod to process this request. 797 */ 798 799 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 800 if (nfs_iodwant[i]) { 801 /* 802 * Found one, so wake it up and tell it which 803 * mount to process. 804 */ 805 nfs_iodwant[i] = NULL; 806 nfs_iodmount[i] = nmp; 807 nmp->nm_bufqiods++; 808 wakeup((caddr_t)&nfs_iodwant[i]); 809 gotiod = TRUE; 810 break; 811 } 812 813 /* 814 * If none are free, we may already have an iod working on this mount 815 * point. If so, it will process our request. 816 */ 817 818 if (!gotiod && nmp->nm_bufqiods > 0) 819 gotiod = TRUE; 820 821 /* 822 * If we have an iod which can process the request, then queue 823 * the buffer. 824 */ 825 826 if (gotiod) { 827 828 /* 829 * Ensure that the queue never grows too large. 830 */ 831 832 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 833 nmp->nm_bufqwant = TRUE; 834 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 835 "nfsaio", slptimeo); 836 if (error) { 837 if (nfs_sigintr(nmp, NULL, curproc)) 838 return (EINTR); 839 if (slpflag == PCATCH) { 840 slpflag = 0; 841 slptimeo = 2 * hz; 842 } 843 } 844 845 /* 846 * We might have lost our iod while sleeping, 847 * so check and loop if nescessary. 848 */ 849 850 if (nmp->nm_bufqiods == 0) 851 goto again; 852 } 853 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 854 nmp->nm_bufqlen++; 855 return (0); 856 } 857 858 /* 859 * All the iods are busy on other mounts, so return EIO to 860 * force the caller to process the i/o synchronously. 861 */ 862 863 return (EIO); 864 } 865 866 /* 867 * Do an I/O operation to/from a cache block. This may be called 868 * synchronously or from an nfsiod. 869 */ 870 int 871 nfs_doio(bp, p) 872 struct buf *bp; 873 struct proc *p; 874 { 875 struct uio *uiop; 876 struct vnode *vp; 877 struct nfsnode *np; 878 struct nfsmount *nmp; 879 int error = 0, diff, len, iomode, must_commit = 0; 880 int pushedrange; 881 struct uio uio; 882 struct iovec io; 883 off_t off, cnt; 884 struct uvm_object *uobj; 885 UVMHIST_FUNC("nfs_doio"); UVMHIST_CALLED(ubchist); 886 887 vp = bp->b_vp; 888 uobj = &vp->v_uobj; 889 np = VTONFS(vp); 890 nmp = VFSTONFS(vp->v_mount); 891 uiop = &uio; 892 uiop->uio_iov = &io; 893 uiop->uio_iovcnt = 1; 894 uiop->uio_segflg = UIO_SYSSPACE; 895 uiop->uio_procp = p; 896 897 /* 898 * Historically, paging was done with physio, but no more... 899 */ 900 if (bp->b_flags & B_PHYS) { 901 /* 902 * ...though reading /dev/drum still gets us here. 903 */ 904 io.iov_len = uiop->uio_resid = bp->b_bcount; 905 /* mapping was done by vmapbuf() */ 906 io.iov_base = bp->b_data; 907 uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; 908 if (bp->b_flags & B_READ) { 909 uiop->uio_rw = UIO_READ; 910 nfsstats.read_physios++; 911 error = nfs_readrpc(vp, uiop); 912 } else { 913 iomode = NFSV3WRITE_DATASYNC; 914 uiop->uio_rw = UIO_WRITE; 915 nfsstats.write_physios++; 916 error = nfs_writerpc(vp, uiop, &iomode, &must_commit); 917 } 918 if (error) { 919 bp->b_flags |= B_ERROR; 920 bp->b_error = error; 921 } 922 } else if (bp->b_flags & B_READ) { 923 io.iov_len = uiop->uio_resid = bp->b_bcount; 924 io.iov_base = bp->b_data; 925 uiop->uio_rw = UIO_READ; 926 switch (vp->v_type) { 927 case VREG: 928 uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; 929 nfsstats.read_bios++; 930 error = nfs_readrpc(vp, uiop); 931 if (!error && uiop->uio_resid) { 932 933 /* 934 * If len > 0, there is a hole in the file and 935 * no writes after the hole have been pushed to 936 * the server yet. 937 * Just zero fill the rest of the valid area. 938 */ 939 940 diff = bp->b_bcount - uiop->uio_resid; 941 len = np->n_size - ((((off_t)bp->b_blkno) << DEV_BSHIFT) 942 + diff); 943 if (len > 0) { 944 len = MIN(len, uiop->uio_resid); 945 memset((char *)bp->b_data + diff, 0, len); 946 } 947 } 948 if (p && (vp->v_flag & VTEXT) && 949 (((nmp->nm_flag & NFSMNT_NQNFS) && 950 NQNFS_CKINVALID(vp, np, ND_READ) && 951 np->n_lrev != np->n_brev) || 952 (!(nmp->nm_flag & NFSMNT_NQNFS) && 953 np->n_mtime != np->n_vattr->va_mtime.tv_sec))) { 954 uprintf("Process killed due to " 955 "text file modification\n"); 956 psignal(p, SIGKILL); 957 p->p_holdcnt++; 958 } 959 break; 960 case VLNK: 961 uiop->uio_offset = (off_t)0; 962 nfsstats.readlink_bios++; 963 error = nfs_readlinkrpc(vp, uiop, curproc->p_ucred); 964 break; 965 case VDIR: 966 nfsstats.readdir_bios++; 967 uiop->uio_offset = bp->b_dcookie; 968 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 969 error = nfs_readdirplusrpc(vp, uiop, curproc->p_ucred); 970 if (error == NFSERR_NOTSUPP) 971 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 972 } 973 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 974 error = nfs_readdirrpc(vp, uiop, curproc->p_ucred); 975 if (!error) { 976 bp->b_dcookie = uiop->uio_offset; 977 } 978 break; 979 default: 980 printf("nfs_doio: type %x unexpected\n",vp->v_type); 981 break; 982 } 983 if (error) { 984 bp->b_flags |= B_ERROR; 985 bp->b_error = error; 986 } 987 } else { 988 int i, npages = bp->b_bufsize >> PAGE_SHIFT; 989 struct vm_page *pgs[npages]; 990 boolean_t needcommit = TRUE; 991 992 if ((bp->b_flags & B_ASYNC) != 0 && NFS_ISV3(vp)) { 993 iomode = NFSV3WRITE_UNSTABLE; 994 } else { 995 iomode = NFSV3WRITE_FILESYNC; 996 } 997 998 for (i = 0; i < npages; i++) { 999 pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + 1000 (i << PAGE_SHIFT)); 1001 if ((pgs[i]->flags & PG_NEEDCOMMIT) == 0) { 1002 needcommit = FALSE; 1003 } 1004 } 1005 if (!needcommit && iomode == NFSV3WRITE_UNSTABLE) { 1006 for (i = 0; i < npages; i++) { 1007 pgs[i]->flags |= PG_NEEDCOMMIT | PG_RDONLY; 1008 pmap_page_protect(pgs[i], VM_PROT_READ); 1009 } 1010 } 1011 1012 uiop->uio_offset = (((off_t)bp->b_blkno) << DEV_BSHIFT); 1013 off = uiop->uio_offset; 1014 cnt = bp->b_bcount; 1015 1016 /* 1017 * Send the data to the server if necessary, 1018 * otherwise just send a commit rpc. 1019 */ 1020 1021 if (needcommit) { 1022 1023 /* 1024 * If the buffer is in the range that we already committed, 1025 * there's nothing to do. 1026 * 1027 * If it's in the range that we need to commit, push the 1028 * whole range at once, otherwise only push the buffer. 1029 * In both these cases, acquire the commit lock to avoid 1030 * other processes modifying the range. 1031 */ 1032 1033 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1034 if (!nfs_in_committed_range(vp, off, bp->b_bcount)) { 1035 if (nfs_in_tobecommitted_range(vp, off, bp->b_bcount)) { 1036 pushedrange = 1; 1037 off = np->n_pushlo; 1038 cnt = np->n_pushhi - np->n_pushlo; 1039 } else { 1040 pushedrange = 0; 1041 } 1042 error = nfs_commit(vp, off, cnt, curproc); 1043 if (error == 0) { 1044 if (pushedrange) { 1045 nfs_merge_commit_ranges(vp); 1046 } else { 1047 nfs_add_committed_range(vp, off, cnt); 1048 } 1049 } 1050 } 1051 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1052 if (!error) { 1053 bp->b_resid = 0; 1054 simple_lock(&uobj->vmobjlock); 1055 for (i = 0; i < npages; i++) { 1056 pgs[i]->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY); 1057 } 1058 simple_unlock(&uobj->vmobjlock); 1059 biodone(bp); 1060 return (0); 1061 } else if (error == NFSERR_STALEWRITEVERF) { 1062 nfs_clearcommit(bp->b_vp->v_mount); 1063 } 1064 } 1065 io.iov_base = bp->b_data; 1066 io.iov_len = uiop->uio_resid = bp->b_bcount; 1067 uiop->uio_rw = UIO_WRITE; 1068 nfsstats.write_bios++; 1069 error = nfs_writerpc(vp, uiop, &iomode, &must_commit); 1070 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1071 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1072 nfs_add_tobecommitted_range(vp, off, cnt); 1073 simple_lock(&uobj->vmobjlock); 1074 for (i = 0; i < npages; i++) { 1075 pgs[i]->flags &= ~PG_CLEAN; 1076 } 1077 simple_unlock(&uobj->vmobjlock); 1078 if (np->n_pushhi - np->n_pushlo > nfs_commitsize) { 1079 off = np->n_pushlo; 1080 cnt = nfs_commitsize >> 1; 1081 error = nfs_commit(vp, off, cnt, curproc); 1082 if (!error) { 1083 nfs_add_committed_range(vp, off, cnt); 1084 nfs_del_tobecommitted_range(vp, off, cnt); 1085 } 1086 } 1087 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1088 } else if (!error && needcommit) { 1089 lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); 1090 nfs_del_committed_range(vp, off, cnt); 1091 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1092 simple_lock(&uobj->vmobjlock); 1093 for (i = 0; i < npages; i++) { 1094 pgs[i]->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY); 1095 } 1096 simple_unlock(&uobj->vmobjlock); 1097 } else { 1098 if (error) { 1099 bp->b_flags |= B_ERROR; 1100 bp->b_error = np->n_error = error; 1101 np->n_flag |= NWRITEERR; 1102 } 1103 } 1104 } 1105 bp->b_resid = uiop->uio_resid; 1106 if (must_commit || (error == NFSERR_STALEWRITEVERF)) { 1107 nfs_clearcommit(vp->v_mount); 1108 } 1109 biodone(bp); 1110 return (error); 1111 } 1112 1113 /* 1114 * Vnode op for VM getpages. 1115 */ 1116 1117 int 1118 nfs_getpages(v) 1119 void *v; 1120 { 1121 struct vop_getpages_args /* { 1122 struct vnode *a_vp; 1123 voff_t a_offset; 1124 struct vm_page **a_m; 1125 int *a_count; 1126 int a_centeridx; 1127 vm_prot_t a_access_type; 1128 int a_advice; 1129 int a_flags; 1130 } */ *ap = v; 1131 1132 struct vnode *vp = ap->a_vp; 1133 struct uvm_object *uobj = &vp->v_uobj; 1134 struct nfsnode *np = VTONFS(vp); 1135 const int npages = *ap->a_count; 1136 struct vm_page *pg, **pgs, *opgs[npages]; 1137 off_t origoffset, len; 1138 int i, error; 1139 boolean_t v3 = NFS_ISV3(vp); 1140 boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0; 1141 boolean_t locked = (ap->a_flags & PGO_LOCKED) != 0; 1142 1143 /* 1144 * update the cached read creds for this node. 1145 */ 1146 1147 if (np->n_rcred) { 1148 crfree(np->n_rcred); 1149 } 1150 np->n_rcred = curproc->p_ucred; 1151 crhold(np->n_rcred); 1152 1153 /* 1154 * if we have delayed truncation and it's safe, do it now. 1155 */ 1156 1157 if (ap->a_flags & PGO_SYNCIO) { 1158 nfs_delayedtruncate(vp); 1159 } 1160 1161 /* 1162 * call the genfs code to get the pages. `pgs' may be NULL 1163 * when doing read-ahead. 1164 */ 1165 1166 pgs = ap->a_m; 1167 if (write && locked && v3) { 1168 KASSERT(pgs != NULL); 1169 #ifdef DEBUG 1170 1171 /* 1172 * If PGO_LOCKED is set, real pages shouldn't exists 1173 * in the array. 1174 */ 1175 1176 for (i = 0; i < npages; i++) 1177 KDASSERT(pgs[i] == NULL || pgs[i] == PGO_DONTCARE); 1178 #endif 1179 memcpy(opgs, pgs, npages * sizeof(struct vm_pages *)); 1180 } 1181 error = genfs_getpages(v); 1182 if (error) { 1183 return (error); 1184 } 1185 1186 /* 1187 * for read faults where the nfs node is not yet marked NMODIFIED, 1188 * set PG_RDONLY on the pages so that we come back here if someone 1189 * tries to modify later via the mapping that will be entered for 1190 * this fault. 1191 */ 1192 1193 if (!write && (np->n_flag & NMODIFIED) == 0 && pgs != NULL) { 1194 if (!locked) { 1195 simple_lock(&uobj->vmobjlock); 1196 } 1197 for (i = 0; i < npages; i++) { 1198 pg = pgs[i]; 1199 if (pg == NULL || pg == PGO_DONTCARE) { 1200 continue; 1201 } 1202 pg->flags |= PG_RDONLY; 1203 } 1204 if (!locked) { 1205 simple_unlock(&uobj->vmobjlock); 1206 } 1207 } 1208 if (!write) { 1209 return (0); 1210 } 1211 1212 /* 1213 * this is a write fault, update the commit info. 1214 */ 1215 1216 origoffset = ap->a_offset; 1217 len = npages << PAGE_SHIFT; 1218 1219 if (v3) { 1220 error = lockmgr(&np->n_commitlock, 1221 LK_EXCLUSIVE | (locked ? LK_NOWAIT : 0), NULL); 1222 if (error) { 1223 KASSERT(locked != 0); 1224 1225 /* 1226 * Since PGO_LOCKED is set, we need to unbusy 1227 * all pages fetched by genfs_getpages() above, 1228 * tell the caller that there are no pages 1229 * available and put back original pgs array. 1230 */ 1231 1232 uvm_lock_pageq(); 1233 uvm_page_unbusy(pgs, npages); 1234 uvm_unlock_pageq(); 1235 *ap->a_count = 0; 1236 memcpy(pgs, opgs, 1237 npages * sizeof(struct vm_pages *)); 1238 return (error); 1239 } 1240 nfs_del_committed_range(vp, origoffset, len); 1241 nfs_del_tobecommitted_range(vp, origoffset, len); 1242 } 1243 np->n_flag |= NMODIFIED; 1244 if (!locked) { 1245 simple_lock(&uobj->vmobjlock); 1246 } 1247 for (i = 0; i < npages; i++) { 1248 pg = pgs[i]; 1249 if (pg == NULL || pg == PGO_DONTCARE) { 1250 continue; 1251 } 1252 pg->flags &= ~(PG_NEEDCOMMIT | PG_RDONLY); 1253 } 1254 if (!locked) { 1255 simple_unlock(&uobj->vmobjlock); 1256 } 1257 if (v3) { 1258 lockmgr(&np->n_commitlock, LK_RELEASE, NULL); 1259 } 1260 return (0); 1261 } 1262