1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * %sccs.include.redist.c% 9 * 10 * @(#)nfs_bio.c 8.9 (Berkeley) 03/30/95 11 */ 12 13 14 #include <sys/param.h> 15 #include <sys/systm.h> 16 #include <sys/resourcevar.h> 17 #include <sys/signalvar.h> 18 #include <sys/proc.h> 19 #include <sys/buf.h> 20 #include <sys/vnode.h> 21 #include <sys/trace.h> 22 #include <sys/mount.h> 23 #include <sys/kernel.h> 24 25 #include <vm/vm.h> 26 27 #include <nfs/rpcv2.h> 28 #include <nfs/nfsproto.h> 29 #include <nfs/nfs.h> 30 #include <nfs/nfsmount.h> 31 #include <nfs/nqnfs.h> 32 #include <nfs/nfsnode.h> 33 34 struct buf *nfs_getcacheblk(); 35 extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; 36 extern int nfs_numasync; 37 extern struct nfsstats nfsstats; 38 39 /* 40 * Vnode op for read using bio 41 * Any similarity to readip() is purely coincidental 42 */ 43 int 44 nfs_bioread(vp, uio, ioflag, cred) 45 register struct vnode *vp; 46 register struct uio *uio; 47 int ioflag; 48 struct ucred *cred; 49 { 50 register struct nfsnode *np = VTONFS(vp); 51 register int biosize, diff, i; 52 struct buf *bp = 0, *rabp; 53 struct vattr vattr; 54 struct proc *p; 55 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 56 daddr_t lbn, bn, bn2, rabn; 57 caddr_t baddr; 58 int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin; 59 nfsquad_t tquad; 60 61 #ifdef DIAGNOSTIC 62 if (uio->uio_rw != UIO_READ) 63 panic("nfs_read mode"); 64 #endif 65 if (uio->uio_resid == 0) 66 return (0); 67 if (uio->uio_offset < 0) 68 return (EINVAL); 69 p = uio->uio_procp; 70 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 71 (void)nfs_fsinfo(nmp, vp, cred, p); 72 biosize = nmp->nm_rsize; 73 /* 74 * For nfs, cache consistency can only be maintained approximately. 75 * Although RFC1094 does not specify the criteria, the following is 76 * believed to be compatible with the reference port. 77 * For nqnfs, full cache consistency is maintained within the loop. 78 * For nfs: 79 * If the file's modify time on the server has changed since the 80 * last read rpc or you have written to the file, 81 * you may have lost data cache consistency with the 82 * server, so flush all of the file's data out of the cache. 83 * Then force a getattr rpc to ensure that you have up to date 84 * attributes. 85 * NB: This implies that cache data can be read when up to 86 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 87 * attributes this could be forced by setting n_attrstamp to 0 before 88 * the VOP_GETATTR() call. 89 */ 90 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { 91 if (np->n_flag & NMODIFIED) { 92 if (vp->v_type != VREG) { 93 if (vp->v_type != VDIR) 94 panic("nfs: bioread, not dir"); 95 nfs_invaldir(vp); 96 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 97 if (error) 98 return (error); 99 } 100 np->n_attrstamp = 0; 101 error = VOP_GETATTR(vp, &vattr, cred, p); 102 if (error) 103 return (error); 104 np->n_mtime = vattr.va_mtime.ts_sec; 105 } else { 106 error = VOP_GETATTR(vp, &vattr, cred, p); 107 if (error) 108 return (error); 109 if (np->n_mtime != vattr.va_mtime.ts_sec) { 110 if (vp->v_type == VDIR) 111 nfs_invaldir(vp); 112 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 113 if (error) 114 return (error); 115 np->n_mtime = vattr.va_mtime.ts_sec; 116 } 117 } 118 } 119 do { 120 121 /* 122 * Get a valid lease. If cached data is stale, flush it. 123 */ 124 if (nmp->nm_flag & NFSMNT_NQNFS) { 125 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 126 do { 127 error = nqnfs_getlease(vp, ND_READ, cred, p); 128 } while (error == NQNFS_EXPIRED); 129 if (error) 130 return (error); 131 if (np->n_lrev != np->n_brev || 132 (np->n_flag & NQNFSNONCACHE) || 133 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 134 if (vp->v_type == VDIR) 135 nfs_invaldir(vp); 136 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 137 if (error) 138 return (error); 139 np->n_brev = np->n_lrev; 140 } 141 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 142 nfs_invaldir(vp); 143 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 144 if (error) 145 return (error); 146 } 147 } 148 if (np->n_flag & NQNFSNONCACHE) { 149 switch (vp->v_type) { 150 case VREG: 151 return (nfs_readrpc(vp, uio, cred)); 152 case VLNK: 153 return (nfs_readlinkrpc(vp, uio, cred)); 154 case VDIR: 155 break; 156 default: 157 printf(" NQNFSNONCACHE: type %x unexpected\n", 158 vp->v_type); 159 }; 160 } 161 baddr = (caddr_t)0; 162 switch (vp->v_type) { 163 case VREG: 164 nfsstats.biocache_reads++; 165 lbn = uio->uio_offset / biosize; 166 on = uio->uio_offset & (biosize - 1); 167 bn = lbn * (biosize / DEV_BSIZE); 168 not_readin = 1; 169 170 /* 171 * Start the read ahead(s), as required. 172 */ 173 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 174 for (nra = 0; nra < nmp->nm_readahead && 175 (lbn + 1 + nra) * biosize < np->n_size; nra++) { 176 rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE); 177 if (!incore(vp, rabn)) { 178 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 179 if (!rabp) 180 return (EINTR); 181 if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) { 182 rabp->b_flags |= (B_READ | B_ASYNC); 183 if (nfs_asyncio(rabp, cred)) { 184 rabp->b_flags |= B_INVAL; 185 brelse(rabp); 186 } 187 } else 188 brelse(rabp); 189 } 190 } 191 } 192 193 /* 194 * If the block is in the cache and has the required data 195 * in a valid region, just copy it out. 196 * Otherwise, get the block and write back/read in, 197 * as required. 198 */ 199 if ((bp = incore(vp, bn)) && 200 (bp->b_flags & (B_BUSY | B_WRITEINPROG)) == 201 (B_BUSY | B_WRITEINPROG)) 202 got_buf = 0; 203 else { 204 again: 205 bp = nfs_getcacheblk(vp, bn, biosize, p); 206 if (!bp) 207 return (EINTR); 208 got_buf = 1; 209 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 210 bp->b_flags |= B_READ; 211 not_readin = 0; 212 error = nfs_doio(bp, cred, p); 213 if (error) { 214 brelse(bp); 215 return (error); 216 } 217 } 218 } 219 n = min((unsigned)(biosize - on), uio->uio_resid); 220 diff = np->n_size - uio->uio_offset; 221 if (diff < n) 222 n = diff; 223 if (not_readin && n > 0) { 224 if (on < bp->b_validoff || (on + n) > bp->b_validend) { 225 if (!got_buf) { 226 bp = nfs_getcacheblk(vp, bn, biosize, p); 227 if (!bp) 228 return (EINTR); 229 got_buf = 1; 230 } 231 bp->b_flags |= B_INVAFTERWRITE; 232 if (bp->b_dirtyend > 0) { 233 if ((bp->b_flags & B_DELWRI) == 0) 234 panic("nfsbioread"); 235 if (VOP_BWRITE(bp) == EINTR) 236 return (EINTR); 237 } else 238 brelse(bp); 239 goto again; 240 } 241 } 242 vp->v_lastr = lbn; 243 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); 244 if (diff < n) 245 n = diff; 246 break; 247 case VLNK: 248 nfsstats.biocache_readlinks++; 249 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 250 if (!bp) 251 return (EINTR); 252 if ((bp->b_flags & B_DONE) == 0) { 253 bp->b_flags |= B_READ; 254 error = nfs_doio(bp, cred, p); 255 if (error) { 256 brelse(bp); 257 return (error); 258 } 259 } 260 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 261 got_buf = 1; 262 on = 0; 263 break; 264 case VDIR: 265 nfsstats.biocache_readdirs++; 266 lbn = uio->uio_offset / NFS_DIRBLKSIZ; 267 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 268 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 269 if (!bp) 270 return (EINTR); 271 if ((bp->b_flags & B_DONE) == 0) { 272 bp->b_flags |= B_READ; 273 error = nfs_doio(bp, cred, p); 274 if (error) { 275 brelse(bp); 276 while (error == NFSERR_BAD_COOKIE) { 277 nfs_invaldir(vp); 278 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 279 /* 280 * Yuck! The directory has been modified on the 281 * server. The only way to get the block is by 282 * reading from the beginning to get all the 283 * offset cookies. 284 */ 285 for (i = 0; i <= lbn && !error; i++) { 286 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 287 if (!bp) 288 return (EINTR); 289 if ((bp->b_flags & B_DONE) == 0) { 290 bp->b_flags |= B_READ; 291 error = nfs_doio(bp, cred, p); 292 if (error) 293 brelse(bp); 294 } 295 } 296 } 297 if (error) 298 return (error); 299 } 300 } 301 302 /* 303 * If not eof and read aheads are enabled, start one. 304 * (You need the current block first, so that you have the 305 * directory offset cookie of the next block.) 306 */ 307 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 308 (np->n_direofoffset == 0 || 309 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 310 !(np->n_flag & NQNFSNONCACHE) && 311 !incore(vp, lbn + 1)) { 312 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 313 if (rabp) { 314 if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { 315 rabp->b_flags |= (B_READ | B_ASYNC); 316 if (nfs_asyncio(rabp, cred)) { 317 rabp->b_flags |= B_INVAL; 318 brelse(rabp); 319 } 320 } else 321 brelse(rabp); 322 } 323 } 324 n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 325 got_buf = 1; 326 break; 327 default: 328 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 329 break; 330 }; 331 332 if (n > 0) { 333 if (!baddr) 334 baddr = bp->b_data; 335 error = uiomove(baddr + on, (int)n, uio); 336 } 337 switch (vp->v_type) { 338 case VREG: 339 break; 340 case VLNK: 341 n = 0; 342 break; 343 case VDIR: 344 if (np->n_flag & NQNFSNONCACHE) 345 bp->b_flags |= B_INVAL; 346 break; 347 default: 348 printf(" nfsbioread: type %x unexpected\n",vp->v_type); 349 } 350 if (got_buf) 351 brelse(bp); 352 } while (error == 0 && uio->uio_resid > 0 && n > 0); 353 return (error); 354 } 355 356 /* 357 * Vnode op for write using bio 358 */ 359 int 360 nfs_write(ap) 361 struct vop_write_args /* { 362 struct vnode *a_vp; 363 struct uio *a_uio; 364 int a_ioflag; 365 struct ucred *a_cred; 366 } */ *ap; 367 { 368 register int biosize; 369 register struct uio *uio = ap->a_uio; 370 struct proc *p = uio->uio_procp; 371 register struct vnode *vp = ap->a_vp; 372 struct nfsnode *np = VTONFS(vp); 373 register struct ucred *cred = ap->a_cred; 374 int ioflag = ap->a_ioflag; 375 struct buf *bp; 376 struct vattr vattr; 377 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 378 daddr_t lbn, bn; 379 int n, on, error = 0, iomode, must_commit; 380 381 #ifdef DIAGNOSTIC 382 if (uio->uio_rw != UIO_WRITE) 383 panic("nfs_write mode"); 384 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 385 panic("nfs_write proc"); 386 #endif 387 if (vp->v_type != VREG) 388 return (EIO); 389 if (np->n_flag & NWRITEERR) { 390 np->n_flag &= ~NWRITEERR; 391 return (np->n_error); 392 } 393 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 394 (void)nfs_fsinfo(nmp, vp, cred, p); 395 if (ioflag & (IO_APPEND | IO_SYNC)) { 396 if (np->n_flag & NMODIFIED) { 397 np->n_attrstamp = 0; 398 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 399 if (error) 400 return (error); 401 } 402 if (ioflag & IO_APPEND) { 403 np->n_attrstamp = 0; 404 error = VOP_GETATTR(vp, &vattr, cred, p); 405 if (error) 406 return (error); 407 uio->uio_offset = np->n_size; 408 } 409 } 410 if (uio->uio_offset < 0) 411 return (EINVAL); 412 if (uio->uio_resid == 0) 413 return (0); 414 /* 415 * Maybe this should be above the vnode op call, but so long as 416 * file servers have no limits, i don't think it matters 417 */ 418 if (p && uio->uio_offset + uio->uio_resid > 419 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 420 psignal(p, SIGXFSZ); 421 return (EFBIG); 422 } 423 /* 424 * I use nm_rsize, not nm_wsize so that all buffer cache blocks 425 * will be the same size within a filesystem. nfs_writerpc will 426 * still use nm_wsize when sizing the rpc's. 427 */ 428 biosize = nmp->nm_rsize; 429 do { 430 431 /* 432 * XXX make sure we aren't cached in the VM page cache 433 */ 434 (void)vnode_pager_uncache(vp); 435 436 /* 437 * Check for a valid write lease. 438 */ 439 if ((nmp->nm_flag & NFSMNT_NQNFS) && 440 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 441 do { 442 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 443 } while (error == NQNFS_EXPIRED); 444 if (error) 445 return (error); 446 if (np->n_lrev != np->n_brev || 447 (np->n_flag & NQNFSNONCACHE)) { 448 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 449 if (error) 450 return (error); 451 np->n_brev = np->n_lrev; 452 } 453 } 454 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 455 iomode = NFSV3WRITE_FILESYNC; 456 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 457 if (must_commit) 458 nfs_clearcommit(vp->v_mount); 459 return (error); 460 } 461 nfsstats.biocache_writes++; 462 lbn = uio->uio_offset / biosize; 463 on = uio->uio_offset & (biosize-1); 464 n = min((unsigned)(biosize - on), uio->uio_resid); 465 bn = lbn * (biosize / DEV_BSIZE); 466 again: 467 bp = nfs_getcacheblk(vp, bn, biosize, p); 468 if (!bp) 469 return (EINTR); 470 if (bp->b_wcred == NOCRED) { 471 crhold(cred); 472 bp->b_wcred = cred; 473 } 474 np->n_flag |= NMODIFIED; 475 if (uio->uio_offset + n > np->n_size) { 476 np->n_size = uio->uio_offset + n; 477 vnode_pager_setsize(vp, (u_long)np->n_size); 478 } 479 480 /* 481 * If the new write will leave a contiguous dirty 482 * area, just update the b_dirtyoff and b_dirtyend, 483 * otherwise force a write rpc of the old dirty area. 484 */ 485 if (bp->b_dirtyend > 0 && 486 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 487 bp->b_proc = p; 488 if (VOP_BWRITE(bp) == EINTR) 489 return (EINTR); 490 goto again; 491 } 492 493 /* 494 * Check for valid write lease and get one as required. 495 * In case getblk() and/or bwrite() delayed us. 496 */ 497 if ((nmp->nm_flag & NFSMNT_NQNFS) && 498 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 499 do { 500 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 501 } while (error == NQNFS_EXPIRED); 502 if (error) { 503 brelse(bp); 504 return (error); 505 } 506 if (np->n_lrev != np->n_brev || 507 (np->n_flag & NQNFSNONCACHE)) { 508 brelse(bp); 509 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 510 if (error) 511 return (error); 512 np->n_brev = np->n_lrev; 513 goto again; 514 } 515 } 516 error = uiomove((char *)bp->b_data + on, n, uio); 517 if (error) { 518 bp->b_flags |= B_ERROR; 519 brelse(bp); 520 return (error); 521 } 522 if (bp->b_dirtyend > 0) { 523 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 524 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 525 } else { 526 bp->b_dirtyoff = on; 527 bp->b_dirtyend = on + n; 528 } 529 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || 530 bp->b_validoff > bp->b_dirtyend) { 531 bp->b_validoff = bp->b_dirtyoff; 532 bp->b_validend = bp->b_dirtyend; 533 } else { 534 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 535 bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 536 } 537 /* 538 * If the lease is non-cachable or IO_SYNC do bwrite(). 539 */ 540 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 541 bp->b_proc = p; 542 error = VOP_BWRITE(bp); 543 if (error) 544 return (error); 545 if (np->n_flag & NQNFSNONCACHE) { 546 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 547 if (error) 548 return (error); 549 } 550 } else if ((n + on) == biosize && 551 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 552 bp->b_proc = (struct proc *)0; 553 bp->b_flags |= B_ASYNC; 554 (void)nfs_writebp(bp, 0); 555 } else 556 bdwrite(bp); 557 } while (uio->uio_resid > 0 && n > 0); 558 return (0); 559 } 560 561 /* 562 * Get an nfs cache block. 563 * Allocate a new one if the block isn't currently in the cache 564 * and return the block marked busy. If the calling process is 565 * interrupted by a signal for an interruptible mount point, return 566 * NULL. 567 */ 568 struct buf * 569 nfs_getcacheblk(vp, bn, size, p) 570 struct vnode *vp; 571 daddr_t bn; 572 int size; 573 struct proc *p; 574 { 575 register struct buf *bp; 576 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 577 578 if (nmp->nm_flag & NFSMNT_INT) { 579 bp = getblk(vp, bn, size, PCATCH, 0); 580 while (bp == (struct buf *)0) { 581 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 582 return ((struct buf *)0); 583 bp = getblk(vp, bn, size, 0, 2 * hz); 584 } 585 } else 586 bp = getblk(vp, bn, size, 0, 0); 587 return (bp); 588 } 589 590 /* 591 * Flush and invalidate all dirty buffers. If another process is already 592 * doing the flush, just wait for completion. 593 */ 594 int 595 nfs_vinvalbuf(vp, flags, cred, p, intrflg) 596 struct vnode *vp; 597 int flags; 598 struct ucred *cred; 599 struct proc *p; 600 int intrflg; 601 { 602 register struct nfsnode *np = VTONFS(vp); 603 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 604 int error = 0, slpflag, slptimeo; 605 606 if ((nmp->nm_flag & NFSMNT_INT) == 0) 607 intrflg = 0; 608 if (intrflg) { 609 slpflag = PCATCH; 610 slptimeo = 2 * hz; 611 } else { 612 slpflag = 0; 613 slptimeo = 0; 614 } 615 /* 616 * First wait for any other process doing a flush to complete. 617 */ 618 while (np->n_flag & NFLUSHINPROG) { 619 np->n_flag |= NFLUSHWANT; 620 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 621 slptimeo); 622 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 623 return (EINTR); 624 } 625 626 /* 627 * Now, flush as required. 628 */ 629 np->n_flag |= NFLUSHINPROG; 630 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 631 while (error) { 632 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 633 np->n_flag &= ~NFLUSHINPROG; 634 if (np->n_flag & NFLUSHWANT) { 635 np->n_flag &= ~NFLUSHWANT; 636 wakeup((caddr_t)&np->n_flag); 637 } 638 return (EINTR); 639 } 640 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 641 } 642 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 643 if (np->n_flag & NFLUSHWANT) { 644 np->n_flag &= ~NFLUSHWANT; 645 wakeup((caddr_t)&np->n_flag); 646 } 647 return (0); 648 } 649 650 /* 651 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 652 * This is mainly to avoid queueing async I/O requests when the nfsiods 653 * are all hung on a dead server. 654 */ 655 int 656 nfs_asyncio(bp, cred) 657 register struct buf *bp; 658 struct ucred *cred; 659 { 660 register int i; 661 662 if (nfs_numasync == 0) 663 return (EIO); 664 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 665 if (nfs_iodwant[i]) { 666 if (bp->b_flags & B_READ) { 667 if (bp->b_rcred == NOCRED && cred != NOCRED) { 668 crhold(cred); 669 bp->b_rcred = cred; 670 } 671 } else { 672 bp->b_flags |= B_WRITEINPROG; 673 if (bp->b_wcred == NOCRED && cred != NOCRED) { 674 crhold(cred); 675 bp->b_wcred = cred; 676 } 677 } 678 679 TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist); 680 nfs_iodwant[i] = (struct proc *)0; 681 wakeup((caddr_t)&nfs_iodwant[i]); 682 return (0); 683 } 684 685 /* 686 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE 687 * return EIO so the process will call nfs_doio() and do it 688 * synchronously. 689 */ 690 if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE)) 691 return (EIO); 692 693 /* 694 * Just turn the async write into a delayed write, instead of 695 * doing in synchronously. Hopefully, at least one of the nfsiods 696 * is currently doing a write for this file and will pick up the 697 * delayed writes before going back to sleep. 698 */ 699 bp->b_flags |= B_DELWRI; 700 reassignbuf(bp, bp->b_vp); 701 biodone(bp); 702 return (0); 703 } 704 705 /* 706 * Do an I/O operation to/from a cache block. This may be called 707 * synchronously or from an nfsiod. 708 */ 709 int 710 nfs_doio(bp, cr, p) 711 register struct buf *bp; 712 struct ucred *cr; 713 struct proc *p; 714 { 715 register struct uio *uiop; 716 register struct vnode *vp; 717 struct nfsnode *np; 718 struct nfsmount *nmp; 719 int error = 0, diff, len, iomode, must_commit = 0; 720 struct uio uio; 721 struct iovec io; 722 nfsquad_t tquad; 723 724 vp = bp->b_vp; 725 np = VTONFS(vp); 726 nmp = VFSTONFS(vp->v_mount); 727 uiop = &uio; 728 uiop->uio_iov = &io; 729 uiop->uio_iovcnt = 1; 730 uiop->uio_segflg = UIO_SYSSPACE; 731 uiop->uio_procp = p; 732 733 /* 734 * Historically, paging was done with physio, but no more. 735 */ 736 if (bp->b_flags & B_PHYS) { 737 /* 738 * ...though reading /dev/drum still gets us here. 739 */ 740 io.iov_len = uiop->uio_resid = bp->b_bcount; 741 /* mapping was done by vmapbuf() */ 742 io.iov_base = bp->b_data; 743 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 744 if (bp->b_flags & B_READ) { 745 uiop->uio_rw = UIO_READ; 746 nfsstats.read_physios++; 747 error = nfs_readrpc(vp, uiop, cr); 748 } else 749 panic("physio write"); 750 if (error) { 751 bp->b_flags |= B_ERROR; 752 bp->b_error = error; 753 } 754 } else if (bp->b_flags & B_READ) { 755 io.iov_len = uiop->uio_resid = bp->b_bcount; 756 io.iov_base = bp->b_data; 757 uiop->uio_rw = UIO_READ; 758 switch (vp->v_type) { 759 case VREG: 760 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 761 nfsstats.read_bios++; 762 error = nfs_readrpc(vp, uiop, cr); 763 if (!error) { 764 bp->b_validoff = 0; 765 if (uiop->uio_resid) { 766 /* 767 * If len > 0, there is a hole in the file and 768 * no writes after the hole have been pushed to 769 * the server yet. 770 * Just zero fill the rest of the valid area. 771 */ 772 diff = bp->b_bcount - uiop->uio_resid; 773 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE 774 + diff); 775 if (len > 0) { 776 len = min(len, uiop->uio_resid); 777 bzero((char *)bp->b_data + diff, len); 778 bp->b_validend = diff + len; 779 } else 780 bp->b_validend = diff; 781 } else 782 bp->b_validend = bp->b_bcount; 783 } 784 if (p && (vp->v_flag & VTEXT) && 785 (((nmp->nm_flag & NFSMNT_NQNFS) && 786 NQNFS_CKINVALID(vp, np, ND_READ) && 787 np->n_lrev != np->n_brev) || 788 (!(nmp->nm_flag & NFSMNT_NQNFS) && 789 np->n_mtime != np->n_vattr.va_mtime.ts_sec))) { 790 uprintf("Process killed due to text file modification\n"); 791 psignal(p, SIGKILL); 792 p->p_flag |= P_NOSWAP; 793 } 794 break; 795 case VLNK: 796 uiop->uio_offset = (off_t)0; 797 nfsstats.readlink_bios++; 798 error = nfs_readlinkrpc(vp, uiop, cr); 799 break; 800 case VDIR: 801 nfsstats.readdir_bios++; 802 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 803 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 804 error = nfs_readdirplusrpc(vp, uiop, cr); 805 if (error == NFSERR_NOTSUPP) 806 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 807 } 808 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 809 error = nfs_readdirrpc(vp, uiop, cr); 810 break; 811 default: 812 printf("nfs_doio: type %x unexpected\n",vp->v_type); 813 break; 814 }; 815 if (error) { 816 bp->b_flags |= B_ERROR; 817 bp->b_error = error; 818 } 819 } else { 820 io.iov_len = uiop->uio_resid = bp->b_dirtyend 821 - bp->b_dirtyoff; 822 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE 823 + bp->b_dirtyoff; 824 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 825 uiop->uio_rw = UIO_WRITE; 826 nfsstats.write_bios++; 827 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC) 828 iomode = NFSV3WRITE_UNSTABLE; 829 else 830 iomode = NFSV3WRITE_FILESYNC; 831 bp->b_flags |= B_WRITEINPROG; 832 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 833 if (!error && iomode == NFSV3WRITE_UNSTABLE) 834 bp->b_flags |= B_NEEDCOMMIT; 835 else 836 bp->b_flags &= ~B_NEEDCOMMIT; 837 bp->b_flags &= ~B_WRITEINPROG; 838 839 /* 840 * For an interrupted write, the buffer is still valid and the 841 * write hasn't been pushed to the server yet, so we can't set 842 * B_ERROR and report the interruption by setting B_EINTR. For 843 * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt 844 * is essentially a noop. 845 * For the case of a V3 write rpc not being committed to stable 846 * storage, the block is still dirty and requires either a commit 847 * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC 848 * before the block is reused. This is indicated by setting the 849 * B_DELWRI and B_NEEDCOMMIT flags. 850 */ 851 if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 852 bp->b_flags |= B_DELWRI; 853 854 /* 855 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the 856 * buffer to the clean list, we have to reassign it back to the 857 * dirty one. Ugh. 858 */ 859 if (bp->b_flags & B_ASYNC) 860 reassignbuf(bp, vp); 861 else 862 bp->b_flags |= B_EINTR; 863 } else { 864 if (error) { 865 bp->b_flags |= B_ERROR; 866 bp->b_error = np->n_error = error; 867 np->n_flag |= NWRITEERR; 868 } 869 bp->b_dirtyoff = bp->b_dirtyend = 0; 870 } 871 } 872 bp->b_resid = uiop->uio_resid; 873 if (must_commit) 874 nfs_clearcommit(vp->v_mount); 875 biodone(bp); 876 return (error); 877 } 878