1 /*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * %sccs.include.redist.c%
9 *
10 * @(#)nfs_bio.c 8.9 (Berkeley) 03/30/95
11 */
12
13
14 #include <sys/param.h>
15 #include <sys/systm.h>
16 #include <sys/resourcevar.h>
17 #include <sys/signalvar.h>
18 #include <sys/proc.h>
19 #include <sys/buf.h>
20 #include <sys/vnode.h>
21 #include <sys/trace.h>
22 #include <sys/mount.h>
23 #include <sys/kernel.h>
24
25 #include <vm/vm.h>
26
27 #include <nfs/rpcv2.h>
28 #include <nfs/nfsproto.h>
29 #include <nfs/nfs.h>
30 #include <nfs/nfsmount.h>
31 #include <nfs/nqnfs.h>
32 #include <nfs/nfsnode.h>
33
34 struct buf *nfs_getcacheblk();
35 extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
36 extern int nfs_numasync;
37 extern struct nfsstats nfsstats;
38
39 /*
40 * Vnode op for read using bio
41 * Any similarity to readip() is purely coincidental
42 */
43 int
nfs_bioread(vp,uio,ioflag,cred)44 nfs_bioread(vp, uio, ioflag, cred)
45 register struct vnode *vp;
46 register struct uio *uio;
47 int ioflag;
48 struct ucred *cred;
49 {
50 register struct nfsnode *np = VTONFS(vp);
51 register int biosize, diff, i;
52 struct buf *bp = 0, *rabp;
53 struct vattr vattr;
54 struct proc *p;
55 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
56 daddr_t lbn, bn, bn2, rabn;
57 caddr_t baddr;
58 int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin;
59 nfsquad_t tquad;
60
61 #ifdef DIAGNOSTIC
62 if (uio->uio_rw != UIO_READ)
63 panic("nfs_read mode");
64 #endif
65 if (uio->uio_resid == 0)
66 return (0);
67 if (uio->uio_offset < 0)
68 return (EINVAL);
69 p = uio->uio_procp;
70 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
71 (void)nfs_fsinfo(nmp, vp, cred, p);
72 biosize = nmp->nm_rsize;
73 /*
74 * For nfs, cache consistency can only be maintained approximately.
75 * Although RFC1094 does not specify the criteria, the following is
76 * believed to be compatible with the reference port.
77 * For nqnfs, full cache consistency is maintained within the loop.
78 * For nfs:
79 * If the file's modify time on the server has changed since the
80 * last read rpc or you have written to the file,
81 * you may have lost data cache consistency with the
82 * server, so flush all of the file's data out of the cache.
83 * Then force a getattr rpc to ensure that you have up to date
84 * attributes.
85 * NB: This implies that cache data can be read when up to
86 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
87 * attributes this could be forced by setting n_attrstamp to 0 before
88 * the VOP_GETATTR() call.
89 */
90 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) {
91 if (np->n_flag & NMODIFIED) {
92 if (vp->v_type != VREG) {
93 if (vp->v_type != VDIR)
94 panic("nfs: bioread, not dir");
95 nfs_invaldir(vp);
96 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
97 if (error)
98 return (error);
99 }
100 np->n_attrstamp = 0;
101 error = VOP_GETATTR(vp, &vattr, cred, p);
102 if (error)
103 return (error);
104 np->n_mtime = vattr.va_mtime.ts_sec;
105 } else {
106 error = VOP_GETATTR(vp, &vattr, cred, p);
107 if (error)
108 return (error);
109 if (np->n_mtime != vattr.va_mtime.ts_sec) {
110 if (vp->v_type == VDIR)
111 nfs_invaldir(vp);
112 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
113 if (error)
114 return (error);
115 np->n_mtime = vattr.va_mtime.ts_sec;
116 }
117 }
118 }
119 do {
120
121 /*
122 * Get a valid lease. If cached data is stale, flush it.
123 */
124 if (nmp->nm_flag & NFSMNT_NQNFS) {
125 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
126 do {
127 error = nqnfs_getlease(vp, ND_READ, cred, p);
128 } while (error == NQNFS_EXPIRED);
129 if (error)
130 return (error);
131 if (np->n_lrev != np->n_brev ||
132 (np->n_flag & NQNFSNONCACHE) ||
133 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
134 if (vp->v_type == VDIR)
135 nfs_invaldir(vp);
136 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
137 if (error)
138 return (error);
139 np->n_brev = np->n_lrev;
140 }
141 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
142 nfs_invaldir(vp);
143 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
144 if (error)
145 return (error);
146 }
147 }
148 if (np->n_flag & NQNFSNONCACHE) {
149 switch (vp->v_type) {
150 case VREG:
151 return (nfs_readrpc(vp, uio, cred));
152 case VLNK:
153 return (nfs_readlinkrpc(vp, uio, cred));
154 case VDIR:
155 break;
156 default:
157 printf(" NQNFSNONCACHE: type %x unexpected\n",
158 vp->v_type);
159 };
160 }
161 baddr = (caddr_t)0;
162 switch (vp->v_type) {
163 case VREG:
164 nfsstats.biocache_reads++;
165 lbn = uio->uio_offset / biosize;
166 on = uio->uio_offset & (biosize - 1);
167 bn = lbn * (biosize / DEV_BSIZE);
168 not_readin = 1;
169
170 /*
171 * Start the read ahead(s), as required.
172 */
173 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
174 for (nra = 0; nra < nmp->nm_readahead &&
175 (lbn + 1 + nra) * biosize < np->n_size; nra++) {
176 rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE);
177 if (!incore(vp, rabn)) {
178 rabp = nfs_getcacheblk(vp, rabn, biosize, p);
179 if (!rabp)
180 return (EINTR);
181 if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) {
182 rabp->b_flags |= (B_READ | B_ASYNC);
183 if (nfs_asyncio(rabp, cred)) {
184 rabp->b_flags |= B_INVAL;
185 brelse(rabp);
186 }
187 } else
188 brelse(rabp);
189 }
190 }
191 }
192
193 /*
194 * If the block is in the cache and has the required data
195 * in a valid region, just copy it out.
196 * Otherwise, get the block and write back/read in,
197 * as required.
198 */
199 if ((bp = incore(vp, bn)) &&
200 (bp->b_flags & (B_BUSY | B_WRITEINPROG)) ==
201 (B_BUSY | B_WRITEINPROG))
202 got_buf = 0;
203 else {
204 again:
205 bp = nfs_getcacheblk(vp, bn, biosize, p);
206 if (!bp)
207 return (EINTR);
208 got_buf = 1;
209 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
210 bp->b_flags |= B_READ;
211 not_readin = 0;
212 error = nfs_doio(bp, cred, p);
213 if (error) {
214 brelse(bp);
215 return (error);
216 }
217 }
218 }
219 n = min((unsigned)(biosize - on), uio->uio_resid);
220 diff = np->n_size - uio->uio_offset;
221 if (diff < n)
222 n = diff;
223 if (not_readin && n > 0) {
224 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
225 if (!got_buf) {
226 bp = nfs_getcacheblk(vp, bn, biosize, p);
227 if (!bp)
228 return (EINTR);
229 got_buf = 1;
230 }
231 bp->b_flags |= B_INVAFTERWRITE;
232 if (bp->b_dirtyend > 0) {
233 if ((bp->b_flags & B_DELWRI) == 0)
234 panic("nfsbioread");
235 if (VOP_BWRITE(bp) == EINTR)
236 return (EINTR);
237 } else
238 brelse(bp);
239 goto again;
240 }
241 }
242 vp->v_lastr = lbn;
243 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
244 if (diff < n)
245 n = diff;
246 break;
247 case VLNK:
248 nfsstats.biocache_readlinks++;
249 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
250 if (!bp)
251 return (EINTR);
252 if ((bp->b_flags & B_DONE) == 0) {
253 bp->b_flags |= B_READ;
254 error = nfs_doio(bp, cred, p);
255 if (error) {
256 brelse(bp);
257 return (error);
258 }
259 }
260 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
261 got_buf = 1;
262 on = 0;
263 break;
264 case VDIR:
265 nfsstats.biocache_readdirs++;
266 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
267 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
268 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
269 if (!bp)
270 return (EINTR);
271 if ((bp->b_flags & B_DONE) == 0) {
272 bp->b_flags |= B_READ;
273 error = nfs_doio(bp, cred, p);
274 if (error) {
275 brelse(bp);
276 while (error == NFSERR_BAD_COOKIE) {
277 nfs_invaldir(vp);
278 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
279 /*
280 * Yuck! The directory has been modified on the
281 * server. The only way to get the block is by
282 * reading from the beginning to get all the
283 * offset cookies.
284 */
285 for (i = 0; i <= lbn && !error; i++) {
286 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
287 if (!bp)
288 return (EINTR);
289 if ((bp->b_flags & B_DONE) == 0) {
290 bp->b_flags |= B_READ;
291 error = nfs_doio(bp, cred, p);
292 if (error)
293 brelse(bp);
294 }
295 }
296 }
297 if (error)
298 return (error);
299 }
300 }
301
302 /*
303 * If not eof and read aheads are enabled, start one.
304 * (You need the current block first, so that you have the
305 * directory offset cookie of the next block.)
306 */
307 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
308 (np->n_direofoffset == 0 ||
309 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
310 !(np->n_flag & NQNFSNONCACHE) &&
311 !incore(vp, lbn + 1)) {
312 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
313 if (rabp) {
314 if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) {
315 rabp->b_flags |= (B_READ | B_ASYNC);
316 if (nfs_asyncio(rabp, cred)) {
317 rabp->b_flags |= B_INVAL;
318 brelse(rabp);
319 }
320 } else
321 brelse(rabp);
322 }
323 }
324 n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
325 got_buf = 1;
326 break;
327 default:
328 printf(" nfsbioread: type %x unexpected\n",vp->v_type);
329 break;
330 };
331
332 if (n > 0) {
333 if (!baddr)
334 baddr = bp->b_data;
335 error = uiomove(baddr + on, (int)n, uio);
336 }
337 switch (vp->v_type) {
338 case VREG:
339 break;
340 case VLNK:
341 n = 0;
342 break;
343 case VDIR:
344 if (np->n_flag & NQNFSNONCACHE)
345 bp->b_flags |= B_INVAL;
346 break;
347 default:
348 printf(" nfsbioread: type %x unexpected\n",vp->v_type);
349 }
350 if (got_buf)
351 brelse(bp);
352 } while (error == 0 && uio->uio_resid > 0 && n > 0);
353 return (error);
354 }
355
356 /*
357 * Vnode op for write using bio
358 */
359 int
nfs_write(ap)360 nfs_write(ap)
361 struct vop_write_args /* {
362 struct vnode *a_vp;
363 struct uio *a_uio;
364 int a_ioflag;
365 struct ucred *a_cred;
366 } */ *ap;
367 {
368 register int biosize;
369 register struct uio *uio = ap->a_uio;
370 struct proc *p = uio->uio_procp;
371 register struct vnode *vp = ap->a_vp;
372 struct nfsnode *np = VTONFS(vp);
373 register struct ucred *cred = ap->a_cred;
374 int ioflag = ap->a_ioflag;
375 struct buf *bp;
376 struct vattr vattr;
377 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
378 daddr_t lbn, bn;
379 int n, on, error = 0, iomode, must_commit;
380
381 #ifdef DIAGNOSTIC
382 if (uio->uio_rw != UIO_WRITE)
383 panic("nfs_write mode");
384 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
385 panic("nfs_write proc");
386 #endif
387 if (vp->v_type != VREG)
388 return (EIO);
389 if (np->n_flag & NWRITEERR) {
390 np->n_flag &= ~NWRITEERR;
391 return (np->n_error);
392 }
393 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
394 (void)nfs_fsinfo(nmp, vp, cred, p);
395 if (ioflag & (IO_APPEND | IO_SYNC)) {
396 if (np->n_flag & NMODIFIED) {
397 np->n_attrstamp = 0;
398 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
399 if (error)
400 return (error);
401 }
402 if (ioflag & IO_APPEND) {
403 np->n_attrstamp = 0;
404 error = VOP_GETATTR(vp, &vattr, cred, p);
405 if (error)
406 return (error);
407 uio->uio_offset = np->n_size;
408 }
409 }
410 if (uio->uio_offset < 0)
411 return (EINVAL);
412 if (uio->uio_resid == 0)
413 return (0);
414 /*
415 * Maybe this should be above the vnode op call, but so long as
416 * file servers have no limits, i don't think it matters
417 */
418 if (p && uio->uio_offset + uio->uio_resid >
419 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
420 psignal(p, SIGXFSZ);
421 return (EFBIG);
422 }
423 /*
424 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
425 * will be the same size within a filesystem. nfs_writerpc will
426 * still use nm_wsize when sizing the rpc's.
427 */
428 biosize = nmp->nm_rsize;
429 do {
430
431 /*
432 * XXX make sure we aren't cached in the VM page cache
433 */
434 (void)vnode_pager_uncache(vp);
435
436 /*
437 * Check for a valid write lease.
438 */
439 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
440 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
441 do {
442 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
443 } while (error == NQNFS_EXPIRED);
444 if (error)
445 return (error);
446 if (np->n_lrev != np->n_brev ||
447 (np->n_flag & NQNFSNONCACHE)) {
448 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
449 if (error)
450 return (error);
451 np->n_brev = np->n_lrev;
452 }
453 }
454 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
455 iomode = NFSV3WRITE_FILESYNC;
456 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
457 if (must_commit)
458 nfs_clearcommit(vp->v_mount);
459 return (error);
460 }
461 nfsstats.biocache_writes++;
462 lbn = uio->uio_offset / biosize;
463 on = uio->uio_offset & (biosize-1);
464 n = min((unsigned)(biosize - on), uio->uio_resid);
465 bn = lbn * (biosize / DEV_BSIZE);
466 again:
467 bp = nfs_getcacheblk(vp, bn, biosize, p);
468 if (!bp)
469 return (EINTR);
470 if (bp->b_wcred == NOCRED) {
471 crhold(cred);
472 bp->b_wcred = cred;
473 }
474 np->n_flag |= NMODIFIED;
475 if (uio->uio_offset + n > np->n_size) {
476 np->n_size = uio->uio_offset + n;
477 vnode_pager_setsize(vp, (u_long)np->n_size);
478 }
479
480 /*
481 * If the new write will leave a contiguous dirty
482 * area, just update the b_dirtyoff and b_dirtyend,
483 * otherwise force a write rpc of the old dirty area.
484 */
485 if (bp->b_dirtyend > 0 &&
486 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
487 bp->b_proc = p;
488 if (VOP_BWRITE(bp) == EINTR)
489 return (EINTR);
490 goto again;
491 }
492
493 /*
494 * Check for valid write lease and get one as required.
495 * In case getblk() and/or bwrite() delayed us.
496 */
497 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
498 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
499 do {
500 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
501 } while (error == NQNFS_EXPIRED);
502 if (error) {
503 brelse(bp);
504 return (error);
505 }
506 if (np->n_lrev != np->n_brev ||
507 (np->n_flag & NQNFSNONCACHE)) {
508 brelse(bp);
509 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
510 if (error)
511 return (error);
512 np->n_brev = np->n_lrev;
513 goto again;
514 }
515 }
516 error = uiomove((char *)bp->b_data + on, n, uio);
517 if (error) {
518 bp->b_flags |= B_ERROR;
519 brelse(bp);
520 return (error);
521 }
522 if (bp->b_dirtyend > 0) {
523 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
524 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
525 } else {
526 bp->b_dirtyoff = on;
527 bp->b_dirtyend = on + n;
528 }
529 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
530 bp->b_validoff > bp->b_dirtyend) {
531 bp->b_validoff = bp->b_dirtyoff;
532 bp->b_validend = bp->b_dirtyend;
533 } else {
534 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
535 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
536 }
537 /*
538 * If the lease is non-cachable or IO_SYNC do bwrite().
539 */
540 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
541 bp->b_proc = p;
542 error = VOP_BWRITE(bp);
543 if (error)
544 return (error);
545 if (np->n_flag & NQNFSNONCACHE) {
546 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
547 if (error)
548 return (error);
549 }
550 } else if ((n + on) == biosize &&
551 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
552 bp->b_proc = (struct proc *)0;
553 bp->b_flags |= B_ASYNC;
554 (void)nfs_writebp(bp, 0);
555 } else
556 bdwrite(bp);
557 } while (uio->uio_resid > 0 && n > 0);
558 return (0);
559 }
560
561 /*
562 * Get an nfs cache block.
563 * Allocate a new one if the block isn't currently in the cache
564 * and return the block marked busy. If the calling process is
565 * interrupted by a signal for an interruptible mount point, return
566 * NULL.
567 */
568 struct buf *
nfs_getcacheblk(vp,bn,size,p)569 nfs_getcacheblk(vp, bn, size, p)
570 struct vnode *vp;
571 daddr_t bn;
572 int size;
573 struct proc *p;
574 {
575 register struct buf *bp;
576 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
577
578 if (nmp->nm_flag & NFSMNT_INT) {
579 bp = getblk(vp, bn, size, PCATCH, 0);
580 while (bp == (struct buf *)0) {
581 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
582 return ((struct buf *)0);
583 bp = getblk(vp, bn, size, 0, 2 * hz);
584 }
585 } else
586 bp = getblk(vp, bn, size, 0, 0);
587 return (bp);
588 }
589
590 /*
591 * Flush and invalidate all dirty buffers. If another process is already
592 * doing the flush, just wait for completion.
593 */
594 int
nfs_vinvalbuf(vp,flags,cred,p,intrflg)595 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
596 struct vnode *vp;
597 int flags;
598 struct ucred *cred;
599 struct proc *p;
600 int intrflg;
601 {
602 register struct nfsnode *np = VTONFS(vp);
603 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
604 int error = 0, slpflag, slptimeo;
605
606 if ((nmp->nm_flag & NFSMNT_INT) == 0)
607 intrflg = 0;
608 if (intrflg) {
609 slpflag = PCATCH;
610 slptimeo = 2 * hz;
611 } else {
612 slpflag = 0;
613 slptimeo = 0;
614 }
615 /*
616 * First wait for any other process doing a flush to complete.
617 */
618 while (np->n_flag & NFLUSHINPROG) {
619 np->n_flag |= NFLUSHWANT;
620 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
621 slptimeo);
622 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
623 return (EINTR);
624 }
625
626 /*
627 * Now, flush as required.
628 */
629 np->n_flag |= NFLUSHINPROG;
630 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
631 while (error) {
632 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
633 np->n_flag &= ~NFLUSHINPROG;
634 if (np->n_flag & NFLUSHWANT) {
635 np->n_flag &= ~NFLUSHWANT;
636 wakeup((caddr_t)&np->n_flag);
637 }
638 return (EINTR);
639 }
640 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
641 }
642 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
643 if (np->n_flag & NFLUSHWANT) {
644 np->n_flag &= ~NFLUSHWANT;
645 wakeup((caddr_t)&np->n_flag);
646 }
647 return (0);
648 }
649
650 /*
651 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
652 * This is mainly to avoid queueing async I/O requests when the nfsiods
653 * are all hung on a dead server.
654 */
655 int
nfs_asyncio(bp,cred)656 nfs_asyncio(bp, cred)
657 register struct buf *bp;
658 struct ucred *cred;
659 {
660 register int i;
661
662 if (nfs_numasync == 0)
663 return (EIO);
664 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
665 if (nfs_iodwant[i]) {
666 if (bp->b_flags & B_READ) {
667 if (bp->b_rcred == NOCRED && cred != NOCRED) {
668 crhold(cred);
669 bp->b_rcred = cred;
670 }
671 } else {
672 bp->b_flags |= B_WRITEINPROG;
673 if (bp->b_wcred == NOCRED && cred != NOCRED) {
674 crhold(cred);
675 bp->b_wcred = cred;
676 }
677 }
678
679 TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist);
680 nfs_iodwant[i] = (struct proc *)0;
681 wakeup((caddr_t)&nfs_iodwant[i]);
682 return (0);
683 }
684
685 /*
686 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE
687 * return EIO so the process will call nfs_doio() and do it
688 * synchronously.
689 */
690 if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE))
691 return (EIO);
692
693 /*
694 * Just turn the async write into a delayed write, instead of
695 * doing in synchronously. Hopefully, at least one of the nfsiods
696 * is currently doing a write for this file and will pick up the
697 * delayed writes before going back to sleep.
698 */
699 bp->b_flags |= B_DELWRI;
700 reassignbuf(bp, bp->b_vp);
701 biodone(bp);
702 return (0);
703 }
704
705 /*
706 * Do an I/O operation to/from a cache block. This may be called
707 * synchronously or from an nfsiod.
708 */
709 int
nfs_doio(bp,cr,p)710 nfs_doio(bp, cr, p)
711 register struct buf *bp;
712 struct ucred *cr;
713 struct proc *p;
714 {
715 register struct uio *uiop;
716 register struct vnode *vp;
717 struct nfsnode *np;
718 struct nfsmount *nmp;
719 int error = 0, diff, len, iomode, must_commit = 0;
720 struct uio uio;
721 struct iovec io;
722 nfsquad_t tquad;
723
724 vp = bp->b_vp;
725 np = VTONFS(vp);
726 nmp = VFSTONFS(vp->v_mount);
727 uiop = &uio;
728 uiop->uio_iov = &io;
729 uiop->uio_iovcnt = 1;
730 uiop->uio_segflg = UIO_SYSSPACE;
731 uiop->uio_procp = p;
732
733 /*
734 * Historically, paging was done with physio, but no more.
735 */
736 if (bp->b_flags & B_PHYS) {
737 /*
738 * ...though reading /dev/drum still gets us here.
739 */
740 io.iov_len = uiop->uio_resid = bp->b_bcount;
741 /* mapping was done by vmapbuf() */
742 io.iov_base = bp->b_data;
743 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
744 if (bp->b_flags & B_READ) {
745 uiop->uio_rw = UIO_READ;
746 nfsstats.read_physios++;
747 error = nfs_readrpc(vp, uiop, cr);
748 } else
749 panic("physio write");
750 if (error) {
751 bp->b_flags |= B_ERROR;
752 bp->b_error = error;
753 }
754 } else if (bp->b_flags & B_READ) {
755 io.iov_len = uiop->uio_resid = bp->b_bcount;
756 io.iov_base = bp->b_data;
757 uiop->uio_rw = UIO_READ;
758 switch (vp->v_type) {
759 case VREG:
760 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
761 nfsstats.read_bios++;
762 error = nfs_readrpc(vp, uiop, cr);
763 if (!error) {
764 bp->b_validoff = 0;
765 if (uiop->uio_resid) {
766 /*
767 * If len > 0, there is a hole in the file and
768 * no writes after the hole have been pushed to
769 * the server yet.
770 * Just zero fill the rest of the valid area.
771 */
772 diff = bp->b_bcount - uiop->uio_resid;
773 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
774 + diff);
775 if (len > 0) {
776 len = min(len, uiop->uio_resid);
777 bzero((char *)bp->b_data + diff, len);
778 bp->b_validend = diff + len;
779 } else
780 bp->b_validend = diff;
781 } else
782 bp->b_validend = bp->b_bcount;
783 }
784 if (p && (vp->v_flag & VTEXT) &&
785 (((nmp->nm_flag & NFSMNT_NQNFS) &&
786 NQNFS_CKINVALID(vp, np, ND_READ) &&
787 np->n_lrev != np->n_brev) ||
788 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
789 np->n_mtime != np->n_vattr.va_mtime.ts_sec))) {
790 uprintf("Process killed due to text file modification\n");
791 psignal(p, SIGKILL);
792 p->p_flag |= P_NOSWAP;
793 }
794 break;
795 case VLNK:
796 uiop->uio_offset = (off_t)0;
797 nfsstats.readlink_bios++;
798 error = nfs_readlinkrpc(vp, uiop, cr);
799 break;
800 case VDIR:
801 nfsstats.readdir_bios++;
802 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
803 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
804 error = nfs_readdirplusrpc(vp, uiop, cr);
805 if (error == NFSERR_NOTSUPP)
806 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
807 }
808 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
809 error = nfs_readdirrpc(vp, uiop, cr);
810 break;
811 default:
812 printf("nfs_doio: type %x unexpected\n",vp->v_type);
813 break;
814 };
815 if (error) {
816 bp->b_flags |= B_ERROR;
817 bp->b_error = error;
818 }
819 } else {
820 io.iov_len = uiop->uio_resid = bp->b_dirtyend
821 - bp->b_dirtyoff;
822 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
823 + bp->b_dirtyoff;
824 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
825 uiop->uio_rw = UIO_WRITE;
826 nfsstats.write_bios++;
827 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
828 iomode = NFSV3WRITE_UNSTABLE;
829 else
830 iomode = NFSV3WRITE_FILESYNC;
831 bp->b_flags |= B_WRITEINPROG;
832 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
833 if (!error && iomode == NFSV3WRITE_UNSTABLE)
834 bp->b_flags |= B_NEEDCOMMIT;
835 else
836 bp->b_flags &= ~B_NEEDCOMMIT;
837 bp->b_flags &= ~B_WRITEINPROG;
838
839 /*
840 * For an interrupted write, the buffer is still valid and the
841 * write hasn't been pushed to the server yet, so we can't set
842 * B_ERROR and report the interruption by setting B_EINTR. For
843 * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt
844 * is essentially a noop.
845 * For the case of a V3 write rpc not being committed to stable
846 * storage, the block is still dirty and requires either a commit
847 * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC
848 * before the block is reused. This is indicated by setting the
849 * B_DELWRI and B_NEEDCOMMIT flags.
850 */
851 if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
852 bp->b_flags |= B_DELWRI;
853
854 /*
855 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
856 * buffer to the clean list, we have to reassign it back to the
857 * dirty one. Ugh.
858 */
859 if (bp->b_flags & B_ASYNC)
860 reassignbuf(bp, vp);
861 else
862 bp->b_flags |= B_EINTR;
863 } else {
864 if (error) {
865 bp->b_flags |= B_ERROR;
866 bp->b_error = np->n_error = error;
867 np->n_flag |= NWRITEERR;
868 }
869 bp->b_dirtyoff = bp->b_dirtyend = 0;
870 }
871 }
872 bp->b_resid = uiop->uio_resid;
873 if (must_commit)
874 nfs_clearcommit(vp->v_mount);
875 biodone(bp);
876 return (error);
877 }
878