xref: /original-bsd/sys/kern/vfs_bio.c (revision 07d71086)
1 /*
2  * Copyright (c) 1982, 1986, 1989 Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms are permitted
6  * provided that the above copyright notice and this paragraph are
7  * duplicated in all such forms and that any documentation,
8  * advertising materials, and other materials related to such
9  * distribution and use acknowledge that the software was developed
10  * by the University of California, Berkeley.  The name of the
11  * University may not be used to endorse or promote products derived
12  * from this software without specific prior written permission.
13  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
14  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
15  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
16  *
17  *	@(#)vfs_bio.c	7.8 (Berkeley) 08/15/89
18  */
19 
20 #include "param.h"
21 #include "user.h"
22 #include "buf.h"
23 #include "vnode.h"
24 #include "trace.h"
25 
26 /*
27  * Read in (if necessary) the block and return a buffer pointer.
28  */
29 bread(vp, blkno, size, bpp)
30 	struct vnode *vp;
31 	daddr_t blkno;
32 	int size;
33 	struct buf **bpp;
34 {
35 	register struct buf *bp;
36 
37 	if (size == 0)
38 		panic("bread: size 0");
39 	*bpp = bp = getblk(vp, blkno, size);
40 	if (bp->b_flags&(B_DONE|B_DELWRI)) {
41 		trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), blkno);
42 		return (0);
43 	}
44 	bp->b_flags |= B_READ;
45 	if (bp->b_bcount > bp->b_bufsize)
46 		panic("bread");
47 	VOP_STRATEGY(bp);
48 	trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), blkno);
49 	u.u_ru.ru_inblock++;		/* pay for read */
50 	return (biowait(bp));
51 }
52 
53 /*
54  * Read in the block, like bread, but also start I/O on the
55  * read-ahead block (which is not allocated to the caller)
56  */
57 breada(vp, blkno, size, rablkno, rabsize, bpp)
58 	struct vnode *vp;
59 	daddr_t blkno; int size;
60 	daddr_t rablkno; int rabsize;
61 	struct buf **bpp;
62 {
63 	register struct buf *bp, *rabp;
64 
65 	bp = NULL;
66 	/*
67 	 * If the block isn't in core, then allocate
68 	 * a buffer and initiate i/o (getblk checks
69 	 * for a cache hit).
70 	 */
71 	if (!incore(vp, blkno)) {
72 		*bpp = bp = getblk(vp, blkno, size);
73 		if ((bp->b_flags&(B_DONE|B_DELWRI)) == 0) {
74 			bp->b_flags |= B_READ;
75 			if (bp->b_bcount > bp->b_bufsize)
76 				panic("breada");
77 			VOP_STRATEGY(bp);
78 			trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size),
79 			    blkno);
80 			u.u_ru.ru_inblock++;		/* pay for read */
81 		} else
82 			trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size),
83 			    blkno);
84 	}
85 
86 	/*
87 	 * If there's a read-ahead block, start i/o
88 	 * on it also (as above).
89 	 */
90 	if (rablkno && !incore(vp, rablkno)) {
91 		rabp = getblk(vp, rablkno, rabsize);
92 		if (rabp->b_flags & (B_DONE|B_DELWRI)) {
93 			brelse(rabp);
94 			trace(TR_BREADHITRA,
95 			    pack(vp->v_mount->m_fsid[0], rabsize), blkno);
96 		} else {
97 			rabp->b_flags |= B_READ|B_ASYNC;
98 			if (rabp->b_bcount > rabp->b_bufsize)
99 				panic("breadrabp");
100 			VOP_STRATEGY(rabp);
101 			trace(TR_BREADMISSRA,
102 			    pack(vp->v_mount->m_fsid[0], rabsize), rablock);
103 			u.u_ru.ru_inblock++;		/* pay in advance */
104 		}
105 	}
106 
107 	/*
108 	 * If block was in core, let bread get it.
109 	 * If block wasn't in core, then the read was started
110 	 * above, and just wait for it.
111 	 */
112 	if (bp == NULL)
113 		return (bread(vp, blkno, size, bpp));
114 	return (biowait(bp));
115 }
116 
117 /*
118  * Write the buffer, waiting for completion.
119  * Then release the buffer.
120  */
121 bwrite(bp)
122 	register struct buf *bp;
123 {
124 	register int flag;
125 	int error;
126 
127 	flag = bp->b_flags;
128 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
129 	if ((flag&B_DELWRI) == 0)
130 		u.u_ru.ru_oublock++;		/* noone paid yet */
131 	trace(TR_BWRITE,
132 	    pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bcount), bp->b_blkno);
133 	if (bp->b_bcount > bp->b_bufsize)
134 		panic("bwrite");
135 	VOP_STRATEGY(bp);
136 
137 	/*
138 	 * If the write was synchronous, then await i/o completion.
139 	 * If the write was "delayed", then we put the buffer on
140 	 * the q of blocks awaiting i/o completion status.
141 	 */
142 	if ((flag&B_ASYNC) == 0) {
143 		error = biowait(bp);
144 		brelse(bp);
145 	} else if (flag & B_DELWRI) {
146 		bp->b_flags |= B_AGE;
147 		error = 0;
148 	}
149 	return (error);
150 }
151 
152 /*
153  * Release the buffer, marking it so that if it is grabbed
154  * for another purpose it will be written out before being
155  * given up (e.g. when writing a partial block where it is
156  * assumed that another write for the same block will soon follow).
157  * This can't be done for magtape, since writes must be done
158  * in the same order as requested.
159  */
160 bdwrite(bp)
161 	register struct buf *bp;
162 {
163 
164 	if ((bp->b_flags&B_DELWRI) == 0)
165 		u.u_ru.ru_oublock++;		/* noone paid yet */
166 #ifdef notdef
167 	/*
168 	 * This does not work for buffers associated with
169 	 * vnodes that are remote - they have no dev.
170 	 * Besides, we don't use bio with tapes, so rather
171 	 * than develop a fix, we just ifdef this out for now.
172 	 */
173 	if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE)
174 		bawrite(bp);
175 	else {
176 		bp->b_flags |= B_DELWRI | B_DONE;
177 		brelse(bp);
178 	}
179 #endif
180 	bp->b_flags |= B_DELWRI | B_DONE;
181 	brelse(bp);
182 }
183 
184 /*
185  * Release the buffer, start I/O on it, but don't wait for completion.
186  */
187 bawrite(bp)
188 	register struct buf *bp;
189 {
190 
191 	bp->b_flags |= B_ASYNC;
192 	(void) bwrite(bp);
193 }
194 
195 /*
196  * Release the buffer, with no I/O implied.
197  */
198 brelse(bp)
199 	register struct buf *bp;
200 {
201 	register struct buf *flist;
202 	register s;
203 
204 	trace(TR_BRELSE,
205 	    pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno);
206 	/*
207 	 * If someone's waiting for the buffer, or
208 	 * is waiting for a buffer wake 'em up.
209 	 */
210 	if (bp->b_flags&B_WANTED)
211 		wakeup((caddr_t)bp);
212 	if (bfreelist[0].b_flags&B_WANTED) {
213 		bfreelist[0].b_flags &= ~B_WANTED;
214 		wakeup((caddr_t)bfreelist);
215 	}
216 	if (bp->b_flags & B_NOCACHE) {
217 		bp->b_flags |= B_INVAL;
218 	}
219 	if (bp->b_flags&B_ERROR)
220 		if (bp->b_flags & B_LOCKED)
221 			bp->b_flags &= ~B_ERROR;	/* try again later */
222 		else
223 			brelvp(bp); 	 		/* no assoc */
224 
225 	/*
226 	 * Stick the buffer back on a free list.
227 	 */
228 	s = splbio();
229 	if (bp->b_bufsize <= 0) {
230 		/* block has no buffer ... put at front of unused buffer list */
231 		flist = &bfreelist[BQ_EMPTY];
232 		binsheadfree(bp, flist);
233 	} else if (bp->b_flags & (B_ERROR|B_INVAL)) {
234 		/* block has no info ... put at front of most free list */
235 		flist = &bfreelist[BQ_AGE];
236 		binsheadfree(bp, flist);
237 	} else {
238 		if (bp->b_flags & B_LOCKED)
239 			flist = &bfreelist[BQ_LOCKED];
240 		else if (bp->b_flags & B_AGE)
241 			flist = &bfreelist[BQ_AGE];
242 		else
243 			flist = &bfreelist[BQ_LRU];
244 		binstailfree(bp, flist);
245 	}
246 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE|B_NOCACHE);
247 	splx(s);
248 }
249 
250 /*
251  * See if the block is associated with some buffer
252  * (mainly to avoid getting hung up on a wait in breada)
253  */
254 incore(vp, blkno)
255 	struct vnode *vp;
256 	daddr_t blkno;
257 {
258 	register struct buf *bp;
259 	register struct buf *dp;
260 
261 	dp = BUFHASH(vp, blkno);
262 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
263 		if (bp->b_blkno == blkno && bp->b_vp == vp &&
264 		    (bp->b_flags & B_INVAL) == 0)
265 			return (1);
266 	return (0);
267 }
268 
269 baddr(vp, blkno, size, bpp)
270 	struct vnode *vp;
271 	daddr_t blkno;
272 	int size;
273 	struct buf **bpp;
274 {
275 
276 	if (incore(vp, blkno))
277 		return (bread(vp, blkno, size, bpp));
278 	*bpp = 0;
279 	return (0);
280 }
281 
282 /*
283  * Assign a buffer for the given block.  If the appropriate
284  * block is already associated, return it; otherwise search
285  * for the oldest non-busy buffer and reassign it.
286  *
287  * If we find the buffer, but it is dirty (marked DELWRI) and
288  * its size is changing, we must write it out first. When the
289  * buffer is shrinking, the write is done by brealloc to avoid
290  * losing the unwritten data. When the buffer is growing, the
291  * write is done by getblk, so that bread will not read stale
292  * disk data over the modified data in the buffer.
293  *
294  * We use splx here because this routine may be called
295  * on the interrupt stack during a dump, and we don't
296  * want to lower the ipl back to 0.
297  */
298 struct buf *
299 getblk(vp, blkno, size)
300 	register struct vnode *vp;
301 	daddr_t blkno;
302 	int size;
303 {
304 	register struct buf *bp, *dp;
305 	int s;
306 
307 	if (size > MAXBSIZE)
308 		panic("getblk: size too big");
309 	/*
310 	 * To prevent overflow of 32-bit ints when converting block
311 	 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set
312 	 * to the maximum number that can be converted to a byte offset
313 	 * without overflow. This is historic code; what bug it fixed,
314 	 * or whether it is still a reasonable thing to do is open to
315 	 * dispute. mkm 9/85
316 	 */
317 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT))
318 		blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1);
319 	/*
320 	 * Search the cache for the block.  If we hit, but
321 	 * the buffer is in use for i/o, then we wait until
322 	 * the i/o has completed.
323 	 */
324 	dp = BUFHASH(vp, blkno);
325 loop:
326 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
327 		if (bp->b_blkno != blkno || bp->b_vp != vp ||
328 		    bp->b_flags&B_INVAL)
329 			continue;
330 		s = splbio();
331 		if (bp->b_flags&B_BUSY) {
332 			bp->b_flags |= B_WANTED;
333 			sleep((caddr_t)bp, PRIBIO+1);
334 			splx(s);
335 			goto loop;
336 		}
337 		splx(s);
338 		notavail(bp);
339 		if (bp->b_bcount != size) {
340 			if (bp->b_bcount < size && (bp->b_flags&B_DELWRI)) {
341 				bp->b_flags &= ~B_ASYNC;
342 				(void) bwrite(bp);
343 				goto loop;
344 			}
345 			if (brealloc(bp, size) == 0)
346 				goto loop;
347 		}
348 		if (bp->b_bcount != size && brealloc(bp, size) == 0)
349 			goto loop;
350 		bp->b_flags |= B_CACHE;
351 		return (bp);
352 	}
353 	bp = getnewbuf();
354 	bfree(bp);
355 	bremhash(bp);
356 	if (bp->b_vp)
357 		brelvp(bp);
358 	VREF(vp);
359 	bp->b_vp = vp;
360 	bp->b_dev = vp->v_rdev;
361 	bp->b_blkno = blkno;
362 	bp->b_error = 0;
363 	bp->b_resid = 0;
364 	binshash(bp, dp);
365 	if (brealloc(bp, size) == 0)
366 		goto loop;
367 	return (bp);
368 }
369 
370 /*
371  * get an empty block,
372  * not assigned to any particular device
373  */
374 struct buf *
375 geteblk(size)
376 	int size;
377 {
378 	register struct buf *bp, *flist;
379 
380 	if (size > MAXBSIZE)
381 		panic("geteblk: size too big");
382 loop:
383 	bp = getnewbuf();
384 	bp->b_flags |= B_INVAL;
385 	bfree(bp);
386 	bremhash(bp);
387 	flist = &bfreelist[BQ_AGE];
388 	brelvp(bp);
389 	bp->b_error = 0;
390 	bp->b_resid = 0;
391 	binshash(bp, flist);
392 	if (brealloc(bp, size) == 0)
393 		goto loop;
394 	return (bp);
395 }
396 
397 /*
398  * Allocate space associated with a buffer.
399  * If can't get space, buffer is released
400  */
401 brealloc(bp, size)
402 	register struct buf *bp;
403 	int size;
404 {
405 	daddr_t start, last;
406 	register struct buf *ep;
407 	struct buf *dp;
408 	int s;
409 
410 	/*
411 	 * First need to make sure that all overlapping previous I/O
412 	 * is dispatched with.
413 	 */
414 	if (size == bp->b_bcount)
415 		return (1);
416 	if (size < bp->b_bcount) {
417 		if (bp->b_flags & B_DELWRI) {
418 			(void) bwrite(bp);
419 			return (0);
420 		}
421 		if (bp->b_flags & B_LOCKED)
422 			panic("brealloc");
423 		return (allocbuf(bp, size));
424 	}
425 	bp->b_flags &= ~B_DONE;
426 	if (bp->b_vp == (struct vnode *)0)
427 		return (allocbuf(bp, size));
428 
429 	trace(TR_BREALLOC,
430 	    pack(bp->b_vp->v_mount->m_fsid[0], size), bp->b_blkno);
431 	/*
432 	 * Search cache for any buffers that overlap the one that we
433 	 * are trying to allocate. Overlapping buffers must be marked
434 	 * invalid, after being written out if they are dirty. (indicated
435 	 * by B_DELWRI) A disk block must be mapped by at most one buffer
436 	 * at any point in time. Care must be taken to avoid deadlocking
437 	 * when two buffer are trying to get the same set of disk blocks.
438 	 */
439 	start = bp->b_blkno;
440 	last = start + btodb(size) - 1;
441 	dp = BUFHASH(bp->b_vp, bp->b_blkno);
442 loop:
443 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
444 		if (ep == bp || ep->b_vp != bp->b_vp ||
445 		    (ep->b_flags & B_INVAL))
446 			continue;
447 		/* look for overlap */
448 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
449 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
450 			continue;
451 		s = splbio();
452 		if (ep->b_flags&B_BUSY) {
453 			ep->b_flags |= B_WANTED;
454 			sleep((caddr_t)ep, PRIBIO+1);
455 			splx(s);
456 			goto loop;
457 		}
458 		splx(s);
459 		notavail(ep);
460 		if (ep->b_flags & B_DELWRI) {
461 			(void) bwrite(ep);
462 			goto loop;
463 		}
464 		ep->b_flags |= B_INVAL;
465 		brelse(ep);
466 	}
467 	return (allocbuf(bp, size));
468 }
469 
470 /*
471  * Find a buffer which is available for use.
472  * Select something from a free list.
473  * Preference is to AGE list, then LRU list.
474  */
475 struct buf *
476 getnewbuf()
477 {
478 	register struct buf *bp, *dp;
479 	int s;
480 
481 loop:
482 	s = splbio();
483 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
484 		if (dp->av_forw != dp)
485 			break;
486 	if (dp == bfreelist) {		/* no free blocks */
487 		dp->b_flags |= B_WANTED;
488 		sleep((caddr_t)dp, PRIBIO+1);
489 		splx(s);
490 		goto loop;
491 	}
492 	splx(s);
493 	bp = dp->av_forw;
494 	notavail(bp);
495 	if (bp->b_flags & B_DELWRI) {
496 		(void) bawrite(bp);
497 		goto loop;
498 	}
499 	trace(TR_BRELSE,
500 	    pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno);
501 	brelvp(bp);
502 	bp->b_flags = B_BUSY;
503 	return (bp);
504 }
505 
506 /*
507  * Wait for I/O completion on the buffer; return errors
508  * to the user.
509  */
510 biowait(bp)
511 	register struct buf *bp;
512 {
513 	int s;
514 
515 	s = splbio();
516 	while ((bp->b_flags&B_DONE)==0)
517 		sleep((caddr_t)bp, PRIBIO);
518 	splx(s);
519 	/*
520 	 * Pick up the device's error number and pass it to the user;
521 	 * if there is an error but the number is 0 set a generalized code.
522 	 */
523 	if ((bp->b_flags & B_ERROR) == 0)
524 		return (0);
525 	if (bp->b_error)
526 		return (bp->b_error);
527 	return (EIO);
528 }
529 
530 /*
531  * Mark I/O complete on a buffer.
532  * If someone should be called, e.g. the pageout
533  * daemon, do so.  Otherwise, wake up anyone
534  * waiting for it.
535  */
536 biodone(bp)
537 	register struct buf *bp;
538 {
539 
540 	if (bp->b_flags & B_DONE)
541 		panic("dup biodone");
542 	bp->b_flags |= B_DONE;
543 	if (bp->b_flags & B_CALL) {
544 		bp->b_flags &= ~B_CALL;
545 		(*bp->b_iodone)(bp);
546 		return;
547 	}
548 	if (bp->b_flags&B_ASYNC)
549 		brelse(bp);
550 	else {
551 		bp->b_flags &= ~B_WANTED;
552 		wakeup((caddr_t)bp);
553 	}
554 }
555 
556 /*
557  * Ensure that no part of a specified block is in an incore buffer.
558 #ifdef SECSIZE
559  * "size" is given in device blocks (the units of b_blkno).
560 #endif SECSIZE
561  */
562 blkflush(vp, blkno, size)
563 	struct vnode *vp;
564 	daddr_t blkno;
565 	long size;
566 {
567 	register struct buf *ep;
568 	struct buf *dp;
569 	daddr_t start, last;
570 	int s, error, allerrors = 0;
571 
572 	start = blkno;
573 	last = start + btodb(size) - 1;
574 	dp = BUFHASH(vp, blkno);
575 loop:
576 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
577 		if (ep->b_vp != vp || (ep->b_flags & B_INVAL))
578 			continue;
579 		/* look for overlap */
580 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
581 		    ep->b_blkno + btodb(ep->b_bcount) <= start)
582 			continue;
583 		s = splbio();
584 		if (ep->b_flags&B_BUSY) {
585 			ep->b_flags |= B_WANTED;
586 			sleep((caddr_t)ep, PRIBIO+1);
587 			splx(s);
588 			goto loop;
589 		}
590 		if (ep->b_flags & B_DELWRI) {
591 			splx(s);
592 			notavail(ep);
593 			if (error = bwrite(ep))
594 				allerrors = error;
595 			goto loop;
596 		}
597 		splx(s);
598 	}
599 	return (allerrors);
600 }
601 
602 /*
603  * Make sure all write-behind blocks associated
604  * with vp are flushed out (from sync).
605  */
606 bflush(dev)
607 	dev_t dev;
608 {
609 	register struct buf *bp;
610 	register struct buf *flist;
611 	int s;
612 
613 loop:
614 	s = splbio();
615 	for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
616 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
617 		if ((bp->b_flags & B_DELWRI) == 0)
618 			continue;
619 		if (dev == NODEV || dev == bp->b_dev) {
620 			notavail(bp);
621 			(void) bawrite(bp);
622 			splx(s);
623 			goto loop;
624 		}
625 	}
626 	splx(s);
627 }
628 
629 #ifdef unused
630 /*
631  * Invalidate blocks associated with vp which are on the freelist.
632  * Make sure all write-behind blocks associated with vp are flushed out.
633  */
634 binvalfree(vp)
635 	struct vnode *vp;
636 {
637 	register struct buf *bp;
638 	register struct buf *flist;
639 	int s;
640 
641 loop:
642 	s = splbio();
643 	for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
644 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
645 		if (vp == (struct vnode *) 0 || vp == bp->b_vp) {
646 			if (bp->b_flags & B_DELWRI) {
647 				notavail(bp);
648 				(void) splx(s);
649 				(void) bawrite(bp);
650 			} else {
651 				bp->b_flags |= B_INVAL;
652 				brelvp(bp);
653 				(void) splx(s);
654 			}
655 			goto loop;
656 		}
657 	}
658 	(void) splx(s);
659 }
660 #endif /* unused */
661 
662 /*
663  * Invalidate in core blocks belonging to closed or umounted filesystem
664  *
665  * We walk through the buffer pool and invalidate any buffers for the
666  * indicated device. Normally this routine is preceeded by a bflush
667  * call, so that on a quiescent filesystem there will be no dirty
668  * buffers when we are done. We return the count of dirty buffers when
669  * we are finished.
670  */
671 binval(dev)
672 	dev_t dev;
673 {
674 	register struct buf *bp;
675 	register struct bufhd *hp;
676 	int dirty = 0;
677 #define dp ((struct buf *)hp)
678 
679 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) {
680 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
681 			if (bp->b_dev != dev || (bp->b_flags & B_INVAL))
682 				continue;
683 			notavail(bp);
684 			if (bp->b_flags & B_DELWRI) {
685 				(void) bawrite(bp);
686 				dirty++;
687 				continue;
688 			}
689 			bp->b_flags |= B_INVAL;
690 			brelvp(bp);
691 			brelse(bp);
692 		}
693 	}
694 	return (dirty);
695 }
696 
697 brelvp(bp)
698 	struct buf *bp;
699 {
700 	struct vnode *vp;
701 
702 	if (bp->b_vp == (struct vnode *) 0)
703 		return;
704 	vp = bp->b_vp;
705 	bp->b_vp = (struct vnode *) 0;
706 	vrele(vp);
707 }
708