xref: /original-bsd/sys/kern/vfs_bio.c (revision 6c394c2f)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_bio.c	7.51 (Berkeley) 07/12/92
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/mount.h>
17 #include <sys/trace.h>
18 #include <sys/resourcevar.h>
19 
20 /*
21  * Initialize buffers and hash links for buffers.
22  */
23 void
24 bufinit()
25 {
26 	register int i;
27 	register struct buf *bp, *dp;
28 	register struct bufhd *hp;
29 	int base, residual;
30 
31 	for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
32 		hp->b_forw = hp->b_back = (struct buf *)hp;
33 
34 	for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
35 		dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
36 		dp->b_flags = B_HEAD;
37 	}
38 	base = bufpages / nbuf;
39 	residual = bufpages % nbuf;
40 	for (i = 0; i < nbuf; i++) {
41 		bp = &buf[i];
42 		bp->b_dev = NODEV;
43 		bp->b_bcount = 0;
44 		bp->b_rcred = NOCRED;
45 		bp->b_wcred = NOCRED;
46 		bp->b_dirtyoff = 0;
47 		bp->b_dirtyend = 0;
48 		bp->b_validoff = 0;
49 		bp->b_validend = 0;
50 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
51 		if (i < residual)
52 			bp->b_bufsize = (base + 1) * CLBYTES;
53 		else
54 			bp->b_bufsize = base * CLBYTES;
55 		binshash(bp, &bfreelist[BQ_AGE]);
56 		bp->b_flags = B_INVAL;
57 		dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY];
58 		binsheadfree(bp, dp);
59 	}
60 }
61 
62 /*
63  * Find the block in the buffer pool.
64  * If the buffer is not present, allocate a new buffer and load
65  * its contents according to the filesystem fill routine.
66  */
67 bread(vp, blkno, size, cred, bpp)
68 	struct vnode *vp;
69 	daddr_t blkno;
70 	int size;
71 	struct ucred *cred;
72 	struct buf **bpp;
73 {
74 	struct proc *p = curproc;		/* XXX */
75 	register struct buf *bp;
76 
77 	if (size == 0)
78 		panic("bread: size 0");
79 	*bpp = bp = getblk(vp, blkno, size);
80 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
81 		trace(TR_BREADHIT, pack(vp, size), blkno);
82 		return (0);
83 	}
84 	bp->b_flags |= B_READ;
85 	if (bp->b_bcount > bp->b_bufsize)
86 		panic("bread");
87 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
88 		crhold(cred);
89 		bp->b_rcred = cred;
90 	}
91 	VOP_STRATEGY(bp);
92 	trace(TR_BREADMISS, pack(vp, size), blkno);
93 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
94 	return (biowait(bp));
95 }
96 
97 /*
98  * Operates like bread, but also starts I/O on the N specified
99  * read-ahead blocks.
100  */
101 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
102 	struct vnode *vp;
103 	daddr_t blkno; int size;
104 	daddr_t rablkno[]; int rabsize[];
105 	int num;
106 	struct ucred *cred;
107 	struct buf **bpp;
108 {
109 	struct proc *p = curproc;		/* XXX */
110 	register struct buf *bp, *rabp;
111 	register int i;
112 
113 	bp = NULL;
114 	/*
115 	 * If the block is not memory resident,
116 	 * allocate a buffer and start I/O.
117 	 */
118 	if (!incore(vp, blkno)) {
119 		*bpp = bp = getblk(vp, blkno, size);
120 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
121 			bp->b_flags |= B_READ;
122 			if (bp->b_bcount > bp->b_bufsize)
123 				panic("breadn");
124 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
125 				crhold(cred);
126 				bp->b_rcred = cred;
127 			}
128 			VOP_STRATEGY(bp);
129 			trace(TR_BREADMISS, pack(vp, size), blkno);
130 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
131 		} else {
132 			trace(TR_BREADHIT, pack(vp, size), blkno);
133 		}
134 	}
135 
136 	/*
137 	 * If there's read-ahead block(s), start I/O
138 	 * on them also (as above).
139 	 */
140 	for (i = 0; i < num; i++) {
141 		if (incore(vp, rablkno[i]))
142 			continue;
143 		rabp = getblk(vp, rablkno[i], rabsize[i]);
144 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
145 			brelse(rabp);
146 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
147 		} else {
148 			rabp->b_flags |= B_ASYNC | B_READ;
149 			if (rabp->b_bcount > rabp->b_bufsize)
150 				panic("breadrabp");
151 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
152 				crhold(cred);
153 				rabp->b_rcred = cred;
154 			}
155 			VOP_STRATEGY(rabp);
156 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
157 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
158 		}
159 	}
160 
161 	/*
162 	 * If block was memory resident, let bread get it.
163 	 * If block was not memory resident, the read was
164 	 * started above, so just wait for the read to complete.
165 	 */
166 	if (bp == NULL)
167 		return (bread(vp, blkno, size, cred, bpp));
168 	return (biowait(bp));
169 }
170 
171 /*
172  * Synchronous write.
173  * Release buffer on completion.
174  */
175 bwrite(bp)
176 	register struct buf *bp;
177 {
178 	struct proc *p = curproc;		/* XXX */
179 	register int flag;
180 	int s, error = 0;
181 
182 	flag = bp->b_flags;
183 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
184 	if (flag & B_ASYNC) {
185 		if ((flag & B_DELWRI) == 0)
186 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
187 		else
188 			reassignbuf(bp, bp->b_vp);
189 	}
190 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
191 	if (bp->b_bcount > bp->b_bufsize)
192 		panic("bwrite");
193 	s = splbio();
194 	bp->b_vp->v_numoutput++;
195 	splx(s);
196 	VOP_STRATEGY(bp);
197 
198 	/*
199 	 * If the write was synchronous, then await I/O completion.
200 	 * If the write was "delayed", then we put the buffer on
201 	 * the queue of blocks awaiting I/O completion status.
202 	 */
203 	if ((flag & B_ASYNC) == 0) {
204 		error = biowait(bp);
205 		if ((flag&B_DELWRI) == 0)
206 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
207 		else
208 			reassignbuf(bp, bp->b_vp);
209 		brelse(bp);
210 	} else if (flag & B_DELWRI) {
211 		s = splbio();
212 		bp->b_flags |= B_AGE;
213 		splx(s);
214 	}
215 	return (error);
216 }
217 
218 int
219 vn_bwrite(ap)
220 	struct vop_bwrite_args *ap;
221 {
222 	return bwrite (ap->a_bp);
223 }
224 
225 
226 /*
227  * Delayed write.
228  *
229  * The buffer is marked dirty, but is not queued for I/O.
230  * This routine should be used when the buffer is expected
231  * to be modified again soon, typically a small write that
232  * partially fills a buffer.
233  *
234  * NB: magnetic tapes cannot be delayed; they must be
235  * written in the order that the writes are requested.
236  */
237 bdwrite(bp)
238 	register struct buf *bp;
239 {
240 	struct proc *p = curproc;		/* XXX */
241 
242 	if ((bp->b_flags & B_DELWRI) == 0) {
243 		bp->b_flags |= B_DELWRI;
244 		reassignbuf(bp, bp->b_vp);
245 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
246 	}
247 	/*
248 	 * If this is a tape drive, the write must be initiated.
249 	 */
250 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
251 		bawrite(bp);
252 	} else {
253 		bp->b_flags |= (B_DONE | B_DELWRI);
254 		brelse(bp);
255 	}
256 }
257 
258 /*
259  * Asynchronous write.
260  * Start I/O on a buffer, but do not wait for it to complete.
261  * The buffer is released when the I/O completes.
262  */
263 bawrite(bp)
264 	register struct buf *bp;
265 {
266 
267 	/*
268 	 * Setting the ASYNC flag causes bwrite to return
269 	 * after starting the I/O.
270 	 */
271 	bp->b_flags |= B_ASYNC;
272 	(void) bwrite(bp);
273 }
274 
275 /*
276  * Release a buffer.
277  * Even if the buffer is dirty, no I/O is started.
278  */
279 brelse(bp)
280 	register struct buf *bp;
281 {
282 	register struct buf *flist;
283 	int s;
284 
285 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
286 	/*
287 	 * If a process is waiting for the buffer, or
288 	 * is waiting for a free buffer, awaken it.
289 	 */
290 	if (bp->b_flags & B_WANTED)
291 		wakeup((caddr_t)bp);
292 	if (bfreelist[0].b_flags & B_WANTED) {
293 		bfreelist[0].b_flags &= ~B_WANTED;
294 		wakeup((caddr_t)bfreelist);
295 	}
296 	/*
297 	 * Retry I/O for locked buffers rather than invalidating them.
298 	 */
299 	s = splbio();
300 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
301 		bp->b_flags &= ~B_ERROR;
302 	/*
303 	 * Disassociate buffers that are no longer valid.
304 	 */
305 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
306 		bp->b_flags |= B_INVAL;
307 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
308 		if (bp->b_vp)
309 			brelvp(bp);
310 		bp->b_flags &= ~B_DELWRI;
311 	}
312 	/*
313 	 * Stick the buffer back on a free list.
314 	 */
315 	if (bp->b_bufsize <= 0) {
316 		/* block has no buffer ... put at front of unused buffer list */
317 		flist = &bfreelist[BQ_EMPTY];
318 		binsheadfree(bp, flist);
319 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
320 		/* block has no info ... put at front of most free list */
321 		flist = &bfreelist[BQ_AGE];
322 		binsheadfree(bp, flist);
323 	} else {
324 		if (bp->b_flags & B_LOCKED)
325 			flist = &bfreelist[BQ_LOCKED];
326 		else if (bp->b_flags & B_AGE)
327 			flist = &bfreelist[BQ_AGE];
328 		else
329 			flist = &bfreelist[BQ_LRU];
330 		binstailfree(bp, flist);
331 	}
332 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
333 	splx(s);
334 }
335 
336 /*
337  * Check to see if a block is currently memory resident.
338  */
339 incore(vp, blkno)
340 	struct vnode *vp;
341 	daddr_t blkno;
342 {
343 	register struct buf *bp;
344 	register struct buf *dp;
345 
346 	dp = BUFHASH(vp, blkno);
347 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
348 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
349 		    (bp->b_flags & B_INVAL) == 0)
350 			return (1);
351 	return (0);
352 }
353 
354 /*
355  * Check to see if a block is currently memory resident.
356  * If it is resident, return it. If it is not resident,
357  * allocate a new buffer and assign it to the block.
358  */
359 struct buf *
360 getblk(vp, blkno, size)
361 	register struct vnode *vp;
362 	daddr_t blkno;
363 	int size;
364 {
365 	register struct buf *bp, *dp;
366 	int s;
367 
368 	if (size > MAXBSIZE)
369 		panic("getblk: size too big");
370 	/*
371 	 * Search the cache for the block. If the buffer is found,
372 	 * but it is currently locked, the we must wait for it to
373 	 * become available.
374 	 */
375 	dp = BUFHASH(vp, blkno);
376 loop:
377 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
378 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
379 		    (bp->b_flags & B_INVAL))
380 			continue;
381 		s = splbio();
382 		if (bp->b_flags & B_BUSY) {
383 			bp->b_flags |= B_WANTED;
384 			sleep((caddr_t)bp, PRIBIO + 1);
385 			splx(s);
386 			goto loop;
387 		}
388 		bremfree(bp);
389 		bp->b_flags |= B_BUSY;
390 		splx(s);
391 		if (bp->b_bcount != size) {
392 			printf("getblk: stray size");
393 			bp->b_flags |= B_INVAL;
394 			bwrite(bp);
395 			goto loop;
396 		}
397 		bp->b_flags |= B_CACHE;
398 		return (bp);
399 	}
400 	bp = getnewbuf();
401 	bremhash(bp);
402 	bgetvp(vp, bp);
403 	bp->b_bcount = 0;
404 	bp->b_lblkno = blkno;
405 	bp->b_blkno = blkno;
406 	bp->b_error = 0;
407 	bp->b_resid = 0;
408 	binshash(bp, dp);
409 	allocbuf(bp, size);
410 	return (bp);
411 }
412 
413 /*
414  * Allocate a buffer.
415  * The caller will assign it to a block.
416  */
417 struct buf *
418 geteblk(size)
419 	int size;
420 {
421 	register struct buf *bp, *flist;
422 
423 	if (size > MAXBSIZE)
424 		panic("geteblk: size too big");
425 	bp = getnewbuf();
426 	bp->b_flags |= B_INVAL;
427 	bremhash(bp);
428 	flist = &bfreelist[BQ_AGE];
429 	bp->b_bcount = 0;
430 	bp->b_error = 0;
431 	bp->b_resid = 0;
432 	binshash(bp, flist);
433 	allocbuf(bp, size);
434 	return (bp);
435 }
436 
437 /*
438  * Expand or contract the actual memory allocated to a buffer.
439  * If no memory is available, release buffer and take error exit.
440  */
441 allocbuf(tp, size)
442 	register struct buf *tp;
443 	int size;
444 {
445 	register struct buf *bp, *ep;
446 	int sizealloc, take, s;
447 
448 	sizealloc = roundup(size, CLBYTES);
449 	/*
450 	 * Buffer size does not change
451 	 */
452 	if (sizealloc == tp->b_bufsize)
453 		goto out;
454 	/*
455 	 * Buffer size is shrinking.
456 	 * Place excess space in a buffer header taken from the
457 	 * BQ_EMPTY buffer list and placed on the "most free" list.
458 	 * If no extra buffer headers are available, leave the
459 	 * extra space in the present buffer.
460 	 */
461 	if (sizealloc < tp->b_bufsize) {
462 		ep = bfreelist[BQ_EMPTY].av_forw;
463 		if (ep == &bfreelist[BQ_EMPTY])
464 			goto out;
465 		s = splbio();
466 		bremfree(ep);
467 		ep->b_flags |= B_BUSY;
468 		splx(s);
469 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
470 		    (int)tp->b_bufsize - sizealloc);
471 		ep->b_bufsize = tp->b_bufsize - sizealloc;
472 		tp->b_bufsize = sizealloc;
473 		ep->b_flags |= B_INVAL;
474 		ep->b_bcount = 0;
475 		brelse(ep);
476 		goto out;
477 	}
478 	/*
479 	 * More buffer space is needed. Get it out of buffers on
480 	 * the "most free" list, placing the empty headers on the
481 	 * BQ_EMPTY buffer header list.
482 	 */
483 	while (tp->b_bufsize < sizealloc) {
484 		take = sizealloc - tp->b_bufsize;
485 		bp = getnewbuf();
486 		if (take >= bp->b_bufsize)
487 			take = bp->b_bufsize;
488 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
489 		    &tp->b_un.b_addr[tp->b_bufsize], take);
490 		tp->b_bufsize += take;
491 		bp->b_bufsize = bp->b_bufsize - take;
492 		if (bp->b_bcount > bp->b_bufsize)
493 			bp->b_bcount = bp->b_bufsize;
494 		if (bp->b_bufsize <= 0) {
495 			bremhash(bp);
496 			binshash(bp, &bfreelist[BQ_EMPTY]);
497 			bp->b_dev = NODEV;
498 			bp->b_error = 0;
499 			bp->b_flags |= B_INVAL;
500 		}
501 		brelse(bp);
502 	}
503 out:
504 	tp->b_bcount = size;
505 	return (1);
506 }
507 
508 /*
509  * Find a buffer which is available for use.
510  * Select something from a free list.
511  * Preference is to AGE list, then LRU list.
512  */
513 struct buf *
514 getnewbuf()
515 {
516 	register struct buf *bp, *dp;
517 	register struct ucred *cred;
518 	int s;
519 
520 #ifdef LFS
521 	lfs_flush();
522 #endif
523 loop:
524 	s = splbio();
525 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
526 		if (dp->av_forw != dp)
527 			break;
528 	if (dp == bfreelist) {		/* no free blocks */
529 		dp->b_flags |= B_WANTED;
530 		sleep((caddr_t)dp, PRIBIO + 1);
531 		splx(s);
532 		goto loop;
533 	}
534 	bp = dp->av_forw;
535 	bremfree(bp);
536 	bp->b_flags |= B_BUSY;
537 	splx(s);
538 	if (bp->b_flags & B_DELWRI) {
539 		(void) bawrite(bp);
540 		goto loop;
541 	}
542 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
543 	if (bp->b_vp)
544 		brelvp(bp);
545 	if (bp->b_rcred != NOCRED) {
546 		cred = bp->b_rcred;
547 		bp->b_rcred = NOCRED;
548 		crfree(cred);
549 	}
550 	if (bp->b_wcred != NOCRED) {
551 		cred = bp->b_wcred;
552 		bp->b_wcred = NOCRED;
553 		crfree(cred);
554 	}
555 	bp->b_flags = B_BUSY;
556 	bp->b_dirtyoff = bp->b_dirtyend = 0;
557 	bp->b_validoff = bp->b_validend = 0;
558 	return (bp);
559 }
560 
561 /*
562  * Wait for I/O to complete.
563  *
564  * Extract and return any errors associated with the I/O.
565  * If the error flag is set, but no specific error is
566  * given, return EIO.
567  */
568 biowait(bp)
569 	register struct buf *bp;
570 {
571 	int s;
572 
573 	s = splbio();
574 	while ((bp->b_flags & B_DONE) == 0)
575 		sleep((caddr_t)bp, PRIBIO);
576 	splx(s);
577 	if ((bp->b_flags & B_ERROR) == 0)
578 		return (0);
579 	if (bp->b_error)
580 		return (bp->b_error);
581 	return (EIO);
582 }
583 
584 /*
585  * Mark I/O complete on a buffer.
586  *
587  * If a callback has been requested, e.g. the pageout
588  * daemon, do so. Otherwise, awaken waiting processes.
589  */
590 void
591 biodone(bp)
592 	register struct buf *bp;
593 {
594 
595 	if (bp->b_flags & B_DONE)
596 		panic("dup biodone");
597 	bp->b_flags |= B_DONE;
598 	if ((bp->b_flags & B_READ) == 0)
599 		vwakeup(bp);
600 	if (bp->b_flags & B_CALL) {
601 		bp->b_flags &= ~B_CALL;
602 		(*bp->b_iodone)(bp);
603 		return;
604 	}
605 	if (bp->b_flags & B_ASYNC)
606 		brelse(bp);
607 	else {
608 		bp->b_flags &= ~B_WANTED;
609 		wakeup((caddr_t)bp);
610 	}
611 }
612