xref: /original-bsd/sys/kern/vfs_bio.c (revision e59fb703)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_bio.c	7.44 (Berkeley) 12/31/91
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/specdev.h>
17 #include <sys/mount.h>
18 #include <sys/trace.h>
19 #include <sys/resourcevar.h>
20 
21 /*
22  * Initialize buffers and hash links for buffers.
23  */
24 void
25 bufinit()
26 {
27 	register int i;
28 	register struct buf *bp, *dp;
29 	register struct bufhd *hp;
30 	int base, residual;
31 
32 	for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
33 		hp->b_forw = hp->b_back = (struct buf *)hp;
34 
35 	for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
36 		dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
37 		dp->b_flags = B_HEAD;
38 	}
39 	base = bufpages / nbuf;
40 	residual = bufpages % nbuf;
41 	for (i = 0; i < nbuf; i++) {
42 		bp = &buf[i];
43 		bp->b_dev = NODEV;
44 		bp->b_bcount = 0;
45 		bp->b_rcred = NOCRED;
46 		bp->b_wcred = NOCRED;
47 		bp->b_dirtyoff = 0;
48 		bp->b_dirtyend = 0;
49 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
50 		if (i < residual)
51 			bp->b_bufsize = (base + 1) * CLBYTES;
52 		else
53 			bp->b_bufsize = base * CLBYTES;
54 		binshash(bp, &bfreelist[BQ_AGE]);
55 		bp->b_flags = B_BUSY|B_INVAL;
56 		brelse(bp);
57 	}
58 }
59 
60 /*
61  * Find the block in the buffer pool.
62  * If the buffer is not present, allocate a new buffer and load
63  * its contents according to the filesystem fill routine.
64  */
65 bread(vp, blkno, size, cred, bpp)
66 	struct vnode *vp;
67 	daddr_t blkno;
68 	int size;
69 	struct ucred *cred;
70 	struct buf **bpp;
71 {
72 	struct proc *p = curproc;		/* XXX */
73 	register struct buf *bp;
74 
75 	if (size == 0)
76 		panic("bread: size 0");
77 	*bpp = bp = getblk(vp, blkno, size);
78 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
79 		trace(TR_BREADHIT, pack(vp, size), blkno);
80 		return (0);
81 	}
82 	bp->b_flags |= B_READ;
83 	if (bp->b_bcount > bp->b_bufsize)
84 		panic("bread");
85 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
86 		crhold(cred);
87 		bp->b_rcred = cred;
88 	}
89 	VOP_STRATEGY(bp);
90 	trace(TR_BREADMISS, pack(vp, size), blkno);
91 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
92 	return (biowait(bp));
93 }
94 
95 /*
96  * Operates like bread, but also starts I/O on the specified
97  * read-ahead block.
98  */
99 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
100 	struct vnode *vp;
101 	daddr_t blkno; int size;
102 	daddr_t rablkno; int rabsize;
103 	struct ucred *cred;
104 	struct buf **bpp;
105 {
106 	struct proc *p = curproc;		/* XXX */
107 	register struct buf *bp, *rabp;
108 
109 	bp = NULL;
110 	/*
111 	 * If the block is not memory resident,
112 	 * allocate a buffer and start I/O.
113 	 */
114 	if (!incore(vp, blkno)) {
115 		*bpp = bp = getblk(vp, blkno, size);
116 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
117 			bp->b_flags |= B_READ;
118 			if (bp->b_bcount > bp->b_bufsize)
119 				panic("breada");
120 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
121 				crhold(cred);
122 				bp->b_rcred = cred;
123 			}
124 			VOP_STRATEGY(bp);
125 			trace(TR_BREADMISS, pack(vp, size), blkno);
126 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
127 		} else
128 			trace(TR_BREADHIT, pack(vp, size), blkno);
129 	}
130 
131 	/*
132 	 * If there is a read-ahead block, start I/O on it too.
133 	 */
134 	if (!incore(vp, rablkno)) {
135 		rabp = getblk(vp, rablkno, rabsize);
136 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
137 			brelse(rabp);
138 			trace(TR_BREADHITRA, pack(vp, rabsize), rablkno);
139 		} else {
140 			rabp->b_flags |= B_ASYNC | B_READ;
141 			if (rabp->b_bcount > rabp->b_bufsize)
142 				panic("breadrabp");
143 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
144 				crhold(cred);
145 				rabp->b_rcred = cred;
146 			}
147 			VOP_STRATEGY(rabp);
148 			trace(TR_BREADMISSRA, pack(vp, rabsize), rablkno);
149 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
150 		}
151 	}
152 
153 	/*
154 	 * If block was memory resident, let bread get it.
155 	 * If block was not memory resident, the read was
156 	 * started above, so just wait for the read to complete.
157 	 */
158 	if (bp == NULL)
159 		return (bread(vp, blkno, size, cred, bpp));
160 	return (biowait(bp));
161 }
162 
163 /*
164  * Synchronous write.
165  * Release buffer on completion.
166  */
167 bwrite(bp)
168 	register struct buf *bp;
169 {
170 	struct proc *p = curproc;		/* XXX */
171 	register int flag;
172 	int s, error;
173 
174 	flag = bp->b_flags;
175 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
176 	if (flag & B_ASYNC) {
177 		if ((flag & B_DELWRI) == 0)
178 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
179 		else
180 			reassignbuf(bp, bp->b_vp);
181 	}
182 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
183 	if (bp->b_bcount > bp->b_bufsize)
184 		panic("bwrite");
185 	s = splbio();
186 	bp->b_vp->v_numoutput++;
187 	splx(s);
188 	VOP_STRATEGY(bp);
189 
190 	/*
191 	 * If the write was synchronous, then await I/O completion.
192 	 * If the write was "delayed", then we put the buffer on
193 	 * the queue of blocks awaiting I/O completion status.
194 	 */
195 	if ((flag & B_ASYNC) == 0) {
196 		error = biowait(bp);
197 		if ((flag&B_DELWRI) == 0)
198 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
199 		else
200 			reassignbuf(bp, bp->b_vp);
201 		brelse(bp);
202 	} else if (flag & B_DELWRI) {
203 		bp->b_flags |= B_AGE;
204 		error = 0;
205 	}
206 	return (error);
207 }
208 
209 /*
210  * Delayed write.
211  *
212  * The buffer is marked dirty, but is not queued for I/O.
213  * This routine should be used when the buffer is expected
214  * to be modified again soon, typically a small write that
215  * partially fills a buffer.
216  *
217  * NB: magnetic tapes cannot be delayed; they must be
218  * written in the order that the writes are requested.
219  */
220 bdwrite(bp)
221 	register struct buf *bp;
222 {
223 	struct proc *p = curproc;		/* XXX */
224 
225 	if ((bp->b_flags & B_DELWRI) == 0) {
226 		bp->b_flags |= B_DELWRI;
227 		reassignbuf(bp, bp->b_vp);
228 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
229 	}
230 	/*
231 	 * If this is a tape drive, the write must be initiated.
232 	 */
233 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
234 		bawrite(bp);
235 	} else {
236 		bp->b_flags |= (B_DONE | B_DELWRI);
237 		brelse(bp);
238 	}
239 }
240 
241 /*
242  * Asynchronous write.
243  * Start I/O on a buffer, but do not wait for it to complete.
244  * The buffer is released when the I/O completes.
245  */
246 bawrite(bp)
247 	register struct buf *bp;
248 {
249 
250 	/*
251 	 * Setting the ASYNC flag causes bwrite to return
252 	 * after starting the I/O.
253 	 */
254 	bp->b_flags |= B_ASYNC;
255 	(void) bwrite(bp);
256 }
257 
258 /*
259  * Release a buffer.
260  * Even if the buffer is dirty, no I/O is started.
261  */
262 brelse(bp)
263 	register struct buf *bp;
264 {
265 	register struct buf *flist;
266 	int s;
267 
268 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
269 	/*
270 	 * If a process is waiting for the buffer, or
271 	 * is waiting for a free buffer, awaken it.
272 	 */
273 	if (bp->b_flags & B_WANTED)
274 		wakeup((caddr_t)bp);
275 	if (bfreelist[0].b_flags & B_WANTED) {
276 		bfreelist[0].b_flags &= ~B_WANTED;
277 		wakeup((caddr_t)bfreelist);
278 	}
279 	/*
280 	 * Retry I/O for locked buffers rather than invalidating them.
281 	 */
282 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
283 		bp->b_flags &= ~B_ERROR;
284 	/*
285 	 * Disassociate buffers that are no longer valid.
286 	 */
287 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
288 		bp->b_flags |= B_INVAL;
289 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
290 		if (bp->b_vp)
291 			brelvp(bp);
292 		bp->b_flags &= ~B_DELWRI;
293 	}
294 	/*
295 	 * Stick the buffer back on a free list.
296 	 */
297 	s = splbio();
298 	if (bp->b_bufsize <= 0) {
299 		/* block has no buffer ... put at front of unused buffer list */
300 		flist = &bfreelist[BQ_EMPTY];
301 		binsheadfree(bp, flist);
302 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
303 		/* block has no info ... put at front of most free list */
304 		flist = &bfreelist[BQ_AGE];
305 		binsheadfree(bp, flist);
306 	} else {
307 		if (bp->b_flags & B_LOCKED)
308 			flist = &bfreelist[BQ_LOCKED];
309 		else if (bp->b_flags & B_AGE)
310 			flist = &bfreelist[BQ_AGE];
311 		else
312 			flist = &bfreelist[BQ_LRU];
313 		binstailfree(bp, flist);
314 	}
315 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
316 	splx(s);
317 }
318 
319 /*
320  * Check to see if a block is currently memory resident.
321  */
322 incore(vp, blkno)
323 	struct vnode *vp;
324 	daddr_t blkno;
325 {
326 	register struct buf *bp;
327 	register struct buf *dp;
328 
329 	dp = BUFHASH(vp, blkno);
330 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
331 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
332 		    (bp->b_flags & B_INVAL) == 0)
333 			return (1);
334 	return (0);
335 }
336 
337 /*
338  * Check to see if a block is currently memory resident.
339  * If it is resident, return it. If it is not resident,
340  * allocate a new buffer and assign it to the block.
341  */
342 struct buf *
343 getblk(vp, blkno, size)
344 	register struct vnode *vp;
345 	daddr_t blkno;
346 	int size;
347 {
348 	register struct buf *bp, *dp;
349 	int s;
350 
351 	if (size > MAXBSIZE)
352 		panic("getblk: size too big");
353 	/*
354 	 * Search the cache for the block. If the buffer is found,
355 	 * but it is currently locked, the we must wait for it to
356 	 * become available.
357 	 */
358 	dp = BUFHASH(vp, blkno);
359 loop:
360 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
361 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
362 		    (bp->b_flags & B_INVAL))
363 			continue;
364 		s = splbio();
365 		if (bp->b_flags & B_BUSY) {
366 			bp->b_flags |= B_WANTED;
367 			sleep((caddr_t)bp, PRIBIO + 1);
368 			splx(s);
369 			goto loop;
370 		}
371 		bremfree(bp);
372 		bp->b_flags |= B_BUSY;
373 		splx(s);
374 		if (bp->b_bcount != size) {
375 			printf("getblk: stray size");
376 			bp->b_flags |= B_INVAL;
377 			bwrite(bp);
378 			goto loop;
379 		}
380 		bp->b_flags |= B_CACHE;
381 		return (bp);
382 	}
383 	bp = getnewbuf();
384 	bremhash(bp);
385 	bgetvp(vp, bp);
386 	bp->b_bcount = 0;
387 	bp->b_lblkno = blkno;
388 	bp->b_blkno = blkno;
389 	bp->b_error = 0;
390 	bp->b_resid = 0;
391 	binshash(bp, dp);
392 	allocbuf(bp, size);
393 	return (bp);
394 }
395 
396 /*
397  * Allocate a buffer.
398  * The caller will assign it to a block.
399  */
400 struct buf *
401 geteblk(size)
402 	int size;
403 {
404 	register struct buf *bp, *flist;
405 
406 	if (size > MAXBSIZE)
407 		panic("geteblk: size too big");
408 	bp = getnewbuf();
409 	bp->b_flags |= B_INVAL;
410 	bremhash(bp);
411 	flist = &bfreelist[BQ_AGE];
412 	bp->b_bcount = 0;
413 	bp->b_error = 0;
414 	bp->b_resid = 0;
415 	binshash(bp, flist);
416 	allocbuf(bp, size);
417 	return (bp);
418 }
419 
420 /*
421  * Expand or contract the actual memory allocated to a buffer.
422  * If no memory is available, release buffer and take error exit.
423  */
424 allocbuf(tp, size)
425 	register struct buf *tp;
426 	int size;
427 {
428 	register struct buf *bp, *ep;
429 	int sizealloc, take, s;
430 
431 	sizealloc = roundup(size, CLBYTES);
432 	/*
433 	 * Buffer size does not change
434 	 */
435 	if (sizealloc == tp->b_bufsize)
436 		goto out;
437 	/*
438 	 * Buffer size is shrinking.
439 	 * Place excess space in a buffer header taken from the
440 	 * BQ_EMPTY buffer list and placed on the "most free" list.
441 	 * If no extra buffer headers are available, leave the
442 	 * extra space in the present buffer.
443 	 */
444 	if (sizealloc < tp->b_bufsize) {
445 		ep = bfreelist[BQ_EMPTY].av_forw;
446 		if (ep == &bfreelist[BQ_EMPTY])
447 			goto out;
448 		s = splbio();
449 		bremfree(ep);
450 		ep->b_flags |= B_BUSY;
451 		splx(s);
452 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
453 		    (int)tp->b_bufsize - sizealloc);
454 		ep->b_bufsize = tp->b_bufsize - sizealloc;
455 		tp->b_bufsize = sizealloc;
456 		ep->b_flags |= B_INVAL;
457 		ep->b_bcount = 0;
458 		brelse(ep);
459 		goto out;
460 	}
461 	/*
462 	 * More buffer space is needed. Get it out of buffers on
463 	 * the "most free" list, placing the empty headers on the
464 	 * BQ_EMPTY buffer header list.
465 	 */
466 	while (tp->b_bufsize < sizealloc) {
467 		take = sizealloc - tp->b_bufsize;
468 		bp = getnewbuf();
469 		if (take >= bp->b_bufsize)
470 			take = bp->b_bufsize;
471 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
472 		    &tp->b_un.b_addr[tp->b_bufsize], take);
473 		tp->b_bufsize += take;
474 		bp->b_bufsize = bp->b_bufsize - take;
475 		if (bp->b_bcount > bp->b_bufsize)
476 			bp->b_bcount = bp->b_bufsize;
477 		if (bp->b_bufsize <= 0) {
478 			bremhash(bp);
479 			binshash(bp, &bfreelist[BQ_EMPTY]);
480 			bp->b_dev = NODEV;
481 			bp->b_error = 0;
482 			bp->b_flags |= B_INVAL;
483 		}
484 		brelse(bp);
485 	}
486 out:
487 	tp->b_bcount = size;
488 	return (1);
489 }
490 
491 /*
492  * Find a buffer which is available for use.
493  * Select something from a free list.
494  * Preference is to AGE list, then LRU list.
495  */
496 struct buf *
497 getnewbuf()
498 {
499 	register struct buf *bp, *dp;
500 	register struct ucred *cred;
501 	int s;
502 
503 #ifdef LFS
504 	lfs_flush();
505 #endif
506 loop:
507 	s = splbio();
508 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
509 		if (dp->av_forw != dp)
510 			break;
511 	if (dp == bfreelist) {		/* no free blocks */
512 		dp->b_flags |= B_WANTED;
513 		sleep((caddr_t)dp, PRIBIO + 1);
514 		splx(s);
515 		goto loop;
516 	}
517 	bp = dp->av_forw;
518 	bremfree(bp);
519 	bp->b_flags |= B_BUSY;
520 	splx(s);
521 	if (bp->b_flags & B_DELWRI) {
522 		(void) bawrite(bp);
523 		goto loop;
524 	}
525 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
526 	if (bp->b_vp)
527 		brelvp(bp);
528 	if (bp->b_rcred != NOCRED) {
529 		cred = bp->b_rcred;
530 		bp->b_rcred = NOCRED;
531 		crfree(cred);
532 	}
533 	if (bp->b_wcred != NOCRED) {
534 		cred = bp->b_wcred;
535 		bp->b_wcred = NOCRED;
536 		crfree(cred);
537 	}
538 	bp->b_flags = B_BUSY;
539 	bp->b_dirtyoff = bp->b_dirtyend = 0;
540 	return (bp);
541 }
542 
543 /*
544  * Wait for I/O to complete.
545  *
546  * Extract and return any errors associated with the I/O.
547  * If the error flag is set, but no specific error is
548  * given, return EIO.
549  */
550 biowait(bp)
551 	register struct buf *bp;
552 {
553 	int s;
554 
555 	s = splbio();
556 	while ((bp->b_flags & B_DONE) == 0)
557 		sleep((caddr_t)bp, PRIBIO);
558 	splx(s);
559 	if ((bp->b_flags & B_ERROR) == 0)
560 		return (0);
561 	if (bp->b_error)
562 		return (bp->b_error);
563 	return (EIO);
564 }
565 
566 /*
567  * Mark I/O complete on a buffer.
568  *
569  * If a callback has been requested, e.g. the pageout
570  * daemon, do so. Otherwise, awaken waiting processes.
571  */
572 void
573 biodone(bp)
574 	register struct buf *bp;
575 {
576 
577 	if (bp->b_flags & B_DONE)
578 		panic("dup biodone");
579 	bp->b_flags |= B_DONE;
580 	if ((bp->b_flags & B_READ) == 0)
581 		vwakeup(bp);
582 	if (bp->b_flags & B_CALL) {
583 		bp->b_flags &= ~B_CALL;
584 		(*bp->b_iodone)(bp);
585 		return;
586 	}
587 	if (bp->b_flags & B_ASYNC)
588 		brelse(bp);
589 	else {
590 		bp->b_flags &= ~B_WANTED;
591 		wakeup((caddr_t)bp);
592 	}
593 }
594