xref: /original-bsd/sys/kern/vfs_bio.c (revision 3f14a87d)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_bio.c	7.58 (Berkeley) 02/02/93
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/mount.h>
17 #include <sys/trace.h>
18 #include <sys/resourcevar.h>
19 #include <sys/malloc.h>
20 #include <libkern/libkern.h>
21 
22 /*
23  * Definitions for the buffer hash lists.
24  */
25 #define	BUFHASH(dvp, lbn)	\
26 	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
27 struct	list_entry *bufhashtbl, invalhash;
28 u_long	bufhash;
29 
30 /*
31  * Insq/Remq for the buffer hash lists.
32  */
33 #define	binshash(bp, dp)	list_enter_head(dp, bp, struct buf *, b_hash)
34 #define	bremhash(bp)		list_remove(bp, struct buf *, b_hash)
35 
36 /*
37  * Definitions for the buffer free lists.
38  */
39 #define	BQUEUES		4		/* number of free buffer queues */
40 
41 #define	BQ_LOCKED	0		/* super-blocks &c */
42 #define	BQ_LRU		1		/* lru, useful buffers */
43 #define	BQ_AGE		2		/* rubbish */
44 #define	BQ_EMPTY	3		/* buffer headers with no memory */
45 
46 struct queue_entry bufqueues[BQUEUES];
47 int needbuffer;
48 
49 /*
50  * Insq/Remq for the buffer free lists.
51  */
52 #define	binsheadfree(bp, dp) \
53 	queue_enter_head(dp, bp, struct buf *, b_freelist)
54 #define	binstailfree(bp, dp) \
55 	queue_enter_tail(dp, bp, struct buf *, b_freelist)
56 
57 /*
58  * Local declarations
59  */
60 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
61 	    daddr_t, long, int));
62 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
63 	    daddr_t, daddr_t, long, int, long));
64 void	    cluster_wbuild __P((struct vnode *, struct buf *, long size,
65 	    daddr_t start_lbn, int len, daddr_t lbn));
66 
67 void
68 bremfree(bp)
69 	struct buf *bp;
70 {
71 	struct queue_entry *dp;
72 
73 	/*
74 	 * We only calculate the head of the freelist when removing
75 	 * the last element of the list as that is the only time that
76 	 * it is needed (e.g. to reset the tail pointer).
77 	 */
78 	if (bp->b_freelist.qe_next == NULL) {
79 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
80 			if (dp->qe_prev == &bp->b_freelist.qe_next)
81 				break;
82 		if (dp == &bufqueues[BQUEUES])
83 			panic("bremfree: lost tail");
84 	}
85 	queue_remove(dp, bp, struct buf *, b_freelist);
86 }
87 
88 /*
89  * Initialize buffers and hash links for buffers.
90  */
91 void
92 bufinit()
93 {
94 	register struct buf *bp;
95 	struct queue_entry *dp;
96 	register int i;
97 	int base, residual;
98 
99 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
100 		queue_init(dp);
101 	bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash);
102 	base = bufpages / nbuf;
103 	residual = bufpages % nbuf;
104 	for (i = 0; i < nbuf; i++) {
105 		bp = &buf[i];
106 		bzero((char *)bp, sizeof *bp);
107 		bp->b_dev = NODEV;
108 		bp->b_rcred = NOCRED;
109 		bp->b_wcred = NOCRED;
110 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
111 		if (i < residual)
112 			bp->b_bufsize = (base + 1) * CLBYTES;
113 		else
114 			bp->b_bufsize = base * CLBYTES;
115 		bp->b_flags = B_INVAL;
116 		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
117 		binsheadfree(bp, dp);
118 		binshash(bp, &invalhash);
119 	}
120 }
121 
122 /*
123  * Find the block in the buffer pool.
124  * If the buffer is not present, allocate a new buffer and load
125  * its contents according to the filesystem fill routine.
126  */
127 bread(vp, blkno, size, cred, bpp)
128 	struct vnode *vp;
129 	daddr_t blkno;
130 	int size;
131 	struct ucred *cred;
132 	struct buf **bpp;
133 {
134 	struct proc *p = curproc;		/* XXX */
135 	register struct buf *bp;
136 
137 	if (size == 0)
138 		panic("bread: size 0");
139 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
140 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
141 		trace(TR_BREADHIT, pack(vp, size), blkno);
142 		return (0);
143 	}
144 	bp->b_flags |= B_READ;
145 	if (bp->b_bcount > bp->b_bufsize)
146 		panic("bread");
147 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
148 		crhold(cred);
149 		bp->b_rcred = cred;
150 	}
151 	VOP_STRATEGY(bp);
152 	trace(TR_BREADMISS, pack(vp, size), blkno);
153 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
154 	return (biowait(bp));
155 }
156 
157 /*
158  * Operates like bread, but also starts I/O on the N specified
159  * read-ahead blocks.
160  */
161 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
162 	struct vnode *vp;
163 	daddr_t blkno; int size;
164 	daddr_t rablkno[]; int rabsize[];
165 	int num;
166 	struct ucred *cred;
167 	struct buf **bpp;
168 {
169 	struct proc *p = curproc;		/* XXX */
170 	register struct buf *bp, *rabp;
171 	register int i;
172 
173 	bp = NULL;
174 	/*
175 	 * If the block is not memory resident,
176 	 * allocate a buffer and start I/O.
177 	 */
178 	if (!incore(vp, blkno)) {
179 		*bpp = bp = getblk(vp, blkno, size, 0, 0);
180 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
181 			bp->b_flags |= B_READ;
182 			if (bp->b_bcount > bp->b_bufsize)
183 				panic("breadn");
184 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
185 				crhold(cred);
186 				bp->b_rcred = cred;
187 			}
188 			VOP_STRATEGY(bp);
189 			trace(TR_BREADMISS, pack(vp, size), blkno);
190 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
191 		} else {
192 			trace(TR_BREADHIT, pack(vp, size), blkno);
193 		}
194 	}
195 
196 	/*
197 	 * If there's read-ahead block(s), start I/O
198 	 * on them also (as above).
199 	 */
200 	for (i = 0; i < num; i++) {
201 		if (incore(vp, rablkno[i]))
202 			continue;
203 		rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0);
204 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
205 			brelse(rabp);
206 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
207 		} else {
208 			rabp->b_flags |= B_ASYNC | B_READ;
209 			if (rabp->b_bcount > rabp->b_bufsize)
210 				panic("breadrabp");
211 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
212 				crhold(cred);
213 				rabp->b_rcred = cred;
214 			}
215 			VOP_STRATEGY(rabp);
216 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
217 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
218 		}
219 	}
220 
221 	/*
222 	 * If block was memory resident, let bread get it.
223 	 * If block was not memory resident, the read was
224 	 * started above, so just wait for the read to complete.
225 	 */
226 	if (bp == NULL)
227 		return (bread(vp, blkno, size, cred, bpp));
228 	return (biowait(bp));
229 }
230 
231 /*
232  * We could optimize this by keeping track of where the last read-ahead
233  * was, but it would involve adding fields to the vnode.  For now, let's
234  * just get it working.
235  *
236  * This replaces bread.  If this is a bread at the beginning of a file and
237  * lastr is 0, we assume this is the first read and we'll read up to two
238  * blocks if they are sequential.  After that, we'll do regular read ahead
239  * in clustered chunks.
240  *
241  * There are 4 or 5 cases depending on how you count:
242  *	Desired block is in the cache:
243  *	    1 Not sequential access (0 I/Os).
244  *	    2 Access is sequential, do read-ahead (1 ASYNC).
245  *	Desired block is not in cache:
246  *	    3 Not sequential access (1 SYNC).
247  *	    4 Sequential access, next block is contiguous (1 SYNC).
248  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
249  *
250  * There are potentially two buffers that require I/O.
251  * 	bp is the block requested.
252  *	rbp is the read-ahead block.
253  *	If either is NULL, then you don't have to do the I/O.
254  */
255 cluster_read(vp, filesize, lblkno, size, cred, bpp)
256 	struct vnode *vp;
257 	u_quad_t filesize;
258 	daddr_t lblkno;
259 	long size;
260 	struct ucred *cred;
261 	struct buf **bpp;
262 {
263 	struct buf *bp, *rbp;
264 	daddr_t blkno, ioblkno;
265 	long flags;
266 	int error, num_ra, alreadyincore;
267 
268 #ifdef DIAGNOSTIC
269 	if (size == 0)
270 		panic("cluster_read: size = 0");
271 #endif
272 
273 	error = 0;
274 	flags = B_READ;
275 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
276 	if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
277 		/*
278 		 * Desired block is in cache; do any readahead ASYNC.
279 		 * Case 1, 2.
280 		 */
281 		trace(TR_BREADHIT, pack(vp, size), lblkno);
282 		flags |= B_ASYNC;
283 		ioblkno = lblkno +
284 		    (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
285 		alreadyincore = (int)incore(vp, ioblkno);
286 		bp = NULL;
287 	} else {
288 		/* Block wasn't in cache, case 3, 4, 5. */
289 		trace(TR_BREADMISS, pack(vp, size), lblkno);
290 		ioblkno = lblkno;
291 		bp->b_flags |= flags;
292 		alreadyincore = 0;
293 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
294 	}
295 	/*
296 	 * XXX
297 	 * Replace 1 with a window size based on some permutation of
298 	 * maxcontig and rot_delay.  This will let you figure out how
299 	 * many blocks you should read-ahead (case 2, 4, 5).
300 	 *
301 	 * If the access isn't sequential, cut the window size in half.
302 	 */
303 	rbp = NULL;
304 	if (lblkno != vp->v_lastr + 1 && lblkno != 0)
305 		vp->v_ralen = max(vp->v_ralen >> 1, 1);
306 	else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
307 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
308 		/*
309 		 * Reading sequentially, and the next block is not in the
310 		 * cache.  We are going to try reading ahead. If this is
311 		 * the first read of a file, then limit read-ahead to a
312 		 * single block, else read as much as we're allowed.
313 		 */
314 		if (num_ra > vp->v_ralen) {
315 			num_ra = vp->v_ralen;
316 			vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
317 		} else
318 			vp->v_ralen = num_ra + 1;
319 
320 
321 		if (num_ra)				/* case 2, 4 */
322 			rbp = cluster_rbuild(vp, filesize,
323 			    bp, ioblkno, blkno, size, num_ra, flags);
324 		else if (lblkno != 0 && ioblkno == lblkno) {
325 			/* Case 5: check how many blocks to read ahead */
326 			++ioblkno;
327 			if ((ioblkno + 1) * size > filesize ||
328 			    (error = VOP_BMAP(vp,
329 			    ioblkno, NULL, &blkno, &num_ra)))
330 				goto skip_readahead;
331 			flags |= B_ASYNC;
332 			if (num_ra)
333 				rbp = cluster_rbuild(vp, filesize,
334 				    NULL, ioblkno, blkno, size, num_ra, flags);
335 			else {
336 				rbp = getblk(vp, ioblkno, size, 0, 0);
337 				rbp->b_flags |= flags;
338 				rbp->b_blkno = blkno;
339 			}
340 		} else if (lblkno != 0) {
341 			/* case 2; read ahead single block */
342 			rbp = getblk(vp, ioblkno, size, 0, 0);
343 			rbp->b_flags |= flags;
344 			rbp->b_blkno = blkno;
345 		} else if (bp)				/* case 1, 3, block 0 */
346 			bp->b_blkno = blkno;
347 		/* Case 1 on block 0; not really doing sequential I/O */
348 
349 		if (rbp == bp)		/* case 4 */
350 			rbp = NULL;
351 		else if (rbp) {			/* case 2, 5 */
352 			trace(TR_BREADMISSRA,
353 			    pack(vp, (num_ra + 1) * size), ioblkno);
354 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
355 		}
356 	}
357 
358 	/* XXX Kirk, do we need to make sure the bp has creds? */
359 skip_readahead:
360 	if (bp)
361 		if (bp->b_flags & (B_DONE | B_DELWRI))
362 			panic("cluster_read: DONE bp");
363 		else
364 			error = VOP_STRATEGY(bp);
365 
366 	if (rbp)
367 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
368 			rbp->b_flags &= ~(B_ASYNC | B_READ);
369 			brelse(rbp);
370 		} else
371 			(void) VOP_STRATEGY(rbp);
372 
373 	if (bp)
374 		return(biowait(bp));
375 	return(error);
376 }
377 
378 /*
379  * If blocks are contiguous on disk, use this to provide clustered
380  * read ahead.  We will read as many blocks as possible sequentially
381  * and then parcel them up into logical blocks in the buffer hash table.
382  */
383 struct buf *
384 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
385 	struct vnode *vp;
386 	u_quad_t filesize;
387 	struct buf *bp;
388 	daddr_t lbn;
389 	daddr_t blkno;
390 	long size;
391 	int run;
392 	long flags;
393 {
394 	struct cluster_save *b_save;
395 	struct buf *tbp;
396 	daddr_t bn;
397 	int i, inc;
398 
399 	if (size * (lbn + run + 1) > filesize)
400 		--run;
401 	if (run == 0) {
402 		if (!bp) {
403 			bp = getblk(vp, lbn, size, 0, 0);
404 			bp->b_blkno = blkno;
405 			bp->b_flags |= flags;
406 		}
407 		return(bp);
408 	}
409 
410 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
411 	if (bp->b_flags & (B_DONE | B_DELWRI))
412 		return (bp);
413 
414 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
415 	    M_SEGMENT, M_WAITOK);
416 	b_save->bs_bufsize = b_save->bs_bcount = size;
417 	b_save->bs_nchildren = 0;
418 	b_save->bs_children = (struct buf **)(b_save + 1);
419 	b_save->bs_saveaddr = bp->b_saveaddr;
420 	bp->b_saveaddr = (caddr_t) b_save;
421 
422 	inc = size / DEV_BSIZE;
423 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
424 		if (incore(vp, lbn + i)) {
425 			if (i == 1) {
426 				bp->b_saveaddr = b_save->bs_saveaddr;
427 				bp->b_flags &= ~B_CALL;
428 				bp->b_iodone = NULL;
429 				allocbuf(bp, size);
430 				free(b_save, M_SEGMENT);
431 			} else
432 				allocbuf(bp, size * i);
433 			break;
434 		}
435 		tbp = getblk(vp, lbn + i, 0, 0, 0);
436 		tbp->b_bcount = tbp->b_bufsize = size;
437 		tbp->b_blkno = bn;
438 		tbp->b_flags |= flags | B_READ | B_ASYNC;
439 		++b_save->bs_nchildren;
440 		b_save->bs_children[i - 1] = tbp;
441 	}
442 	if (!(bp->b_flags & B_ASYNC))
443 		vp->v_ralen = max(vp->v_ralen - 1, 1);
444 	return(bp);
445 }
446 
447 /*
448  * Either get a new buffer or grow the existing one.
449  */
450 struct buf *
451 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
452 	struct vnode *vp;
453 	struct buf *bp;
454 	long flags;
455 	daddr_t blkno;
456 	daddr_t lblkno;
457 	long size;
458 	int run;
459 {
460 	if (!bp) {
461 		bp = getblk(vp, lblkno, size, 0, 0);
462 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
463 			bp->b_blkno = blkno;
464 			return(bp);
465 		}
466 	}
467 	allocbuf(bp, run * size);
468 	bp->b_blkno = blkno;
469 	bp->b_iodone = cluster_callback;
470 	bp->b_flags |= flags | B_CALL;
471 	return(bp);
472 }
473 
474 /*
475  * Cleanup after a clustered read or write.
476  */
477 void
478 cluster_callback(bp)
479 	struct buf *bp;
480 {
481 	struct cluster_save *b_save;
482 	struct buf **tbp;
483 	long bsize;
484 	caddr_t cp;
485 	b_save = (struct cluster_save *)(bp->b_saveaddr);
486 	bp->b_saveaddr = b_save->bs_saveaddr;
487 
488 	cp = bp->b_un.b_addr + b_save->bs_bufsize;
489 	for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
490 		pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
491 		cp += (*tbp)->b_bufsize;
492 		bp->b_bufsize -= (*tbp)->b_bufsize;
493 		biodone(*tbp);
494 	}
495 #ifdef DIAGNOSTIC
496 	if (bp->b_bufsize != b_save->bs_bufsize)
497 		panic ("cluster_callback: more space to reclaim");
498 #endif
499 	bp->b_bcount = bp->b_bufsize;
500 	bp->b_iodone = NULL;
501 	free(b_save, M_SEGMENT);
502 	if (bp->b_flags & B_ASYNC)
503 		brelse(bp);
504 	else
505 		wakeup((caddr_t)bp);
506 }
507 
508 /*
509  * Synchronous write.
510  * Release buffer on completion.
511  */
512 bwrite(bp)
513 	register struct buf *bp;
514 {
515 	struct proc *p = curproc;		/* XXX */
516 	register int flag;
517 	int s, error = 0;
518 
519 	flag = bp->b_flags;
520 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
521 	if (flag & B_ASYNC) {
522 		if ((flag & B_DELWRI) == 0)
523 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
524 		else
525 			reassignbuf(bp, bp->b_vp);
526 	}
527 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
528 	if (bp->b_bcount > bp->b_bufsize)
529 		panic("bwrite");
530 	s = splbio();
531 	bp->b_vp->v_numoutput++;
532 	bp->b_flags |= B_WRITEINPROG;
533 	splx(s);
534 	VOP_STRATEGY(bp);
535 
536 	/*
537 	 * If the write was synchronous, then await I/O completion.
538 	 * If the write was "delayed", then we put the buffer on
539 	 * the queue of blocks awaiting I/O completion status.
540 	 */
541 	if ((flag & B_ASYNC) == 0) {
542 		error = biowait(bp);
543 		if ((flag&B_DELWRI) == 0)
544 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
545 		else
546 			reassignbuf(bp, bp->b_vp);
547 		if (bp->b_flags & B_EINTR) {
548 			bp->b_flags &= ~B_EINTR;
549 			error = EINTR;
550 		}
551 		brelse(bp);
552 	} else if (flag & B_DELWRI) {
553 		s = splbio();
554 		bp->b_flags |= B_AGE;
555 		splx(s);
556 	}
557 	return (error);
558 }
559 
560 int
561 vn_bwrite(ap)
562 	struct vop_bwrite_args *ap;
563 {
564 	return (bwrite(ap->a_bp));
565 }
566 
567 
568 /*
569  * Delayed write.
570  *
571  * The buffer is marked dirty, but is not queued for I/O.
572  * This routine should be used when the buffer is expected
573  * to be modified again soon, typically a small write that
574  * partially fills a buffer.
575  *
576  * NB: magnetic tapes cannot be delayed; they must be
577  * written in the order that the writes are requested.
578  */
579 bdwrite(bp)
580 	register struct buf *bp;
581 {
582 	struct proc *p = curproc;		/* XXX */
583 
584 	if ((bp->b_flags & B_DELWRI) == 0) {
585 		bp->b_flags |= B_DELWRI;
586 		reassignbuf(bp, bp->b_vp);
587 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
588 	}
589 	/*
590 	 * If this is a tape drive, the write must be initiated.
591 	 */
592 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
593 		bawrite(bp);
594 	} else {
595 		bp->b_flags |= (B_DONE | B_DELWRI);
596 		brelse(bp);
597 	}
598 }
599 
600 /*
601  * Asynchronous write.
602  * Start I/O on a buffer, but do not wait for it to complete.
603  * The buffer is released when the I/O completes.
604  */
605 bawrite(bp)
606 	register struct buf *bp;
607 {
608 
609 	/*
610 	 * Setting the ASYNC flag causes bwrite to return
611 	 * after starting the I/O.
612 	 */
613 	bp->b_flags |= B_ASYNC;
614 	(void) VOP_BWRITE(bp);
615 }
616 
617 /*
618  * Do clustered write for FFS.
619  *
620  * Three cases:
621  *	1. Write is not sequential (write asynchronously)
622  *	Write is sequential:
623  *	2.	beginning of cluster - begin cluster
624  *	3.	middle of a cluster - add to cluster
625  *	4.	end of a cluster - asynchronously write cluster
626  */
627 void
628 cluster_write(bp, filesize)
629         struct buf *bp;
630 	u_quad_t filesize;
631 {
632         struct vnode *vp;
633         daddr_t lbn;
634         int clen, error, maxrun;
635 
636         vp = bp->b_vp;
637         lbn = bp->b_lblkno;
638 	clen = 0;
639 
640 	/*
641 	 * Handle end of file first.  If we are appending, we need to check
642 	 * if the current block was allocated contiguously.  If it wasn't,
643 	 * then we need to fire off a previous cluster if it existed.
644 	 * Additionally, when we're appending, we need to figure out how
645 	 * to initialize vp->v_clen.
646 	 */
647 	if ((lbn + 1) * bp->b_bcount == filesize) {
648 		if (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE) {
649 			/* This block was not allocated contiguously */
650 			if (vp->v_clen)
651 			    cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
652 				vp->v_lastw - vp->v_cstart + 1, lbn);
653 			vp->v_cstart = lbn;
654 			clen = vp->v_clen =
655 			    MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
656 			/*
657 			 * Next cluster started. Write this buffer and return.
658 			 */
659 			vp->v_lastw = lbn;
660 			vp->v_lasta = bp->b_blkno;
661 			bdwrite(bp);
662 			return;
663 		}
664 		vp->v_lasta = bp->b_blkno;
665 	} else if (lbn == 0) {
666 		vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
667 	}
668         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1) {
669 		if (vp->v_clen != 0)
670 			/*
671 			 * Write is not sequential.
672 			 */
673 			cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
674 			    vp->v_lastw - vp->v_cstart + 1, lbn);
675 		/*
676 		 * Consider beginning a cluster.
677 		 */
678 		if (error = VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) {
679 			bawrite(bp);
680 			vp->v_cstart = lbn + 1;
681 			vp->v_lastw = lbn;
682 			return;
683 		}
684                 vp->v_clen = clen;
685                 if (clen == 0) {		/* I/O not contiguous */
686 			vp->v_cstart = lbn + 1;
687                         bawrite(bp);
688                 } else {			/* Wait for rest of cluster */
689 			vp->v_cstart = lbn;
690                         bdwrite(bp);
691 		}
692         } else if (lbn == vp->v_cstart + vp->v_clen) {
693 		/*
694 		 * At end of cluster, write it out.
695 		 */
696 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
697 		    vp->v_clen + 1, lbn);
698 		vp->v_clen = 0;
699 		vp->v_cstart = lbn + 1;
700         } else
701 		/*
702 		 * In the middle of a cluster, so just delay the
703 		 * I/O for now.
704 		 */
705                 bdwrite(bp);
706         vp->v_lastw = lbn;
707 }
708 
709 
710 /*
711  * This is an awful lot like cluster_rbuild...wish they could be combined.
712  * The last lbn argument is the current block on which I/O is being
713  * performed.  Check to see that it doesn't fall in the middle of
714  * the current block.
715  */
716 void
717 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
718 	struct vnode *vp;
719 	struct buf *last_bp;
720 	long size;
721 	daddr_t start_lbn;
722 	int len;
723 	daddr_t	lbn;
724 {
725 	struct cluster_save *b_save;
726 	struct buf *bp, *tbp;
727 	caddr_t	cp;
728 	int i, s;
729 
730 redo:
731 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
732 		++start_lbn;
733 		--len;
734 	}
735 
736 	/* Get more memory for current buffer */
737 	if (len <= 1) {
738 		if (last_bp)
739 			bawrite(last_bp);
740 		return;
741 	}
742 
743 	bp = getblk(vp, start_lbn, size, 0, 0);
744 	if (!(bp->b_flags & B_DELWRI)) {
745 		++start_lbn;
746 		--len;
747 		brelse(bp);
748 		goto redo;
749 	}
750 
751 	--len;
752 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
753 	    M_SEGMENT, M_WAITOK);
754 	b_save->bs_bcount = bp->b_bcount;
755 	b_save->bs_bufsize = bp->b_bufsize;
756 	b_save->bs_nchildren = 0;
757 	b_save->bs_children = (struct buf **)(b_save + 1);
758 	b_save->bs_saveaddr = bp->b_saveaddr;
759 	bp->b_saveaddr = (caddr_t) b_save;
760 
761 
762 	bp->b_flags |= B_CALL;
763 	bp->b_iodone = cluster_callback;
764 	cp = bp->b_un.b_addr + bp->b_bufsize;
765 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
766 		if (!incore(vp, start_lbn) || start_lbn == lbn)
767 			break;
768 
769 		if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
770 			tbp = getblk(vp, start_lbn, size, 0, 0);
771 #ifdef DIAGNOSTIC
772 			if (tbp->b_bcount != tbp->b_bufsize)
773 				panic("cluster_wbuild: Buffer too big");
774 #endif
775 			if (!(tbp->b_flags & B_DELWRI)) {
776 				brelse(tbp);
777 				break;
778 			}
779 		} else
780 			tbp = last_bp;
781 
782 		++b_save->bs_nchildren;
783 
784 		/* Move memory from children to parent */
785 		pagemove(tbp->b_un.b_daddr, cp, size);
786 		bp->b_bcount += size;
787 		bp->b_bufsize += size;
788 
789 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
790 		tbp->b_flags |= B_ASYNC;
791 		s = splbio();
792 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
793 		++tbp->b_vp->v_numoutput;
794 		splx(s);
795 		b_save->bs_children[i] = tbp;
796 
797 		cp += tbp->b_bufsize;
798 	}
799 
800 	if (i == 0) {
801 		/* None to cluster */
802 		bp->b_saveaddr = b_save->bs_saveaddr;
803 		bp->b_flags &= ~B_CALL;
804 		bp->b_iodone = NULL;
805 		free(b_save, M_SEGMENT);
806 	}
807 	bawrite(bp);
808 	if (i < len) {
809 		len -= i + 1;
810 		start_lbn += 1;
811 		goto redo;
812 	}
813 }
814 
815 /*
816  * Release a buffer.
817  * Even if the buffer is dirty, no I/O is started.
818  */
819 brelse(bp)
820 	register struct buf *bp;
821 {
822 	register struct queue_entry *flist;
823 	int s;
824 
825 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
826 	/*
827 	 * If a process is waiting for the buffer, or
828 	 * is waiting for a free buffer, awaken it.
829 	 */
830 	if (bp->b_flags & B_WANTED)
831 		wakeup((caddr_t)bp);
832 	if (needbuffer) {
833 		needbuffer = 0;
834 		wakeup((caddr_t)&needbuffer);
835 	}
836 	/*
837 	 * Retry I/O for locked buffers rather than invalidating them.
838 	 */
839 	s = splbio();
840 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
841 		bp->b_flags &= ~B_ERROR;
842 	/*
843 	 * Disassociate buffers that are no longer valid.
844 	 */
845 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
846 		bp->b_flags |= B_INVAL;
847 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
848 		if (bp->b_vp)
849 			brelvp(bp);
850 		bp->b_flags &= ~B_DELWRI;
851 	}
852 	/*
853 	 * Stick the buffer back on a free list.
854 	 */
855 	if (bp->b_bufsize <= 0) {
856 		/* block has no buffer ... put at front of unused buffer list */
857 		flist = &bufqueues[BQ_EMPTY];
858 		binsheadfree(bp, flist);
859 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
860 		/* block has no info ... put at front of most free list */
861 		flist = &bufqueues[BQ_AGE];
862 		binsheadfree(bp, flist);
863 	} else {
864 		if (bp->b_flags & B_LOCKED)
865 			flist = &bufqueues[BQ_LOCKED];
866 		else if (bp->b_flags & B_AGE)
867 			flist = &bufqueues[BQ_AGE];
868 		else
869 			flist = &bufqueues[BQ_LRU];
870 		binstailfree(bp, flist);
871 	}
872 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
873 	splx(s);
874 }
875 
876 /*
877  * Check to see if a block is currently memory resident.
878  */
879 struct buf *
880 incore(vp, blkno)
881 	struct vnode *vp;
882 	daddr_t blkno;
883 {
884 	register struct buf *bp;
885 
886 	for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next)
887 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
888 		    (bp->b_flags & B_INVAL) == 0)
889 			return (bp);
890 	return (NULL);
891 }
892 
893 /*
894  * Check to see if a block is currently memory resident.
895  * If it is resident, return it. If it is not resident,
896  * allocate a new buffer and assign it to the block.
897  */
898 struct buf *
899 getblk(vp, blkno, size, slpflag, slptimeo)
900 	register struct vnode *vp;
901 	daddr_t blkno;
902 	int size, slpflag, slptimeo;
903 {
904 	register struct buf *bp;
905 	struct list_entry *dp;
906 	int s, error;
907 
908 	if (size > MAXBSIZE)
909 		panic("getblk: size too big");
910 	/*
911 	 * Search the cache for the block. If the buffer is found,
912 	 * but it is currently locked, the we must wait for it to
913 	 * become available.
914 	 */
915 	dp = BUFHASH(vp, blkno);
916 loop:
917 	for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) {
918 		if (bp->b_lblkno != blkno || bp->b_vp != vp)
919 			continue;
920 		s = splbio();
921 		if (bp->b_flags & B_BUSY) {
922 			bp->b_flags |= B_WANTED;
923 			error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1),
924 				"getblk", slptimeo);
925 			splx(s);
926 			if (error)
927 				return (NULL);
928 			goto loop;
929 		}
930 		/*
931 		 * The test for B_INVAL is moved down here, since there
932 		 * are cases where B_INVAL is set before VOP_BWRITE() is
933 		 * called and for NFS, the process cannot be allowed to
934 		 * allocate a new buffer for the same block until the write
935 		 * back to the server has been completed. (ie. B_BUSY clears)
936 		 */
937 		if (bp->b_flags & B_INVAL) {
938 			splx(s);
939 			continue;
940 		}
941 		bremfree(bp);
942 		bp->b_flags |= B_BUSY;
943 		splx(s);
944 		if (bp->b_bcount != size) {
945 			printf("getblk: stray size");
946 			bp->b_flags |= B_INVAL;
947 			VOP_BWRITE(bp);
948 			goto loop;
949 		}
950 		bp->b_flags |= B_CACHE;
951 		return (bp);
952 	}
953 	/*
954 	 * The loop back to the top when getnewbuf() fails is because
955 	 * stateless filesystems like NFS have no node locks. Thus,
956 	 * there is a slight chance that more than one process will
957 	 * try and getnewbuf() for the same block concurrently when
958 	 * the first sleeps in getnewbuf(). So after a sleep, go back
959 	 * up to the top to check the hash lists again.
960 	 */
961 	if ((bp = getnewbuf(slpflag, slptimeo)) == 0)
962 		goto loop;
963 	bremhash(bp);
964 	bgetvp(vp, bp);
965 	bp->b_bcount = 0;
966 	bp->b_lblkno = blkno;
967 	bp->b_blkno = blkno;
968 	bp->b_error = 0;
969 	bp->b_resid = 0;
970 	binshash(bp, dp);
971 	allocbuf(bp, size);
972 	return (bp);
973 }
974 
975 /*
976  * Allocate a buffer.
977  * The caller will assign it to a block.
978  */
979 struct buf *
980 geteblk(size)
981 	int size;
982 {
983 	register struct buf *bp;
984 
985 	if (size > MAXBSIZE)
986 		panic("geteblk: size too big");
987 	while ((bp = getnewbuf(0, 0)) == NULL)
988 		/* void */;
989 	bp->b_flags |= B_INVAL;
990 	bremhash(bp);
991 	binshash(bp, &invalhash);
992 	bp->b_bcount = 0;
993 	bp->b_error = 0;
994 	bp->b_resid = 0;
995 	allocbuf(bp, size);
996 	return (bp);
997 }
998 
999 /*
1000  * Expand or contract the actual memory allocated to a buffer.
1001  * If no memory is available, release buffer and take error exit.
1002  */
1003 allocbuf(tp, size)
1004 	register struct buf *tp;
1005 	int size;
1006 {
1007 	register struct buf *bp, *ep;
1008 	int sizealloc, take, s;
1009 
1010 	sizealloc = roundup(size, CLBYTES);
1011 	/*
1012 	 * Buffer size does not change
1013 	 */
1014 	if (sizealloc == tp->b_bufsize)
1015 		goto out;
1016 	/*
1017 	 * Buffer size is shrinking.
1018 	 * Place excess space in a buffer header taken from the
1019 	 * BQ_EMPTY buffer list and placed on the "most free" list.
1020 	 * If no extra buffer headers are available, leave the
1021 	 * extra space in the present buffer.
1022 	 */
1023 	if (sizealloc < tp->b_bufsize) {
1024 		if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL)
1025 			goto out;
1026 		s = splbio();
1027 		bremfree(ep);
1028 		ep->b_flags |= B_BUSY;
1029 		splx(s);
1030 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
1031 		    (int)tp->b_bufsize - sizealloc);
1032 		ep->b_bufsize = tp->b_bufsize - sizealloc;
1033 		tp->b_bufsize = sizealloc;
1034 		ep->b_flags |= B_INVAL;
1035 		ep->b_bcount = 0;
1036 		brelse(ep);
1037 		goto out;
1038 	}
1039 	/*
1040 	 * More buffer space is needed. Get it out of buffers on
1041 	 * the "most free" list, placing the empty headers on the
1042 	 * BQ_EMPTY buffer header list.
1043 	 */
1044 	while (tp->b_bufsize < sizealloc) {
1045 		take = sizealloc - tp->b_bufsize;
1046 		while ((bp = getnewbuf(0, 0)) == NULL)
1047 			/* void */;
1048 		if (take >= bp->b_bufsize)
1049 			take = bp->b_bufsize;
1050 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
1051 		    &tp->b_un.b_addr[tp->b_bufsize], take);
1052 		tp->b_bufsize += take;
1053 		bp->b_bufsize = bp->b_bufsize - take;
1054 		if (bp->b_bcount > bp->b_bufsize)
1055 			bp->b_bcount = bp->b_bufsize;
1056 		if (bp->b_bufsize <= 0) {
1057 			bremhash(bp);
1058 			binshash(bp, &invalhash);
1059 			bp->b_dev = NODEV;
1060 			bp->b_error = 0;
1061 			bp->b_flags |= B_INVAL;
1062 		}
1063 		brelse(bp);
1064 	}
1065 out:
1066 	tp->b_bcount = size;
1067 	return (1);
1068 }
1069 
1070 /*
1071  * Find a buffer which is available for use.
1072  * Select something from a free list.
1073  * Preference is to AGE list, then LRU list.
1074  */
1075 struct buf *
1076 getnewbuf(slpflag, slptimeo)
1077 	int slpflag, slptimeo;
1078 {
1079 	register struct buf *bp;
1080 	register struct queue_entry *dp;
1081 	register struct ucred *cred;
1082 	int s;
1083 
1084 loop:
1085 	s = splbio();
1086 	for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--)
1087 		if (dp->qe_next)
1088 			break;
1089 	if (dp == bufqueues) {		/* no free blocks */
1090 		needbuffer = 1;
1091 		(void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1),
1092 			"getnewbuf", slptimeo);
1093 		splx(s);
1094 		return (NULL);
1095 	}
1096 	bp = dp->qe_next;
1097 	bremfree(bp);
1098 	bp->b_flags |= B_BUSY;
1099 	splx(s);
1100 	if (bp->b_flags & B_DELWRI) {
1101 		(void) bawrite(bp);
1102 		goto loop;
1103 	}
1104 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1105 	if (bp->b_vp)
1106 		brelvp(bp);
1107 	if (bp->b_rcred != NOCRED) {
1108 		cred = bp->b_rcred;
1109 		bp->b_rcred = NOCRED;
1110 		crfree(cred);
1111 	}
1112 	if (bp->b_wcred != NOCRED) {
1113 		cred = bp->b_wcred;
1114 		bp->b_wcred = NOCRED;
1115 		crfree(cred);
1116 	}
1117 	bp->b_flags = B_BUSY;
1118 	bp->b_dirtyoff = bp->b_dirtyend = 0;
1119 	bp->b_validoff = bp->b_validend = 0;
1120 	return (bp);
1121 }
1122 
1123 /*
1124  * Wait for I/O to complete.
1125  *
1126  * Extract and return any errors associated with the I/O.
1127  * If the error flag is set, but no specific error is
1128  * given, return EIO.
1129  */
1130 biowait(bp)
1131 	register struct buf *bp;
1132 {
1133 	int s;
1134 
1135 	s = splbio();
1136 	while ((bp->b_flags & B_DONE) == 0)
1137 		sleep((caddr_t)bp, PRIBIO);
1138 	splx(s);
1139 	if ((bp->b_flags & B_ERROR) == 0)
1140 		return (0);
1141 	if (bp->b_error)
1142 		return (bp->b_error);
1143 	return (EIO);
1144 }
1145 
1146 /*
1147  * Mark I/O complete on a buffer.
1148  *
1149  * If a callback has been requested, e.g. the pageout
1150  * daemon, do so. Otherwise, awaken waiting processes.
1151  */
1152 void
1153 biodone(bp)
1154 	register struct buf *bp;
1155 {
1156 
1157 	if (bp->b_flags & B_DONE)
1158 		panic("dup biodone");
1159 	bp->b_flags |= B_DONE;
1160 	if ((bp->b_flags & B_READ) == 0)
1161 		vwakeup(bp);
1162 	if (bp->b_flags & B_CALL) {
1163 		bp->b_flags &= ~B_CALL;
1164 		(*bp->b_iodone)(bp);
1165 		return;
1166 	}
1167 	if (bp->b_flags & B_ASYNC)
1168 		brelse(bp);
1169 	else {
1170 		bp->b_flags &= ~B_WANTED;
1171 		wakeup((caddr_t)bp);
1172 	}
1173 }
1174 
1175 int
1176 count_lock_queue()
1177 {
1178 	register struct buf *bp;
1179 	register int ret;
1180 
1181 	for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next;
1182 	    bp; bp = (struct buf *)bp->b_freelist.qe_next)
1183 		++ret;
1184 	return(ret);
1185 }
1186 
1187 #ifdef DIAGNOSTIC
1188 /*
1189  * Print out statistics on the current allocation of the buffer pool.
1190  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1191  * above.
1192  */
1193 void
1194 vfs_bufstats()
1195 {
1196 	int s, i, j, count;
1197 	register struct buf *bp;
1198 	register struct queue_entry *dp;
1199 	int counts[MAXBSIZE/CLBYTES+1];
1200 	static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
1201 
1202 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1203 		count = 0;
1204 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1205 			counts[j] = 0;
1206 		s = splbio();
1207 		for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) {
1208 			counts[bp->b_bufsize/CLBYTES]++;
1209 			count++;
1210 		}
1211 		splx(s);
1212 		printf("%s: total-%d", bname[i], count);
1213 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1214 			if (counts[j] != 0)
1215 				printf(", %d-%d", j * CLBYTES, counts[j]);
1216 		printf("\n");
1217 	}
1218 }
1219 #endif /* DIAGNOSTIC */
1220