xref: /original-bsd/sys/kern/vfs_bio.c (revision a6d8c59f)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_bio.c	7.57 (Berkeley) 12/09/92
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/mount.h>
17 #include <sys/trace.h>
18 #include <sys/resourcevar.h>
19 #include <sys/malloc.h>
20 #include <libkern/libkern.h>
21 
22 /*
23  * Definitions for the buffer hash lists.
24  */
25 #define	BUFHASH(dvp, lbn)	\
26 	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
27 struct	list_entry *bufhashtbl, invalhash;
28 u_long	bufhash;
29 
30 /*
31  * Insq/Remq for the buffer hash lists.
32  */
33 #define	binshash(bp, dp)	list_enter_head(dp, bp, struct buf *, b_hash)
34 #define	bremhash(bp)		list_remove(bp, struct buf *, b_hash)
35 
36 /*
37  * Definitions for the buffer free lists.
38  */
39 #define	BQUEUES		4		/* number of free buffer queues */
40 
41 #define	BQ_LOCKED	0		/* super-blocks &c */
42 #define	BQ_LRU		1		/* lru, useful buffers */
43 #define	BQ_AGE		2		/* rubbish */
44 #define	BQ_EMPTY	3		/* buffer headers with no memory */
45 
46 struct queue_entry bufqueues[BQUEUES];
47 int needbuffer;
48 
49 /*
50  * Insq/Remq for the buffer free lists.
51  */
52 #define	binsheadfree(bp, dp) \
53 	queue_enter_head(dp, bp, struct buf *, b_freelist)
54 #define	binstailfree(bp, dp) \
55 	queue_enter_tail(dp, bp, struct buf *, b_freelist)
56 
57 /*
58  * Local declarations
59  */
60 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
61 	    daddr_t, long, int));
62 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
63 	    daddr_t, daddr_t, long, int, long));
64 void	    cluster_wbuild __P((struct vnode *, struct buf *, long size,
65 	    daddr_t start_lbn, int len, daddr_t lbn));
66 
67 void
68 bremfree(bp)
69 	struct buf *bp;
70 {
71 	struct queue_entry *dp;
72 
73 	/*
74 	 * We only calculate the head of the freelist when removing
75 	 * the last element of the list as that is the only time that
76 	 * it is needed (e.g. to reset the tail pointer).
77 	 */
78 	if (bp->b_freelist.qe_next == NULL) {
79 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
80 			if (dp->qe_prev == &bp->b_freelist.qe_next)
81 				break;
82 		if (dp == &bufqueues[BQUEUES])
83 			panic("bremfree: lost tail");
84 	}
85 	queue_remove(dp, bp, struct buf *, b_freelist);
86 }
87 
88 /*
89  * Initialize buffers and hash links for buffers.
90  */
91 void
92 bufinit()
93 {
94 	register struct buf *bp;
95 	struct queue_entry *dp;
96 	register int i;
97 	int base, residual;
98 
99 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
100 		queue_init(dp);
101 	bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash);
102 	base = bufpages / nbuf;
103 	residual = bufpages % nbuf;
104 	for (i = 0; i < nbuf; i++) {
105 		bp = &buf[i];
106 		bzero((char *)bp, sizeof *bp);
107 		bp->b_dev = NODEV;
108 		bp->b_rcred = NOCRED;
109 		bp->b_wcred = NOCRED;
110 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
111 		if (i < residual)
112 			bp->b_bufsize = (base + 1) * CLBYTES;
113 		else
114 			bp->b_bufsize = base * CLBYTES;
115 		bp->b_flags = B_INVAL;
116 		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
117 		binsheadfree(bp, dp);
118 		binshash(bp, &invalhash);
119 	}
120 }
121 
122 /*
123  * Find the block in the buffer pool.
124  * If the buffer is not present, allocate a new buffer and load
125  * its contents according to the filesystem fill routine.
126  */
127 bread(vp, blkno, size, cred, bpp)
128 	struct vnode *vp;
129 	daddr_t blkno;
130 	int size;
131 	struct ucred *cred;
132 	struct buf **bpp;
133 {
134 	struct proc *p = curproc;		/* XXX */
135 	register struct buf *bp;
136 
137 	if (size == 0)
138 		panic("bread: size 0");
139 	*bpp = bp = getblk(vp, blkno, size);
140 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
141 		trace(TR_BREADHIT, pack(vp, size), blkno);
142 		return (0);
143 	}
144 	bp->b_flags |= B_READ;
145 	if (bp->b_bcount > bp->b_bufsize)
146 		panic("bread");
147 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
148 		crhold(cred);
149 		bp->b_rcred = cred;
150 	}
151 	VOP_STRATEGY(bp);
152 	trace(TR_BREADMISS, pack(vp, size), blkno);
153 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
154 	return (biowait(bp));
155 }
156 
157 /*
158  * Operates like bread, but also starts I/O on the N specified
159  * read-ahead blocks.
160  */
161 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
162 	struct vnode *vp;
163 	daddr_t blkno; int size;
164 	daddr_t rablkno[]; int rabsize[];
165 	int num;
166 	struct ucred *cred;
167 	struct buf **bpp;
168 {
169 	struct proc *p = curproc;		/* XXX */
170 	register struct buf *bp, *rabp;
171 	register int i;
172 
173 	bp = NULL;
174 	/*
175 	 * If the block is not memory resident,
176 	 * allocate a buffer and start I/O.
177 	 */
178 	if (!incore(vp, blkno)) {
179 		*bpp = bp = getblk(vp, blkno, size);
180 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
181 			bp->b_flags |= B_READ;
182 			if (bp->b_bcount > bp->b_bufsize)
183 				panic("breadn");
184 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
185 				crhold(cred);
186 				bp->b_rcred = cred;
187 			}
188 			VOP_STRATEGY(bp);
189 			trace(TR_BREADMISS, pack(vp, size), blkno);
190 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
191 		} else {
192 			trace(TR_BREADHIT, pack(vp, size), blkno);
193 		}
194 	}
195 
196 	/*
197 	 * If there's read-ahead block(s), start I/O
198 	 * on them also (as above).
199 	 */
200 	for (i = 0; i < num; i++) {
201 		if (incore(vp, rablkno[i]))
202 			continue;
203 		rabp = getblk(vp, rablkno[i], rabsize[i]);
204 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
205 			brelse(rabp);
206 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
207 		} else {
208 			rabp->b_flags |= B_ASYNC | B_READ;
209 			if (rabp->b_bcount > rabp->b_bufsize)
210 				panic("breadrabp");
211 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
212 				crhold(cred);
213 				rabp->b_rcred = cred;
214 			}
215 			VOP_STRATEGY(rabp);
216 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
217 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
218 		}
219 	}
220 
221 	/*
222 	 * If block was memory resident, let bread get it.
223 	 * If block was not memory resident, the read was
224 	 * started above, so just wait for the read to complete.
225 	 */
226 	if (bp == NULL)
227 		return (bread(vp, blkno, size, cred, bpp));
228 	return (biowait(bp));
229 }
230 
231 /*
232  * We could optimize this by keeping track of where the last read-ahead
233  * was, but it would involve adding fields to the vnode.  For now, let's
234  * just get it working.
235  *
236  * This replaces bread.  If this is a bread at the beginning of a file and
237  * lastr is 0, we assume this is the first read and we'll read up to two
238  * blocks if they are sequential.  After that, we'll do regular read ahead
239  * in clustered chunks.
240  *
241  * There are 4 or 5 cases depending on how you count:
242  *	Desired block is in the cache:
243  *	    1 Not sequential access (0 I/Os).
244  *	    2 Access is sequential, do read-ahead (1 ASYNC).
245  *	Desired block is not in cache:
246  *	    3 Not sequential access (1 SYNC).
247  *	    4 Sequential access, next block is contiguous (1 SYNC).
248  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
249  *
250  * There are potentially two buffers that require I/O.
251  * 	bp is the block requested.
252  *	rbp is the read-ahead block.
253  *	If either is NULL, then you don't have to do the I/O.
254  */
255 cluster_read(vp, filesize, lblkno, size, cred, bpp)
256 	struct vnode *vp;
257 	u_quad_t filesize;
258 	daddr_t lblkno;
259 	long size;
260 	struct ucred *cred;
261 	struct buf **bpp;
262 {
263 	struct buf *bp, *rbp;
264 	daddr_t blkno, ioblkno;
265 	long flags;
266 	int error, num_ra, alreadyincore;
267 
268 #ifdef DIAGNOSTIC
269 	if (size == 0)
270 		panic("cluster_read: size = 0");
271 #endif
272 
273 	error = 0;
274 	flags = B_READ;
275 	*bpp = bp = getblk(vp, lblkno, size);
276 	if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
277 		/*
278 		 * Desired block is in cache; do any readahead ASYNC.
279 		 * Case 1, 2.
280 		 */
281 		trace(TR_BREADHIT, pack(vp, size), lblkno);
282 		flags |= B_ASYNC;
283 		ioblkno = lblkno +
284 		    (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
285 		alreadyincore = incore(vp, ioblkno);
286 		bp = NULL;
287 	} else {
288 		/* Block wasn't in cache, case 3, 4, 5. */
289 		trace(TR_BREADMISS, pack(vp, size), lblkno);
290 		ioblkno = lblkno;
291 		bp->b_flags |= flags;
292 		alreadyincore = 0;
293 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
294 	}
295 	/*
296 	 * XXX
297 	 * Replace 1 with a window size based on some permutation of
298 	 * maxcontig and rot_delay.  This will let you figure out how
299 	 * many blocks you should read-ahead (case 2, 4, 5).
300 	 *
301 	 * If the access isn't sequential, cut the window size in half.
302 	 */
303 	rbp = NULL;
304 	if (lblkno != vp->v_lastr + 1 && lblkno != 0)
305 		vp->v_ralen = max(vp->v_ralen >> 1, 1);
306 	else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
307 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
308 		/*
309 		 * Reading sequentially, and the next block is not in the
310 		 * cache.  We are going to try reading ahead. If this is
311 		 * the first read of a file, then limit read-ahead to a
312 		 * single block, else read as much as we're allowed.
313 		 */
314 		if (num_ra > vp->v_ralen) {
315 			num_ra = vp->v_ralen;
316 			vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
317 		} else
318 			vp->v_ralen = num_ra + 1;
319 
320 
321 		if (num_ra)				/* case 2, 4 */
322 			rbp = cluster_rbuild(vp, filesize,
323 			    bp, ioblkno, blkno, size, num_ra, flags);
324 		else if (lblkno != 0 && ioblkno == lblkno) {
325 			/* Case 5: check how many blocks to read ahead */
326 			++ioblkno;
327 			if ((ioblkno + 1) * size > filesize ||
328 			    (error = VOP_BMAP(vp,
329 			    ioblkno, NULL, &blkno, &num_ra)))
330 				goto skip_readahead;
331 			flags |= B_ASYNC;
332 			if (num_ra)
333 				rbp = cluster_rbuild(vp, filesize,
334 				    NULL, ioblkno, blkno, size, num_ra, flags);
335 			else {
336 				rbp = getblk(vp, ioblkno, size);
337 				rbp->b_flags |= flags;
338 				rbp->b_blkno = blkno;
339 			}
340 		} else if (lblkno != 0) {
341 			/* case 2; read ahead single block */
342 			rbp = getblk(vp, ioblkno, size);
343 			rbp->b_flags |= flags;
344 			rbp->b_blkno = blkno;
345 		} else if (bp)				/* case 1, 3, block 0 */
346 			bp->b_blkno = blkno;
347 		/* Case 1 on block 0; not really doing sequential I/O */
348 
349 		if (rbp == bp)		/* case 4 */
350 			rbp = NULL;
351 		else if (rbp) {			/* case 2, 5 */
352 			trace(TR_BREADMISSRA,
353 			    pack(vp, (num_ra + 1) * size), ioblkno);
354 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
355 		}
356 	}
357 
358 	/* XXX Kirk, do we need to make sure the bp has creds? */
359 skip_readahead:
360 	if (bp)
361 		if (bp->b_flags & (B_DONE | B_DELWRI))
362 			panic("cluster_read: DONE bp");
363 		else
364 			error = VOP_STRATEGY(bp);
365 
366 	if (rbp)
367 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
368 			rbp->b_flags &= ~(B_ASYNC | B_READ);
369 			brelse(rbp);
370 		} else
371 			(void) VOP_STRATEGY(rbp);
372 
373 	if (bp)
374 		return(biowait(bp));
375 	return(error);
376 }
377 
378 /*
379  * If blocks are contiguous on disk, use this to provide clustered
380  * read ahead.  We will read as many blocks as possible sequentially
381  * and then parcel them up into logical blocks in the buffer hash table.
382  */
383 struct buf *
384 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
385 	struct vnode *vp;
386 	u_quad_t filesize;
387 	struct buf *bp;
388 	daddr_t lbn;
389 	daddr_t blkno;
390 	long size;
391 	int run;
392 	long flags;
393 {
394 	struct cluster_save *b_save;
395 	struct buf *tbp;
396 	daddr_t bn;
397 	int i, inc;
398 
399 	if (size * (lbn + run + 1) > filesize)
400 		--run;
401 	if (run == 0) {
402 		if (!bp) {
403 			bp = getblk(vp, lbn, size);
404 			bp->b_blkno = blkno;
405 			bp->b_flags |= flags;
406 		}
407 		return(bp);
408 	}
409 
410 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
411 	if (bp->b_flags & (B_DONE | B_DELWRI))
412 		return (bp);
413 
414 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
415 	    M_SEGMENT, M_WAITOK);
416 	b_save->bs_bufsize = b_save->bs_bcount = size;
417 	b_save->bs_nchildren = 0;
418 	b_save->bs_children = (struct buf **)(b_save + 1);
419 	b_save->bs_saveaddr = bp->b_saveaddr;
420 	bp->b_saveaddr = (caddr_t) b_save;
421 
422 	inc = size / DEV_BSIZE;
423 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
424 		if (incore(vp, lbn + i)) {
425 			if (i == 1) {
426 				bp->b_saveaddr = b_save->bs_saveaddr;
427 				bp->b_flags &= ~B_CALL;
428 				bp->b_iodone = NULL;
429 				allocbuf(bp, size);
430 				free(b_save, M_SEGMENT);
431 			} else
432 				allocbuf(bp, size * i);
433 			break;
434 		}
435 		tbp = getblk(vp, lbn + i, 0);
436 		tbp->b_bcount = tbp->b_bufsize = size;
437 		tbp->b_blkno = bn;
438 		tbp->b_flags |= flags | B_READ | B_ASYNC;
439 		++b_save->bs_nchildren;
440 		b_save->bs_children[i - 1] = tbp;
441 	}
442 	if (!(bp->b_flags & B_ASYNC))
443 		vp->v_ralen = max(vp->v_ralen - 1, 1);
444 	return(bp);
445 }
446 
447 /*
448  * Either get a new buffer or grow the existing one.
449  */
450 struct buf *
451 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
452 	struct vnode *vp;
453 	struct buf *bp;
454 	long flags;
455 	daddr_t blkno;
456 	daddr_t lblkno;
457 	long size;
458 	int run;
459 {
460 	if (!bp) {
461 		bp = getblk(vp, lblkno, size);
462 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
463 			bp->b_blkno = blkno;
464 			return(bp);
465 		}
466 	}
467 	allocbuf(bp, run * size);
468 	bp->b_blkno = blkno;
469 	bp->b_iodone = cluster_callback;
470 	bp->b_flags |= flags | B_CALL;
471 	return(bp);
472 }
473 
474 /*
475  * Cleanup after a clustered read or write.
476  */
477 void
478 cluster_callback(bp)
479 	struct buf *bp;
480 {
481 	struct cluster_save *b_save;
482 	struct buf **tbp;
483 	long bsize;
484 	caddr_t cp;
485 
486 	b_save = (struct cluster_save *)(bp->b_saveaddr);
487 	bp->b_saveaddr = b_save->bs_saveaddr;
488 
489 	cp = bp->b_un.b_addr + b_save->bs_bufsize;
490 	for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
491 		pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
492 		cp += (*tbp)->b_bufsize;
493 		bp->b_bufsize -= (*tbp)->b_bufsize;
494 		biodone(*tbp);
495 	}
496 #ifdef DIAGNOSTIC
497 	if (bp->b_bufsize != b_save->bs_bufsize)
498 		panic ("cluster_callback: more space to reclaim");
499 #endif
500 	bp->b_bcount = bp->b_bufsize;
501 	bp->b_iodone = NULL;
502 	free(b_save, M_SEGMENT);
503 	if (bp->b_flags & B_ASYNC)
504 		brelse(bp);
505 	else
506 		wakeup((caddr_t)bp);
507 }
508 
509 /*
510  * Synchronous write.
511  * Release buffer on completion.
512  */
513 bwrite(bp)
514 	register struct buf *bp;
515 {
516 	struct proc *p = curproc;		/* XXX */
517 	register int flag;
518 	int s, error = 0;
519 
520 	flag = bp->b_flags;
521 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
522 	if (flag & B_ASYNC) {
523 		if ((flag & B_DELWRI) == 0)
524 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
525 		else
526 			reassignbuf(bp, bp->b_vp);
527 	}
528 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
529 	if (bp->b_bcount > bp->b_bufsize)
530 		panic("bwrite");
531 	s = splbio();
532 	bp->b_vp->v_numoutput++;
533 	splx(s);
534 	VOP_STRATEGY(bp);
535 
536 	/*
537 	 * If the write was synchronous, then await I/O completion.
538 	 * If the write was "delayed", then we put the buffer on
539 	 * the queue of blocks awaiting I/O completion status.
540 	 */
541 	if ((flag & B_ASYNC) == 0) {
542 		error = biowait(bp);
543 		if ((flag&B_DELWRI) == 0)
544 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
545 		else
546 			reassignbuf(bp, bp->b_vp);
547 		brelse(bp);
548 	} else if (flag & B_DELWRI) {
549 		s = splbio();
550 		bp->b_flags |= B_AGE;
551 		splx(s);
552 	}
553 	return (error);
554 }
555 
556 int
557 vn_bwrite(ap)
558 	struct vop_bwrite_args *ap;
559 {
560 	return (bwrite(ap->a_bp));
561 }
562 
563 
564 /*
565  * Delayed write.
566  *
567  * The buffer is marked dirty, but is not queued for I/O.
568  * This routine should be used when the buffer is expected
569  * to be modified again soon, typically a small write that
570  * partially fills a buffer.
571  *
572  * NB: magnetic tapes cannot be delayed; they must be
573  * written in the order that the writes are requested.
574  */
575 bdwrite(bp)
576 	register struct buf *bp;
577 {
578 	struct proc *p = curproc;		/* XXX */
579 
580 	if ((bp->b_flags & B_DELWRI) == 0) {
581 		bp->b_flags |= B_DELWRI;
582 		reassignbuf(bp, bp->b_vp);
583 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
584 	}
585 	/*
586 	 * If this is a tape drive, the write must be initiated.
587 	 */
588 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
589 		bawrite(bp);
590 	} else {
591 		bp->b_flags |= (B_DONE | B_DELWRI);
592 		brelse(bp);
593 	}
594 }
595 
596 /*
597  * Asynchronous write.
598  * Start I/O on a buffer, but do not wait for it to complete.
599  * The buffer is released when the I/O completes.
600  */
601 bawrite(bp)
602 	register struct buf *bp;
603 {
604 
605 	/*
606 	 * Setting the ASYNC flag causes bwrite to return
607 	 * after starting the I/O.
608 	 */
609 	bp->b_flags |= B_ASYNC;
610 	(void) bwrite(bp);
611 }
612 
613 /*
614  * Do clustered write for FFS.
615  *
616  * Three cases:
617  *	1. Write is not sequential (write asynchronously)
618  *	Write is sequential:
619  *	2.	beginning of cluster - begin cluster
620  *	3.	middle of a cluster - add to cluster
621  *	4.	end of a cluster - asynchronously write cluster
622  */
623 void
624 cluster_write(bp, filesize)
625         struct buf *bp;
626 	u_quad_t filesize;
627 {
628         struct vnode *vp;
629         daddr_t lbn;
630         int clen, error, maxrun;
631 
632         vp = bp->b_vp;
633         lbn = bp->b_lblkno;
634 	clen = 0;
635 
636 	/*
637 	 * Handle end of file first.  If we are appending, we need to check
638 	 * if the current block was allocated contiguously.  If it wasn't,
639 	 * then we need to fire off a previous cluster if it existed.
640 	 * Additionally, when we're appending, we need to figure out how
641 	 * to initialize vp->v_clen.
642 	 */
643 	if ((lbn + 1) * bp->b_bcount == filesize) {
644 		if (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE) {
645 			/* This block was not allocated contiguously */
646 			if (vp->v_clen)
647 			    cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
648 				vp->v_lastw - vp->v_cstart + 1, lbn);
649 			vp->v_cstart = lbn;
650 			clen = vp->v_clen =
651 			    MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
652 			/*
653 			 * Next cluster started. Write this buffer and return.
654 			 */
655 			vp->v_lastw = lbn;
656 			vp->v_lasta = bp->b_blkno;
657 			bdwrite(bp);
658 			return;
659 		}
660 		vp->v_lasta = bp->b_blkno;
661 	} else if (lbn == 0) {
662 		vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
663 	}
664         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1) {
665 		if (vp->v_clen != 0)
666 			/*
667 			 * Write is not sequential.
668 			 */
669 			cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
670 			    vp->v_lastw - vp->v_cstart + 1, lbn);
671 		/*
672 		 * Consider beginning a cluster.
673 		 */
674 		if (error = VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) {
675 			bawrite(bp);
676 			vp->v_cstart = lbn + 1;
677 			vp->v_lastw = lbn;
678 			return;
679 		}
680                 vp->v_clen = clen;
681                 if (clen == 0) {		/* I/O not contiguous */
682 			vp->v_cstart = lbn + 1;
683                         bawrite(bp);
684                 } else {			/* Wait for rest of cluster */
685 			vp->v_cstart = lbn;
686                         bdwrite(bp);
687 		}
688         } else if (lbn == vp->v_cstart + vp->v_clen) {
689 		/*
690 		 * At end of cluster, write it out.
691 		 */
692 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
693 		    vp->v_clen + 1, lbn);
694 		vp->v_clen = 0;
695 		vp->v_cstart = lbn + 1;
696         } else
697 		/*
698 		 * In the middle of a cluster, so just delay the
699 		 * I/O for now.
700 		 */
701                 bdwrite(bp);
702         vp->v_lastw = lbn;
703 }
704 
705 
706 /*
707  * This is an awful lot like cluster_rbuild...wish they could be combined.
708  * The last lbn argument is the current block on which I/O is being
709  * performed.  Check to see that it doesn't fall in the middle of
710  * the current block.
711  */
712 void
713 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
714 	struct vnode *vp;
715 	struct buf *last_bp;
716 	long size;
717 	daddr_t start_lbn;
718 	int len;
719 	daddr_t	lbn;
720 {
721 	struct cluster_save *b_save;
722 	struct buf *bp, *tbp;
723 	caddr_t	cp;
724 	int i, s;
725 
726 redo:
727 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
728 		++start_lbn;
729 		--len;
730 	}
731 
732 	/* Get more memory for current buffer */
733 	if (len <= 1) {
734 		if (last_bp)
735 			bawrite(last_bp);
736 		return;
737 	}
738 
739 	bp = getblk(vp, start_lbn, size);
740 	if (!(bp->b_flags & B_DELWRI)) {
741 		++start_lbn;
742 		--len;
743 		brelse(bp);
744 		goto redo;
745 	}
746 
747 	--len;
748 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
749 	    M_SEGMENT, M_WAITOK);
750 	b_save->bs_bcount = bp->b_bcount;
751 	b_save->bs_bufsize = bp->b_bufsize;
752 	b_save->bs_nchildren = 0;
753 	b_save->bs_children = (struct buf **)(b_save + 1);
754 	b_save->bs_saveaddr = bp->b_saveaddr;
755 	bp->b_saveaddr = (caddr_t) b_save;
756 
757 
758 	bp->b_flags |= B_CALL;
759 	bp->b_iodone = cluster_callback;
760 	cp = bp->b_un.b_addr + bp->b_bufsize;
761 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
762 		if (!incore(vp, start_lbn) || start_lbn == lbn)
763 			break;
764 
765 		if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
766 			tbp = getblk(vp, start_lbn, size);
767 #ifdef DIAGNOSTIC
768 			if (tbp->b_bcount != tbp->b_bufsize)
769 				panic("cluster_wbuild: Buffer too big");
770 #endif
771 			if (!(tbp->b_flags & B_DELWRI)) {
772 				brelse(tbp);
773 				break;
774 			}
775 		} else
776 			tbp = last_bp;
777 
778 		++b_save->bs_nchildren;
779 
780 		/* Move memory from children to parent */
781 		pagemove(tbp->b_un.b_daddr, cp, size);
782 		bp->b_bcount += size;
783 		bp->b_bufsize += size;
784 
785 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
786 		tbp->b_flags |= B_ASYNC;
787 		s = splbio();
788 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
789 		++tbp->b_vp->v_numoutput;
790 		splx(s);
791 		b_save->bs_children[i] = tbp;
792 
793 		cp += tbp->b_bufsize;
794 	}
795 
796 	if (i == 0) {
797 		/* None to cluster */
798 		bp->b_saveaddr = b_save->bs_saveaddr;
799 		bp->b_flags &= ~B_CALL;
800 		bp->b_iodone = NULL;
801 		free(b_save, M_SEGMENT);
802 	}
803 	bawrite(bp);
804 	if (i < len) {
805 		len -= i + 1;
806 		start_lbn += 1;
807 		goto redo;
808 	}
809 }
810 
811 /*
812  * Release a buffer.
813  * Even if the buffer is dirty, no I/O is started.
814  */
815 brelse(bp)
816 	register struct buf *bp;
817 {
818 	register struct queue_entry *flist;
819 	int s;
820 
821 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
822 	/*
823 	 * If a process is waiting for the buffer, or
824 	 * is waiting for a free buffer, awaken it.
825 	 */
826 	if (bp->b_flags & B_WANTED)
827 		wakeup((caddr_t)bp);
828 	if (needbuffer) {
829 		needbuffer = 0;
830 		wakeup((caddr_t)&needbuffer);
831 	}
832 	/*
833 	 * Retry I/O for locked buffers rather than invalidating them.
834 	 */
835 	s = splbio();
836 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
837 		bp->b_flags &= ~B_ERROR;
838 	/*
839 	 * Disassociate buffers that are no longer valid.
840 	 */
841 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
842 		bp->b_flags |= B_INVAL;
843 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
844 		if (bp->b_vp)
845 			brelvp(bp);
846 		bp->b_flags &= ~B_DELWRI;
847 	}
848 	/*
849 	 * Stick the buffer back on a free list.
850 	 */
851 	if (bp->b_bufsize <= 0) {
852 		/* block has no buffer ... put at front of unused buffer list */
853 		flist = &bufqueues[BQ_EMPTY];
854 		binsheadfree(bp, flist);
855 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
856 		/* block has no info ... put at front of most free list */
857 		flist = &bufqueues[BQ_AGE];
858 		binsheadfree(bp, flist);
859 	} else {
860 		if (bp->b_flags & B_LOCKED)
861 			flist = &bufqueues[BQ_LOCKED];
862 		else if (bp->b_flags & B_AGE)
863 			flist = &bufqueues[BQ_AGE];
864 		else
865 			flist = &bufqueues[BQ_LRU];
866 		binstailfree(bp, flist);
867 	}
868 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
869 	splx(s);
870 }
871 
872 /*
873  * Check to see if a block is currently memory resident.
874  */
875 incore(vp, blkno)
876 	struct vnode *vp;
877 	daddr_t blkno;
878 {
879 	register struct buf *bp;
880 
881 	for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next)
882 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
883 		    (bp->b_flags & B_INVAL) == 0)
884 			return (1);
885 	return (0);
886 }
887 
888 /*
889  * Check to see if a block is currently memory resident.
890  * If it is resident, return it. If it is not resident,
891  * allocate a new buffer and assign it to the block.
892  */
893 struct buf *
894 getblk(vp, blkno, size)
895 	register struct vnode *vp;
896 	daddr_t blkno;
897 	int size;
898 {
899 	register struct buf *bp;
900 	struct list_entry *dp;
901 	int s;
902 
903 	if (size > MAXBSIZE)
904 		panic("getblk: size too big");
905 	/*
906 	 * Search the cache for the block. If the buffer is found,
907 	 * but it is currently locked, the we must wait for it to
908 	 * become available.
909 	 */
910 	dp = BUFHASH(vp, blkno);
911 loop:
912 	for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) {
913 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
914 		    (bp->b_flags & B_INVAL))
915 			continue;
916 		s = splbio();
917 		if (bp->b_flags & B_BUSY) {
918 			bp->b_flags |= B_WANTED;
919 			sleep((caddr_t)bp, PRIBIO + 1);
920 			splx(s);
921 			goto loop;
922 		}
923 		bremfree(bp);
924 		bp->b_flags |= B_BUSY;
925 		splx(s);
926 		if (bp->b_bcount != size) {
927 			printf("getblk: stray size");
928 			bp->b_flags |= B_INVAL;
929 			bwrite(bp);
930 			goto loop;
931 		}
932 		bp->b_flags |= B_CACHE;
933 		return (bp);
934 	}
935 	bp = getnewbuf();
936 	bremhash(bp);
937 	bgetvp(vp, bp);
938 	bp->b_bcount = 0;
939 	bp->b_lblkno = blkno;
940 	bp->b_blkno = blkno;
941 	bp->b_error = 0;
942 	bp->b_resid = 0;
943 	binshash(bp, dp);
944 	allocbuf(bp, size);
945 	return (bp);
946 }
947 
948 /*
949  * Allocate a buffer.
950  * The caller will assign it to a block.
951  */
952 struct buf *
953 geteblk(size)
954 	int size;
955 {
956 	register struct buf *bp;
957 
958 	if (size > MAXBSIZE)
959 		panic("geteblk: size too big");
960 	bp = getnewbuf();
961 	bp->b_flags |= B_INVAL;
962 	bremhash(bp);
963 	binshash(bp, &invalhash);
964 	bp->b_bcount = 0;
965 	bp->b_error = 0;
966 	bp->b_resid = 0;
967 	allocbuf(bp, size);
968 	return (bp);
969 }
970 
971 /*
972  * Expand or contract the actual memory allocated to a buffer.
973  * If no memory is available, release buffer and take error exit.
974  */
975 allocbuf(tp, size)
976 	register struct buf *tp;
977 	int size;
978 {
979 	register struct buf *bp, *ep;
980 	int sizealloc, take, s;
981 
982 	sizealloc = roundup(size, CLBYTES);
983 	/*
984 	 * Buffer size does not change
985 	 */
986 	if (sizealloc == tp->b_bufsize)
987 		goto out;
988 	/*
989 	 * Buffer size is shrinking.
990 	 * Place excess space in a buffer header taken from the
991 	 * BQ_EMPTY buffer list and placed on the "most free" list.
992 	 * If no extra buffer headers are available, leave the
993 	 * extra space in the present buffer.
994 	 */
995 	if (sizealloc < tp->b_bufsize) {
996 		if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL)
997 			goto out;
998 		s = splbio();
999 		bremfree(ep);
1000 		ep->b_flags |= B_BUSY;
1001 		splx(s);
1002 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
1003 		    (int)tp->b_bufsize - sizealloc);
1004 		ep->b_bufsize = tp->b_bufsize - sizealloc;
1005 		tp->b_bufsize = sizealloc;
1006 		ep->b_flags |= B_INVAL;
1007 		ep->b_bcount = 0;
1008 		brelse(ep);
1009 		goto out;
1010 	}
1011 	/*
1012 	 * More buffer space is needed. Get it out of buffers on
1013 	 * the "most free" list, placing the empty headers on the
1014 	 * BQ_EMPTY buffer header list.
1015 	 */
1016 	while (tp->b_bufsize < sizealloc) {
1017 		take = sizealloc - tp->b_bufsize;
1018 		bp = getnewbuf();
1019 		if (take >= bp->b_bufsize)
1020 			take = bp->b_bufsize;
1021 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
1022 		    &tp->b_un.b_addr[tp->b_bufsize], take);
1023 		tp->b_bufsize += take;
1024 		bp->b_bufsize = bp->b_bufsize - take;
1025 		if (bp->b_bcount > bp->b_bufsize)
1026 			bp->b_bcount = bp->b_bufsize;
1027 		if (bp->b_bufsize <= 0) {
1028 			bremhash(bp);
1029 			binshash(bp, &invalhash);
1030 			bp->b_dev = NODEV;
1031 			bp->b_error = 0;
1032 			bp->b_flags |= B_INVAL;
1033 		}
1034 		brelse(bp);
1035 	}
1036 out:
1037 	tp->b_bcount = size;
1038 	return (1);
1039 }
1040 
1041 /*
1042  * Find a buffer which is available for use.
1043  * Select something from a free list.
1044  * Preference is to AGE list, then LRU list.
1045  */
1046 struct buf *
1047 getnewbuf()
1048 {
1049 	register struct buf *bp;
1050 	register struct queue_entry *dp;
1051 	register struct ucred *cred;
1052 	int s;
1053 
1054 loop:
1055 	s = splbio();
1056 	for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--)
1057 		if (dp->qe_next)
1058 			break;
1059 	if (dp == bufqueues) {		/* no free blocks */
1060 		needbuffer = 1;
1061 		sleep((caddr_t)&needbuffer, PRIBIO + 1);
1062 		splx(s);
1063 		goto loop;
1064 	}
1065 	bp = dp->qe_next;
1066 	bremfree(bp);
1067 	bp->b_flags |= B_BUSY;
1068 	splx(s);
1069 	if (bp->b_flags & B_DELWRI) {
1070 		(void) bawrite(bp);
1071 		goto loop;
1072 	}
1073 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1074 	if (bp->b_vp)
1075 		brelvp(bp);
1076 	if (bp->b_rcred != NOCRED) {
1077 		cred = bp->b_rcred;
1078 		bp->b_rcred = NOCRED;
1079 		crfree(cred);
1080 	}
1081 	if (bp->b_wcred != NOCRED) {
1082 		cred = bp->b_wcred;
1083 		bp->b_wcred = NOCRED;
1084 		crfree(cred);
1085 	}
1086 	bp->b_flags = B_BUSY;
1087 	bp->b_dirtyoff = bp->b_dirtyend = 0;
1088 	bp->b_validoff = bp->b_validend = 0;
1089 	return (bp);
1090 }
1091 
1092 /*
1093  * Wait for I/O to complete.
1094  *
1095  * Extract and return any errors associated with the I/O.
1096  * If the error flag is set, but no specific error is
1097  * given, return EIO.
1098  */
1099 biowait(bp)
1100 	register struct buf *bp;
1101 {
1102 	int s;
1103 
1104 	s = splbio();
1105 	while ((bp->b_flags & B_DONE) == 0)
1106 		sleep((caddr_t)bp, PRIBIO);
1107 	splx(s);
1108 	if ((bp->b_flags & B_ERROR) == 0)
1109 		return (0);
1110 	if (bp->b_error)
1111 		return (bp->b_error);
1112 	return (EIO);
1113 }
1114 
1115 /*
1116  * Mark I/O complete on a buffer.
1117  *
1118  * If a callback has been requested, e.g. the pageout
1119  * daemon, do so. Otherwise, awaken waiting processes.
1120  */
1121 void
1122 biodone(bp)
1123 	register struct buf *bp;
1124 {
1125 
1126 	if (bp->b_flags & B_DONE)
1127 		panic("dup biodone");
1128 	bp->b_flags |= B_DONE;
1129 	if ((bp->b_flags & B_READ) == 0)
1130 		vwakeup(bp);
1131 	if (bp->b_flags & B_CALL) {
1132 		bp->b_flags &= ~B_CALL;
1133 		(*bp->b_iodone)(bp);
1134 		return;
1135 	}
1136 	if (bp->b_flags & B_ASYNC)
1137 		brelse(bp);
1138 	else {
1139 		bp->b_flags &= ~B_WANTED;
1140 		wakeup((caddr_t)bp);
1141 	}
1142 }
1143 
1144 int
1145 count_lock_queue()
1146 {
1147 	register struct buf *bp;
1148 	register int ret;
1149 
1150 	for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next;
1151 	    bp; bp = (struct buf *)bp->b_freelist.qe_next)
1152 		++ret;
1153 	return(ret);
1154 }
1155 
1156 #ifdef DIAGNOSTIC
1157 /*
1158  * Print out statistics on the current allocation of the buffer pool.
1159  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1160  * above.
1161  */
1162 void
1163 vfs_bufstats()
1164 {
1165 	int s, i, j, count;
1166 	register struct buf *bp;
1167 	register struct queue_entry *dp;
1168 	int counts[MAXBSIZE/CLBYTES+1];
1169 	static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
1170 
1171 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1172 		count = 0;
1173 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1174 			counts[j] = 0;
1175 		s = splbio();
1176 		for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) {
1177 			counts[bp->b_bufsize/CLBYTES]++;
1178 			count++;
1179 		}
1180 		splx(s);
1181 		printf("%s: total-%d", bname[i], count);
1182 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1183 			if (counts[j] != 0)
1184 				printf(", %d-%d", j * CLBYTES, counts[j]);
1185 		printf("\n");
1186 	}
1187 }
1188 #endif /* DIAGNOSTIC */
1189