xref: /original-bsd/sys/kern/vfs_bio.c (revision 57bea009)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_bio.c	7.59.1.1 (Berkeley) 05/10/93
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/mount.h>
17 #include <sys/trace.h>
18 #include <sys/resourcevar.h>
19 #include <sys/malloc.h>
20 #include <libkern/libkern.h>
21 #include <ufs/ufs/quota.h>
22 #include <ufs/ufs/inode.h>
23 
24 /*
25  * Definitions for the buffer hash lists.
26  */
27 #define	BUFHASH(dvp, lbn)	\
28 	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
29 struct	list_entry *bufhashtbl, invalhash;
30 u_long	bufhash;
31 
32 /*
33  * Insq/Remq for the buffer hash lists.
34  */
35 #define	binshash(bp, dp)	list_enter_head(dp, bp, struct buf *, b_hash)
36 #define	bremhash(bp)		list_remove(bp, struct buf *, b_hash)
37 
38 /*
39  * Definitions for the buffer free lists.
40  */
41 #define	BQUEUES		4		/* number of free buffer queues */
42 
43 #define	BQ_LOCKED	0		/* super-blocks &c */
44 #define	BQ_LRU		1		/* lru, useful buffers */
45 #define	BQ_AGE		2		/* rubbish */
46 #define	BQ_EMPTY	3		/* buffer headers with no memory */
47 
48 struct queue_entry bufqueues[BQUEUES];
49 int needbuffer;
50 
51 /*
52  * Insq/Remq for the buffer free lists.
53  */
54 #define	binsheadfree(bp, dp) \
55 	queue_enter_head(dp, bp, struct buf *, b_freelist)
56 #define	binstailfree(bp, dp) \
57 	queue_enter_tail(dp, bp, struct buf *, b_freelist)
58 
59 /*
60  * Local declarations
61  */
62 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
63 	    daddr_t, long, int));
64 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
65 	    daddr_t, daddr_t, long, int, long));
66 void	    cluster_wbuild __P((struct vnode *, struct buf *, long size,
67 	    daddr_t start_lbn, int len, daddr_t lbn));
68 
69 void
70 bremfree(bp)
71 	struct buf *bp;
72 {
73 	struct queue_entry *dp;
74 
75 	/*
76 	 * We only calculate the head of the freelist when removing
77 	 * the last element of the list as that is the only time that
78 	 * it is needed (e.g. to reset the tail pointer).
79 	 */
80 	if (bp->b_freelist.qe_next == NULL) {
81 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
82 			if (dp->qe_prev == &bp->b_freelist.qe_next)
83 				break;
84 		if (dp == &bufqueues[BQUEUES])
85 			panic("bremfree: lost tail");
86 	}
87 	queue_remove(dp, bp, struct buf *, b_freelist);
88 }
89 
90 /*
91  * Initialize buffers and hash links for buffers.
92  */
93 void
94 bufinit()
95 {
96 	register struct buf *bp;
97 	struct queue_entry *dp;
98 	register int i;
99 	int base, residual;
100 
101 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
102 		queue_init(dp);
103 	bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash);
104 	base = bufpages / nbuf;
105 	residual = bufpages % nbuf;
106 	for (i = 0; i < nbuf; i++) {
107 		bp = &buf[i];
108 		bzero((char *)bp, sizeof *bp);
109 		bp->b_dev = NODEV;
110 		bp->b_rcred = NOCRED;
111 		bp->b_wcred = NOCRED;
112 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
113 		if (i < residual)
114 			bp->b_bufsize = (base + 1) * CLBYTES;
115 		else
116 			bp->b_bufsize = base * CLBYTES;
117 		bp->b_flags = B_INVAL;
118 		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
119 		binsheadfree(bp, dp);
120 		binshash(bp, &invalhash);
121 	}
122 }
123 
124 /*
125  * Find the block in the buffer pool.
126  * If the buffer is not present, allocate a new buffer and load
127  * its contents according to the filesystem fill routine.
128  */
129 bread(vp, blkno, size, cred, bpp)
130 	struct vnode *vp;
131 	daddr_t blkno;
132 	int size;
133 	struct ucred *cred;
134 	struct buf **bpp;
135 {
136 	struct proc *p = curproc;		/* XXX */
137 	register struct buf *bp;
138 
139 	if (size == 0)
140 		panic("bread: size 0");
141 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
142 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
143 		trace(TR_BREADHIT, pack(vp, size), blkno);
144 		return (0);
145 	}
146 	bp->b_flags |= B_READ;
147 	if (bp->b_bcount > bp->b_bufsize)
148 		panic("bread");
149 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
150 		crhold(cred);
151 		bp->b_rcred = cred;
152 	}
153 	VOP_STRATEGY(bp);
154 	trace(TR_BREADMISS, pack(vp, size), blkno);
155 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
156 	return (biowait(bp));
157 }
158 
159 /*
160  * Operates like bread, but also starts I/O on the N specified
161  * read-ahead blocks.
162  */
163 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
164 	struct vnode *vp;
165 	daddr_t blkno; int size;
166 	daddr_t rablkno[]; int rabsize[];
167 	int num;
168 	struct ucred *cred;
169 	struct buf **bpp;
170 {
171 	struct proc *p = curproc;		/* XXX */
172 	register struct buf *bp, *rabp;
173 	register int i;
174 
175 	bp = NULL;
176 	/*
177 	 * If the block is not memory resident,
178 	 * allocate a buffer and start I/O.
179 	 */
180 	if (!incore(vp, blkno)) {
181 		*bpp = bp = getblk(vp, blkno, size, 0, 0);
182 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
183 			bp->b_flags |= B_READ;
184 			if (bp->b_bcount > bp->b_bufsize)
185 				panic("breadn");
186 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
187 				crhold(cred);
188 				bp->b_rcred = cred;
189 			}
190 			VOP_STRATEGY(bp);
191 			trace(TR_BREADMISS, pack(vp, size), blkno);
192 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
193 		} else {
194 			trace(TR_BREADHIT, pack(vp, size), blkno);
195 		}
196 	}
197 
198 	/*
199 	 * If there's read-ahead block(s), start I/O
200 	 * on them also (as above).
201 	 */
202 	for (i = 0; i < num; i++) {
203 		if (incore(vp, rablkno[i]))
204 			continue;
205 		rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0);
206 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
207 			brelse(rabp);
208 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
209 		} else {
210 			rabp->b_flags |= B_ASYNC | B_READ;
211 			if (rabp->b_bcount > rabp->b_bufsize)
212 				panic("breadrabp");
213 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
214 				crhold(cred);
215 				rabp->b_rcred = cred;
216 			}
217 			VOP_STRATEGY(rabp);
218 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
219 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
220 		}
221 	}
222 
223 	/*
224 	 * If block was memory resident, let bread get it.
225 	 * If block was not memory resident, the read was
226 	 * started above, so just wait for the read to complete.
227 	 */
228 	if (bp == NULL)
229 		return (bread(vp, blkno, size, cred, bpp));
230 	return (biowait(bp));
231 }
232 
233 /*
234  * We could optimize this by keeping track of where the last read-ahead
235  * was, but it would involve adding fields to the vnode.  For now, let's
236  * just get it working.
237  *
238  * This replaces bread.  If this is a bread at the beginning of a file and
239  * lastr is 0, we assume this is the first read and we'll read up to two
240  * blocks if they are sequential.  After that, we'll do regular read ahead
241  * in clustered chunks.
242  *
243  * There are 4 or 5 cases depending on how you count:
244  *	Desired block is in the cache:
245  *	    1 Not sequential access (0 I/Os).
246  *	    2 Access is sequential, do read-ahead (1 ASYNC).
247  *	Desired block is not in cache:
248  *	    3 Not sequential access (1 SYNC).
249  *	    4 Sequential access, next block is contiguous (1 SYNC).
250  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
251  *
252  * There are potentially two buffers that require I/O.
253  * 	bp is the block requested.
254  *	rbp is the read-ahead block.
255  *	If either is NULL, then you don't have to do the I/O.
256  */
257 cluster_read(vp, filesize, lblkno, size, cred, bpp)
258 	struct vnode *vp;
259 	u_quad_t filesize;
260 	daddr_t lblkno;
261 	long size;
262 	struct ucred *cred;
263 	struct buf **bpp;
264 {
265 	struct buf *bp, *rbp;
266 	daddr_t blkno, ioblkno;
267 	long flags;
268 	int error, num_ra, alreadyincore;
269 
270 #ifdef DIAGNOSTIC
271 	if (size == 0)
272 		panic("cluster_read: size = 0");
273 #endif
274 
275 	error = 0;
276 	flags = B_READ;
277 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
278 	if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
279 		/*
280 		 * Desired block is in cache; do any readahead ASYNC.
281 		 * Case 1, 2.
282 		 */
283 		trace(TR_BREADHIT, pack(vp, size), lblkno);
284 		flags |= B_ASYNC;
285 		ioblkno = lblkno +
286 		    (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
287 		alreadyincore = (int)incore(vp, ioblkno);
288 		bp = NULL;
289 	} else {
290 		/* Block wasn't in cache, case 3, 4, 5. */
291 		trace(TR_BREADMISS, pack(vp, size), lblkno);
292 		ioblkno = lblkno;
293 		bp->b_flags |= flags;
294 		alreadyincore = 0;
295 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
296 	}
297 	/*
298 	 * XXX
299 	 * Replace 1 with a window size based on some permutation of
300 	 * maxcontig and rot_delay.  This will let you figure out how
301 	 * many blocks you should read-ahead (case 2, 4, 5).
302 	 *
303 	 * If the access isn't sequential, cut the window size in half.
304 	 */
305 	rbp = NULL;
306 	if (lblkno != vp->v_lastr + 1 && lblkno != 0)
307 		vp->v_ralen = max(vp->v_ralen >> 1, 1);
308 	else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
309 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
310 		/*
311 		 * Reading sequentially, and the next block is not in the
312 		 * cache.  We are going to try reading ahead. If this is
313 		 * the first read of a file, then limit read-ahead to a
314 		 * single block, else read as much as we're allowed.
315 		 */
316 		if (num_ra > vp->v_ralen) {
317 			num_ra = vp->v_ralen;
318 			vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
319 		} else
320 			vp->v_ralen = num_ra + 1;
321 
322 
323 		if (num_ra)				/* case 2, 4 */
324 			rbp = cluster_rbuild(vp, filesize,
325 			    bp, ioblkno, blkno, size, num_ra, flags);
326 		else if (lblkno != 0 && ioblkno == lblkno) {
327 			/* Case 5: check how many blocks to read ahead */
328 			++ioblkno;
329 			if ((ioblkno + 1) * size > filesize ||
330 			    (error = VOP_BMAP(vp,
331 			    ioblkno, NULL, &blkno, &num_ra)))
332 				goto skip_readahead;
333 			flags |= B_ASYNC;
334 			if (num_ra)
335 				rbp = cluster_rbuild(vp, filesize,
336 				    NULL, ioblkno, blkno, size, num_ra, flags);
337 			else {
338 				rbp = getblk(vp, ioblkno, size, 0, 0);
339 				rbp->b_flags |= flags;
340 				rbp->b_blkno = blkno;
341 			}
342 		} else if (lblkno != 0) {
343 			/* case 2; read ahead single block */
344 			rbp = getblk(vp, ioblkno, size, 0, 0);
345 			rbp->b_flags |= flags;
346 			rbp->b_blkno = blkno;
347 		} else if (bp)				/* case 1, 3, block 0 */
348 			bp->b_blkno = blkno;
349 		/* Case 1 on block 0; not really doing sequential I/O */
350 
351 		if (rbp == bp)		/* case 4 */
352 			rbp = NULL;
353 		else if (rbp) {			/* case 2, 5 */
354 			trace(TR_BREADMISSRA,
355 			    pack(vp, (num_ra + 1) * size), ioblkno);
356 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
357 		}
358 	}
359 
360 	/* XXX Kirk, do we need to make sure the bp has creds? */
361 skip_readahead:
362 	if (bp)
363 		if (bp->b_flags & (B_DONE | B_DELWRI))
364 			panic("cluster_read: DONE bp");
365 		else
366 			error = VOP_STRATEGY(bp);
367 
368 	if (rbp)
369 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
370 			rbp->b_flags &= ~(B_ASYNC | B_READ);
371 			brelse(rbp);
372 		} else
373 			(void) VOP_STRATEGY(rbp);
374 
375 	if (bp)
376 		return(biowait(bp));
377 	return(error);
378 }
379 
380 /*
381  * If blocks are contiguous on disk, use this to provide clustered
382  * read ahead.  We will read as many blocks as possible sequentially
383  * and then parcel them up into logical blocks in the buffer hash table.
384  */
385 struct buf *
386 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
387 	struct vnode *vp;
388 	u_quad_t filesize;
389 	struct buf *bp;
390 	daddr_t lbn;
391 	daddr_t blkno;
392 	long size;
393 	int run;
394 	long flags;
395 {
396 	struct cluster_save *b_save;
397 	struct buf *tbp;
398 	daddr_t bn;
399 	int i, inc;
400 
401 #ifdef DIAGNOSTIC
402 	if (size != vp->v_mount->mnt_stat.f_iosize)
403 		panic("cluster_rbuild: size %d != filesize %d\n",
404 			size, vp->v_mount->mnt_stat.f_iosize);
405 #endif
406 	if (size * (lbn + run + 1) > filesize)
407 		--run;
408 	if (run == 0) {
409 		if (!bp) {
410 			bp = getblk(vp, lbn, size, 0, 0);
411 			bp->b_blkno = blkno;
412 			bp->b_flags |= flags;
413 		}
414 		return(bp);
415 	}
416 
417 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
418 	if (bp->b_flags & (B_DONE | B_DELWRI))
419 		return (bp);
420 
421 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
422 	    M_SEGMENT, M_WAITOK);
423 	b_save->bs_bufsize = b_save->bs_bcount = size;
424 	b_save->bs_nchildren = 0;
425 	b_save->bs_children = (struct buf **)(b_save + 1);
426 	b_save->bs_saveaddr = bp->b_saveaddr;
427 	bp->b_saveaddr = (caddr_t) b_save;
428 
429 	inc = size / DEV_BSIZE;
430 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
431 		if (incore(vp, lbn + i)) {
432 			if (i == 1) {
433 				bp->b_saveaddr = b_save->bs_saveaddr;
434 				bp->b_flags &= ~B_CALL;
435 				bp->b_iodone = NULL;
436 				allocbuf(bp, size);
437 				free(b_save, M_SEGMENT);
438 			} else
439 				allocbuf(bp, size * i);
440 			break;
441 		}
442 		tbp = getblk(vp, lbn + i, 0, 0, 0);
443 		tbp->b_bcount = tbp->b_bufsize = size;
444 		tbp->b_blkno = bn;
445 		{
446 			daddr_t temp;
447 			VOP_BMAP(tbp->b_vp, tbp->b_lblkno, NULL, &temp, NULL);
448 			if (temp != bn) {
449 				printf("Block: %d Assigned address: %x Bmap address: %x\n",
450 					    tbp->b_lblkno, tbp->b_blkno, temp);
451 				panic("cluster_rbuild: wrong disk address");
452 			}
453 		}
454 		tbp->b_flags |= flags | B_READ | B_ASYNC;
455 		++b_save->bs_nchildren;
456 		b_save->bs_children[i - 1] = tbp;
457 	}
458 	if (!(bp->b_flags & B_ASYNC))
459 		vp->v_ralen = max(vp->v_ralen - 1, 1);
460 	return(bp);
461 }
462 
463 /*
464  * Either get a new buffer or grow the existing one.
465  */
466 struct buf *
467 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
468 	struct vnode *vp;
469 	struct buf *bp;
470 	long flags;
471 	daddr_t blkno;
472 	daddr_t lblkno;
473 	long size;
474 	int run;
475 {
476 	if (!bp) {
477 		bp = getblk(vp, lblkno, size, 0, 0);
478 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
479 			bp->b_blkno = blkno;
480 			return(bp);
481 		}
482 	}
483 	allocbuf(bp, run * size);
484 	bp->b_blkno = blkno;
485 	bp->b_iodone = cluster_callback;
486 	bp->b_flags |= flags | B_CALL;
487 	return(bp);
488 }
489 
490 /*
491  * Cleanup after a clustered read or write.
492  */
493 void
494 cluster_callback(bp)
495 	struct buf *bp;
496 {
497 	struct cluster_save *b_save;
498 	struct buf **tbp;
499 	long bsize;
500 	caddr_t cp;
501 	daddr_t	daddr;
502 	b_save = (struct cluster_save *)(bp->b_saveaddr);
503 	bp->b_saveaddr = b_save->bs_saveaddr;
504 
505 	cp = bp->b_un.b_addr + b_save->bs_bufsize;
506 	daddr = bp->b_blkno + b_save->bs_bufsize / DEV_BSIZE;
507 	for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
508 		pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
509 		cp += (*tbp)->b_bufsize;
510 		bp->b_bufsize -= (*tbp)->b_bufsize;
511 		if ((*tbp)->b_blkno != daddr) {
512 			struct inode *ip;
513 			printf("cluster_callback: bad disk address:\n");
514 			printf("Clustered Block: %d DiskAddr: %x bytes left: %d\n",
515 			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
516 			printf("\toriginal size: %d flags: %x\n", bp->b_bcount,
517 			    bp->b_flags);
518 			printf("Child Block: %d DiskAddr: %x bytes: %d\n",
519 			    (*tbp)->b_lblkno, (*tbp)->b_blkno,
520 			    (*tbp)->b_bufsize);
521 			ip = VTOI((*tbp)->b_vp);
522 			printf("daddr: %x i_size %qd\n", daddr, ip->i_size);
523 			if ((*tbp)->b_lblkno < NDADDR)
524 				printf("Child block pointer from inode: %x\n",
525 				    ip->i_din.di_db[(*tbp)->b_lblkno]);
526 			spl0();
527 			panic ("cluster_callback: bad disk address");
528 		}
529 		daddr += (*tbp)->b_bufsize / DEV_BSIZE;
530 		biodone(*tbp);
531 	}
532 #ifdef DIAGNOSTIC
533 	if (bp->b_bufsize != b_save->bs_bufsize)
534 		panic ("cluster_callback: more space to reclaim");
535 #endif
536 	bp->b_bcount = bp->b_bufsize;
537 	bp->b_iodone = NULL;
538 	free(b_save, M_SEGMENT);
539 	if (bp->b_flags & B_ASYNC)
540 		brelse(bp);
541 	else
542 		wakeup((caddr_t)bp);
543 }
544 
545 /*
546  * Synchronous write.
547  * Release buffer on completion.
548  */
549 bwrite(bp)
550 	register struct buf *bp;
551 {
552 	struct proc *p = curproc;		/* XXX */
553 	register int flag;
554 	int s, error = 0;
555 
556 	flag = bp->b_flags;
557 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
558 	if (flag & B_ASYNC) {
559 		if ((flag & B_DELWRI) == 0)
560 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
561 		else
562 			reassignbuf(bp, bp->b_vp);
563 	}
564 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
565 	if (bp->b_bcount > bp->b_bufsize)
566 		panic("bwrite");
567 	s = splbio();
568 	bp->b_vp->v_numoutput++;
569 	bp->b_flags |= B_WRITEINPROG;
570 	splx(s);
571 	VOP_STRATEGY(bp);
572 
573 	/*
574 	 * If the write was synchronous, then await I/O completion.
575 	 * If the write was "delayed", then we put the buffer on
576 	 * the queue of blocks awaiting I/O completion status.
577 	 */
578 	if ((flag & B_ASYNC) == 0) {
579 		error = biowait(bp);
580 		if ((flag&B_DELWRI) == 0)
581 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
582 		else
583 			reassignbuf(bp, bp->b_vp);
584 		if (bp->b_flags & B_EINTR) {
585 			bp->b_flags &= ~B_EINTR;
586 			error = EINTR;
587 		}
588 		brelse(bp);
589 	} else if (flag & B_DELWRI) {
590 		s = splbio();
591 		bp->b_flags |= B_AGE;
592 		splx(s);
593 	}
594 	return (error);
595 }
596 
597 int
598 vn_bwrite(ap)
599 	struct vop_bwrite_args *ap;
600 {
601 	return (bwrite(ap->a_bp));
602 }
603 
604 
605 /*
606  * Delayed write.
607  *
608  * The buffer is marked dirty, but is not queued for I/O.
609  * This routine should be used when the buffer is expected
610  * to be modified again soon, typically a small write that
611  * partially fills a buffer.
612  *
613  * NB: magnetic tapes cannot be delayed; they must be
614  * written in the order that the writes are requested.
615  */
616 bdwrite(bp)
617 	register struct buf *bp;
618 {
619 	struct proc *p = curproc;		/* XXX */
620 
621 	if ((bp->b_flags & B_DELWRI) == 0) {
622 		bp->b_flags |= B_DELWRI;
623 		reassignbuf(bp, bp->b_vp);
624 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
625 	}
626 	/*
627 	 * If this is a tape drive, the write must be initiated.
628 	 */
629 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
630 		bawrite(bp);
631 	} else {
632 		bp->b_flags |= (B_DONE | B_DELWRI);
633 		brelse(bp);
634 	}
635 }
636 
637 /*
638  * Asynchronous write.
639  * Start I/O on a buffer, but do not wait for it to complete.
640  * The buffer is released when the I/O completes.
641  */
642 bawrite(bp)
643 	register struct buf *bp;
644 {
645 
646 	/*
647 	 * Setting the ASYNC flag causes bwrite to return
648 	 * after starting the I/O.
649 	 */
650 	bp->b_flags |= B_ASYNC;
651 	(void) VOP_BWRITE(bp);
652 }
653 
654 /*
655  * Do clustered write for FFS.
656  *
657  * Three cases:
658  *	1. Write is not sequential (write asynchronously)
659  *	Write is sequential:
660  *	2.	beginning of cluster - begin cluster
661  *	3.	middle of a cluster - add to cluster
662  *	4.	end of a cluster - asynchronously write cluster
663  */
664 void
665 cluster_write(bp, filesize)
666         struct buf *bp;
667 	u_quad_t filesize;
668 {
669         struct vnode *vp;
670         daddr_t lbn;
671         int clen;
672 
673         vp = bp->b_vp;
674         lbn = bp->b_lblkno;
675 
676 	/* Initialize vnode to beginning of file. */
677 	if (lbn == 0)
678 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
679 
680         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
681 	    (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) {
682 		if (vp->v_clen != 0)
683 			/*
684 			 * Write is not sequential.
685 			 */
686 			cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
687 			    vp->v_lastw - vp->v_cstart + 1, lbn);
688 		/*
689 		 * Consider beginning a cluster.
690 		 */
691 		if ((lbn + 1) * bp->b_bcount == filesize)
692 			/* End of file, make cluster as large as possible */
693 			clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
694 		else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) {
695 			bawrite(bp);
696 			vp->v_clen = 0;
697 			vp->v_lasta = bp->b_blkno;
698 			vp->v_cstart = lbn + 1;
699 			vp->v_lastw = lbn;
700 			return;
701 		} else
702 			clen = 0;
703                 vp->v_clen = clen;
704                 if (clen == 0) {		/* I/O not contiguous */
705 			vp->v_cstart = lbn + 1;
706                         bawrite(bp);
707                 } else {			/* Wait for rest of cluster */
708 			vp->v_cstart = lbn;
709                         bdwrite(bp);
710 		}
711         } else if (lbn == vp->v_cstart + vp->v_clen) {
712 		/*
713 		 * At end of cluster, write it out.
714 		 */
715 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
716 		    vp->v_clen + 1, lbn);
717 		vp->v_clen = 0;
718 		vp->v_cstart = lbn + 1;
719         } else
720 		/*
721 		 * In the middle of a cluster, so just delay the
722 		 * I/O for now.
723 		 */
724                 bdwrite(bp);
725         vp->v_lastw = lbn;
726 	vp->v_lasta = bp->b_blkno;
727 }
728 
729 
730 /*
731  * This is an awful lot like cluster_rbuild...wish they could be combined.
732  * The last lbn argument is the current block on which I/O is being
733  * performed.  Check to see that it doesn't fall in the middle of
734  * the current block.
735  */
736 void
737 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
738 	struct vnode *vp;
739 	struct buf *last_bp;
740 	long size;
741 	daddr_t start_lbn;
742 	int len;
743 	daddr_t	lbn;
744 {
745 	struct cluster_save *b_save;
746 	struct buf *bp, *tbp;
747 	caddr_t	cp;
748 	int i, s;
749 
750 #ifdef DIAGNOSTIC
751 	if (size != vp->v_mount->mnt_stat.f_iosize)
752 		panic("cluster_wbuild: size %d != filesize %d\n",
753 			size, vp->v_mount->mnt_stat.f_iosize);
754 #endif
755 redo:
756 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
757 		++start_lbn;
758 		--len;
759 	}
760 
761 	/* Get more memory for current buffer */
762 	if (len <= 1) {
763 		if (last_bp) {
764 			bawrite(last_bp);
765 		} else if (len) {
766 			bp = getblk(vp, start_lbn, size, 0, 0);
767 			bawrite(bp);
768 		}
769 		return;
770 	}
771 
772 	bp = getblk(vp, start_lbn, size, 0, 0);
773 	if (!(bp->b_flags & B_DELWRI)) {
774 		++start_lbn;
775 		--len;
776 		brelse(bp);
777 		goto redo;
778 	}
779 
780 	--len;
781 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
782 	    M_SEGMENT, M_WAITOK);
783 	b_save->bs_bcount = bp->b_bcount;
784 	b_save->bs_bufsize = bp->b_bufsize;
785 	b_save->bs_nchildren = 0;
786 	b_save->bs_children = (struct buf **)(b_save + 1);
787 	b_save->bs_saveaddr = bp->b_saveaddr;
788 	bp->b_saveaddr = (caddr_t) b_save;
789 
790 
791 	bp->b_flags |= B_CALL;
792 	bp->b_iodone = cluster_callback;
793 	cp = bp->b_un.b_addr + bp->b_bufsize;
794 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
795 		if (!incore(vp, start_lbn) || start_lbn == lbn)
796 			break;
797 
798 		if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
799 			tbp = getblk(vp, start_lbn, size, 0, 0);
800 #ifdef DIAGNOSTIC
801 			if (tbp->b_bcount != tbp->b_bufsize)
802 				panic("cluster_wbuild: Buffer too big");
803 #endif
804 			if (!(tbp->b_flags & B_DELWRI)) {
805 				brelse(tbp);
806 				break;
807 			}
808 		} else
809 			tbp = last_bp;
810 
811 		++b_save->bs_nchildren;
812 
813 		/* Move memory from children to parent */
814 		if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) {
815 			printf("Clustered Block: %d addr %x bufsize: %d\n",
816 			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
817 			printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
818 			    tbp->b_blkno);
819 			panic("Clustered write to wrong blocks");
820 		}
821 
822 		pagemove(tbp->b_un.b_daddr, cp, size);
823 		bp->b_bcount += size;
824 		bp->b_bufsize += size;
825 
826 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
827 		tbp->b_flags |= B_ASYNC;
828 		s = splbio();
829 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
830 		++tbp->b_vp->v_numoutput;
831 		splx(s);
832 		b_save->bs_children[i] = tbp;
833 
834 		cp += tbp->b_bufsize;
835 	}
836 
837 	if (i == 0) {
838 		/* None to cluster */
839 		bp->b_saveaddr = b_save->bs_saveaddr;
840 		bp->b_flags &= ~B_CALL;
841 		bp->b_iodone = NULL;
842 		free(b_save, M_SEGMENT);
843 	}
844 	bawrite(bp);
845 	if (i < len) {
846 		len -= i + 1;
847 		start_lbn += 1;
848 		goto redo;
849 	}
850 }
851 
852 /*
853  * Release a buffer.
854  * Even if the buffer is dirty, no I/O is started.
855  */
856 brelse(bp)
857 	register struct buf *bp;
858 {
859 	register struct queue_entry *flist;
860 	int s;
861 
862 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
863 	/*
864 	 * If a process is waiting for the buffer, or
865 	 * is waiting for a free buffer, awaken it.
866 	 */
867 	if (bp->b_flags & B_WANTED)
868 		wakeup((caddr_t)bp);
869 	if (needbuffer) {
870 		needbuffer = 0;
871 		wakeup((caddr_t)&needbuffer);
872 	}
873 	/*
874 	 * Retry I/O for locked buffers rather than invalidating them.
875 	 */
876 	s = splbio();
877 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
878 		bp->b_flags &= ~B_ERROR;
879 	/*
880 	 * Disassociate buffers that are no longer valid.
881 	 */
882 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
883 		bp->b_flags |= B_INVAL;
884 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
885 		if (bp->b_vp)
886 			brelvp(bp);
887 		bp->b_flags &= ~B_DELWRI;
888 	}
889 	/*
890 	 * Stick the buffer back on a free list.
891 	 */
892 	if (bp->b_bufsize <= 0) {
893 		/* block has no buffer ... put at front of unused buffer list */
894 		flist = &bufqueues[BQ_EMPTY];
895 		binsheadfree(bp, flist);
896 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
897 		/* block has no info ... put at front of most free list */
898 		flist = &bufqueues[BQ_AGE];
899 		binsheadfree(bp, flist);
900 	} else {
901 		if (bp->b_flags & B_LOCKED)
902 			flist = &bufqueues[BQ_LOCKED];
903 		else if (bp->b_flags & B_AGE)
904 			flist = &bufqueues[BQ_AGE];
905 		else
906 			flist = &bufqueues[BQ_LRU];
907 		binstailfree(bp, flist);
908 	}
909 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
910 	splx(s);
911 }
912 
913 /*
914  * Check to see if a block is currently memory resident.
915  */
916 struct buf *
917 incore(vp, blkno)
918 	struct vnode *vp;
919 	daddr_t blkno;
920 {
921 	register struct buf *bp;
922 
923 	for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next)
924 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
925 		    (bp->b_flags & B_INVAL) == 0)
926 			return (bp);
927 	return (NULL);
928 }
929 
930 /*
931  * Check to see if a block is currently memory resident.
932  * If it is resident, return it. If it is not resident,
933  * allocate a new buffer and assign it to the block.
934  */
935 struct buf *
936 getblk(vp, blkno, size, slpflag, slptimeo)
937 	register struct vnode *vp;
938 	daddr_t blkno;
939 	int size, slpflag, slptimeo;
940 {
941 	register struct buf *bp;
942 	struct list_entry *dp;
943 	int s, error;
944 
945 	if (size > MAXBSIZE)
946 		panic("getblk: size too big");
947 	/*
948 	 * Search the cache for the block. If the buffer is found,
949 	 * but it is currently locked, the we must wait for it to
950 	 * become available.
951 	 */
952 	dp = BUFHASH(vp, blkno);
953 loop:
954 	for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) {
955 		if (bp->b_lblkno != blkno || bp->b_vp != vp)
956 			continue;
957 		s = splbio();
958 		if (bp->b_flags & B_BUSY) {
959 			bp->b_flags |= B_WANTED;
960 			error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1),
961 				"getblk", slptimeo);
962 			splx(s);
963 			if (error)
964 				return (NULL);
965 			goto loop;
966 		}
967 		/*
968 		 * The test for B_INVAL is moved down here, since there
969 		 * are cases where B_INVAL is set before VOP_BWRITE() is
970 		 * called and for NFS, the process cannot be allowed to
971 		 * allocate a new buffer for the same block until the write
972 		 * back to the server has been completed. (ie. B_BUSY clears)
973 		 */
974 		if (bp->b_flags & B_INVAL) {
975 			splx(s);
976 			continue;
977 		}
978 		bremfree(bp);
979 		bp->b_flags |= B_BUSY;
980 		splx(s);
981 		if (bp->b_bcount != size) {
982 			printf("getblk: stray size");
983 			bp->b_flags |= B_INVAL;
984 			VOP_BWRITE(bp);
985 			goto loop;
986 		}
987 		bp->b_flags |= B_CACHE;
988 		return (bp);
989 	}
990 	/*
991 	 * The loop back to the top when getnewbuf() fails is because
992 	 * stateless filesystems like NFS have no node locks. Thus,
993 	 * there is a slight chance that more than one process will
994 	 * try and getnewbuf() for the same block concurrently when
995 	 * the first sleeps in getnewbuf(). So after a sleep, go back
996 	 * up to the top to check the hash lists again.
997 	 */
998 	if ((bp = getnewbuf(slpflag, slptimeo)) == 0)
999 		goto loop;
1000 	bremhash(bp);
1001 	bgetvp(vp, bp);
1002 	bp->b_bcount = 0;
1003 	bp->b_lblkno = blkno;
1004 	bp->b_blkno = blkno;
1005 	bp->b_error = 0;
1006 	bp->b_resid = 0;
1007 	binshash(bp, dp);
1008 	allocbuf(bp, size);
1009 	return (bp);
1010 }
1011 
1012 /*
1013  * Allocate a buffer.
1014  * The caller will assign it to a block.
1015  */
1016 struct buf *
1017 geteblk(size)
1018 	int size;
1019 {
1020 	register struct buf *bp;
1021 
1022 	if (size > MAXBSIZE)
1023 		panic("geteblk: size too big");
1024 	while ((bp = getnewbuf(0, 0)) == NULL)
1025 		/* void */;
1026 	bp->b_flags |= B_INVAL;
1027 	bremhash(bp);
1028 	binshash(bp, &invalhash);
1029 	bp->b_bcount = 0;
1030 	bp->b_error = 0;
1031 	bp->b_resid = 0;
1032 	allocbuf(bp, size);
1033 	return (bp);
1034 }
1035 
1036 /*
1037  * Expand or contract the actual memory allocated to a buffer.
1038  * If no memory is available, release buffer and take error exit.
1039  */
1040 allocbuf(tp, size)
1041 	register struct buf *tp;
1042 	int size;
1043 {
1044 	register struct buf *bp, *ep;
1045 	int sizealloc, take, s;
1046 
1047 	sizealloc = roundup(size, CLBYTES);
1048 	/*
1049 	 * Buffer size does not change
1050 	 */
1051 	if (sizealloc == tp->b_bufsize)
1052 		goto out;
1053 	/*
1054 	 * Buffer size is shrinking.
1055 	 * Place excess space in a buffer header taken from the
1056 	 * BQ_EMPTY buffer list and placed on the "most free" list.
1057 	 * If no extra buffer headers are available, leave the
1058 	 * extra space in the present buffer.
1059 	 */
1060 	if (sizealloc < tp->b_bufsize) {
1061 		if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL)
1062 			goto out;
1063 		s = splbio();
1064 		bremfree(ep);
1065 		ep->b_flags |= B_BUSY;
1066 		splx(s);
1067 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
1068 		    (int)tp->b_bufsize - sizealloc);
1069 		ep->b_bufsize = tp->b_bufsize - sizealloc;
1070 		tp->b_bufsize = sizealloc;
1071 		ep->b_flags |= B_INVAL;
1072 		ep->b_bcount = 0;
1073 		brelse(ep);
1074 		goto out;
1075 	}
1076 	/*
1077 	 * More buffer space is needed. Get it out of buffers on
1078 	 * the "most free" list, placing the empty headers on the
1079 	 * BQ_EMPTY buffer header list.
1080 	 */
1081 	while (tp->b_bufsize < sizealloc) {
1082 		take = sizealloc - tp->b_bufsize;
1083 		while ((bp = getnewbuf(0, 0)) == NULL)
1084 			/* void */;
1085 		if (take >= bp->b_bufsize)
1086 			take = bp->b_bufsize;
1087 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
1088 		    &tp->b_un.b_addr[tp->b_bufsize], take);
1089 		tp->b_bufsize += take;
1090 		bp->b_bufsize = bp->b_bufsize - take;
1091 		if (bp->b_bcount > bp->b_bufsize)
1092 			bp->b_bcount = bp->b_bufsize;
1093 		if (bp->b_bufsize <= 0) {
1094 			bremhash(bp);
1095 			binshash(bp, &invalhash);
1096 			bp->b_dev = NODEV;
1097 			bp->b_error = 0;
1098 			bp->b_flags |= B_INVAL;
1099 		}
1100 		brelse(bp);
1101 	}
1102 out:
1103 	tp->b_bcount = size;
1104 	return (1);
1105 }
1106 
1107 /*
1108  * Find a buffer which is available for use.
1109  * Select something from a free list.
1110  * Preference is to AGE list, then LRU list.
1111  */
1112 struct buf *
1113 getnewbuf(slpflag, slptimeo)
1114 	int slpflag, slptimeo;
1115 {
1116 	register struct buf *bp;
1117 	register struct queue_entry *dp;
1118 	register struct ucred *cred;
1119 	int s;
1120 	struct buf *abp;
1121 	static int losecnt = 0;
1122 
1123 loop:
1124 	s = splbio();
1125 	abp = NULL;
1126 	for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) {
1127 		for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) {
1128 			if (abp == NULL)
1129 				abp = bp;
1130 			if ((bp->b_flags & B_DELWRI) &&
1131 			    bp->b_vp && VOP_ISLOCKED(bp->b_vp))
1132 				continue;
1133 			goto found;
1134 		}
1135 	}
1136 	if (dp == bufqueues) {		/* no free blocks */
1137 		if (abp) {
1138 			bp = abp;
1139 			bp->b_flags |= B_XXX;
1140 			if (losecnt++ < 20) {
1141 				vprint("skipping blkno check", bp->b_vp);
1142 				printf("\tlblkno %d, blkno %d\n",
1143 				   bp->b_lblkno, bp->b_blkno);
1144 			}
1145 			goto found;
1146 		}
1147 		needbuffer = 1;
1148 		(void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1),
1149 			"getnewbuf", slptimeo);
1150 		splx(s);
1151 		return (NULL);
1152 	}
1153 found:
1154 	bremfree(bp);
1155 	bp->b_flags |= B_BUSY;
1156 	splx(s);
1157 	if (bp->b_flags & B_DELWRI) {
1158 		(void) bawrite(bp);
1159 		goto loop;
1160 	}
1161 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1162 	if (bp->b_vp)
1163 		brelvp(bp);
1164 	if (bp->b_rcred != NOCRED) {
1165 		cred = bp->b_rcred;
1166 		bp->b_rcred = NOCRED;
1167 		crfree(cred);
1168 	}
1169 	if (bp->b_wcred != NOCRED) {
1170 		cred = bp->b_wcred;
1171 		bp->b_wcred = NOCRED;
1172 		crfree(cred);
1173 	}
1174 	bp->b_flags = B_BUSY;
1175 	bp->b_dirtyoff = bp->b_dirtyend = 0;
1176 	bp->b_validoff = bp->b_validend = 0;
1177 	return (bp);
1178 }
1179 
1180 /*
1181  * Wait for I/O to complete.
1182  *
1183  * Extract and return any errors associated with the I/O.
1184  * If the error flag is set, but no specific error is
1185  * given, return EIO.
1186  */
1187 biowait(bp)
1188 	register struct buf *bp;
1189 {
1190 	int s;
1191 
1192 	s = splbio();
1193 	while ((bp->b_flags & B_DONE) == 0)
1194 		sleep((caddr_t)bp, PRIBIO);
1195 	splx(s);
1196 	if ((bp->b_flags & B_ERROR) == 0)
1197 		return (0);
1198 	if (bp->b_error)
1199 		return (bp->b_error);
1200 	return (EIO);
1201 }
1202 
1203 /*
1204  * Mark I/O complete on a buffer.
1205  *
1206  * If a callback has been requested, e.g. the pageout
1207  * daemon, do so. Otherwise, awaken waiting processes.
1208  */
1209 void
1210 biodone(bp)
1211 	register struct buf *bp;
1212 {
1213 
1214 	if (bp->b_flags & B_DONE)
1215 		panic("dup biodone");
1216 	bp->b_flags |= B_DONE;
1217 	if ((bp->b_flags & B_READ) == 0)
1218 		vwakeup(bp);
1219 	if (bp->b_flags & B_CALL) {
1220 		bp->b_flags &= ~B_CALL;
1221 		(*bp->b_iodone)(bp);
1222 		return;
1223 	}
1224 	if (bp->b_flags & B_ASYNC)
1225 		brelse(bp);
1226 	else {
1227 		bp->b_flags &= ~B_WANTED;
1228 		wakeup((caddr_t)bp);
1229 	}
1230 }
1231 
1232 int
1233 count_lock_queue()
1234 {
1235 	register struct buf *bp;
1236 	register int ret;
1237 
1238 	for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next;
1239 	    bp; bp = (struct buf *)bp->b_freelist.qe_next)
1240 		++ret;
1241 	return(ret);
1242 }
1243 
1244 #ifdef DIAGNOSTIC
1245 /*
1246  * Print out statistics on the current allocation of the buffer pool.
1247  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1248  * above.
1249  */
1250 void
1251 vfs_bufstats()
1252 {
1253 	int s, i, j, count;
1254 	register struct buf *bp;
1255 	register struct queue_entry *dp;
1256 	int counts[MAXBSIZE/CLBYTES+1];
1257 	static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
1258 
1259 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1260 		count = 0;
1261 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1262 			counts[j] = 0;
1263 		s = splbio();
1264 		for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) {
1265 			counts[bp->b_bufsize/CLBYTES]++;
1266 			count++;
1267 		}
1268 		splx(s);
1269 		printf("%s: total-%d", bname[i], count);
1270 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1271 			if (counts[j] != 0)
1272 				printf(", %d-%d", j * CLBYTES, counts[j]);
1273 		printf("\n");
1274 	}
1275 }
1276 #endif /* DIAGNOSTIC */
1277