xref: /original-bsd/sys/kern/vfs_cluster.c (revision 10042f30)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_cluster.c	7.48 (Berkeley) 05/15/92
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/specdev.h>
17 #include <sys/mount.h>
18 #include <sys/trace.h>
19 #include <sys/resourcevar.h>
20 
21 /*
22  * Initialize buffers and hash links for buffers.
23  */
24 void
25 bufinit()
26 {
27 	register int i;
28 	register struct buf *bp, *dp;
29 	register struct bufhd *hp;
30 	int base, residual;
31 
32 	for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
33 		hp->b_forw = hp->b_back = (struct buf *)hp;
34 
35 	for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
36 		dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
37 		dp->b_flags = B_HEAD;
38 	}
39 	base = bufpages / nbuf;
40 	residual = bufpages % nbuf;
41 	for (i = 0; i < nbuf; i++) {
42 		bp = &buf[i];
43 		bp->b_dev = NODEV;
44 		bp->b_bcount = 0;
45 		bp->b_rcred = NOCRED;
46 		bp->b_wcred = NOCRED;
47 		bp->b_dirtyoff = 0;
48 		bp->b_dirtyend = 0;
49 		bp->b_validoff = 0;
50 		bp->b_validend = 0;
51 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
52 		if (i < residual)
53 			bp->b_bufsize = (base + 1) * CLBYTES;
54 		else
55 			bp->b_bufsize = base * CLBYTES;
56 		binshash(bp, &bfreelist[BQ_AGE]);
57 		bp->b_flags = B_INVAL;
58 		dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY];
59 		binsheadfree(bp, dp);
60 	}
61 }
62 
63 /*
64  * Find the block in the buffer pool.
65  * If the buffer is not present, allocate a new buffer and load
66  * its contents according to the filesystem fill routine.
67  */
68 bread(vp, blkno, size, cred, bpp)
69 	struct vnode *vp;
70 	daddr_t blkno;
71 	int size;
72 	struct ucred *cred;
73 	struct buf **bpp;
74 {
75 	USES_VOP_STRATEGY;
76 	struct proc *p = curproc;		/* XXX */
77 	register struct buf *bp;
78 
79 	if (size == 0)
80 		panic("bread: size 0");
81 	*bpp = bp = getblk(vp, blkno, size);
82 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
83 		trace(TR_BREADHIT, pack(vp, size), blkno);
84 		return (0);
85 	}
86 	bp->b_flags |= B_READ;
87 	if (bp->b_bcount > bp->b_bufsize)
88 		panic("bread");
89 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
90 		crhold(cred);
91 		bp->b_rcred = cred;
92 	}
93 	VOP_STRATEGY(bp);
94 	trace(TR_BREADMISS, pack(vp, size), blkno);
95 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
96 	return (biowait(bp));
97 }
98 
99 /*
100  * Operates like bread, but also starts I/O on the N specified
101  * read-ahead blocks.
102  */
103 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
104 	struct vnode *vp;
105 	daddr_t blkno; int size;
106 	daddr_t rablkno[]; int rabsize[];
107 	int num;
108 	struct ucred *cred;
109 	struct buf **bpp;
110 {
111 	USES_VOP_STRATEGY;
112 	struct proc *p = curproc;		/* XXX */
113 	register struct buf *bp, *rabp;
114 	register int i;
115 
116 	bp = NULL;
117 	/*
118 	 * If the block is not memory resident,
119 	 * allocate a buffer and start I/O.
120 	 */
121 	if (!incore(vp, blkno)) {
122 		*bpp = bp = getblk(vp, blkno, size);
123 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
124 			bp->b_flags |= B_READ;
125 			if (bp->b_bcount > bp->b_bufsize)
126 				panic("breadn");
127 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
128 				crhold(cred);
129 				bp->b_rcred = cred;
130 			}
131 			VOP_STRATEGY(bp);
132 			trace(TR_BREADMISS, pack(vp, size), blkno);
133 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
134 		} else
135 			trace(TR_BREADHIT, pack(vp, size), blkno);
136 	}
137 
138 	/*
139 	 * If there's read-ahead block(s), start I/O
140 	 * on them also (as above).
141 	 */
142 	for (i = 0; i < num; i++) {
143 		if (incore(vp, rablkno[i]))
144 			continue;
145 		rabp = getblk(vp, rablkno[i], rabsize[i]);
146 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
147 			brelse(rabp);
148 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
149 		} else {
150 			rabp->b_flags |= B_ASYNC | B_READ;
151 			if (rabp->b_bcount > rabp->b_bufsize)
152 				panic("breadrabp");
153 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
154 				crhold(cred);
155 				rabp->b_rcred = cred;
156 			}
157 			VOP_STRATEGY(rabp);
158 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
159 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
160 		}
161 	}
162 
163 	/*
164 	 * If block was memory resident, let bread get it.
165 	 * If block was not memory resident, the read was
166 	 * started above, so just wait for the read to complete.
167 	 */
168 	if (bp == NULL)
169 		return (bread(vp, blkno, size, cred, bpp));
170 	return (biowait(bp));
171 }
172 
173 /*
174  * Synchronous write.
175  * Release buffer on completion.
176  */
177 bwrite(bp)
178 	register struct buf *bp;
179 {
180 	USES_VOP_STRATEGY;
181 	struct proc *p = curproc;		/* XXX */
182 	register int flag;
183 	int s, error = 0;
184 
185 	flag = bp->b_flags;
186 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
187 	if (flag & B_ASYNC) {
188 		if ((flag & B_DELWRI) == 0)
189 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
190 		else
191 			reassignbuf(bp, bp->b_vp);
192 	}
193 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
194 	if (bp->b_bcount > bp->b_bufsize)
195 		panic("bwrite");
196 	s = splbio();
197 	bp->b_vp->v_numoutput++;
198 	splx(s);
199 	VOP_STRATEGY(bp);
200 
201 	/*
202 	 * If the write was synchronous, then await I/O completion.
203 	 * If the write was "delayed", then we put the buffer on
204 	 * the queue of blocks awaiting I/O completion status.
205 	 */
206 	if ((flag & B_ASYNC) == 0) {
207 		error = biowait(bp);
208 		if ((flag&B_DELWRI) == 0)
209 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
210 		else
211 			reassignbuf(bp, bp->b_vp);
212 		brelse(bp);
213 	} else if (flag & B_DELWRI) {
214 		s = splbio();
215 		bp->b_flags |= B_AGE;
216 		splx(s);
217 	}
218 	return (error);
219 }
220 
221 int
222 vn_bwrite(ap)
223 	struct vop_bwrite_args *ap;
224 {
225 	return bwrite (ap->a_bp);
226 }
227 
228 
229 /*
230  * Delayed write.
231  *
232  * The buffer is marked dirty, but is not queued for I/O.
233  * This routine should be used when the buffer is expected
234  * to be modified again soon, typically a small write that
235  * partially fills a buffer.
236  *
237  * NB: magnetic tapes cannot be delayed; they must be
238  * written in the order that the writes are requested.
239  */
240 bdwrite(bp)
241 	register struct buf *bp;
242 {
243 	USES_VOP_IOCTL;
244 	struct proc *p = curproc;		/* XXX */
245 
246 	if ((bp->b_flags & B_DELWRI) == 0) {
247 		bp->b_flags |= B_DELWRI;
248 		reassignbuf(bp, bp->b_vp);
249 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
250 	}
251 	/*
252 	 * If this is a tape drive, the write must be initiated.
253 	 */
254 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
255 		bawrite(bp);
256 	} else {
257 		bp->b_flags |= (B_DONE | B_DELWRI);
258 		brelse(bp);
259 	}
260 }
261 
262 /*
263  * Asynchronous write.
264  * Start I/O on a buffer, but do not wait for it to complete.
265  * The buffer is released when the I/O completes.
266  */
267 bawrite(bp)
268 	register struct buf *bp;
269 {
270 
271 	/*
272 	 * Setting the ASYNC flag causes bwrite to return
273 	 * after starting the I/O.
274 	 */
275 	bp->b_flags |= B_ASYNC;
276 	(void) bwrite(bp);
277 }
278 
279 /*
280  * Release a buffer.
281  * Even if the buffer is dirty, no I/O is started.
282  */
283 brelse(bp)
284 	register struct buf *bp;
285 {
286 	register struct buf *flist;
287 	int s;
288 
289 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
290 	/*
291 	 * If a process is waiting for the buffer, or
292 	 * is waiting for a free buffer, awaken it.
293 	 */
294 	if (bp->b_flags & B_WANTED)
295 		wakeup((caddr_t)bp);
296 	if (bfreelist[0].b_flags & B_WANTED) {
297 		bfreelist[0].b_flags &= ~B_WANTED;
298 		wakeup((caddr_t)bfreelist);
299 	}
300 	/*
301 	 * Retry I/O for locked buffers rather than invalidating them.
302 	 */
303 	s = splbio();
304 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
305 		bp->b_flags &= ~B_ERROR;
306 	/*
307 	 * Disassociate buffers that are no longer valid.
308 	 */
309 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
310 		bp->b_flags |= B_INVAL;
311 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
312 		if (bp->b_vp)
313 			brelvp(bp);
314 		bp->b_flags &= ~B_DELWRI;
315 	}
316 	/*
317 	 * Stick the buffer back on a free list.
318 	 */
319 	if (bp->b_bufsize <= 0) {
320 		/* block has no buffer ... put at front of unused buffer list */
321 		flist = &bfreelist[BQ_EMPTY];
322 		binsheadfree(bp, flist);
323 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
324 		/* block has no info ... put at front of most free list */
325 		flist = &bfreelist[BQ_AGE];
326 		binsheadfree(bp, flist);
327 	} else {
328 		if (bp->b_flags & B_LOCKED)
329 			flist = &bfreelist[BQ_LOCKED];
330 		else if (bp->b_flags & B_AGE)
331 			flist = &bfreelist[BQ_AGE];
332 		else
333 			flist = &bfreelist[BQ_LRU];
334 		binstailfree(bp, flist);
335 	}
336 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
337 	splx(s);
338 }
339 
340 /*
341  * Check to see if a block is currently memory resident.
342  */
343 incore(vp, blkno)
344 	struct vnode *vp;
345 	daddr_t blkno;
346 {
347 	register struct buf *bp;
348 	register struct buf *dp;
349 
350 	dp = BUFHASH(vp, blkno);
351 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
352 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
353 		    (bp->b_flags & B_INVAL) == 0)
354 			return (1);
355 	return (0);
356 }
357 
358 /*
359  * Check to see if a block is currently memory resident.
360  * If it is resident, return it. If it is not resident,
361  * allocate a new buffer and assign it to the block.
362  */
363 struct buf *
364 getblk(vp, blkno, size)
365 	register struct vnode *vp;
366 	daddr_t blkno;
367 	int size;
368 {
369 	register struct buf *bp, *dp;
370 	int s;
371 
372 	if (size > MAXBSIZE)
373 		panic("getblk: size too big");
374 	/*
375 	 * Search the cache for the block. If the buffer is found,
376 	 * but it is currently locked, the we must wait for it to
377 	 * become available.
378 	 */
379 	dp = BUFHASH(vp, blkno);
380 loop:
381 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
382 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
383 		    (bp->b_flags & B_INVAL))
384 			continue;
385 		s = splbio();
386 		if (bp->b_flags & B_BUSY) {
387 			bp->b_flags |= B_WANTED;
388 			sleep((caddr_t)bp, PRIBIO + 1);
389 			splx(s);
390 			goto loop;
391 		}
392 		bremfree(bp);
393 		bp->b_flags |= B_BUSY;
394 		splx(s);
395 		if (bp->b_bcount != size) {
396 			printf("getblk: stray size");
397 			bp->b_flags |= B_INVAL;
398 			bwrite(bp);
399 			goto loop;
400 		}
401 		bp->b_flags |= B_CACHE;
402 		return (bp);
403 	}
404 	bp = getnewbuf();
405 	bremhash(bp);
406 	bgetvp(vp, bp);
407 	bp->b_bcount = 0;
408 	bp->b_lblkno = blkno;
409 	bp->b_blkno = blkno;
410 	bp->b_error = 0;
411 	bp->b_resid = 0;
412 	binshash(bp, dp);
413 	allocbuf(bp, size);
414 	return (bp);
415 }
416 
417 /*
418  * Allocate a buffer.
419  * The caller will assign it to a block.
420  */
421 struct buf *
422 geteblk(size)
423 	int size;
424 {
425 	register struct buf *bp, *flist;
426 
427 	if (size > MAXBSIZE)
428 		panic("geteblk: size too big");
429 	bp = getnewbuf();
430 	bp->b_flags |= B_INVAL;
431 	bremhash(bp);
432 	flist = &bfreelist[BQ_AGE];
433 	bp->b_bcount = 0;
434 	bp->b_error = 0;
435 	bp->b_resid = 0;
436 	binshash(bp, flist);
437 	allocbuf(bp, size);
438 	return (bp);
439 }
440 
441 /*
442  * Expand or contract the actual memory allocated to a buffer.
443  * If no memory is available, release buffer and take error exit.
444  */
445 allocbuf(tp, size)
446 	register struct buf *tp;
447 	int size;
448 {
449 	register struct buf *bp, *ep;
450 	int sizealloc, take, s;
451 
452 	sizealloc = roundup(size, CLBYTES);
453 	/*
454 	 * Buffer size does not change
455 	 */
456 	if (sizealloc == tp->b_bufsize)
457 		goto out;
458 	/*
459 	 * Buffer size is shrinking.
460 	 * Place excess space in a buffer header taken from the
461 	 * BQ_EMPTY buffer list and placed on the "most free" list.
462 	 * If no extra buffer headers are available, leave the
463 	 * extra space in the present buffer.
464 	 */
465 	if (sizealloc < tp->b_bufsize) {
466 		ep = bfreelist[BQ_EMPTY].av_forw;
467 		if (ep == &bfreelist[BQ_EMPTY])
468 			goto out;
469 		s = splbio();
470 		bremfree(ep);
471 		ep->b_flags |= B_BUSY;
472 		splx(s);
473 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
474 		    (int)tp->b_bufsize - sizealloc);
475 		ep->b_bufsize = tp->b_bufsize - sizealloc;
476 		tp->b_bufsize = sizealloc;
477 		ep->b_flags |= B_INVAL;
478 		ep->b_bcount = 0;
479 		brelse(ep);
480 		goto out;
481 	}
482 	/*
483 	 * More buffer space is needed. Get it out of buffers on
484 	 * the "most free" list, placing the empty headers on the
485 	 * BQ_EMPTY buffer header list.
486 	 */
487 	while (tp->b_bufsize < sizealloc) {
488 		take = sizealloc - tp->b_bufsize;
489 		bp = getnewbuf();
490 		if (take >= bp->b_bufsize)
491 			take = bp->b_bufsize;
492 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
493 		    &tp->b_un.b_addr[tp->b_bufsize], take);
494 		tp->b_bufsize += take;
495 		bp->b_bufsize = bp->b_bufsize - take;
496 		if (bp->b_bcount > bp->b_bufsize)
497 			bp->b_bcount = bp->b_bufsize;
498 		if (bp->b_bufsize <= 0) {
499 			bremhash(bp);
500 			binshash(bp, &bfreelist[BQ_EMPTY]);
501 			bp->b_dev = NODEV;
502 			bp->b_error = 0;
503 			bp->b_flags |= B_INVAL;
504 		}
505 		brelse(bp);
506 	}
507 out:
508 	tp->b_bcount = size;
509 	return (1);
510 }
511 
512 /*
513  * Find a buffer which is available for use.
514  * Select something from a free list.
515  * Preference is to AGE list, then LRU list.
516  */
517 struct buf *
518 getnewbuf()
519 {
520 	register struct buf *bp, *dp;
521 	register struct ucred *cred;
522 	int s;
523 
524 #ifdef LFS
525 	lfs_flush();
526 #endif
527 loop:
528 	s = splbio();
529 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
530 		if (dp->av_forw != dp)
531 			break;
532 	if (dp == bfreelist) {		/* no free blocks */
533 		dp->b_flags |= B_WANTED;
534 		sleep((caddr_t)dp, PRIBIO + 1);
535 		splx(s);
536 		goto loop;
537 	}
538 	bp = dp->av_forw;
539 	bremfree(bp);
540 	bp->b_flags |= B_BUSY;
541 	splx(s);
542 	if (bp->b_flags & B_DELWRI) {
543 		(void) bawrite(bp);
544 		goto loop;
545 	}
546 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
547 	if (bp->b_vp)
548 		brelvp(bp);
549 	if (bp->b_rcred != NOCRED) {
550 		cred = bp->b_rcred;
551 		bp->b_rcred = NOCRED;
552 		crfree(cred);
553 	}
554 	if (bp->b_wcred != NOCRED) {
555 		cred = bp->b_wcred;
556 		bp->b_wcred = NOCRED;
557 		crfree(cred);
558 	}
559 	bp->b_flags = B_BUSY;
560 	bp->b_dirtyoff = bp->b_dirtyend = 0;
561 	bp->b_validoff = bp->b_validend = 0;
562 	return (bp);
563 }
564 
565 /*
566  * Wait for I/O to complete.
567  *
568  * Extract and return any errors associated with the I/O.
569  * If the error flag is set, but no specific error is
570  * given, return EIO.
571  */
572 biowait(bp)
573 	register struct buf *bp;
574 {
575 	int s;
576 
577 	s = splbio();
578 	while ((bp->b_flags & B_DONE) == 0)
579 		sleep((caddr_t)bp, PRIBIO);
580 	splx(s);
581 	if ((bp->b_flags & B_ERROR) == 0)
582 		return (0);
583 	if (bp->b_error)
584 		return (bp->b_error);
585 	return (EIO);
586 }
587 
588 /*
589  * Mark I/O complete on a buffer.
590  *
591  * If a callback has been requested, e.g. the pageout
592  * daemon, do so. Otherwise, awaken waiting processes.
593  */
594 void
595 biodone(bp)
596 	register struct buf *bp;
597 {
598 
599 	if (bp->b_flags & B_DONE)
600 		panic("dup biodone");
601 	bp->b_flags |= B_DONE;
602 	if ((bp->b_flags & B_READ) == 0)
603 		vwakeup(bp);
604 	if (bp->b_flags & B_CALL) {
605 		bp->b_flags &= ~B_CALL;
606 		(*bp->b_iodone)(bp);
607 		return;
608 	}
609 	if (bp->b_flags & B_ASYNC)
610 		brelse(bp);
611 	else {
612 		bp->b_flags &= ~B_WANTED;
613 		wakeup((caddr_t)bp);
614 	}
615 }
616