xref: /original-bsd/sys/kern/vfs_cluster.c (revision 784a75ed)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_cluster.c	7.49 (Berkeley) 06/23/92
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/specdev.h>
17 #include <sys/mount.h>
18 #include <sys/trace.h>
19 #include <sys/resourcevar.h>
20 
21 /*
22  * Initialize buffers and hash links for buffers.
23  */
24 void
25 bufinit()
26 {
27 	register int i;
28 	register struct buf *bp, *dp;
29 	register struct bufhd *hp;
30 	int base, residual;
31 
32 	for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
33 		hp->b_forw = hp->b_back = (struct buf *)hp;
34 
35 	for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
36 		dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
37 		dp->b_flags = B_HEAD;
38 	}
39 	base = bufpages / nbuf;
40 	residual = bufpages % nbuf;
41 	for (i = 0; i < nbuf; i++) {
42 		bp = &buf[i];
43 		bp->b_dev = NODEV;
44 		bp->b_bcount = 0;
45 		bp->b_rcred = NOCRED;
46 		bp->b_wcred = NOCRED;
47 		bp->b_dirtyoff = 0;
48 		bp->b_dirtyend = 0;
49 		bp->b_validoff = 0;
50 		bp->b_validend = 0;
51 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
52 		if (i < residual)
53 			bp->b_bufsize = (base + 1) * CLBYTES;
54 		else
55 			bp->b_bufsize = base * CLBYTES;
56 		binshash(bp, &bfreelist[BQ_AGE]);
57 		bp->b_flags = B_INVAL;
58 		dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY];
59 		binsheadfree(bp, dp);
60 	}
61 }
62 
63 /*
64  * Find the block in the buffer pool.
65  * If the buffer is not present, allocate a new buffer and load
66  * its contents according to the filesystem fill routine.
67  */
68 bread(vp, blkno, size, cred, bpp)
69 	struct vnode *vp;
70 	daddr_t blkno;
71 	int size;
72 	struct ucred *cred;
73 	struct buf **bpp;
74 {
75 	USES_VOP_STRATEGY;
76 	struct proc *p = curproc;		/* XXX */
77 	register struct buf *bp;
78 
79 	if (size == 0)
80 		panic("bread: size 0");
81 	*bpp = bp = getblk(vp, blkno, size);
82 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
83 		trace(TR_BREADHIT, pack(vp, size), blkno);
84 		return (0);
85 	}
86 	bp->b_flags |= B_READ;
87 	if (bp->b_bcount > bp->b_bufsize)
88 		panic("bread");
89 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
90 		crhold(cred);
91 		bp->b_rcred = cred;
92 	}
93 	VOP_STRATEGY(bp);
94 	trace(TR_BREADMISS, pack(vp, size), blkno);
95 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
96 	return (biowait(bp));
97 }
98 
99 /*
100  * Operates like bread, but also starts I/O on the N specified
101  * read-ahead blocks.
102  */
103 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
104 	struct vnode *vp;
105 	daddr_t blkno; int size;
106 	daddr_t rablkno[]; int rabsize[];
107 	int num;
108 	struct ucred *cred;
109 	struct buf **bpp;
110 {
111 	USES_VOP_STRATEGY;
112 	struct proc *p = curproc;		/* XXX */
113 	register struct buf *bp, *rabp;
114 	register int i;
115 
116 	bp = NULL;
117 	/*
118 	 * If the block is not memory resident,
119 	 * allocate a buffer and start I/O.
120 	 */
121 	if (!incore(vp, blkno)) {
122 		*bpp = bp = getblk(vp, blkno, size);
123 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
124 			bp->b_flags |= B_READ;
125 			if (bp->b_bcount > bp->b_bufsize)
126 				panic("breadn");
127 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
128 				crhold(cred);
129 				bp->b_rcred = cred;
130 			}
131 			VOP_STRATEGY(bp);
132 			trace(TR_BREADMISS, pack(vp, size), blkno);
133 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
134 		} else {
135 			trace(TR_BREADHIT, pack(vp, size), blkno);
136 		}
137 	}
138 
139 	/*
140 	 * If there's read-ahead block(s), start I/O
141 	 * on them also (as above).
142 	 */
143 	for (i = 0; i < num; i++) {
144 		if (incore(vp, rablkno[i]))
145 			continue;
146 		rabp = getblk(vp, rablkno[i], rabsize[i]);
147 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
148 			brelse(rabp);
149 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
150 		} else {
151 			rabp->b_flags |= B_ASYNC | B_READ;
152 			if (rabp->b_bcount > rabp->b_bufsize)
153 				panic("breadrabp");
154 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
155 				crhold(cred);
156 				rabp->b_rcred = cred;
157 			}
158 			VOP_STRATEGY(rabp);
159 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
160 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
161 		}
162 	}
163 
164 	/*
165 	 * If block was memory resident, let bread get it.
166 	 * If block was not memory resident, the read was
167 	 * started above, so just wait for the read to complete.
168 	 */
169 	if (bp == NULL)
170 		return (bread(vp, blkno, size, cred, bpp));
171 	return (biowait(bp));
172 }
173 
174 /*
175  * Synchronous write.
176  * Release buffer on completion.
177  */
178 bwrite(bp)
179 	register struct buf *bp;
180 {
181 	USES_VOP_STRATEGY;
182 	struct proc *p = curproc;		/* XXX */
183 	register int flag;
184 	int s, error = 0;
185 
186 	flag = bp->b_flags;
187 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
188 	if (flag & B_ASYNC) {
189 		if ((flag & B_DELWRI) == 0)
190 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
191 		else
192 			reassignbuf(bp, bp->b_vp);
193 	}
194 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
195 	if (bp->b_bcount > bp->b_bufsize)
196 		panic("bwrite");
197 	s = splbio();
198 	bp->b_vp->v_numoutput++;
199 	splx(s);
200 	VOP_STRATEGY(bp);
201 
202 	/*
203 	 * If the write was synchronous, then await I/O completion.
204 	 * If the write was "delayed", then we put the buffer on
205 	 * the queue of blocks awaiting I/O completion status.
206 	 */
207 	if ((flag & B_ASYNC) == 0) {
208 		error = biowait(bp);
209 		if ((flag&B_DELWRI) == 0)
210 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
211 		else
212 			reassignbuf(bp, bp->b_vp);
213 		brelse(bp);
214 	} else if (flag & B_DELWRI) {
215 		s = splbio();
216 		bp->b_flags |= B_AGE;
217 		splx(s);
218 	}
219 	return (error);
220 }
221 
222 int
223 vn_bwrite(ap)
224 	struct vop_bwrite_args *ap;
225 {
226 	return bwrite (ap->a_bp);
227 }
228 
229 
230 /*
231  * Delayed write.
232  *
233  * The buffer is marked dirty, but is not queued for I/O.
234  * This routine should be used when the buffer is expected
235  * to be modified again soon, typically a small write that
236  * partially fills a buffer.
237  *
238  * NB: magnetic tapes cannot be delayed; they must be
239  * written in the order that the writes are requested.
240  */
241 bdwrite(bp)
242 	register struct buf *bp;
243 {
244 	USES_VOP_IOCTL;
245 	struct proc *p = curproc;		/* XXX */
246 
247 	if ((bp->b_flags & B_DELWRI) == 0) {
248 		bp->b_flags |= B_DELWRI;
249 		reassignbuf(bp, bp->b_vp);
250 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
251 	}
252 	/*
253 	 * If this is a tape drive, the write must be initiated.
254 	 */
255 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
256 		bawrite(bp);
257 	} else {
258 		bp->b_flags |= (B_DONE | B_DELWRI);
259 		brelse(bp);
260 	}
261 }
262 
263 /*
264  * Asynchronous write.
265  * Start I/O on a buffer, but do not wait for it to complete.
266  * The buffer is released when the I/O completes.
267  */
268 bawrite(bp)
269 	register struct buf *bp;
270 {
271 
272 	/*
273 	 * Setting the ASYNC flag causes bwrite to return
274 	 * after starting the I/O.
275 	 */
276 	bp->b_flags |= B_ASYNC;
277 	(void) bwrite(bp);
278 }
279 
280 /*
281  * Release a buffer.
282  * Even if the buffer is dirty, no I/O is started.
283  */
284 brelse(bp)
285 	register struct buf *bp;
286 {
287 	register struct buf *flist;
288 	int s;
289 
290 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
291 	/*
292 	 * If a process is waiting for the buffer, or
293 	 * is waiting for a free buffer, awaken it.
294 	 */
295 	if (bp->b_flags & B_WANTED)
296 		wakeup((caddr_t)bp);
297 	if (bfreelist[0].b_flags & B_WANTED) {
298 		bfreelist[0].b_flags &= ~B_WANTED;
299 		wakeup((caddr_t)bfreelist);
300 	}
301 	/*
302 	 * Retry I/O for locked buffers rather than invalidating them.
303 	 */
304 	s = splbio();
305 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
306 		bp->b_flags &= ~B_ERROR;
307 	/*
308 	 * Disassociate buffers that are no longer valid.
309 	 */
310 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
311 		bp->b_flags |= B_INVAL;
312 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
313 		if (bp->b_vp)
314 			brelvp(bp);
315 		bp->b_flags &= ~B_DELWRI;
316 	}
317 	/*
318 	 * Stick the buffer back on a free list.
319 	 */
320 	if (bp->b_bufsize <= 0) {
321 		/* block has no buffer ... put at front of unused buffer list */
322 		flist = &bfreelist[BQ_EMPTY];
323 		binsheadfree(bp, flist);
324 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
325 		/* block has no info ... put at front of most free list */
326 		flist = &bfreelist[BQ_AGE];
327 		binsheadfree(bp, flist);
328 	} else {
329 		if (bp->b_flags & B_LOCKED)
330 			flist = &bfreelist[BQ_LOCKED];
331 		else if (bp->b_flags & B_AGE)
332 			flist = &bfreelist[BQ_AGE];
333 		else
334 			flist = &bfreelist[BQ_LRU];
335 		binstailfree(bp, flist);
336 	}
337 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
338 	splx(s);
339 }
340 
341 /*
342  * Check to see if a block is currently memory resident.
343  */
344 incore(vp, blkno)
345 	struct vnode *vp;
346 	daddr_t blkno;
347 {
348 	register struct buf *bp;
349 	register struct buf *dp;
350 
351 	dp = BUFHASH(vp, blkno);
352 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
353 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
354 		    (bp->b_flags & B_INVAL) == 0)
355 			return (1);
356 	return (0);
357 }
358 
359 /*
360  * Check to see if a block is currently memory resident.
361  * If it is resident, return it. If it is not resident,
362  * allocate a new buffer and assign it to the block.
363  */
364 struct buf *
365 getblk(vp, blkno, size)
366 	register struct vnode *vp;
367 	daddr_t blkno;
368 	int size;
369 {
370 	register struct buf *bp, *dp;
371 	int s;
372 
373 	if (size > MAXBSIZE)
374 		panic("getblk: size too big");
375 	/*
376 	 * Search the cache for the block. If the buffer is found,
377 	 * but it is currently locked, the we must wait for it to
378 	 * become available.
379 	 */
380 	dp = BUFHASH(vp, blkno);
381 loop:
382 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
383 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
384 		    (bp->b_flags & B_INVAL))
385 			continue;
386 		s = splbio();
387 		if (bp->b_flags & B_BUSY) {
388 			bp->b_flags |= B_WANTED;
389 			sleep((caddr_t)bp, PRIBIO + 1);
390 			splx(s);
391 			goto loop;
392 		}
393 		bremfree(bp);
394 		bp->b_flags |= B_BUSY;
395 		splx(s);
396 		if (bp->b_bcount != size) {
397 			printf("getblk: stray size");
398 			bp->b_flags |= B_INVAL;
399 			bwrite(bp);
400 			goto loop;
401 		}
402 		bp->b_flags |= B_CACHE;
403 		return (bp);
404 	}
405 	bp = getnewbuf();
406 	bremhash(bp);
407 	bgetvp(vp, bp);
408 	bp->b_bcount = 0;
409 	bp->b_lblkno = blkno;
410 	bp->b_blkno = blkno;
411 	bp->b_error = 0;
412 	bp->b_resid = 0;
413 	binshash(bp, dp);
414 	allocbuf(bp, size);
415 	return (bp);
416 }
417 
418 /*
419  * Allocate a buffer.
420  * The caller will assign it to a block.
421  */
422 struct buf *
423 geteblk(size)
424 	int size;
425 {
426 	register struct buf *bp, *flist;
427 
428 	if (size > MAXBSIZE)
429 		panic("geteblk: size too big");
430 	bp = getnewbuf();
431 	bp->b_flags |= B_INVAL;
432 	bremhash(bp);
433 	flist = &bfreelist[BQ_AGE];
434 	bp->b_bcount = 0;
435 	bp->b_error = 0;
436 	bp->b_resid = 0;
437 	binshash(bp, flist);
438 	allocbuf(bp, size);
439 	return (bp);
440 }
441 
442 /*
443  * Expand or contract the actual memory allocated to a buffer.
444  * If no memory is available, release buffer and take error exit.
445  */
446 allocbuf(tp, size)
447 	register struct buf *tp;
448 	int size;
449 {
450 	register struct buf *bp, *ep;
451 	int sizealloc, take, s;
452 
453 	sizealloc = roundup(size, CLBYTES);
454 	/*
455 	 * Buffer size does not change
456 	 */
457 	if (sizealloc == tp->b_bufsize)
458 		goto out;
459 	/*
460 	 * Buffer size is shrinking.
461 	 * Place excess space in a buffer header taken from the
462 	 * BQ_EMPTY buffer list and placed on the "most free" list.
463 	 * If no extra buffer headers are available, leave the
464 	 * extra space in the present buffer.
465 	 */
466 	if (sizealloc < tp->b_bufsize) {
467 		ep = bfreelist[BQ_EMPTY].av_forw;
468 		if (ep == &bfreelist[BQ_EMPTY])
469 			goto out;
470 		s = splbio();
471 		bremfree(ep);
472 		ep->b_flags |= B_BUSY;
473 		splx(s);
474 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
475 		    (int)tp->b_bufsize - sizealloc);
476 		ep->b_bufsize = tp->b_bufsize - sizealloc;
477 		tp->b_bufsize = sizealloc;
478 		ep->b_flags |= B_INVAL;
479 		ep->b_bcount = 0;
480 		brelse(ep);
481 		goto out;
482 	}
483 	/*
484 	 * More buffer space is needed. Get it out of buffers on
485 	 * the "most free" list, placing the empty headers on the
486 	 * BQ_EMPTY buffer header list.
487 	 */
488 	while (tp->b_bufsize < sizealloc) {
489 		take = sizealloc - tp->b_bufsize;
490 		bp = getnewbuf();
491 		if (take >= bp->b_bufsize)
492 			take = bp->b_bufsize;
493 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
494 		    &tp->b_un.b_addr[tp->b_bufsize], take);
495 		tp->b_bufsize += take;
496 		bp->b_bufsize = bp->b_bufsize - take;
497 		if (bp->b_bcount > bp->b_bufsize)
498 			bp->b_bcount = bp->b_bufsize;
499 		if (bp->b_bufsize <= 0) {
500 			bremhash(bp);
501 			binshash(bp, &bfreelist[BQ_EMPTY]);
502 			bp->b_dev = NODEV;
503 			bp->b_error = 0;
504 			bp->b_flags |= B_INVAL;
505 		}
506 		brelse(bp);
507 	}
508 out:
509 	tp->b_bcount = size;
510 	return (1);
511 }
512 
513 /*
514  * Find a buffer which is available for use.
515  * Select something from a free list.
516  * Preference is to AGE list, then LRU list.
517  */
518 struct buf *
519 getnewbuf()
520 {
521 	register struct buf *bp, *dp;
522 	register struct ucred *cred;
523 	int s;
524 
525 #ifdef LFS
526 	lfs_flush();
527 #endif
528 loop:
529 	s = splbio();
530 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
531 		if (dp->av_forw != dp)
532 			break;
533 	if (dp == bfreelist) {		/* no free blocks */
534 		dp->b_flags |= B_WANTED;
535 		sleep((caddr_t)dp, PRIBIO + 1);
536 		splx(s);
537 		goto loop;
538 	}
539 	bp = dp->av_forw;
540 	bremfree(bp);
541 	bp->b_flags |= B_BUSY;
542 	splx(s);
543 	if (bp->b_flags & B_DELWRI) {
544 		(void) bawrite(bp);
545 		goto loop;
546 	}
547 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
548 	if (bp->b_vp)
549 		brelvp(bp);
550 	if (bp->b_rcred != NOCRED) {
551 		cred = bp->b_rcred;
552 		bp->b_rcred = NOCRED;
553 		crfree(cred);
554 	}
555 	if (bp->b_wcred != NOCRED) {
556 		cred = bp->b_wcred;
557 		bp->b_wcred = NOCRED;
558 		crfree(cred);
559 	}
560 	bp->b_flags = B_BUSY;
561 	bp->b_dirtyoff = bp->b_dirtyend = 0;
562 	bp->b_validoff = bp->b_validend = 0;
563 	return (bp);
564 }
565 
566 /*
567  * Wait for I/O to complete.
568  *
569  * Extract and return any errors associated with the I/O.
570  * If the error flag is set, but no specific error is
571  * given, return EIO.
572  */
573 biowait(bp)
574 	register struct buf *bp;
575 {
576 	int s;
577 
578 	s = splbio();
579 	while ((bp->b_flags & B_DONE) == 0)
580 		sleep((caddr_t)bp, PRIBIO);
581 	splx(s);
582 	if ((bp->b_flags & B_ERROR) == 0)
583 		return (0);
584 	if (bp->b_error)
585 		return (bp->b_error);
586 	return (EIO);
587 }
588 
589 /*
590  * Mark I/O complete on a buffer.
591  *
592  * If a callback has been requested, e.g. the pageout
593  * daemon, do so. Otherwise, awaken waiting processes.
594  */
595 void
596 biodone(bp)
597 	register struct buf *bp;
598 {
599 
600 	if (bp->b_flags & B_DONE)
601 		panic("dup biodone");
602 	bp->b_flags |= B_DONE;
603 	if ((bp->b_flags & B_READ) == 0)
604 		vwakeup(bp);
605 	if (bp->b_flags & B_CALL) {
606 		bp->b_flags &= ~B_CALL;
607 		(*bp->b_iodone)(bp);
608 		return;
609 	}
610 	if (bp->b_flags & B_ASYNC)
611 		brelse(bp);
612 	else {
613 		bp->b_flags &= ~B_WANTED;
614 		wakeup((caddr_t)bp);
615 	}
616 }
617