xref: /original-bsd/sys/kern/vfs_cluster.c (revision a91856c6)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_cluster.c	7.42 (Berkeley) 05/10/91
10  */
11 
12 #include "param.h"
13 #include "proc.h"
14 #include "buf.h"
15 #include "vnode.h"
16 #include "specdev.h"
17 #include "mount.h"
18 #include "trace.h"
19 #include "resourcevar.h"
20 
21 /*
22  * Initialize buffers and hash links for buffers.
23  */
24 bufinit()
25 {
26 	register int i;
27 	register struct buf *bp, *dp;
28 	register struct bufhd *hp;
29 	int base, residual;
30 
31 	for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
32 		hp->b_forw = hp->b_back = (struct buf *)hp;
33 
34 	for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
35 		dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
36 		dp->b_flags = B_HEAD;
37 	}
38 	base = bufpages / nbuf;
39 	residual = bufpages % nbuf;
40 	for (i = 0; i < nbuf; i++) {
41 		bp = &buf[i];
42 		bp->b_dev = NODEV;
43 		bp->b_bcount = 0;
44 		bp->b_rcred = NOCRED;
45 		bp->b_wcred = NOCRED;
46 		bp->b_dirtyoff = 0;
47 		bp->b_dirtyend = 0;
48 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
49 		if (i < residual)
50 			bp->b_bufsize = (base + 1) * CLBYTES;
51 		else
52 			bp->b_bufsize = base * CLBYTES;
53 		binshash(bp, &bfreelist[BQ_AGE]);
54 		bp->b_flags = B_BUSY|B_INVAL;
55 		brelse(bp);
56 	}
57 }
58 
59 /*
60  * Find the block in the buffer pool.
61  * If the buffer is not present, allocate a new buffer and load
62  * its contents according to the filesystem fill routine.
63  */
64 bread(vp, blkno, size, cred, bpp)
65 	struct vnode *vp;
66 	daddr_t blkno;
67 	int size;
68 	struct ucred *cred;
69 	struct buf **bpp;
70 {
71 	struct proc *p = curproc;		/* XXX */
72 	register struct buf *bp;
73 
74 	if (size == 0)
75 		panic("bread: size 0");
76 	*bpp = bp = getblk(vp, blkno, size);
77 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
78 		trace(TR_BREADHIT, pack(vp, size), blkno);
79 		return (0);
80 	}
81 	bp->b_flags |= B_READ;
82 	if (bp->b_bcount > bp->b_bufsize)
83 		panic("bread");
84 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
85 		crhold(cred);
86 		bp->b_rcred = cred;
87 	}
88 	VOP_STRATEGY(bp);
89 	trace(TR_BREADMISS, pack(vp, size), blkno);
90 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
91 	return (biowait(bp));
92 }
93 
94 /*
95  * Operates like bread, but also starts I/O on the specified
96  * read-ahead block.
97  */
98 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
99 	struct vnode *vp;
100 	daddr_t blkno; int size;
101 	daddr_t rablkno; int rabsize;
102 	struct ucred *cred;
103 	struct buf **bpp;
104 {
105 	struct proc *p = curproc;		/* XXX */
106 	register struct buf *bp, *rabp;
107 
108 	bp = NULL;
109 	/*
110 	 * If the block is not memory resident,
111 	 * allocate a buffer and start I/O.
112 	 */
113 	if (!incore(vp, blkno)) {
114 		*bpp = bp = getblk(vp, blkno, size);
115 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
116 			bp->b_flags |= B_READ;
117 			if (bp->b_bcount > bp->b_bufsize)
118 				panic("breada");
119 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
120 				crhold(cred);
121 				bp->b_rcred = cred;
122 			}
123 			VOP_STRATEGY(bp);
124 			trace(TR_BREADMISS, pack(vp, size), blkno);
125 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
126 		} else
127 			trace(TR_BREADHIT, pack(vp, size), blkno);
128 	}
129 
130 	/*
131 	 * If there is a read-ahead block, start I/O on it too.
132 	 */
133 	if (!incore(vp, rablkno)) {
134 		rabp = getblk(vp, rablkno, rabsize);
135 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
136 			brelse(rabp);
137 			trace(TR_BREADHITRA, pack(vp, rabsize), rablkno);
138 		} else {
139 			rabp->b_flags |= B_ASYNC | B_READ;
140 			if (rabp->b_bcount > rabp->b_bufsize)
141 				panic("breadrabp");
142 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
143 				crhold(cred);
144 				rabp->b_rcred = cred;
145 			}
146 			VOP_STRATEGY(rabp);
147 			trace(TR_BREADMISSRA, pack(vp, rabsize), rablkno);
148 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
149 		}
150 	}
151 
152 	/*
153 	 * If block was memory resident, let bread get it.
154 	 * If block was not memory resident, the read was
155 	 * started above, so just wait for the read to complete.
156 	 */
157 	if (bp == NULL)
158 		return (bread(vp, blkno, size, cred, bpp));
159 	return (biowait(bp));
160 }
161 
162 /*
163  * Synchronous write.
164  * Release buffer on completion.
165  */
166 bwrite(bp)
167 	register struct buf *bp;
168 {
169 	struct proc *p = curproc;		/* XXX */
170 	register int flag;
171 	int s, error;
172 
173 	flag = bp->b_flags;
174 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
175 	if (flag & B_ASYNC) {
176 		if ((flag & B_DELWRI) == 0)
177 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
178 		else
179 			reassignbuf(bp, bp->b_vp);
180 	}
181 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
182 	if (bp->b_bcount > bp->b_bufsize)
183 		panic("bwrite");
184 	s = splbio();
185 	bp->b_vp->v_numoutput++;
186 	splx(s);
187 	VOP_STRATEGY(bp);
188 
189 	/*
190 	 * If the write was synchronous, then await I/O completion.
191 	 * If the write was "delayed", then we put the buffer on
192 	 * the queue of blocks awaiting I/O completion status.
193 	 */
194 	if ((flag & B_ASYNC) == 0) {
195 		error = biowait(bp);
196 		if ((flag&B_DELWRI) == 0)
197 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
198 		else
199 			reassignbuf(bp, bp->b_vp);
200 		brelse(bp);
201 	} else if (flag & B_DELWRI) {
202 		bp->b_flags |= B_AGE;
203 		error = 0;
204 	}
205 	return (error);
206 }
207 
208 /*
209  * Delayed write.
210  *
211  * The buffer is marked dirty, but is not queued for I/O.
212  * This routine should be used when the buffer is expected
213  * to be modified again soon, typically a small write that
214  * partially fills a buffer.
215  *
216  * NB: magnetic tapes cannot be delayed; they must be
217  * written in the order that the writes are requested.
218  */
219 bdwrite(bp)
220 	register struct buf *bp;
221 {
222 	struct proc *p = curproc;		/* XXX */
223 
224 	if ((bp->b_flags & B_DELWRI) == 0) {
225 		bp->b_flags |= B_DELWRI;
226 		reassignbuf(bp, bp->b_vp);
227 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
228 	}
229 	/*
230 	 * If this is a tape drive, the write must be initiated.
231 	 */
232 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
233 		bawrite(bp);
234 	} else {
235 		bp->b_flags |= (B_DONE | B_DELWRI);
236 		brelse(bp);
237 	}
238 }
239 
240 /*
241  * Asynchronous write.
242  * Start I/O on a buffer, but do not wait for it to complete.
243  * The buffer is released when the I/O completes.
244  */
245 bawrite(bp)
246 	register struct buf *bp;
247 {
248 
249 	/*
250 	 * Setting the ASYNC flag causes bwrite to return
251 	 * after starting the I/O.
252 	 */
253 	bp->b_flags |= B_ASYNC;
254 	(void) bwrite(bp);
255 }
256 
257 /*
258  * Release a buffer.
259  * Even if the buffer is dirty, no I/O is started.
260  */
261 brelse(bp)
262 	register struct buf *bp;
263 {
264 	register struct buf *flist;
265 	int s;
266 
267 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
268 	/*
269 	 * If a process is waiting for the buffer, or
270 	 * is waiting for a free buffer, awaken it.
271 	 */
272 	if (bp->b_flags & B_WANTED)
273 		wakeup((caddr_t)bp);
274 	if (bfreelist[0].b_flags & B_WANTED) {
275 		bfreelist[0].b_flags &= ~B_WANTED;
276 		wakeup((caddr_t)bfreelist);
277 	}
278 	/*
279 	 * Retry I/O for locked buffers rather than invalidating them.
280 	 */
281 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
282 		bp->b_flags &= ~B_ERROR;
283 	/*
284 	 * Disassociate buffers that are no longer valid.
285 	 */
286 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
287 		bp->b_flags |= B_INVAL;
288 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
289 		if (bp->b_vp)
290 			brelvp(bp);
291 		bp->b_flags &= ~B_DELWRI;
292 	}
293 	/*
294 	 * Stick the buffer back on a free list.
295 	 */
296 	s = splbio();
297 	if (bp->b_bufsize <= 0) {
298 		/* block has no buffer ... put at front of unused buffer list */
299 		flist = &bfreelist[BQ_EMPTY];
300 		binsheadfree(bp, flist);
301 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
302 		/* block has no info ... put at front of most free list */
303 		flist = &bfreelist[BQ_AGE];
304 		binsheadfree(bp, flist);
305 	} else {
306 		if (bp->b_flags & B_LOCKED)
307 			flist = &bfreelist[BQ_LOCKED];
308 		else if (bp->b_flags & B_AGE)
309 			flist = &bfreelist[BQ_AGE];
310 		else
311 			flist = &bfreelist[BQ_LRU];
312 		binstailfree(bp, flist);
313 	}
314 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
315 	splx(s);
316 }
317 
318 /*
319  * Check to see if a block is currently memory resident.
320  */
321 incore(vp, blkno)
322 	struct vnode *vp;
323 	daddr_t blkno;
324 {
325 	register struct buf *bp;
326 	register struct buf *dp;
327 
328 	dp = BUFHASH(vp, blkno);
329 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
330 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
331 		    (bp->b_flags & B_INVAL) == 0)
332 			return (1);
333 	return (0);
334 }
335 
336 /*
337  * Check to see if a block is currently memory resident.
338  * If it is resident, return it. If it is not resident,
339  * allocate a new buffer and assign it to the block.
340  */
341 struct buf *
342 getblk(vp, blkno, size)
343 	register struct vnode *vp;
344 	daddr_t blkno;
345 	int size;
346 {
347 	register struct buf *bp, *dp;
348 	int s;
349 
350 	if (size > MAXBSIZE)
351 		panic("getblk: size too big");
352 	/*
353 	 * Search the cache for the block. If the buffer is found,
354 	 * but it is currently locked, the we must wait for it to
355 	 * become available.
356 	 */
357 	dp = BUFHASH(vp, blkno);
358 loop:
359 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
360 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
361 		    (bp->b_flags & B_INVAL))
362 			continue;
363 		s = splbio();
364 		if (bp->b_flags & B_BUSY) {
365 			bp->b_flags |= B_WANTED;
366 			sleep((caddr_t)bp, PRIBIO + 1);
367 			splx(s);
368 			goto loop;
369 		}
370 		bremfree(bp);
371 		bp->b_flags |= B_BUSY;
372 		splx(s);
373 		if (bp->b_bcount != size) {
374 			printf("getblk: stray size");
375 			bp->b_flags |= B_INVAL;
376 			bwrite(bp);
377 			goto loop;
378 		}
379 		bp->b_flags |= B_CACHE;
380 		return (bp);
381 	}
382 	bp = getnewbuf();
383 	bremhash(bp);
384 	bgetvp(vp, bp);
385 	bp->b_bcount = 0;
386 	bp->b_lblkno = blkno;
387 	bp->b_blkno = blkno;
388 	bp->b_error = 0;
389 	bp->b_resid = 0;
390 	binshash(bp, dp);
391 	allocbuf(bp, size);
392 	return (bp);
393 }
394 
395 /*
396  * Allocate a buffer.
397  * The caller will assign it to a block.
398  */
399 struct buf *
400 geteblk(size)
401 	int size;
402 {
403 	register struct buf *bp, *flist;
404 
405 	if (size > MAXBSIZE)
406 		panic("geteblk: size too big");
407 	bp = getnewbuf();
408 	bp->b_flags |= B_INVAL;
409 	bremhash(bp);
410 	flist = &bfreelist[BQ_AGE];
411 	bp->b_bcount = 0;
412 	bp->b_error = 0;
413 	bp->b_resid = 0;
414 	binshash(bp, flist);
415 	allocbuf(bp, size);
416 	return (bp);
417 }
418 
419 /*
420  * Expand or contract the actual memory allocated to a buffer.
421  * If no memory is available, release buffer and take error exit.
422  */
423 allocbuf(tp, size)
424 	register struct buf *tp;
425 	int size;
426 {
427 	register struct buf *bp, *ep;
428 	int sizealloc, take, s;
429 
430 	sizealloc = roundup(size, CLBYTES);
431 	/*
432 	 * Buffer size does not change
433 	 */
434 	if (sizealloc == tp->b_bufsize)
435 		goto out;
436 	/*
437 	 * Buffer size is shrinking.
438 	 * Place excess space in a buffer header taken from the
439 	 * BQ_EMPTY buffer list and placed on the "most free" list.
440 	 * If no extra buffer headers are available, leave the
441 	 * extra space in the present buffer.
442 	 */
443 	if (sizealloc < tp->b_bufsize) {
444 		ep = bfreelist[BQ_EMPTY].av_forw;
445 		if (ep == &bfreelist[BQ_EMPTY])
446 			goto out;
447 		s = splbio();
448 		bremfree(ep);
449 		ep->b_flags |= B_BUSY;
450 		splx(s);
451 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
452 		    (int)tp->b_bufsize - sizealloc);
453 		ep->b_bufsize = tp->b_bufsize - sizealloc;
454 		tp->b_bufsize = sizealloc;
455 		ep->b_flags |= B_INVAL;
456 		ep->b_bcount = 0;
457 		brelse(ep);
458 		goto out;
459 	}
460 	/*
461 	 * More buffer space is needed. Get it out of buffers on
462 	 * the "most free" list, placing the empty headers on the
463 	 * BQ_EMPTY buffer header list.
464 	 */
465 	while (tp->b_bufsize < sizealloc) {
466 		take = sizealloc - tp->b_bufsize;
467 		bp = getnewbuf();
468 		if (take >= bp->b_bufsize)
469 			take = bp->b_bufsize;
470 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
471 		    &tp->b_un.b_addr[tp->b_bufsize], take);
472 		tp->b_bufsize += take;
473 		bp->b_bufsize = bp->b_bufsize - take;
474 		if (bp->b_bcount > bp->b_bufsize)
475 			bp->b_bcount = bp->b_bufsize;
476 		if (bp->b_bufsize <= 0) {
477 			bremhash(bp);
478 			binshash(bp, &bfreelist[BQ_EMPTY]);
479 			bp->b_dev = NODEV;
480 			bp->b_error = 0;
481 			bp->b_flags |= B_INVAL;
482 		}
483 		brelse(bp);
484 	}
485 out:
486 	tp->b_bcount = size;
487 	return (1);
488 }
489 
490 /*
491  * Find a buffer which is available for use.
492  * Select something from a free list.
493  * Preference is to AGE list, then LRU list.
494  */
495 struct buf *
496 getnewbuf()
497 {
498 	register struct buf *bp, *dp;
499 	register struct ucred *cred;
500 	int s;
501 
502 loop:
503 	s = splbio();
504 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
505 		if (dp->av_forw != dp)
506 			break;
507 	if (dp == bfreelist) {		/* no free blocks */
508 		dp->b_flags |= B_WANTED;
509 		sleep((caddr_t)dp, PRIBIO + 1);
510 		splx(s);
511 		goto loop;
512 	}
513 	bp = dp->av_forw;
514 	bremfree(bp);
515 	bp->b_flags |= B_BUSY;
516 	splx(s);
517 	if (bp->b_flags & B_DELWRI) {
518 		(void) bawrite(bp);
519 		goto loop;
520 	}
521 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
522 	if (bp->b_vp)
523 		brelvp(bp);
524 	if (bp->b_rcred != NOCRED) {
525 		cred = bp->b_rcred;
526 		bp->b_rcred = NOCRED;
527 		crfree(cred);
528 	}
529 	if (bp->b_wcred != NOCRED) {
530 		cred = bp->b_wcred;
531 		bp->b_wcred = NOCRED;
532 		crfree(cred);
533 	}
534 	bp->b_flags = B_BUSY;
535 	bp->b_dirtyoff = bp->b_dirtyend = 0;
536 	return (bp);
537 }
538 
539 /*
540  * Wait for I/O to complete.
541  *
542  * Extract and return any errors associated with the I/O.
543  * If the error flag is set, but no specific error is
544  * given, return EIO.
545  */
546 biowait(bp)
547 	register struct buf *bp;
548 {
549 	int s;
550 
551 	s = splbio();
552 	while ((bp->b_flags & B_DONE) == 0)
553 		sleep((caddr_t)bp, PRIBIO);
554 	splx(s);
555 	if ((bp->b_flags & B_ERROR) == 0)
556 		return (0);
557 	if (bp->b_error)
558 		return (bp->b_error);
559 	return (EIO);
560 }
561 
562 /*
563  * Mark I/O complete on a buffer.
564  *
565  * If a callback has been requested, e.g. the pageout
566  * daemon, do so. Otherwise, awaken waiting processes.
567  */
568 biodone(bp)
569 	register struct buf *bp;
570 {
571 
572 	if (bp->b_flags & B_DONE)
573 		panic("dup biodone");
574 	bp->b_flags |= B_DONE;
575 	if ((bp->b_flags & B_READ) == 0)
576 		vwakeup(bp);
577 	if (bp->b_flags & B_CALL) {
578 		bp->b_flags &= ~B_CALL;
579 		(*bp->b_iodone)(bp);
580 		return;
581 	}
582 	if (bp->b_flags & B_ASYNC)
583 		brelse(bp);
584 	else {
585 		bp->b_flags &= ~B_WANTED;
586 		wakeup((caddr_t)bp);
587 	}
588 }
589