xref: /original-bsd/sys/kern/vfs_cluster.c (revision 753853ba)
1 /*-
2  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This module is believed to contain source code proprietary to AT&T.
6  * Use and redistribution is subject to the Berkeley Software License
7  * Agreement and your Software Agreement with AT&T (Western Electric).
8  *
9  *	@(#)vfs_cluster.c	7.46 (Berkeley) 02/05/92
10  */
11 
12 #include <sys/param.h>
13 #include <sys/proc.h>
14 #include <sys/buf.h>
15 #include <sys/vnode.h>
16 #include <sys/specdev.h>
17 #include <sys/mount.h>
18 #include <sys/trace.h>
19 #include <sys/resourcevar.h>
20 
21 /*
22  * Initialize buffers and hash links for buffers.
23  */
24 void
25 bufinit()
26 {
27 	register int i;
28 	register struct buf *bp, *dp;
29 	register struct bufhd *hp;
30 	int base, residual;
31 
32 	for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++)
33 		hp->b_forw = hp->b_back = (struct buf *)hp;
34 
35 	for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) {
36 		dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp;
37 		dp->b_flags = B_HEAD;
38 	}
39 	base = bufpages / nbuf;
40 	residual = bufpages % nbuf;
41 	for (i = 0; i < nbuf; i++) {
42 		bp = &buf[i];
43 		bp->b_dev = NODEV;
44 		bp->b_bcount = 0;
45 		bp->b_rcred = NOCRED;
46 		bp->b_wcred = NOCRED;
47 		bp->b_dirtyoff = 0;
48 		bp->b_dirtyend = 0;
49 		bp->b_validoff = 0;
50 		bp->b_validend = 0;
51 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
52 		if (i < residual)
53 			bp->b_bufsize = (base + 1) * CLBYTES;
54 		else
55 			bp->b_bufsize = base * CLBYTES;
56 		binshash(bp, &bfreelist[BQ_AGE]);
57 		bp->b_flags = B_INVAL;
58 		dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY];
59 		binsheadfree(bp, dp);
60 	}
61 }
62 
63 /*
64  * Find the block in the buffer pool.
65  * If the buffer is not present, allocate a new buffer and load
66  * its contents according to the filesystem fill routine.
67  */
68 bread(vp, blkno, size, cred, bpp)
69 	struct vnode *vp;
70 	daddr_t blkno;
71 	int size;
72 	struct ucred *cred;
73 	struct buf **bpp;
74 {
75 	struct proc *p = curproc;		/* XXX */
76 	register struct buf *bp;
77 
78 	if (size == 0)
79 		panic("bread: size 0");
80 	*bpp = bp = getblk(vp, blkno, size);
81 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
82 		trace(TR_BREADHIT, pack(vp, size), blkno);
83 		return (0);
84 	}
85 	bp->b_flags |= B_READ;
86 	if (bp->b_bcount > bp->b_bufsize)
87 		panic("bread");
88 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
89 		crhold(cred);
90 		bp->b_rcred = cred;
91 	}
92 	VOP_STRATEGY(bp);
93 	trace(TR_BREADMISS, pack(vp, size), blkno);
94 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
95 	return (biowait(bp));
96 }
97 
98 /*
99  * Operates like bread, but also starts I/O on the N specified
100  * read-ahead blocks.
101  */
102 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
103 	struct vnode *vp;
104 	daddr_t blkno; int size;
105 	daddr_t rablkno[]; int rabsize[];
106 	int num;
107 	struct ucred *cred;
108 	struct buf **bpp;
109 {
110 	struct proc *p = curproc;		/* XXX */
111 	register struct buf *bp, *rabp;
112 	register int i;
113 
114 	bp = NULL;
115 	/*
116 	 * If the block is not memory resident,
117 	 * allocate a buffer and start I/O.
118 	 */
119 	if (!incore(vp, blkno)) {
120 		*bpp = bp = getblk(vp, blkno, size);
121 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
122 			bp->b_flags |= B_READ;
123 			if (bp->b_bcount > bp->b_bufsize)
124 				panic("breadn");
125 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
126 				crhold(cred);
127 				bp->b_rcred = cred;
128 			}
129 			VOP_STRATEGY(bp);
130 			trace(TR_BREADMISS, pack(vp, size), blkno);
131 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
132 		} else
133 			trace(TR_BREADHIT, pack(vp, size), blkno);
134 	}
135 
136 	/*
137 	 * If there's read-ahead block(s), start I/O
138 	 * on them also (as above).
139 	 */
140 	for (i = 0; i < num; i++) {
141 		if (incore(vp, rablkno[i]))
142 			continue;
143 		rabp = getblk(vp, rablkno[i], rabsize[i]);
144 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
145 			brelse(rabp);
146 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
147 		} else {
148 			rabp->b_flags |= B_ASYNC | B_READ;
149 			if (rabp->b_bcount > rabp->b_bufsize)
150 				panic("breadrabp");
151 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
152 				crhold(cred);
153 				rabp->b_rcred = cred;
154 			}
155 			VOP_STRATEGY(rabp);
156 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
157 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
158 		}
159 	}
160 
161 	/*
162 	 * If block was memory resident, let bread get it.
163 	 * If block was not memory resident, the read was
164 	 * started above, so just wait for the read to complete.
165 	 */
166 	if (bp == NULL)
167 		return (bread(vp, blkno, size, cred, bpp));
168 	return (biowait(bp));
169 }
170 
171 /*
172  * Synchronous write.
173  * Release buffer on completion.
174  */
175 bwrite(bp)
176 	register struct buf *bp;
177 {
178 	struct proc *p = curproc;		/* XXX */
179 	register int flag;
180 	int s, error = 0;
181 
182 	flag = bp->b_flags;
183 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
184 	if (flag & B_ASYNC) {
185 		if ((flag & B_DELWRI) == 0)
186 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
187 		else
188 			reassignbuf(bp, bp->b_vp);
189 	}
190 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
191 	if (bp->b_bcount > bp->b_bufsize)
192 		panic("bwrite");
193 	s = splbio();
194 	bp->b_vp->v_numoutput++;
195 	splx(s);
196 	VOP_STRATEGY(bp);
197 
198 	/*
199 	 * If the write was synchronous, then await I/O completion.
200 	 * If the write was "delayed", then we put the buffer on
201 	 * the queue of blocks awaiting I/O completion status.
202 	 */
203 	if ((flag & B_ASYNC) == 0) {
204 		error = biowait(bp);
205 		if ((flag&B_DELWRI) == 0)
206 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
207 		else
208 			reassignbuf(bp, bp->b_vp);
209 		brelse(bp);
210 	} else if (flag & B_DELWRI) {
211 		s = splbio();
212 		bp->b_flags |= B_AGE;
213 		splx(s);
214 	}
215 	return (error);
216 }
217 
218 /*
219  * Delayed write.
220  *
221  * The buffer is marked dirty, but is not queued for I/O.
222  * This routine should be used when the buffer is expected
223  * to be modified again soon, typically a small write that
224  * partially fills a buffer.
225  *
226  * NB: magnetic tapes cannot be delayed; they must be
227  * written in the order that the writes are requested.
228  */
229 bdwrite(bp)
230 	register struct buf *bp;
231 {
232 	struct proc *p = curproc;		/* XXX */
233 
234 	if ((bp->b_flags & B_DELWRI) == 0) {
235 		bp->b_flags |= B_DELWRI;
236 		reassignbuf(bp, bp->b_vp);
237 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
238 	}
239 	/*
240 	 * If this is a tape drive, the write must be initiated.
241 	 */
242 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
243 		bawrite(bp);
244 	} else {
245 		bp->b_flags |= (B_DONE | B_DELWRI);
246 		brelse(bp);
247 	}
248 }
249 
250 /*
251  * Asynchronous write.
252  * Start I/O on a buffer, but do not wait for it to complete.
253  * The buffer is released when the I/O completes.
254  */
255 bawrite(bp)
256 	register struct buf *bp;
257 {
258 
259 	/*
260 	 * Setting the ASYNC flag causes bwrite to return
261 	 * after starting the I/O.
262 	 */
263 	bp->b_flags |= B_ASYNC;
264 	(void) bwrite(bp);
265 }
266 
267 /*
268  * Release a buffer.
269  * Even if the buffer is dirty, no I/O is started.
270  */
271 brelse(bp)
272 	register struct buf *bp;
273 {
274 	register struct buf *flist;
275 	int s;
276 
277 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
278 	/*
279 	 * If a process is waiting for the buffer, or
280 	 * is waiting for a free buffer, awaken it.
281 	 */
282 	if (bp->b_flags & B_WANTED)
283 		wakeup((caddr_t)bp);
284 	if (bfreelist[0].b_flags & B_WANTED) {
285 		bfreelist[0].b_flags &= ~B_WANTED;
286 		wakeup((caddr_t)bfreelist);
287 	}
288 	/*
289 	 * Retry I/O for locked buffers rather than invalidating them.
290 	 */
291 	s = splbio();
292 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
293 		bp->b_flags &= ~B_ERROR;
294 	/*
295 	 * Disassociate buffers that are no longer valid.
296 	 */
297 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
298 		bp->b_flags |= B_INVAL;
299 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
300 		if (bp->b_vp)
301 			brelvp(bp);
302 		bp->b_flags &= ~B_DELWRI;
303 	}
304 	/*
305 	 * Stick the buffer back on a free list.
306 	 */
307 	if (bp->b_bufsize <= 0) {
308 		/* block has no buffer ... put at front of unused buffer list */
309 		flist = &bfreelist[BQ_EMPTY];
310 		binsheadfree(bp, flist);
311 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
312 		/* block has no info ... put at front of most free list */
313 		flist = &bfreelist[BQ_AGE];
314 		binsheadfree(bp, flist);
315 	} else {
316 		if (bp->b_flags & B_LOCKED)
317 			flist = &bfreelist[BQ_LOCKED];
318 		else if (bp->b_flags & B_AGE)
319 			flist = &bfreelist[BQ_AGE];
320 		else
321 			flist = &bfreelist[BQ_LRU];
322 		binstailfree(bp, flist);
323 	}
324 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
325 	splx(s);
326 }
327 
328 /*
329  * Check to see if a block is currently memory resident.
330  */
331 incore(vp, blkno)
332 	struct vnode *vp;
333 	daddr_t blkno;
334 {
335 	register struct buf *bp;
336 	register struct buf *dp;
337 
338 	dp = BUFHASH(vp, blkno);
339 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
340 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
341 		    (bp->b_flags & B_INVAL) == 0)
342 			return (1);
343 	return (0);
344 }
345 
346 /*
347  * Check to see if a block is currently memory resident.
348  * If it is resident, return it. If it is not resident,
349  * allocate a new buffer and assign it to the block.
350  */
351 struct buf *
352 getblk(vp, blkno, size)
353 	register struct vnode *vp;
354 	daddr_t blkno;
355 	int size;
356 {
357 	register struct buf *bp, *dp;
358 	int s;
359 
360 	if (size > MAXBSIZE)
361 		panic("getblk: size too big");
362 	/*
363 	 * Search the cache for the block. If the buffer is found,
364 	 * but it is currently locked, the we must wait for it to
365 	 * become available.
366 	 */
367 	dp = BUFHASH(vp, blkno);
368 loop:
369 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
370 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
371 		    (bp->b_flags & B_INVAL))
372 			continue;
373 		s = splbio();
374 		if (bp->b_flags & B_BUSY) {
375 			bp->b_flags |= B_WANTED;
376 			sleep((caddr_t)bp, PRIBIO + 1);
377 			splx(s);
378 			goto loop;
379 		}
380 		bremfree(bp);
381 		bp->b_flags |= B_BUSY;
382 		splx(s);
383 		if (bp->b_bcount != size) {
384 			printf("getblk: stray size");
385 			bp->b_flags |= B_INVAL;
386 			bwrite(bp);
387 			goto loop;
388 		}
389 		bp->b_flags |= B_CACHE;
390 		return (bp);
391 	}
392 	bp = getnewbuf();
393 	bremhash(bp);
394 	bgetvp(vp, bp);
395 	bp->b_bcount = 0;
396 	bp->b_lblkno = blkno;
397 	bp->b_blkno = blkno;
398 	bp->b_error = 0;
399 	bp->b_resid = 0;
400 	binshash(bp, dp);
401 	allocbuf(bp, size);
402 	return (bp);
403 }
404 
405 /*
406  * Allocate a buffer.
407  * The caller will assign it to a block.
408  */
409 struct buf *
410 geteblk(size)
411 	int size;
412 {
413 	register struct buf *bp, *flist;
414 
415 	if (size > MAXBSIZE)
416 		panic("geteblk: size too big");
417 	bp = getnewbuf();
418 	bp->b_flags |= B_INVAL;
419 	bremhash(bp);
420 	flist = &bfreelist[BQ_AGE];
421 	bp->b_bcount = 0;
422 	bp->b_error = 0;
423 	bp->b_resid = 0;
424 	binshash(bp, flist);
425 	allocbuf(bp, size);
426 	return (bp);
427 }
428 
429 /*
430  * Expand or contract the actual memory allocated to a buffer.
431  * If no memory is available, release buffer and take error exit.
432  */
433 allocbuf(tp, size)
434 	register struct buf *tp;
435 	int size;
436 {
437 	register struct buf *bp, *ep;
438 	int sizealloc, take, s;
439 
440 	sizealloc = roundup(size, CLBYTES);
441 	/*
442 	 * Buffer size does not change
443 	 */
444 	if (sizealloc == tp->b_bufsize)
445 		goto out;
446 	/*
447 	 * Buffer size is shrinking.
448 	 * Place excess space in a buffer header taken from the
449 	 * BQ_EMPTY buffer list and placed on the "most free" list.
450 	 * If no extra buffer headers are available, leave the
451 	 * extra space in the present buffer.
452 	 */
453 	if (sizealloc < tp->b_bufsize) {
454 		ep = bfreelist[BQ_EMPTY].av_forw;
455 		if (ep == &bfreelist[BQ_EMPTY])
456 			goto out;
457 		s = splbio();
458 		bremfree(ep);
459 		ep->b_flags |= B_BUSY;
460 		splx(s);
461 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
462 		    (int)tp->b_bufsize - sizealloc);
463 		ep->b_bufsize = tp->b_bufsize - sizealloc;
464 		tp->b_bufsize = sizealloc;
465 		ep->b_flags |= B_INVAL;
466 		ep->b_bcount = 0;
467 		brelse(ep);
468 		goto out;
469 	}
470 	/*
471 	 * More buffer space is needed. Get it out of buffers on
472 	 * the "most free" list, placing the empty headers on the
473 	 * BQ_EMPTY buffer header list.
474 	 */
475 	while (tp->b_bufsize < sizealloc) {
476 		take = sizealloc - tp->b_bufsize;
477 		bp = getnewbuf();
478 		if (take >= bp->b_bufsize)
479 			take = bp->b_bufsize;
480 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
481 		    &tp->b_un.b_addr[tp->b_bufsize], take);
482 		tp->b_bufsize += take;
483 		bp->b_bufsize = bp->b_bufsize - take;
484 		if (bp->b_bcount > bp->b_bufsize)
485 			bp->b_bcount = bp->b_bufsize;
486 		if (bp->b_bufsize <= 0) {
487 			bremhash(bp);
488 			binshash(bp, &bfreelist[BQ_EMPTY]);
489 			bp->b_dev = NODEV;
490 			bp->b_error = 0;
491 			bp->b_flags |= B_INVAL;
492 		}
493 		brelse(bp);
494 	}
495 out:
496 	tp->b_bcount = size;
497 	return (1);
498 }
499 
500 /*
501  * Find a buffer which is available for use.
502  * Select something from a free list.
503  * Preference is to AGE list, then LRU list.
504  */
505 struct buf *
506 getnewbuf()
507 {
508 	register struct buf *bp, *dp;
509 	register struct ucred *cred;
510 	int s;
511 
512 #ifdef LFS
513 	lfs_flush();
514 #endif
515 loop:
516 	s = splbio();
517 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
518 		if (dp->av_forw != dp)
519 			break;
520 	if (dp == bfreelist) {		/* no free blocks */
521 		dp->b_flags |= B_WANTED;
522 		sleep((caddr_t)dp, PRIBIO + 1);
523 		splx(s);
524 		goto loop;
525 	}
526 	bp = dp->av_forw;
527 	bremfree(bp);
528 	bp->b_flags |= B_BUSY;
529 	splx(s);
530 	if (bp->b_flags & B_DELWRI) {
531 		(void) bawrite(bp);
532 		goto loop;
533 	}
534 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
535 	if (bp->b_vp)
536 		brelvp(bp);
537 	if (bp->b_rcred != NOCRED) {
538 		cred = bp->b_rcred;
539 		bp->b_rcred = NOCRED;
540 		crfree(cred);
541 	}
542 	if (bp->b_wcred != NOCRED) {
543 		cred = bp->b_wcred;
544 		bp->b_wcred = NOCRED;
545 		crfree(cred);
546 	}
547 	bp->b_flags = B_BUSY;
548 	bp->b_dirtyoff = bp->b_dirtyend = 0;
549 	bp->b_validoff = bp->b_validend = 0;
550 	return (bp);
551 }
552 
553 /*
554  * Wait for I/O to complete.
555  *
556  * Extract and return any errors associated with the I/O.
557  * If the error flag is set, but no specific error is
558  * given, return EIO.
559  */
560 biowait(bp)
561 	register struct buf *bp;
562 {
563 	int s;
564 
565 	s = splbio();
566 	while ((bp->b_flags & B_DONE) == 0)
567 		sleep((caddr_t)bp, PRIBIO);
568 	splx(s);
569 	if ((bp->b_flags & B_ERROR) == 0)
570 		return (0);
571 	if (bp->b_error)
572 		return (bp->b_error);
573 	return (EIO);
574 }
575 
576 /*
577  * Mark I/O complete on a buffer.
578  *
579  * If a callback has been requested, e.g. the pageout
580  * daemon, do so. Otherwise, awaken waiting processes.
581  */
582 void
583 biodone(bp)
584 	register struct buf *bp;
585 {
586 
587 	if (bp->b_flags & B_DONE)
588 		panic("dup biodone");
589 	bp->b_flags |= B_DONE;
590 	if ((bp->b_flags & B_READ) == 0)
591 		vwakeup(bp);
592 	if (bp->b_flags & B_CALL) {
593 		bp->b_flags &= ~B_CALL;
594 		(*bp->b_iodone)(bp);
595 		return;
596 	}
597 	if (bp->b_flags & B_ASYNC)
598 		brelse(bp);
599 	else {
600 		bp->b_flags &= ~B_WANTED;
601 		wakeup((caddr_t)bp);
602 	}
603 }
604