xref: /original-bsd/sys/kern/vfs_cluster.c (revision 5998a314)
1 /*	vfs_cluster.c	4.39	82/11/13	*/
2 
3 #include "../h/param.h"
4 #include "../h/systm.h"
5 #include "../h/dir.h"
6 #include "../h/user.h"
7 #include "../h/buf.h"
8 #include "../h/conf.h"
9 #include "../h/proc.h"
10 #include "../h/seg.h"
11 #include "../h/pte.h"
12 #include "../h/vm.h"
13 #include "../h/trace.h"
14 
15 /*
16  * Read in (if necessary) the block and return a buffer pointer.
17  */
18 struct buf *
19 bread(dev, blkno, size)
20 	dev_t dev;
21 	daddr_t blkno;
22 	int size;
23 {
24 	register struct buf *bp;
25 
26 	if (size == 0)
27 		panic("bread: size 0");
28 	bp = getblk(dev, blkno, size);
29 	if (bp->b_flags&B_DONE) {
30 		trace(TR_BREADHIT, dev, blkno);
31 		return(bp);
32 	}
33 	bp->b_flags |= B_READ;
34 	if (bp->b_bcount > bp->b_bufsize)
35 		panic("bread");
36 	(*bdevsw[major(dev)].d_strategy)(bp);
37 	trace(TR_BREADMISS, dev, blkno);
38 	u.u_ru.ru_inblock++;		/* pay for read */
39 	biowait(bp);
40 	return(bp);
41 }
42 
43 /*
44  * Read in the block, like bread, but also start I/O on the
45  * read-ahead block (which is not allocated to the caller)
46  */
47 struct buf *
48 breada(dev, blkno, size, rablkno, rabsize)
49 	dev_t dev;
50 	daddr_t blkno; int size;
51 	daddr_t rablkno; int rabsize;
52 {
53 	register struct buf *bp, *rabp;
54 
55 	bp = NULL;
56 	/*
57 	 * If the block isn't in core, then allocate
58 	 * a buffer and initiate i/o (getblk checks
59 	 * for a cache hit).
60 	 */
61 	if (!incore(dev, blkno)) {
62 		bp = getblk(dev, blkno, size);
63 		if ((bp->b_flags&B_DONE) == 0) {
64 			bp->b_flags |= B_READ;
65 			if (bp->b_bcount > bp->b_bufsize)
66 				panic("breada");
67 			(*bdevsw[major(dev)].d_strategy)(bp);
68 			trace(TR_BREADMISS, dev, blkno);
69 			u.u_ru.ru_inblock++;		/* pay for read */
70 		} else
71 			trace(TR_BREADHIT, dev, blkno);
72 	}
73 
74 	/*
75 	 * If there's a read-ahead block, start i/o
76 	 * on it also (as above).
77 	 */
78 	if (rablkno && !incore(dev, rablkno)) {
79 		rabp = getblk(dev, rablkno, rabsize);
80 		if (rabp->b_flags & B_DONE) {
81 			brelse(rabp);
82 			trace(TR_BREADHITRA, dev, blkno);
83 		} else {
84 			rabp->b_flags |= B_READ|B_ASYNC;
85 			if (rabp->b_bcount > rabp->b_bufsize)
86 				panic("breadrabp");
87 			(*bdevsw[major(dev)].d_strategy)(rabp);
88 			trace(TR_BREADMISSRA, dev, rablock);
89 			u.u_ru.ru_inblock++;		/* pay in advance */
90 		}
91 	}
92 
93 	/*
94 	 * If block was in core, let bread get it.
95 	 * If block wasn't in core, then the read was started
96 	 * above, and just wait for it.
97 	 */
98 	if (bp == NULL)
99 		return (bread(dev, blkno, size));
100 	biowait(bp);
101 	return (bp);
102 }
103 
104 /*
105  * Write the buffer, waiting for completion.
106  * Then release the buffer.
107  */
108 bwrite(bp)
109 	register struct buf *bp;
110 {
111 	register flag;
112 
113 	flag = bp->b_flags;
114 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE);
115 	if ((flag&B_DELWRI) == 0)
116 		u.u_ru.ru_oublock++;		/* noone paid yet */
117 	trace(TR_BWRITE, bp->b_dev, bp->b_blkno);
118 	if (bp->b_bcount > bp->b_bufsize)
119 		panic("bwrite");
120 	(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
121 
122 	/*
123 	 * If the write was synchronous, then await i/o completion.
124 	 * If the write was "delayed", then we put the buffer on
125 	 * the q of blocks awaiting i/o completion status.
126 	 * Otherwise, the i/o must be finished and we check for
127 	 * an error.
128 	 */
129 	if ((flag&B_ASYNC) == 0) {
130 		biowait(bp);
131 		brelse(bp);
132 	} else if (flag & B_DELWRI)
133 		bp->b_flags |= B_AGE;
134 	else
135 		u.u_error = geterror(bp);
136 }
137 
138 /*
139  * Release the buffer, marking it so that if it is grabbed
140  * for another purpose it will be written out before being
141  * given up (e.g. when writing a partial block where it is
142  * assumed that another write for the same block will soon follow).
143  * This can't be done for magtape, since writes must be done
144  * in the same order as requested.
145  */
146 bdwrite(bp)
147 	register struct buf *bp;
148 {
149 	register int flags;
150 
151 	if ((bp->b_flags&B_DELWRI) == 0)
152 		u.u_ru.ru_oublock++;		/* noone paid yet */
153 	flags = bdevsw[major(bp->b_dev)].d_flags;
154 	if(flags & B_TAPE)
155 		bawrite(bp);
156 	else {
157 		bp->b_flags |= B_DELWRI | B_DONE;
158 		brelse(bp);
159 	}
160 }
161 
162 /*
163  * Release the buffer, start I/O on it, but don't wait for completion.
164  */
165 bawrite(bp)
166 	register struct buf *bp;
167 {
168 
169 	bp->b_flags |= B_ASYNC;
170 	bwrite(bp);
171 }
172 
173 /*
174  * Release the buffer, with no I/O implied.
175  */
176 brelse(bp)
177 	register struct buf *bp;
178 {
179 	register struct buf *flist;
180 	register s;
181 
182 	/*
183 	 * If someone's waiting for the buffer, or
184 	 * is waiting for a buffer wake 'em up.
185 	 */
186 	if (bp->b_flags&B_WANTED)
187 		wakeup((caddr_t)bp);
188 	if (bfreelist[0].b_flags&B_WANTED) {
189 		bfreelist[0].b_flags &= ~B_WANTED;
190 		wakeup((caddr_t)bfreelist);
191 	}
192 	if (bp->b_flags&B_ERROR)
193 		if (bp->b_flags & B_LOCKED)
194 			bp->b_flags &= ~B_ERROR;	/* try again later */
195 		else
196 			bp->b_dev = NODEV;  		/* no assoc */
197 
198 	/*
199 	 * Stick the buffer back on a free list.
200 	 */
201 	s = spl6();
202 	if (bp->b_bufsize <= 0) {
203 		/* block has no buffer ... put at front of unused buffer list */
204 		flist = &bfreelist[BQ_EMPTY];
205 		binsheadfree(bp, flist);
206 	} else if (bp->b_flags & (B_ERROR|B_INVAL)) {
207 		/* block has no info ... put at front of most free list */
208 		flist = &bfreelist[BQ_AGE];
209 		binsheadfree(bp, flist);
210 	} else {
211 		if (bp->b_flags & B_LOCKED)
212 			flist = &bfreelist[BQ_LOCKED];
213 		else if (bp->b_flags & B_AGE)
214 			flist = &bfreelist[BQ_AGE];
215 		else
216 			flist = &bfreelist[BQ_LRU];
217 		binstailfree(bp, flist);
218 	}
219 	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE);
220 	splx(s);
221 }
222 
223 /*
224  * See if the block is associated with some buffer
225  * (mainly to avoid getting hung up on a wait in breada)
226  */
227 incore(dev, blkno)
228 	dev_t dev;
229 	daddr_t blkno;
230 {
231 	register struct buf *bp;
232 	register struct buf *dp;
233 
234 	dp = BUFHASH(dev, blkno);
235 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
236 		if (bp->b_blkno == blkno && bp->b_dev == dev &&
237 		    (bp->b_flags & B_INVAL) == 0)
238 			return (1);
239 	return (0);
240 }
241 
242 struct buf *
243 baddr(dev, blkno, size)
244 	dev_t dev;
245 	daddr_t blkno;
246 	int size;
247 {
248 
249 	if (incore(dev, blkno))
250 		return (bread(dev, blkno, size));
251 	return (0);
252 }
253 
254 /*
255  * Assign a buffer for the given block.  If the appropriate
256  * block is already associated, return it; otherwise search
257  * for the oldest non-busy buffer and reassign it.
258  *
259  * We use splx here because this routine may be called
260  * on the interrupt stack during a dump, and we don't
261  * want to lower the ipl back to 0.
262  */
263 struct buf *
264 getblk(dev, blkno, size)
265 	dev_t dev;
266 	daddr_t blkno;
267 	int size;
268 {
269 	register struct buf *bp, *dp;
270 	int s;
271 
272 	if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT))
273 		blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1);
274 	/*
275 	 * Search the cache for the block.  If we hit, but
276 	 * the buffer is in use for i/o, then we wait until
277 	 * the i/o has completed.
278 	 */
279 	dp = BUFHASH(dev, blkno);
280 loop:
281 	for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
282 		if (bp->b_blkno != blkno || bp->b_dev != dev ||
283 		    bp->b_flags&B_INVAL)
284 			continue;
285 		s = spl6();
286 		if (bp->b_flags&B_BUSY) {
287 			bp->b_flags |= B_WANTED;
288 			sleep((caddr_t)bp, PRIBIO+1);
289 			splx(s);
290 			goto loop;
291 		}
292 		splx(s);
293 		notavail(bp);
294 		if (brealloc(bp, size) == 0)
295 			goto loop;
296 		bp->b_flags |= B_CACHE;
297 		return(bp);
298 	}
299 	if (major(dev) >= nblkdev)
300 		panic("blkdev");
301 	bp = getnewbuf();
302 	bfree(bp);
303 	bremhash(bp);
304 	binshash(bp, dp);
305 	bp->b_dev = dev;
306 	bp->b_blkno = blkno;
307 	bp->b_error = 0;
308 	if (brealloc(bp, size) == 0)
309 		goto loop;
310 	return(bp);
311 }
312 
313 /*
314  * get an empty block,
315  * not assigned to any particular device
316  */
317 struct buf *
318 geteblk(size)
319 	int size;
320 {
321 	register struct buf *bp, *flist;
322 
323 loop:
324 	bp = getnewbuf();
325 	bp->b_flags |= B_INVAL;
326 	bfree(bp);
327 	bremhash(bp);
328 	flist = &bfreelist[BQ_AGE];
329 	binshash(bp, flist);
330 	bp->b_dev = (dev_t)NODEV;
331 	bp->b_error = 0;
332 	if (brealloc(bp, size) == 0)
333 		goto loop;
334 	return(bp);
335 }
336 
337 /*
338  * Allocate space associated with a buffer.
339  */
340 brealloc(bp, size)
341 	register struct buf *bp;
342 	int size;
343 {
344 	daddr_t start, last;
345 	register struct buf *ep;
346 	struct buf *dp;
347 	int s;
348 
349 	/*
350 	 * First need to make sure that all overlaping previous I/O
351 	 * is dispatched with.
352 	 */
353 	if (size == bp->b_bcount)
354 		return (1);
355 	if (size < bp->b_bcount) {
356 		if (bp->b_flags & B_DELWRI) {
357 			bwrite(bp);
358 			return (0);
359 		}
360 		if (bp->b_flags & B_LOCKED)
361 			panic("brealloc");
362 		allocbuf(bp, size);
363 		return (1);
364 	}
365 	bp->b_flags &= ~B_DONE;
366 	if (bp->b_dev == NODEV) {
367 		allocbuf(bp, size);
368 		return (1);
369 	}
370 
371 	/*
372 	 * Search cache for any buffers that overlap the one that we
373 	 * are trying to allocate. Overlapping buffers must be marked
374 	 * invalid, after being written out if they are dirty. (indicated
375 	 * by B_DELWRI) A disk block must be mapped by at most one buffer
376 	 * at any point in time. Care must be taken to avoid deadlocking
377 	 * when two buffer are trying to get the same set of disk blocks.
378 	 */
379 	start = bp->b_blkno;
380 	last = start + (size / DEV_BSIZE) - 1;
381 	dp = BUFHASH(bp->b_dev, bp->b_blkno);
382 loop:
383 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
384 		if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL))
385 			continue;
386 		/* look for overlap */
387 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
388 		    ep->b_blkno + (ep->b_bcount / DEV_BSIZE) <= start)
389 			continue;
390 		s = spl6();
391 		if (ep->b_flags&B_BUSY) {
392 			ep->b_flags |= B_WANTED;
393 			sleep((caddr_t)ep, PRIBIO+1);
394 			splx(s);
395 			goto loop;
396 		}
397 		splx(s);
398 		notavail(ep);
399 		if (ep->b_flags & B_DELWRI) {
400 			bwrite(ep);
401 			goto loop;
402 		}
403 		ep->b_flags |= B_INVAL;
404 		brelse(ep);
405 	}
406 	allocbuf(bp, size);
407 	return (1);
408 }
409 
410 /*
411  * Expand or contract the actual memory allocated to a buffer.
412  */
413 allocbuf(tp, size)
414 	register struct buf *tp;
415 	int size;
416 {
417 	register struct buf *bp, *ep;
418 	int sizealloc, take;
419 
420 	sizealloc = roundup(size, CLBYTES);
421 	/*
422 	 * Buffer size does not change
423 	 */
424 	if (sizealloc == tp->b_bufsize)
425 		goto out;
426 	/*
427 	 * Buffer size is shrinking.
428 	 * Place excess space in a buffer header taken from the
429 	 * BQ_EMPTY buffer list and placed on the "most free" list.
430 	 * If no extra buffer headers are available, leave the
431 	 * extra space in the present buffer.
432 	 */
433 	if (sizealloc < tp->b_bufsize) {
434 		ep = bfreelist[BQ_EMPTY].av_forw;
435 		if (ep == &bfreelist[BQ_EMPTY])
436 			goto out;
437 		notavail(ep);
438 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
439 		    (int)tp->b_bufsize - sizealloc);
440 		ep->b_bufsize = tp->b_bufsize - sizealloc;
441 		tp->b_bufsize = sizealloc;
442 		ep->b_flags |= B_INVAL;
443 		ep->b_bcount = 0;
444 		brelse(ep);
445 		goto out;
446 	}
447 	/*
448 	 * More buffer space is needed. Get it out of buffers on
449 	 * the "most free" list, placing the empty headers on the
450 	 * BQ_EMPTY buffer header list.
451 	 */
452 	while (tp->b_bufsize < sizealloc) {
453 		take = sizealloc - tp->b_bufsize;
454 		bp = getnewbuf();
455 		if (take >= bp->b_bufsize)
456 			take = bp->b_bufsize;
457 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
458 		    &tp->b_un.b_addr[tp->b_bufsize], take);
459 		tp->b_bufsize += take;
460 		bp->b_bufsize = bp->b_bufsize - take;
461 		if (bp->b_bcount > bp->b_bufsize)
462 			bp->b_bcount = bp->b_bufsize;
463 		if (bp->b_bufsize <= 0) {
464 			bremhash(bp);
465 			binshash(bp, &bfreelist[BQ_EMPTY]);
466 			bp->b_dev = (dev_t)NODEV;
467 			bp->b_error = 0;
468 			bp->b_flags |= B_INVAL;
469 		}
470 		brelse(bp);
471 	}
472 out:
473 	tp->b_bcount = size;
474 }
475 
476 /*
477  * Release space associated with a buffer.
478  */
479 bfree(bp)
480 	struct buf *bp;
481 {
482 	/*
483 	 * This stub is provided to allow the system to reclaim
484 	 * memory from the buffer pool. Currently we do not migrate
485 	 * memory between the buffer memory pool and the user memory
486 	 * pool.
487 	 */
488 	bp->b_bcount = 0;
489 }
490 
491 /*
492  * Find a buffer which is available for use.
493  * Select something from a free list.
494  * Preference is to AGE list, then LRU list.
495  */
496 struct buf *
497 getnewbuf()
498 {
499 	register struct buf *bp, *dp;
500 	int s;
501 
502 loop:
503 	s = spl6();
504 	for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--)
505 		if (dp->av_forw != dp)
506 			break;
507 	if (dp == bfreelist) {		/* no free blocks */
508 		dp->b_flags |= B_WANTED;
509 		sleep((caddr_t)dp, PRIBIO+1);
510 		goto loop;
511 	}
512 	splx(s);
513 	bp = dp->av_forw;
514 	notavail(bp);
515 	if (bp->b_flags & B_DELWRI) {
516 		bp->b_flags |= B_ASYNC;
517 		bwrite(bp);
518 		goto loop;
519 	}
520 	trace(TR_BRELSE, bp->b_dev, bp->b_blkno);
521 	bp->b_flags = B_BUSY;
522 	return (bp);
523 }
524 
525 /*
526  * Wait for I/O completion on the buffer; return errors
527  * to the user.
528  */
529 biowait(bp)
530 	register struct buf *bp;
531 {
532 	int s;
533 
534 	s = spl6();
535 	while ((bp->b_flags&B_DONE)==0)
536 		sleep((caddr_t)bp, PRIBIO);
537 	splx(s);
538 	u.u_error = geterror(bp);
539 }
540 
541 /*
542  * Mark I/O complete on a buffer. If the header
543  * indicates a dirty page push completion, the
544  * header is inserted into the ``cleaned'' list
545  * to be processed by the pageout daemon. Otherwise
546  * release it if I/O is asynchronous, and wake
547  * up anyone waiting for it.
548  */
549 biodone(bp)
550 	register struct buf *bp;
551 {
552 	register int s;
553 
554 	if (bp->b_flags & B_DONE)
555 		panic("dup biodone");
556 	bp->b_flags |= B_DONE;
557 	if (bp->b_flags & B_DIRTY) {
558 		if (bp->b_flags & B_ERROR)
559 			panic("IO err in push");
560 		s = spl6();
561 		bp->av_forw = bclnlist;
562 		bp->b_bcount = swsize[bp - swbuf];
563 		bp->b_pfcent = swpf[bp - swbuf];
564 		cnt.v_pgout++;
565 		cnt.v_pgpgout += bp->b_bcount / NBPG;
566 		bclnlist = bp;
567 		if (bswlist.b_flags & B_WANTED)
568 			wakeup((caddr_t)&proc[2]);
569 		splx(s);
570 		return;
571 	}
572 	if (bp->b_flags&B_ASYNC)
573 		brelse(bp);
574 	else {
575 		bp->b_flags &= ~B_WANTED;
576 		wakeup((caddr_t)bp);
577 	}
578 }
579 
580 /*
581  * Insure that no part of a specified block is in an incore buffer.
582  */
583 blkflush(dev, blkno, size)
584 	dev_t dev;
585 	daddr_t blkno;
586 	long size;
587 {
588 	register struct buf *ep;
589 	struct buf *dp;
590 	daddr_t start, last;
591 	int s;
592 
593 	start = blkno;
594 	last = start + (size / DEV_BSIZE) - 1;
595 	dp = BUFHASH(dev, blkno);
596 loop:
597 	for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) {
598 		if (ep->b_dev != dev || (ep->b_flags&B_INVAL))
599 			continue;
600 		/* look for overlap */
601 		if (ep->b_bcount == 0 || ep->b_blkno > last ||
602 		    ep->b_blkno + (ep->b_bcount / DEV_BSIZE) <= start)
603 			continue;
604 		s = spl6();
605 		if (ep->b_flags&B_BUSY) {
606 			ep->b_flags |= B_WANTED;
607 			sleep((caddr_t)ep, PRIBIO+1);
608 			splx(s);
609 			goto loop;
610 		}
611 		if (ep->b_flags & B_DELWRI) {
612 			splx(s);
613 			notavail(ep);
614 			bwrite(ep);
615 			goto loop;
616 		}
617 		splx(s);
618 	}
619 }
620 
621 /*
622  * make sure all write-behind blocks
623  * on dev (or NODEV for all)
624  * are flushed out.
625  * (from umount and update)
626  * (and temporarily pagein)
627  */
628 bflush(dev)
629 	dev_t dev;
630 {
631 	register struct buf *bp;
632 	register struct buf *flist;
633 	int s;
634 
635 loop:
636 	s = spl6();
637 	for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++)
638 	for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) {
639 		if ((bp->b_flags & B_DELWRI) == 0)
640 			continue;
641 		if (dev == NODEV || dev == bp->b_dev) {
642 			bp->b_flags |= B_ASYNC;
643 			notavail(bp);
644 			bwrite(bp);
645 			goto loop;
646 		}
647 	}
648 	splx(s);
649 }
650 
651 /*
652  * Pick up the device's error number and pass it to the user;
653  * if there is an error but the number is 0 set a generalized
654  * code.  Actually the latter is always true because devices
655  * don't yet return specific errors.
656  */
657 geterror(bp)
658 	register struct buf *bp;
659 {
660 	int error = 0;
661 
662 	if (bp->b_flags&B_ERROR)
663 		if ((error = bp->b_error)==0)
664 			return (EIO);
665 	return (error);
666 }
667 
668 /*
669  * Invalidate in core blocks belonging to closed or umounted filesystem
670  *
671  * This is not nicely done at all - the buffer ought to be removed from the
672  * hash chains & have its dev/blkno fields clobbered, but unfortunately we
673  * can't do that here, as it is quite possible that the block is still
674  * being used for i/o. Eventually, all disc drivers should be forced to
675  * have a close routine, which ought ensure that the queue is empty, then
676  * properly flush the queues. Until that happy day, this suffices for
677  * correctness.						... kre
678  */
679 binval(dev)
680 	dev_t dev;
681 {
682 	register struct buf *bp;
683 	register struct bufhd *hp;
684 #define dp ((struct buf *)hp)
685 
686 	for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++)
687 		for (bp = dp->b_forw; bp != dp; bp = bp->b_forw)
688 			if (bp->b_dev == dev)
689 				bp->b_flags |= B_INVAL;
690 }
691