xref: /openbsd/sys/kern/vfs_bio.c (revision 264ca280)
1 /*	$OpenBSD: vfs_bio.c,v 1.175 2016/06/07 01:31:54 tedu Exp $	*/
2 /*	$NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $	*/
3 
4 /*
5  * Copyright (c) 1994 Christopher G. Demetriou
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_bio.c	8.6 (Berkeley) 1/11/94
39  */
40 
41 /*
42  * Some references:
43  *	Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
44  *	Leffler, et al.: The Design and Implementation of the 4.3BSD
45  *		UNIX Operating System (Addison Welley, 1989)
46  */
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/proc.h>
51 #include <sys/buf.h>
52 #include <sys/vnode.h>
53 #include <sys/mount.h>
54 #include <sys/malloc.h>
55 #include <sys/pool.h>
56 #include <sys/resourcevar.h>
57 #include <sys/conf.h>
58 #include <sys/kernel.h>
59 #include <sys/specdev.h>
60 #include <uvm/uvm_extern.h>
61 
62 int nobuffers;
63 int needbuffer;
64 struct bio_ops bioops;
65 
66 /* private bufcache functions */
67 void bufcache_init(void);
68 void bufcache_adjust(void);
69 
70 /*
71  * Buffer pool for I/O buffers.
72  */
73 struct pool bufpool;
74 struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead);
75 void buf_put(struct buf *);
76 
77 struct buf *bio_doread(struct vnode *, daddr_t, int, int);
78 struct buf *buf_get(struct vnode *, daddr_t, size_t);
79 void bread_cluster_callback(struct buf *);
80 
81 struct bcachestats bcstats;  /* counters */
82 long lodirtypages;      /* dirty page count low water mark */
83 long hidirtypages;      /* dirty page count high water mark */
84 long targetpages;   	/* target number of pages for cache size */
85 long buflowpages;	/* smallest size cache allowed */
86 long bufhighpages; 	/* largest size cache allowed */
87 long bufbackpages; 	/* minimum number of pages we shrink when asked to */
88 
89 vsize_t bufkvm;
90 
91 struct proc *cleanerproc;
92 int bd_req;			/* Sleep point for cleaner daemon. */
93 
94 #define NUM_CACHES 2
95 #define DMA_CACHE 0
96 struct bufcache cleancache[NUM_CACHES];
97 struct bufqueue dirtyqueue;
98 
99 void
100 buf_put(struct buf *bp)
101 {
102 	splassert(IPL_BIO);
103 
104 #ifdef DIAGNOSTIC
105 	if (bp->b_pobj != NULL)
106 		KASSERT(bp->b_bufsize > 0);
107 	if (ISSET(bp->b_flags, B_DELWRI))
108 		panic("buf_put: releasing dirty buffer");
109 	if (bp->b_freelist.tqe_next != NOLIST &&
110 	    bp->b_freelist.tqe_next != (void *)-1)
111 		panic("buf_put: still on the free list");
112 	if (bp->b_vnbufs.le_next != NOLIST &&
113 	    bp->b_vnbufs.le_next != (void *)-1)
114 		panic("buf_put: still on the vnode list");
115 	if (!LIST_EMPTY(&bp->b_dep))
116 		panic("buf_put: b_dep is not empty");
117 #endif
118 
119 	LIST_REMOVE(bp, b_list);
120 	bcstats.numbufs--;
121 
122 	if (buf_dealloc_mem(bp) != 0)
123 		return;
124 	pool_put(&bufpool, bp);
125 }
126 
127 /*
128  * Initialize buffers and hash links for buffers.
129  */
130 void
131 bufinit(void)
132 {
133 	u_int64_t dmapages;
134 
135 	dmapages = uvm_pagecount(&dma_constraint);
136 	/* take away a guess at how much of this the kernel will consume */
137 	dmapages -= (atop(physmem) - atop(uvmexp.free));
138 
139 	/*
140 	 * If MD code doesn't say otherwise, use up to 10% of DMA'able
141 	 * memory for buffers.
142 	 */
143 	if (bufcachepercent == 0)
144 		bufcachepercent = 10;
145 
146 	/*
147 	 * XXX these values and their same use in kern_sysctl
148 	 * need to move into buf.h
149 	 */
150 	KASSERT(bufcachepercent <= 90);
151 	KASSERT(bufcachepercent >= 5);
152 	if (bufpages == 0)
153 		bufpages = dmapages * bufcachepercent / 100;
154 	if (bufpages < BCACHE_MIN)
155 		bufpages = BCACHE_MIN;
156 	KASSERT(bufpages < dmapages);
157 
158 	bufhighpages = bufpages;
159 
160 	/*
161 	 * Set the base backoff level for the buffer cache.  We will
162 	 * not allow uvm to steal back more than this number of pages.
163 	 */
164 	buflowpages = dmapages * 5 / 100;
165 	if (buflowpages < BCACHE_MIN)
166 		buflowpages = BCACHE_MIN;
167 
168 	/*
169 	 * set bufbackpages to 100 pages, or 10 percent of the low water mark
170 	 * if we don't have that many pages.
171 	 */
172 
173 	bufbackpages = buflowpages * 10 / 100;
174 	if (bufbackpages > 100)
175 		bufbackpages = 100;
176 
177 	/*
178 	 * If the MD code does not say otherwise, reserve 10% of kva
179 	 * space for mapping buffers.
180 	 */
181 	if (bufkvm == 0)
182 		bufkvm = VM_KERNEL_SPACE_SIZE / 10;
183 
184 	/*
185 	 * Don't use more than twice the amount of bufpages for mappings.
186 	 * It's twice since we map things sparsely.
187 	 */
188 	if (bufkvm > bufpages * PAGE_SIZE)
189 		bufkvm = bufpages * PAGE_SIZE;
190 	/*
191 	 * Round bufkvm to MAXPHYS because we allocate chunks of va space
192 	 * in MAXPHYS chunks.
193 	 */
194 	bufkvm &= ~(MAXPHYS - 1);
195 
196 	pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
197 	pool_setipl(&bufpool, IPL_BIO);
198 
199 	bufcache_init();
200 
201 	/*
202 	 * hmm - bufkvm is an argument because it's static, while
203 	 * bufpages is global because it can change while running.
204  	 */
205 	buf_mem_init(bufkvm);
206 
207 	/*
208 	 * Set the dirty page high water mark to be less than the low
209 	 * water mark for pages in the buffer cache. This ensures we
210 	 * can always back off by throwing away clean pages, and give
211 	 * ourselves a chance to write out the dirty pages eventually.
212 	 */
213 	hidirtypages = (buflowpages / 4) * 3;
214 	lodirtypages = buflowpages / 2;
215 
216 	/*
217 	 * We are allowed to use up to the reserve.
218 	 */
219 	targetpages = bufpages - RESERVE_PAGES;
220 }
221 
222 /*
223  * Change cachepct
224  */
225 void
226 bufadjust(int newbufpages)
227 {
228 	struct buf *bp;
229 	int s;
230 
231 	if (newbufpages < buflowpages)
232 		newbufpages = buflowpages;
233 
234 	s = splbio();
235 	bufpages = newbufpages;
236 
237 	/*
238 	 * We are allowed to use up to the reserve
239 	 */
240 	targetpages = bufpages - RESERVE_PAGES;
241 
242 	/*
243 	 * Shrinking the cache happens here only if someone has manually
244 	 * adjusted bufcachepercent - or the pagedaemon has told us
245 	 * to give back memory *now* - so we give it all back.
246 	 */
247 	while ((bp = bufcache_getanycleanbuf()) &&
248 	    (bcstats.numbufpages > targetpages)) {
249 		bufcache_take(bp);
250 		if (bp->b_vp) {
251 			RB_REMOVE(buf_rb_bufs,
252 			    &bp->b_vp->v_bufs_tree, bp);
253 			brelvp(bp);
254 		}
255 		buf_put(bp);
256 	}
257 	bufcache_adjust();
258 
259 	/*
260 	 * Wake up the cleaner if we have lots of dirty pages,
261 	 * or if we are getting low on buffer cache kva.
262 	 */
263 	if ((UNCLEAN_PAGES >= hidirtypages) ||
264 	    bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS)
265 		wakeup(&bd_req);
266 
267 	splx(s);
268 }
269 
270 /*
271  * Make the buffer cache back off from cachepct.
272  */
273 int
274 bufbackoff(struct uvm_constraint_range *range, long size)
275 {
276 	/*
277 	 * Back off "size" buffer cache pages. Called by the page
278 	 * daemon to consume buffer cache pages rather than scanning.
279 	 *
280 	 * It returns 0 to the pagedaemon to indicate that it has
281 	 * succeeded in freeing enough pages. It returns -1 to
282 	 * indicate that it could not and the pagedaemon should take
283 	 * other measures.
284 	 *
285 	 */
286 	long pdelta, oldbufpages;
287 
288 	/*
289 	 * Back off by at least bufbackpages. If the page daemon gave us
290 	 * a larger size, back off by that much.
291 	 */
292 	pdelta = (size > bufbackpages) ? size : bufbackpages;
293 
294 	if (bufpages <= buflowpages)
295 		return(-1);
296 	if (bufpages - pdelta < buflowpages)
297 		pdelta = bufpages - buflowpages;
298 	oldbufpages = bufpages;
299 	bufadjust(bufpages - pdelta);
300 	if (oldbufpages - bufpages < size)
301 		return (-1); /* we did not free what we were asked */
302 	else
303 		return(0);
304 }
305 
306 void
307 buf_flip_high(struct buf *bp)
308 {
309 	KASSERT(ISSET(bp->b_flags, B_BC));
310 	KASSERT(ISSET(bp->b_flags, B_DMA));
311 	KASSERT(bp->cache == DMA_CACHE);
312 	CLR(bp->b_flags, B_DMA);
313 	/* XXX does nothing to buffer for now */
314 }
315 
316 void
317 buf_flip_dma(struct buf *bp)
318 {
319 	KASSERT(ISSET(bp->b_flags, B_BC));
320 	KASSERT(ISSET(bp->b_flags, B_BUSY));
321 	if (!ISSET(bp->b_flags, B_DMA)) {
322 		KASSERT(bp->cache > DMA_CACHE);
323 		KASSERT(bp->cache < NUM_CACHES);
324 		/* XXX does not flip buffer for now */
325 		/* make buffer hot, in DMA_CACHE, once it gets released. */
326 		CLR(bp->b_flags, B_COLD);
327 		CLR(bp->b_flags, B_WARM);
328 		SET(bp->b_flags, B_DMA);
329 		bp->cache = DMA_CACHE;
330 	}
331 }
332 
333 struct buf *
334 bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
335 {
336 	struct buf *bp;
337 	struct mount *mp;
338 
339 	bp = getblk(vp, blkno, size, 0, 0);
340 
341 	/*
342 	 * If buffer does not have valid data, start a read.
343 	 * Note that if buffer is B_INVAL, getblk() won't return it.
344 	 * Therefore, it's valid if its I/O has completed or been delayed.
345 	 */
346 	if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
347 		SET(bp->b_flags, B_READ | async);
348 		bcstats.pendingreads++;
349 		bcstats.numreads++;
350 		VOP_STRATEGY(bp);
351 		/* Pay for the read. */
352 		curproc->p_ru.ru_inblock++;			/* XXX */
353 	} else if (async) {
354 		brelse(bp);
355 	}
356 
357 	mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount;
358 
359 	/*
360 	 * Collect statistics on synchronous and asynchronous reads.
361 	 * Reads from block devices are charged to their associated
362 	 * filesystem (if any).
363 	 */
364 	if (mp != NULL) {
365 		if (async == 0)
366 			mp->mnt_stat.f_syncreads++;
367 		else
368 			mp->mnt_stat.f_asyncreads++;
369 	}
370 
371 	return (bp);
372 }
373 
374 /*
375  * Read a disk block.
376  * This algorithm described in Bach (p.54).
377  */
378 int
379 bread(struct vnode *vp, daddr_t blkno, int size, struct buf **bpp)
380 {
381 	struct buf *bp;
382 
383 	/* Get buffer for block. */
384 	bp = *bpp = bio_doread(vp, blkno, size, 0);
385 
386 	/* Wait for the read to complete, and return result. */
387 	return (biowait(bp));
388 }
389 
390 /*
391  * Read-ahead multiple disk blocks. The first is sync, the rest async.
392  * Trivial modification to the breada algorithm presented in Bach (p.55).
393  */
394 int
395 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t rablks[],
396     int rasizes[], int nrablks, struct buf **bpp)
397 {
398 	struct buf *bp;
399 	int i;
400 
401 	bp = *bpp = bio_doread(vp, blkno, size, 0);
402 
403 	/*
404 	 * For each of the read-ahead blocks, start a read, if necessary.
405 	 */
406 	for (i = 0; i < nrablks; i++) {
407 		/* If it's in the cache, just go on to next one. */
408 		if (incore(vp, rablks[i]))
409 			continue;
410 
411 		/* Get a buffer for the read-ahead block */
412 		(void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
413 	}
414 
415 	/* Otherwise, we had to start a read for it; wait until it's valid. */
416 	return (biowait(bp));
417 }
418 
419 /*
420  * Called from interrupt context.
421  */
422 void
423 bread_cluster_callback(struct buf *bp)
424 {
425 	struct buf **xbpp = bp->b_saveaddr;
426 	int i;
427 
428 	if (xbpp[1] != NULL) {
429 		size_t newsize = xbpp[1]->b_bufsize;
430 
431 		/*
432 		 * Shrink this buffer's mapping to only cover its part of
433 		 * the total I/O.
434 		 */
435 		buf_fix_mapping(bp, newsize);
436 		bp->b_bcount = newsize;
437 	}
438 
439 	for (i = 1; xbpp[i] != 0; i++) {
440 		if (ISSET(bp->b_flags, B_ERROR))
441 			SET(xbpp[i]->b_flags, B_INVAL | B_ERROR);
442 		biodone(xbpp[i]);
443 	}
444 
445 	free(xbpp, M_TEMP, 0);
446 
447 	if (ISSET(bp->b_flags, B_ASYNC)) {
448 		brelse(bp);
449 	} else {
450 		CLR(bp->b_flags, B_WANTED);
451 		wakeup(bp);
452 	}
453 }
454 
455 int
456 bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp)
457 {
458 	struct buf *bp, **xbpp;
459 	int howmany, maxra, i, inc;
460 	daddr_t sblkno;
461 
462 	*rbpp = bio_doread(vp, blkno, size, 0);
463 
464 	/*
465 	 * If the buffer is in the cache skip any I/O operation.
466 	 */
467 	if (ISSET((*rbpp)->b_flags, B_CACHE))
468 		goto out;
469 
470 	if (size != round_page(size))
471 		goto out;
472 
473 	if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra))
474 		goto out;
475 
476 	maxra++;
477 	if (sblkno == -1 || maxra < 2)
478 		goto out;
479 
480 	howmany = MAXPHYS / size;
481 	if (howmany > maxra)
482 		howmany = maxra;
483 
484 	xbpp = mallocarray(howmany + 1, sizeof(struct buf *), M_TEMP, M_NOWAIT);
485 	if (xbpp == NULL)
486 		goto out;
487 
488 	for (i = howmany - 1; i >= 0; i--) {
489 		size_t sz;
490 
491 		/*
492 		 * First buffer allocates big enough size to cover what
493 		 * all the other buffers need.
494 		 */
495 		sz = i == 0 ? howmany * size : 0;
496 
497 		xbpp[i] = buf_get(vp, blkno + i + 1, sz);
498 		if (xbpp[i] == NULL) {
499 			for (++i; i < howmany; i++) {
500 				SET(xbpp[i]->b_flags, B_INVAL);
501 				brelse(xbpp[i]);
502 			}
503 			free(xbpp, M_TEMP, 0);
504 			goto out;
505 		}
506 	}
507 
508 	bp = xbpp[0];
509 
510 	xbpp[howmany] = 0;
511 
512 	inc = btodb(size);
513 
514 	for (i = 1; i < howmany; i++) {
515 		bcstats.pendingreads++;
516 		bcstats.numreads++;
517                 /*
518                 * We set B_DMA here because bp above will be B_DMA,
519                 * and we are playing buffer slice-n-dice games from
520                 * the memory allocated in bp.
521                 */
522 		SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
523 		xbpp[i]->b_blkno = sblkno + (i * inc);
524 		xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
525 		xbpp[i]->b_data = NULL;
526 		xbpp[i]->b_pobj = bp->b_pobj;
527 		xbpp[i]->b_poffs = bp->b_poffs + (i * size);
528 	}
529 
530 	KASSERT(bp->b_lblkno == blkno + 1);
531 	KASSERT(bp->b_vp == vp);
532 
533 	bp->b_blkno = sblkno;
534 	SET(bp->b_flags, B_READ | B_ASYNC | B_CALL);
535 
536 	bp->b_saveaddr = (void *)xbpp;
537 	bp->b_iodone = bread_cluster_callback;
538 
539 	bcstats.pendingreads++;
540 	bcstats.numreads++;
541 	VOP_STRATEGY(bp);
542 	curproc->p_ru.ru_inblock++;
543 
544 out:
545 	return (biowait(*rbpp));
546 }
547 
548 /*
549  * Block write.  Described in Bach (p.56)
550  */
551 int
552 bwrite(struct buf *bp)
553 {
554 	int rv, async, wasdelayed, s;
555 	struct vnode *vp;
556 	struct mount *mp;
557 
558 	vp = bp->b_vp;
559 	if (vp != NULL)
560 		mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount;
561 	else
562 		mp = NULL;
563 
564 	/*
565 	 * Remember buffer type, to switch on it later.  If the write was
566 	 * synchronous, but the file system was mounted with MNT_ASYNC,
567 	 * convert it to a delayed write.
568 	 * XXX note that this relies on delayed tape writes being converted
569 	 * to async, not sync writes (which is safe, but ugly).
570 	 */
571 	async = ISSET(bp->b_flags, B_ASYNC);
572 	if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)) {
573 		bdwrite(bp);
574 		return (0);
575 	}
576 
577 	/*
578 	 * Collect statistics on synchronous and asynchronous writes.
579 	 * Writes to block devices are charged to their associated
580 	 * filesystem (if any).
581 	 */
582 	if (mp != NULL) {
583 		if (async)
584 			mp->mnt_stat.f_asyncwrites++;
585 		else
586 			mp->mnt_stat.f_syncwrites++;
587 	}
588 	bcstats.pendingwrites++;
589 	bcstats.numwrites++;
590 
591 	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
592 	CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
593 
594 	s = splbio();
595 
596 	/*
597 	 * If not synchronous, pay for the I/O operation and make
598 	 * sure the buf is on the correct vnode queue.  We have
599 	 * to do this now, because if we don't, the vnode may not
600 	 * be properly notified that its I/O has completed.
601 	 */
602 	if (wasdelayed) {
603 		reassignbuf(bp);
604 	} else
605 		curproc->p_ru.ru_oublock++;
606 
607 
608 	/* Initiate disk write.  Make sure the appropriate party is charged. */
609 	bp->b_vp->v_numoutput++;
610 	splx(s);
611 	buf_flip_dma(bp);
612 	SET(bp->b_flags, B_WRITEINPROG);
613 	VOP_STRATEGY(bp);
614 
615 	/*
616 	 * If the queue is above the high water mark, wait till
617 	 * the number of outstanding write bufs drops below the low
618 	 * water mark.
619 	 */
620 	if (bp->b_bq)
621 		bufq_wait(bp->b_bq);
622 
623 	if (async)
624 		return (0);
625 
626 	/*
627 	 * If I/O was synchronous, wait for it to complete.
628 	 */
629 	rv = biowait(bp);
630 
631 	/* Release the buffer. */
632 	brelse(bp);
633 
634 	return (rv);
635 }
636 
637 
638 /*
639  * Delayed write.
640  *
641  * The buffer is marked dirty, but is not queued for I/O.
642  * This routine should be used when the buffer is expected
643  * to be modified again soon, typically a small write that
644  * partially fills a buffer.
645  *
646  * NB: magnetic tapes cannot be delayed; they must be
647  * written in the order that the writes are requested.
648  *
649  * Described in Leffler, et al. (pp. 208-213).
650  */
651 void
652 bdwrite(struct buf *bp)
653 {
654 	int s;
655 
656 	/*
657 	 * If the block hasn't been seen before:
658 	 *	(1) Mark it as having been seen,
659 	 *	(2) Charge for the write.
660 	 *	(3) Make sure it's on its vnode's correct block list,
661 	 *	(4) If a buffer is rewritten, move it to end of dirty list
662 	 */
663 	if (!ISSET(bp->b_flags, B_DELWRI)) {
664 		SET(bp->b_flags, B_DELWRI);
665 		s = splbio();
666 		buf_flip_dma(bp);
667 		reassignbuf(bp);
668 		splx(s);
669 		curproc->p_ru.ru_oublock++;		/* XXX */
670 	}
671 
672 	/* If this is a tape block, write the block now. */
673 	if (major(bp->b_dev) < nblkdev &&
674 	    bdevsw[major(bp->b_dev)].d_type == D_TAPE) {
675 		bawrite(bp);
676 		return;
677 	}
678 
679 	/* Otherwise, the "write" is done, so mark and release the buffer. */
680 	CLR(bp->b_flags, B_NEEDCOMMIT);
681 	SET(bp->b_flags, B_DONE);
682 	brelse(bp);
683 }
684 
685 /*
686  * Asynchronous block write; just an asynchronous bwrite().
687  */
688 void
689 bawrite(struct buf *bp)
690 {
691 
692 	SET(bp->b_flags, B_ASYNC);
693 	VOP_BWRITE(bp);
694 }
695 
696 /*
697  * Must be called at splbio()
698  */
699 void
700 buf_dirty(struct buf *bp)
701 {
702 	splassert(IPL_BIO);
703 
704 #ifdef DIAGNOSTIC
705 	if (!ISSET(bp->b_flags, B_BUSY))
706 		panic("Trying to dirty buffer on freelist!");
707 #endif
708 
709 	if (ISSET(bp->b_flags, B_DELWRI) == 0) {
710 		SET(bp->b_flags, B_DELWRI);
711 		buf_flip_dma(bp);
712 		reassignbuf(bp);
713 	}
714 }
715 
716 /*
717  * Must be called at splbio()
718  */
719 void
720 buf_undirty(struct buf *bp)
721 {
722 	splassert(IPL_BIO);
723 
724 #ifdef DIAGNOSTIC
725 	if (!ISSET(bp->b_flags, B_BUSY))
726 		panic("Trying to undirty buffer on freelist!");
727 #endif
728 	if (ISSET(bp->b_flags, B_DELWRI)) {
729 		CLR(bp->b_flags, B_DELWRI);
730 		reassignbuf(bp);
731 	}
732 }
733 
734 /*
735  * Release a buffer on to the free lists.
736  * Described in Bach (p. 46).
737  */
738 void
739 brelse(struct buf *bp)
740 {
741 	int s;
742 
743 	s = splbio();
744 
745 	if (bp->b_data != NULL)
746 		KASSERT(bp->b_bufsize > 0);
747 
748 	/*
749 	 * Determine which queue the buffer should be on, then put it there.
750 	 */
751 
752 	/* If it's not cacheable, or an error, mark it invalid. */
753 	if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
754 		SET(bp->b_flags, B_INVAL);
755 
756 	if (ISSET(bp->b_flags, B_INVAL)) {
757 		/*
758 		 * If the buffer is invalid, free it now rather than leaving
759 		 * it in a queue and wasting memory.
760 		 */
761 		if (LIST_FIRST(&bp->b_dep) != NULL)
762 			buf_deallocate(bp);
763 
764 		if (ISSET(bp->b_flags, B_DELWRI)) {
765 			CLR(bp->b_flags, B_DELWRI);
766 		}
767 
768 		if (bp->b_vp) {
769 			RB_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree,
770 			    bp);
771 			brelvp(bp);
772 		}
773 		bp->b_vp = NULL;
774 
775 		/*
776 		 * Wake up any processes waiting for _this_ buffer to
777 		 * become free. They are not allowed to grab it
778 		 * since it will be freed. But the only sleeper is
779 		 * getblk and it will restart the operation after
780 		 * sleep.
781 		 */
782 		if (ISSET(bp->b_flags, B_WANTED)) {
783 			CLR(bp->b_flags, B_WANTED);
784 			wakeup(bp);
785 		}
786 		buf_put(bp);
787 	} else {
788 		/*
789 		 * It has valid data.  Put it on the end of the appropriate
790 		 * queue, so that it'll stick around for as long as possible.
791 		 */
792 		bufcache_release(bp);
793 
794 		/* Unlock the buffer. */
795 		CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
796 		buf_release(bp);
797 
798 		/* Wake up any processes waiting for _this_ buffer to
799 		 * become free. */
800 		if (ISSET(bp->b_flags, B_WANTED)) {
801 			CLR(bp->b_flags, B_WANTED);
802 			wakeup(bp);
803 		}
804 	}
805 
806 	/* Wake up syncer and cleaner processes waiting for buffers. */
807 	if (nobuffers) {
808 		nobuffers = 0;
809 		wakeup(&nobuffers);
810 	}
811 
812 	/* Wake up any processes waiting for any buffer to become free. */
813 	if (needbuffer && bcstats.numbufpages < targetpages &&
814 	    bcstats.kvaslots_avail > RESERVE_SLOTS) {
815 		needbuffer = 0;
816 		wakeup(&needbuffer);
817 	}
818 
819 	splx(s);
820 }
821 
822 /*
823  * Determine if a block is in the cache. Just look on what would be its hash
824  * chain. If it's there, return a pointer to it, unless it's marked invalid.
825  */
826 struct buf *
827 incore(struct vnode *vp, daddr_t blkno)
828 {
829 	struct buf *bp;
830 	struct buf b;
831 	int s;
832 
833 	s = splbio();
834 
835 	/* Search buf lookup tree */
836 	b.b_lblkno = blkno;
837 	bp = RB_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
838 	if (bp != NULL && ISSET(bp->b_flags, B_INVAL))
839 		bp = NULL;
840 
841 	splx(s);
842 	return (bp);
843 }
844 
845 /*
846  * Get a block of requested size that is associated with
847  * a given vnode and block offset. If it is found in the
848  * block cache, mark it as having been found, make it busy
849  * and return it. Otherwise, return an empty block of the
850  * correct size. It is up to the caller to ensure that the
851  * cached blocks be of the correct size.
852  */
853 struct buf *
854 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
855 {
856 	struct buf *bp;
857 	struct buf b;
858 	int s, error;
859 
860 	/*
861 	 * XXX
862 	 * The following is an inlined version of 'incore()', but with
863 	 * the 'invalid' test moved to after the 'busy' test.  It's
864 	 * necessary because there are some cases in which the NFS
865 	 * code sets B_INVAL prior to writing data to the server, but
866 	 * in which the buffers actually contain valid data.  In this
867 	 * case, we can't allow the system to allocate a new buffer for
868 	 * the block until the write is finished.
869 	 */
870 start:
871 	s = splbio();
872 	b.b_lblkno = blkno;
873 	bp = RB_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
874 	if (bp != NULL) {
875 		if (ISSET(bp->b_flags, B_BUSY)) {
876 			SET(bp->b_flags, B_WANTED);
877 			error = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
878 			    slptimeo);
879 			splx(s);
880 			if (error)
881 				return (NULL);
882 			goto start;
883 		}
884 
885 		if (!ISSET(bp->b_flags, B_INVAL)) {
886 			bcstats.cachehits++;
887 			SET(bp->b_flags, B_CACHE);
888 			bufcache_take(bp);
889 			buf_acquire(bp);
890 			splx(s);
891 			return (bp);
892 		}
893 	}
894 	splx(s);
895 
896 	if ((bp = buf_get(vp, blkno, size)) == NULL)
897 		goto start;
898 
899 	return (bp);
900 }
901 
902 /*
903  * Get an empty, disassociated buffer of given size.
904  */
905 struct buf *
906 geteblk(int size)
907 {
908 	struct buf *bp;
909 
910 	while ((bp = buf_get(NULL, 0, size)) == NULL)
911 		continue;
912 
913 	return (bp);
914 }
915 
916 /*
917  * Allocate a buffer.
918  */
919 struct buf *
920 buf_get(struct vnode *vp, daddr_t blkno, size_t size)
921 {
922 	struct buf *bp;
923 	int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK;
924 	int npages;
925 	int s;
926 
927 	s = splbio();
928 	if (size) {
929 		/*
930 		 * Wake up the cleaner if we have lots of dirty pages,
931 		 * or if we are getting low on buffer cache kva.
932 		 */
933 		if (UNCLEAN_PAGES >= hidirtypages ||
934 			bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS)
935 			wakeup(&bd_req);
936 
937 		npages = atop(round_page(size));
938 
939 		/*
940 		 * if our cache has been previously shrunk,
941 		 * allow it to grow again with use up to
942 		 * bufhighpages (cachepercent)
943 		 */
944 		if (bufpages < bufhighpages)
945 			bufadjust(bufhighpages);
946 
947 		/*
948 		 * If we would go over the page target with our
949 		 * new allocation, free enough buffers first
950 		 * to stay at the target with our new allocation.
951 		 */
952 		while ((bcstats.numbufpages + npages > targetpages) &&
953 		    (bp = bufcache_getanycleanbuf())) {
954 			bufcache_take(bp);
955 			if (bp->b_vp) {
956 				RB_REMOVE(buf_rb_bufs,
957 				    &bp->b_vp->v_bufs_tree, bp);
958 				brelvp(bp);
959 			}
960 			buf_put(bp);
961 		}
962 
963 		/*
964 		 * If we get here, we tried to free the world down
965 		 * above, and couldn't get down - Wake the cleaner
966 		 * and wait for it to push some buffers out.
967 		 */
968 		if ((bcstats.numbufpages + npages > targetpages ||
969 		    bcstats.kvaslots_avail <= RESERVE_SLOTS) &&
970 		    curproc != syncerproc && curproc != cleanerproc) {
971 			wakeup(&bd_req);
972 			needbuffer++;
973 			tsleep(&needbuffer, PRIBIO, "needbuffer", 0);
974 			splx(s);
975 			return (NULL);
976 		}
977 		if (bcstats.numbufpages + npages > bufpages) {
978 			/* cleaner or syncer */
979 			nobuffers = 1;
980 			tsleep(&nobuffers, PRIBIO, "nobuffers", 0);
981 			splx(s);
982 			return (NULL);
983 		}
984 	}
985 
986 	bp = pool_get(&bufpool, poolwait|PR_ZERO);
987 
988 	if (bp == NULL) {
989 		splx(s);
990 		return (NULL);
991 	}
992 
993 	bp->b_freelist.tqe_next = NOLIST;
994 	bp->b_dev = NODEV;
995 	LIST_INIT(&bp->b_dep);
996 	bp->b_bcount = size;
997 
998 	buf_acquire_nomap(bp);
999 
1000 	if (vp != NULL) {
1001 		/*
1002 		 * We insert the buffer into the hash with B_BUSY set
1003 		 * while we allocate pages for it. This way any getblk
1004 		 * that happens while we allocate pages will wait for
1005 		 * this buffer instead of starting its own buf_get.
1006 		 *
1007 		 * But first, we check if someone beat us to it.
1008 		 */
1009 		if (incore(vp, blkno)) {
1010 			pool_put(&bufpool, bp);
1011 			splx(s);
1012 			return (NULL);
1013 		}
1014 
1015 		bp->b_blkno = bp->b_lblkno = blkno;
1016 		bgetvp(vp, bp);
1017 		if (RB_INSERT(buf_rb_bufs, &vp->v_bufs_tree, bp))
1018 			panic("buf_get: dup lblk vp %p bp %p", vp, bp);
1019 	} else {
1020 		bp->b_vnbufs.le_next = NOLIST;
1021 		SET(bp->b_flags, B_INVAL);
1022 		bp->b_vp = NULL;
1023 	}
1024 
1025 	LIST_INSERT_HEAD(&bufhead, bp, b_list);
1026 	bcstats.numbufs++;
1027 
1028 	if (size) {
1029 		buf_alloc_pages(bp, round_page(size));
1030 		KASSERT(ISSET(bp->b_flags, B_DMA));
1031 		buf_map(bp);
1032 	}
1033 
1034 	SET(bp->b_flags, B_BC);
1035 	splx(s);
1036 
1037 	return (bp);
1038 }
1039 
1040 /*
1041  * Buffer cleaning daemon.
1042  */
1043 void
1044 buf_daemon(struct proc *p)
1045 {
1046 	struct buf *bp = NULL;
1047 	int s, pushed = 0;
1048 
1049 	cleanerproc = curproc;
1050 
1051 	s = splbio();
1052 	for (;;) {
1053 		if (bp == NULL || (pushed >= 16 &&
1054 		    UNCLEAN_PAGES < hidirtypages &&
1055 		    bcstats.kvaslots_avail > 2 * RESERVE_SLOTS)){
1056 			pushed = 0;
1057 			/*
1058 			 * Wake up anyone who was waiting for buffers
1059 			 * to be released.
1060 			 */
1061 			if (needbuffer) {
1062 				needbuffer = 0;
1063 				wakeup(&needbuffer);
1064 			}
1065 			tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
1066 		}
1067 
1068 		while ((bp = bufcache_getdirtybuf())) {
1069 
1070 			if (UNCLEAN_PAGES < lodirtypages &&
1071 			    bcstats.kvaslots_avail > 2 * RESERVE_SLOTS &&
1072 			    pushed >= 16)
1073 				break;
1074 
1075 			bufcache_take(bp);
1076 			buf_acquire(bp);
1077 			splx(s);
1078 
1079 			if (ISSET(bp->b_flags, B_INVAL)) {
1080 				brelse(bp);
1081 				s = splbio();
1082 				continue;
1083 			}
1084 #ifdef DIAGNOSTIC
1085 			if (!ISSET(bp->b_flags, B_DELWRI))
1086 				panic("Clean buffer on dirty queue");
1087 #endif
1088 			if (LIST_FIRST(&bp->b_dep) != NULL &&
1089 			    !ISSET(bp->b_flags, B_DEFERRED) &&
1090 			    buf_countdeps(bp, 0, 0)) {
1091 				SET(bp->b_flags, B_DEFERRED);
1092 				s = splbio();
1093 				bufcache_release(bp);
1094 				buf_release(bp);
1095 				continue;
1096 			}
1097 
1098 			bawrite(bp);
1099 			pushed++;
1100 
1101 			sched_pause();
1102 
1103 			s = splbio();
1104 		}
1105 	}
1106 }
1107 
1108 /*
1109  * Wait for operations on the buffer to complete.
1110  * When they do, extract and return the I/O's error value.
1111  */
1112 int
1113 biowait(struct buf *bp)
1114 {
1115 	int s;
1116 
1117 	KASSERT(!(bp->b_flags & B_ASYNC));
1118 
1119 	s = splbio();
1120 	while (!ISSET(bp->b_flags, B_DONE))
1121 		tsleep(bp, PRIBIO + 1, "biowait", 0);
1122 	splx(s);
1123 
1124 	/* check for interruption of I/O (e.g. via NFS), then errors. */
1125 	if (ISSET(bp->b_flags, B_EINTR)) {
1126 		CLR(bp->b_flags, B_EINTR);
1127 		return (EINTR);
1128 	}
1129 
1130 	if (ISSET(bp->b_flags, B_ERROR))
1131 		return (bp->b_error ? bp->b_error : EIO);
1132 	else
1133 		return (0);
1134 }
1135 
1136 /*
1137  * Mark I/O complete on a buffer.
1138  *
1139  * If a callback has been requested, e.g. the pageout
1140  * daemon, do so. Otherwise, awaken waiting processes.
1141  *
1142  * [ Leffler, et al., says on p.247:
1143  *	"This routine wakes up the blocked process, frees the buffer
1144  *	for an asynchronous write, or, for a request by the pagedaemon
1145  *	process, invokes a procedure specified in the buffer structure" ]
1146  *
1147  * In real life, the pagedaemon (or other system processes) wants
1148  * to do async stuff to, and doesn't want the buffer brelse()'d.
1149  * (for swap pager, that puts swap buffers on the free lists (!!!),
1150  * for the vn device, that puts malloc'd buffers on the free lists!)
1151  *
1152  * Must be called at splbio().
1153  */
1154 void
1155 biodone(struct buf *bp)
1156 {
1157 	splassert(IPL_BIO);
1158 
1159 	if (ISSET(bp->b_flags, B_DONE))
1160 		panic("biodone already");
1161 	SET(bp->b_flags, B_DONE);		/* note that it's done */
1162 
1163 	if (bp->b_bq)
1164 		bufq_done(bp->b_bq, bp);
1165 
1166 	if (LIST_FIRST(&bp->b_dep) != NULL)
1167 		buf_complete(bp);
1168 
1169 	if (!ISSET(bp->b_flags, B_READ)) {
1170 		CLR(bp->b_flags, B_WRITEINPROG);
1171 		vwakeup(bp->b_vp);
1172 	}
1173 	if (bcstats.numbufs &&
1174 	    (!(ISSET(bp->b_flags, B_RAW) || ISSET(bp->b_flags, B_PHYS)))) {
1175 		if (!ISSET(bp->b_flags, B_READ)) {
1176 			bcstats.pendingwrites--;
1177 		} else
1178 			bcstats.pendingreads--;
1179 	}
1180 	if (ISSET(bp->b_flags, B_CALL)) {	/* if necessary, call out */
1181 		CLR(bp->b_flags, B_CALL);	/* but note callout done */
1182 		(*bp->b_iodone)(bp);
1183 	} else {
1184 		if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */
1185 			brelse(bp);
1186 		} else {			/* or just wakeup the buffer */
1187 			CLR(bp->b_flags, B_WANTED);
1188 			wakeup(bp);
1189 		}
1190 	}
1191 }
1192 
1193 #ifdef DDB
1194 void	bcstats_print(int (*)(const char *, ...)
1195     __attribute__((__format__(__kprintf__,1,2))));
1196 /*
1197  * bcstats_print: ddb hook to print interesting buffer cache counters
1198  */
1199 void
1200 bcstats_print(
1201     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1202 {
1203 	(*pr)("Current Buffer Cache status:\n");
1204 	(*pr)("numbufs %lld busymapped %lld, delwri %lld\n",
1205 	    bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs);
1206 	(*pr)("kvaslots %lld avail kva slots %lld\n",
1207 	    bcstats.kvaslots, bcstats.kvaslots_avail);
1208     	(*pr)("bufpages %lld, dirtypages %lld\n",
1209 	    bcstats.numbufpages,  bcstats.numdirtypages);
1210 	(*pr)("pendingreads %lld, pendingwrites %lld\n",
1211 	    bcstats.pendingreads, bcstats.pendingwrites);
1212 }
1213 #endif
1214 
1215 void
1216 buf_adjcnt(struct buf *bp, long ncount)
1217 {
1218 	KASSERT(ncount <= bp->b_bufsize);
1219 	bp->b_bcount = ncount;
1220 }
1221 
1222 /* bufcache freelist code below */
1223 /*
1224  * Copyright (c) 2014 Ted Unangst <tedu@openbsd.org>
1225  *
1226  * Permission to use, copy, modify, and distribute this software for any
1227  * purpose with or without fee is hereby granted, provided that the above
1228  * copyright notice and this permission notice appear in all copies.
1229  *
1230  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1231  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1232  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1233  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1234  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1235  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1236  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1237  */
1238 
1239 /*
1240  * The code below implements a variant of the 2Q buffer cache algorithm by
1241  * Johnson and Shasha.
1242  *
1243  * General Outline
1244  * We divide the buffer cache into three working sets: current, previous,
1245  * and long term. Each list is itself LRU and buffers get promoted and moved
1246  * around between them. A buffer starts its life in the current working set.
1247  * As time passes and newer buffers push it out, it will turn into the previous
1248  * working set and is subject to recycling. But if it's accessed again from
1249  * the previous working set, that's an indication that it's actually in the
1250  * long term working set, so we promote it there. The separation of current
1251  * and previous working sets prevents us from promoting a buffer that's only
1252  * temporarily hot to the long term cache.
1253  *
1254  * The objective is to provide scan resistance by making the long term
1255  * working set ineligible for immediate recycling, even as the current
1256  * working set is rapidly turned over.
1257  *
1258  * Implementation
1259  * The code below identifies the current, previous, and long term sets as
1260  * hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at
1261  * 1/3 of the total clean pages, after which point they start pushing their
1262  * oldest buffers into coldqueue.
1263  * A buf always starts out with neither WARM or COLD flags set (implying HOT).
1264  * When released, it will be returned to the tail of the hotqueue list.
1265  * When the hotqueue gets too large, the oldest hot buf will be moved to the
1266  * coldqueue, with the B_COLD flag set. When a cold buf is released, we set
1267  * the B_WARM flag and put it onto the warmqueue. Warm bufs are also
1268  * directly returned to the end of the warmqueue. As with the hotqueue, when
1269  * the warmqueue grows too large, B_WARM bufs are moved onto the coldqueue.
1270  *
1271  * Note that this design does still support large working sets, greater
1272  * than the cap of hotqueue or warmqueue would imply. The coldqueue is still
1273  * cached and has no maximum length. The hot and warm queues form a Y feeding
1274  * into the coldqueue. Moving bufs between queues is constant time, so this
1275  * design decays to one long warm->cold queue.
1276  *
1277  * In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue
1278  * is Am. We always cache pages, as opposed to pointers to pages for A1.
1279  *
1280  * This implementation adds support for multiple 2q caches.
1281  *
1282  * If we have more than one 2q cache, as bufs fall off the cold queue
1283  * for recyclying, bufs that have been warm before (which retain the
1284  * B_WARM flag in addition to B_COLD) can be put into the hot queue of
1285  * a second level 2Q cache. buffers which are only B_COLD are
1286  * recycled. Bufs falling off the last cache's cold queue are always
1287  * recycled.
1288  *
1289  */
1290 
1291 /*
1292  * this function is called when a hot or warm queue may have exceeded its
1293  * size limit. it will move a buf to the coldqueue.
1294  */
1295 int chillbufs(struct
1296     bufcache *cache, struct bufqueue *queue, int64_t *queuepages);
1297 
1298 void
1299 bufcache_init(void)
1300 {
1301 	int i;
1302 	for (i=0; i < NUM_CACHES; i++) {
1303 		TAILQ_INIT(&cleancache[i].hotqueue);
1304 		TAILQ_INIT(&cleancache[i].coldqueue);
1305 		TAILQ_INIT(&cleancache[i].warmqueue);
1306 	}
1307 	TAILQ_INIT(&dirtyqueue);
1308 }
1309 
1310 /*
1311  * if the buffer caches have shrunk, we may need to rebalance our queues.
1312  */
1313 void
1314 bufcache_adjust(void)
1315 {
1316 	int i;
1317 	for (i=0; i < NUM_CACHES; i++) {
1318 		while (chillbufs(&cleancache[i], &cleancache[i].warmqueue,
1319 		    &cleancache[i].warmbufpages) ||
1320 		    chillbufs(&cleancache[i], &cleancache[i].hotqueue,
1321 		    &cleancache[i].hotbufpages))
1322 			continue;
1323 	}
1324 }
1325 
1326 /*
1327  * Get a clean buffer from the cache. if "discard" is set do not promote
1328  * previously warm buffers as normal, because we are tossing everything
1329  * away such as in a hibernation
1330  */
1331 struct buf *
1332 bufcache_getcleanbuf(int cachenum, int discard)
1333 {
1334 	struct buf *bp = NULL;
1335 	struct bufcache *cache = &cleancache[cachenum];
1336 
1337 	splassert(IPL_BIO);
1338 
1339 	/* try  cold queue */
1340 	while ((bp = TAILQ_FIRST(&cache->coldqueue))) {
1341 		if ((!discard) &&
1342 		    cachenum < NUM_CACHES - 1 && ISSET(bp->b_flags, B_WARM)) {
1343 			/*
1344 			 * If this buffer was warm before, move it to
1345 			 *  the hot queue in the next cache
1346 			 */
1347 			TAILQ_REMOVE(&cache->coldqueue, bp, b_freelist);
1348 			CLR(bp->b_flags, B_WARM);
1349 			CLR(bp->b_flags, B_COLD);
1350 			int64_t pages = atop(bp->b_bufsize);
1351 			KASSERT(bp->cache == cachenum);
1352 			if (bp->cache == 0)
1353 				buf_flip_high(bp);
1354 			bp->cache++;
1355 			struct bufcache *newcache = &cleancache[bp->cache];
1356 			newcache->cachepages += pages;
1357 			newcache->hotbufpages += pages;
1358 			chillbufs(newcache, &newcache->hotqueue,
1359 			    &newcache->hotbufpages);
1360 			TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist);
1361 		}
1362 		else
1363 			/* buffer is cold - give it up */
1364 			return bp;
1365 	}
1366 	if ((bp = TAILQ_FIRST(&cache->warmqueue)))
1367 		return bp;
1368 	if ((bp = TAILQ_FIRST(&cache->hotqueue)))
1369  		return bp;
1370 	return bp;
1371 }
1372 
1373 struct buf *
1374 bufcache_getcleanbuf_range(int start, int end, int discard)
1375 {
1376 	int i, j = start, q = end;
1377 	struct buf *bp = NULL;
1378 
1379 	/*
1380 	 * XXX in theory we could promote warm buffers into a previous queue
1381 	 * so in the pathological case of where we go through all the caches
1382 	 * without getting a buffer we have to start at the beginning again.
1383 	 */
1384 	while (j <= q)	{
1385 		for (i = q; i >= j; i--)
1386 			if ((bp = bufcache_getcleanbuf(i, discard)))
1387 				return(bp);
1388 		j++;
1389 	}
1390 	return bp;
1391 }
1392 
1393 struct buf *
1394 bufcache_getanycleanbuf(void)
1395 {
1396 	return bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES -1, 0);
1397 }
1398 
1399 
1400 struct buf *
1401 bufcache_getdirtybuf(void)
1402 {
1403 	return TAILQ_FIRST(&dirtyqueue);
1404 }
1405 
1406 void
1407 bufcache_take(struct buf *bp)
1408 {
1409 	struct bufqueue *queue;
1410 	int64_t pages;
1411 
1412 	splassert(IPL_BIO);
1413 
1414 	KASSERT(ISSET(bp->b_flags, B_BC));
1415 	KASSERT(bp->cache >= DMA_CACHE);
1416 	KASSERT((bp->cache < NUM_CACHES));
1417 	pages = atop(bp->b_bufsize);
1418 	struct bufcache *cache = &cleancache[bp->cache];
1419 	if (!ISSET(bp->b_flags, B_DELWRI)) {
1420                 if (ISSET(bp->b_flags, B_COLD)) {
1421 			queue = &cache->coldqueue;
1422 		} else if (ISSET(bp->b_flags, B_WARM)) {
1423 			queue = &cache->warmqueue;
1424 			cache->warmbufpages -= pages;
1425 		} else {
1426 			queue = &cache->hotqueue;
1427 			cache->hotbufpages -= pages;
1428 		}
1429 		bcstats.numcleanpages -= pages;
1430 		cache->cachepages -= pages;
1431 	} else {
1432 		queue = &dirtyqueue;
1433 		bcstats.numdirtypages -= pages;
1434 		bcstats.delwribufs--;
1435 	}
1436 	TAILQ_REMOVE(queue, bp, b_freelist);
1437 }
1438 
1439 /* move buffers from a hot or warm queue to a cold queue in a cache */
1440 int
1441 chillbufs(struct bufcache *cache, struct bufqueue *queue, int64_t *queuepages)
1442 {
1443 	struct buf *bp;
1444 	int64_t limit, pages;
1445 
1446 	/*
1447 	 * The warm and hot queues are allowed to be up to one third each.
1448 	 * We impose a minimum size of 96 to prevent too much "wobbling".
1449 	 */
1450 	limit = cache->cachepages / 3;
1451 	if (*queuepages > 96 && *queuepages > limit) {
1452 		bp = TAILQ_FIRST(queue);
1453 		if (!bp)
1454 			panic("inconsistent bufpage counts");
1455 		pages = atop(bp->b_bufsize);
1456 		*queuepages -= pages;
1457 		TAILQ_REMOVE(queue, bp, b_freelist);
1458 		/* we do not clear B_WARM */
1459 		SET(bp->b_flags, B_COLD);
1460 		TAILQ_INSERT_TAIL(&cache->coldqueue, bp, b_freelist);
1461 		return 1;
1462 	}
1463 	return 0;
1464 }
1465 
1466 void
1467 bufcache_release(struct buf *bp)
1468 {
1469 	struct bufqueue *queue;
1470 	int64_t pages;
1471 	struct bufcache *cache = &cleancache[bp->cache];
1472 	pages = atop(bp->b_bufsize);
1473 	KASSERT(ISSET(bp->b_flags, B_BC));
1474 	KASSERT((ISSET(bp->b_flags, B_DMA) && bp->cache == 0)
1475 	    || ((!ISSET(bp->b_flags, B_DMA)) && bp->cache > 0));
1476 	if (!ISSET(bp->b_flags, B_DELWRI)) {
1477 		int64_t *queuepages;
1478 		if (ISSET(bp->b_flags, B_WARM | B_COLD)) {
1479 			SET(bp->b_flags, B_WARM);
1480 			CLR(bp->b_flags, B_COLD);
1481 			queue = &cache->warmqueue;
1482 			queuepages = &cache->warmbufpages;
1483 		} else {
1484 			queue = &cache->hotqueue;
1485 			queuepages = &cache->hotbufpages;
1486 		}
1487 		*queuepages += pages;
1488 		bcstats.numcleanpages += pages;
1489 		cache->cachepages += pages;
1490 		chillbufs(cache, queue, queuepages);
1491 	} else {
1492 		queue = &dirtyqueue;
1493 		bcstats.numdirtypages += pages;
1494 		bcstats.delwribufs++;
1495 	}
1496 	TAILQ_INSERT_TAIL(queue, bp, b_freelist);
1497 }
1498 
1499 #ifdef HIBERNATE
1500 /*
1501  * Nuke the buffer cache from orbit when hibernating. We do not want to save
1502  * any clean cache pages to swap and read them back. the original disk files
1503  * are just as good.
1504  */
1505 void
1506 hibernate_suspend_bufcache(void)
1507 {
1508 	struct buf *bp;
1509 	int s;
1510 
1511 	s = splbio();
1512 	/* Chuck away all the cache pages.. discard bufs, do not promote */
1513 	while ((bp = bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 1))) {
1514 		bufcache_take(bp);
1515 		if (bp->b_vp) {
1516 			RB_REMOVE(buf_rb_bufs,
1517 			    &bp->b_vp->v_bufs_tree, bp);
1518 			brelvp(bp);
1519 		}
1520 		buf_put(bp);
1521 	}
1522 	splx(s);
1523 }
1524 
1525 void
1526 hibernate_resume_bufcache(void)
1527 {
1528 	/* XXX Nothing needed here for now */
1529 }
1530 #endif /* HIBERNATE */
1531