1 /* $OpenBSD: vfs_bio.c,v 1.175 2016/06/07 01:31:54 tedu Exp $ */ 2 /* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */ 3 4 /* 5 * Copyright (c) 1994 Christopher G. Demetriou 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 39 */ 40 41 /* 42 * Some references: 43 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 44 * Leffler, et al.: The Design and Implementation of the 4.3BSD 45 * UNIX Operating System (Addison Welley, 1989) 46 */ 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/buf.h> 52 #include <sys/vnode.h> 53 #include <sys/mount.h> 54 #include <sys/malloc.h> 55 #include <sys/pool.h> 56 #include <sys/resourcevar.h> 57 #include <sys/conf.h> 58 #include <sys/kernel.h> 59 #include <sys/specdev.h> 60 #include <uvm/uvm_extern.h> 61 62 int nobuffers; 63 int needbuffer; 64 struct bio_ops bioops; 65 66 /* private bufcache functions */ 67 void bufcache_init(void); 68 void bufcache_adjust(void); 69 70 /* 71 * Buffer pool for I/O buffers. 72 */ 73 struct pool bufpool; 74 struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead); 75 void buf_put(struct buf *); 76 77 struct buf *bio_doread(struct vnode *, daddr_t, int, int); 78 struct buf *buf_get(struct vnode *, daddr_t, size_t); 79 void bread_cluster_callback(struct buf *); 80 81 struct bcachestats bcstats; /* counters */ 82 long lodirtypages; /* dirty page count low water mark */ 83 long hidirtypages; /* dirty page count high water mark */ 84 long targetpages; /* target number of pages for cache size */ 85 long buflowpages; /* smallest size cache allowed */ 86 long bufhighpages; /* largest size cache allowed */ 87 long bufbackpages; /* minimum number of pages we shrink when asked to */ 88 89 vsize_t bufkvm; 90 91 struct proc *cleanerproc; 92 int bd_req; /* Sleep point for cleaner daemon. */ 93 94 #define NUM_CACHES 2 95 #define DMA_CACHE 0 96 struct bufcache cleancache[NUM_CACHES]; 97 struct bufqueue dirtyqueue; 98 99 void 100 buf_put(struct buf *bp) 101 { 102 splassert(IPL_BIO); 103 104 #ifdef DIAGNOSTIC 105 if (bp->b_pobj != NULL) 106 KASSERT(bp->b_bufsize > 0); 107 if (ISSET(bp->b_flags, B_DELWRI)) 108 panic("buf_put: releasing dirty buffer"); 109 if (bp->b_freelist.tqe_next != NOLIST && 110 bp->b_freelist.tqe_next != (void *)-1) 111 panic("buf_put: still on the free list"); 112 if (bp->b_vnbufs.le_next != NOLIST && 113 bp->b_vnbufs.le_next != (void *)-1) 114 panic("buf_put: still on the vnode list"); 115 if (!LIST_EMPTY(&bp->b_dep)) 116 panic("buf_put: b_dep is not empty"); 117 #endif 118 119 LIST_REMOVE(bp, b_list); 120 bcstats.numbufs--; 121 122 if (buf_dealloc_mem(bp) != 0) 123 return; 124 pool_put(&bufpool, bp); 125 } 126 127 /* 128 * Initialize buffers and hash links for buffers. 129 */ 130 void 131 bufinit(void) 132 { 133 u_int64_t dmapages; 134 135 dmapages = uvm_pagecount(&dma_constraint); 136 /* take away a guess at how much of this the kernel will consume */ 137 dmapages -= (atop(physmem) - atop(uvmexp.free)); 138 139 /* 140 * If MD code doesn't say otherwise, use up to 10% of DMA'able 141 * memory for buffers. 142 */ 143 if (bufcachepercent == 0) 144 bufcachepercent = 10; 145 146 /* 147 * XXX these values and their same use in kern_sysctl 148 * need to move into buf.h 149 */ 150 KASSERT(bufcachepercent <= 90); 151 KASSERT(bufcachepercent >= 5); 152 if (bufpages == 0) 153 bufpages = dmapages * bufcachepercent / 100; 154 if (bufpages < BCACHE_MIN) 155 bufpages = BCACHE_MIN; 156 KASSERT(bufpages < dmapages); 157 158 bufhighpages = bufpages; 159 160 /* 161 * Set the base backoff level for the buffer cache. We will 162 * not allow uvm to steal back more than this number of pages. 163 */ 164 buflowpages = dmapages * 5 / 100; 165 if (buflowpages < BCACHE_MIN) 166 buflowpages = BCACHE_MIN; 167 168 /* 169 * set bufbackpages to 100 pages, or 10 percent of the low water mark 170 * if we don't have that many pages. 171 */ 172 173 bufbackpages = buflowpages * 10 / 100; 174 if (bufbackpages > 100) 175 bufbackpages = 100; 176 177 /* 178 * If the MD code does not say otherwise, reserve 10% of kva 179 * space for mapping buffers. 180 */ 181 if (bufkvm == 0) 182 bufkvm = VM_KERNEL_SPACE_SIZE / 10; 183 184 /* 185 * Don't use more than twice the amount of bufpages for mappings. 186 * It's twice since we map things sparsely. 187 */ 188 if (bufkvm > bufpages * PAGE_SIZE) 189 bufkvm = bufpages * PAGE_SIZE; 190 /* 191 * Round bufkvm to MAXPHYS because we allocate chunks of va space 192 * in MAXPHYS chunks. 193 */ 194 bufkvm &= ~(MAXPHYS - 1); 195 196 pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL); 197 pool_setipl(&bufpool, IPL_BIO); 198 199 bufcache_init(); 200 201 /* 202 * hmm - bufkvm is an argument because it's static, while 203 * bufpages is global because it can change while running. 204 */ 205 buf_mem_init(bufkvm); 206 207 /* 208 * Set the dirty page high water mark to be less than the low 209 * water mark for pages in the buffer cache. This ensures we 210 * can always back off by throwing away clean pages, and give 211 * ourselves a chance to write out the dirty pages eventually. 212 */ 213 hidirtypages = (buflowpages / 4) * 3; 214 lodirtypages = buflowpages / 2; 215 216 /* 217 * We are allowed to use up to the reserve. 218 */ 219 targetpages = bufpages - RESERVE_PAGES; 220 } 221 222 /* 223 * Change cachepct 224 */ 225 void 226 bufadjust(int newbufpages) 227 { 228 struct buf *bp; 229 int s; 230 231 if (newbufpages < buflowpages) 232 newbufpages = buflowpages; 233 234 s = splbio(); 235 bufpages = newbufpages; 236 237 /* 238 * We are allowed to use up to the reserve 239 */ 240 targetpages = bufpages - RESERVE_PAGES; 241 242 /* 243 * Shrinking the cache happens here only if someone has manually 244 * adjusted bufcachepercent - or the pagedaemon has told us 245 * to give back memory *now* - so we give it all back. 246 */ 247 while ((bp = bufcache_getanycleanbuf()) && 248 (bcstats.numbufpages > targetpages)) { 249 bufcache_take(bp); 250 if (bp->b_vp) { 251 RB_REMOVE(buf_rb_bufs, 252 &bp->b_vp->v_bufs_tree, bp); 253 brelvp(bp); 254 } 255 buf_put(bp); 256 } 257 bufcache_adjust(); 258 259 /* 260 * Wake up the cleaner if we have lots of dirty pages, 261 * or if we are getting low on buffer cache kva. 262 */ 263 if ((UNCLEAN_PAGES >= hidirtypages) || 264 bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS) 265 wakeup(&bd_req); 266 267 splx(s); 268 } 269 270 /* 271 * Make the buffer cache back off from cachepct. 272 */ 273 int 274 bufbackoff(struct uvm_constraint_range *range, long size) 275 { 276 /* 277 * Back off "size" buffer cache pages. Called by the page 278 * daemon to consume buffer cache pages rather than scanning. 279 * 280 * It returns 0 to the pagedaemon to indicate that it has 281 * succeeded in freeing enough pages. It returns -1 to 282 * indicate that it could not and the pagedaemon should take 283 * other measures. 284 * 285 */ 286 long pdelta, oldbufpages; 287 288 /* 289 * Back off by at least bufbackpages. If the page daemon gave us 290 * a larger size, back off by that much. 291 */ 292 pdelta = (size > bufbackpages) ? size : bufbackpages; 293 294 if (bufpages <= buflowpages) 295 return(-1); 296 if (bufpages - pdelta < buflowpages) 297 pdelta = bufpages - buflowpages; 298 oldbufpages = bufpages; 299 bufadjust(bufpages - pdelta); 300 if (oldbufpages - bufpages < size) 301 return (-1); /* we did not free what we were asked */ 302 else 303 return(0); 304 } 305 306 void 307 buf_flip_high(struct buf *bp) 308 { 309 KASSERT(ISSET(bp->b_flags, B_BC)); 310 KASSERT(ISSET(bp->b_flags, B_DMA)); 311 KASSERT(bp->cache == DMA_CACHE); 312 CLR(bp->b_flags, B_DMA); 313 /* XXX does nothing to buffer for now */ 314 } 315 316 void 317 buf_flip_dma(struct buf *bp) 318 { 319 KASSERT(ISSET(bp->b_flags, B_BC)); 320 KASSERT(ISSET(bp->b_flags, B_BUSY)); 321 if (!ISSET(bp->b_flags, B_DMA)) { 322 KASSERT(bp->cache > DMA_CACHE); 323 KASSERT(bp->cache < NUM_CACHES); 324 /* XXX does not flip buffer for now */ 325 /* make buffer hot, in DMA_CACHE, once it gets released. */ 326 CLR(bp->b_flags, B_COLD); 327 CLR(bp->b_flags, B_WARM); 328 SET(bp->b_flags, B_DMA); 329 bp->cache = DMA_CACHE; 330 } 331 } 332 333 struct buf * 334 bio_doread(struct vnode *vp, daddr_t blkno, int size, int async) 335 { 336 struct buf *bp; 337 struct mount *mp; 338 339 bp = getblk(vp, blkno, size, 0, 0); 340 341 /* 342 * If buffer does not have valid data, start a read. 343 * Note that if buffer is B_INVAL, getblk() won't return it. 344 * Therefore, it's valid if its I/O has completed or been delayed. 345 */ 346 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { 347 SET(bp->b_flags, B_READ | async); 348 bcstats.pendingreads++; 349 bcstats.numreads++; 350 VOP_STRATEGY(bp); 351 /* Pay for the read. */ 352 curproc->p_ru.ru_inblock++; /* XXX */ 353 } else if (async) { 354 brelse(bp); 355 } 356 357 mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount; 358 359 /* 360 * Collect statistics on synchronous and asynchronous reads. 361 * Reads from block devices are charged to their associated 362 * filesystem (if any). 363 */ 364 if (mp != NULL) { 365 if (async == 0) 366 mp->mnt_stat.f_syncreads++; 367 else 368 mp->mnt_stat.f_asyncreads++; 369 } 370 371 return (bp); 372 } 373 374 /* 375 * Read a disk block. 376 * This algorithm described in Bach (p.54). 377 */ 378 int 379 bread(struct vnode *vp, daddr_t blkno, int size, struct buf **bpp) 380 { 381 struct buf *bp; 382 383 /* Get buffer for block. */ 384 bp = *bpp = bio_doread(vp, blkno, size, 0); 385 386 /* Wait for the read to complete, and return result. */ 387 return (biowait(bp)); 388 } 389 390 /* 391 * Read-ahead multiple disk blocks. The first is sync, the rest async. 392 * Trivial modification to the breada algorithm presented in Bach (p.55). 393 */ 394 int 395 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t rablks[], 396 int rasizes[], int nrablks, struct buf **bpp) 397 { 398 struct buf *bp; 399 int i; 400 401 bp = *bpp = bio_doread(vp, blkno, size, 0); 402 403 /* 404 * For each of the read-ahead blocks, start a read, if necessary. 405 */ 406 for (i = 0; i < nrablks; i++) { 407 /* If it's in the cache, just go on to next one. */ 408 if (incore(vp, rablks[i])) 409 continue; 410 411 /* Get a buffer for the read-ahead block */ 412 (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC); 413 } 414 415 /* Otherwise, we had to start a read for it; wait until it's valid. */ 416 return (biowait(bp)); 417 } 418 419 /* 420 * Called from interrupt context. 421 */ 422 void 423 bread_cluster_callback(struct buf *bp) 424 { 425 struct buf **xbpp = bp->b_saveaddr; 426 int i; 427 428 if (xbpp[1] != NULL) { 429 size_t newsize = xbpp[1]->b_bufsize; 430 431 /* 432 * Shrink this buffer's mapping to only cover its part of 433 * the total I/O. 434 */ 435 buf_fix_mapping(bp, newsize); 436 bp->b_bcount = newsize; 437 } 438 439 for (i = 1; xbpp[i] != 0; i++) { 440 if (ISSET(bp->b_flags, B_ERROR)) 441 SET(xbpp[i]->b_flags, B_INVAL | B_ERROR); 442 biodone(xbpp[i]); 443 } 444 445 free(xbpp, M_TEMP, 0); 446 447 if (ISSET(bp->b_flags, B_ASYNC)) { 448 brelse(bp); 449 } else { 450 CLR(bp->b_flags, B_WANTED); 451 wakeup(bp); 452 } 453 } 454 455 int 456 bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp) 457 { 458 struct buf *bp, **xbpp; 459 int howmany, maxra, i, inc; 460 daddr_t sblkno; 461 462 *rbpp = bio_doread(vp, blkno, size, 0); 463 464 /* 465 * If the buffer is in the cache skip any I/O operation. 466 */ 467 if (ISSET((*rbpp)->b_flags, B_CACHE)) 468 goto out; 469 470 if (size != round_page(size)) 471 goto out; 472 473 if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra)) 474 goto out; 475 476 maxra++; 477 if (sblkno == -1 || maxra < 2) 478 goto out; 479 480 howmany = MAXPHYS / size; 481 if (howmany > maxra) 482 howmany = maxra; 483 484 xbpp = mallocarray(howmany + 1, sizeof(struct buf *), M_TEMP, M_NOWAIT); 485 if (xbpp == NULL) 486 goto out; 487 488 for (i = howmany - 1; i >= 0; i--) { 489 size_t sz; 490 491 /* 492 * First buffer allocates big enough size to cover what 493 * all the other buffers need. 494 */ 495 sz = i == 0 ? howmany * size : 0; 496 497 xbpp[i] = buf_get(vp, blkno + i + 1, sz); 498 if (xbpp[i] == NULL) { 499 for (++i; i < howmany; i++) { 500 SET(xbpp[i]->b_flags, B_INVAL); 501 brelse(xbpp[i]); 502 } 503 free(xbpp, M_TEMP, 0); 504 goto out; 505 } 506 } 507 508 bp = xbpp[0]; 509 510 xbpp[howmany] = 0; 511 512 inc = btodb(size); 513 514 for (i = 1; i < howmany; i++) { 515 bcstats.pendingreads++; 516 bcstats.numreads++; 517 /* 518 * We set B_DMA here because bp above will be B_DMA, 519 * and we are playing buffer slice-n-dice games from 520 * the memory allocated in bp. 521 */ 522 SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC); 523 xbpp[i]->b_blkno = sblkno + (i * inc); 524 xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size; 525 xbpp[i]->b_data = NULL; 526 xbpp[i]->b_pobj = bp->b_pobj; 527 xbpp[i]->b_poffs = bp->b_poffs + (i * size); 528 } 529 530 KASSERT(bp->b_lblkno == blkno + 1); 531 KASSERT(bp->b_vp == vp); 532 533 bp->b_blkno = sblkno; 534 SET(bp->b_flags, B_READ | B_ASYNC | B_CALL); 535 536 bp->b_saveaddr = (void *)xbpp; 537 bp->b_iodone = bread_cluster_callback; 538 539 bcstats.pendingreads++; 540 bcstats.numreads++; 541 VOP_STRATEGY(bp); 542 curproc->p_ru.ru_inblock++; 543 544 out: 545 return (biowait(*rbpp)); 546 } 547 548 /* 549 * Block write. Described in Bach (p.56) 550 */ 551 int 552 bwrite(struct buf *bp) 553 { 554 int rv, async, wasdelayed, s; 555 struct vnode *vp; 556 struct mount *mp; 557 558 vp = bp->b_vp; 559 if (vp != NULL) 560 mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount; 561 else 562 mp = NULL; 563 564 /* 565 * Remember buffer type, to switch on it later. If the write was 566 * synchronous, but the file system was mounted with MNT_ASYNC, 567 * convert it to a delayed write. 568 * XXX note that this relies on delayed tape writes being converted 569 * to async, not sync writes (which is safe, but ugly). 570 */ 571 async = ISSET(bp->b_flags, B_ASYNC); 572 if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)) { 573 bdwrite(bp); 574 return (0); 575 } 576 577 /* 578 * Collect statistics on synchronous and asynchronous writes. 579 * Writes to block devices are charged to their associated 580 * filesystem (if any). 581 */ 582 if (mp != NULL) { 583 if (async) 584 mp->mnt_stat.f_asyncwrites++; 585 else 586 mp->mnt_stat.f_syncwrites++; 587 } 588 bcstats.pendingwrites++; 589 bcstats.numwrites++; 590 591 wasdelayed = ISSET(bp->b_flags, B_DELWRI); 592 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI)); 593 594 s = splbio(); 595 596 /* 597 * If not synchronous, pay for the I/O operation and make 598 * sure the buf is on the correct vnode queue. We have 599 * to do this now, because if we don't, the vnode may not 600 * be properly notified that its I/O has completed. 601 */ 602 if (wasdelayed) { 603 reassignbuf(bp); 604 } else 605 curproc->p_ru.ru_oublock++; 606 607 608 /* Initiate disk write. Make sure the appropriate party is charged. */ 609 bp->b_vp->v_numoutput++; 610 splx(s); 611 buf_flip_dma(bp); 612 SET(bp->b_flags, B_WRITEINPROG); 613 VOP_STRATEGY(bp); 614 615 /* 616 * If the queue is above the high water mark, wait till 617 * the number of outstanding write bufs drops below the low 618 * water mark. 619 */ 620 if (bp->b_bq) 621 bufq_wait(bp->b_bq); 622 623 if (async) 624 return (0); 625 626 /* 627 * If I/O was synchronous, wait for it to complete. 628 */ 629 rv = biowait(bp); 630 631 /* Release the buffer. */ 632 brelse(bp); 633 634 return (rv); 635 } 636 637 638 /* 639 * Delayed write. 640 * 641 * The buffer is marked dirty, but is not queued for I/O. 642 * This routine should be used when the buffer is expected 643 * to be modified again soon, typically a small write that 644 * partially fills a buffer. 645 * 646 * NB: magnetic tapes cannot be delayed; they must be 647 * written in the order that the writes are requested. 648 * 649 * Described in Leffler, et al. (pp. 208-213). 650 */ 651 void 652 bdwrite(struct buf *bp) 653 { 654 int s; 655 656 /* 657 * If the block hasn't been seen before: 658 * (1) Mark it as having been seen, 659 * (2) Charge for the write. 660 * (3) Make sure it's on its vnode's correct block list, 661 * (4) If a buffer is rewritten, move it to end of dirty list 662 */ 663 if (!ISSET(bp->b_flags, B_DELWRI)) { 664 SET(bp->b_flags, B_DELWRI); 665 s = splbio(); 666 buf_flip_dma(bp); 667 reassignbuf(bp); 668 splx(s); 669 curproc->p_ru.ru_oublock++; /* XXX */ 670 } 671 672 /* If this is a tape block, write the block now. */ 673 if (major(bp->b_dev) < nblkdev && 674 bdevsw[major(bp->b_dev)].d_type == D_TAPE) { 675 bawrite(bp); 676 return; 677 } 678 679 /* Otherwise, the "write" is done, so mark and release the buffer. */ 680 CLR(bp->b_flags, B_NEEDCOMMIT); 681 SET(bp->b_flags, B_DONE); 682 brelse(bp); 683 } 684 685 /* 686 * Asynchronous block write; just an asynchronous bwrite(). 687 */ 688 void 689 bawrite(struct buf *bp) 690 { 691 692 SET(bp->b_flags, B_ASYNC); 693 VOP_BWRITE(bp); 694 } 695 696 /* 697 * Must be called at splbio() 698 */ 699 void 700 buf_dirty(struct buf *bp) 701 { 702 splassert(IPL_BIO); 703 704 #ifdef DIAGNOSTIC 705 if (!ISSET(bp->b_flags, B_BUSY)) 706 panic("Trying to dirty buffer on freelist!"); 707 #endif 708 709 if (ISSET(bp->b_flags, B_DELWRI) == 0) { 710 SET(bp->b_flags, B_DELWRI); 711 buf_flip_dma(bp); 712 reassignbuf(bp); 713 } 714 } 715 716 /* 717 * Must be called at splbio() 718 */ 719 void 720 buf_undirty(struct buf *bp) 721 { 722 splassert(IPL_BIO); 723 724 #ifdef DIAGNOSTIC 725 if (!ISSET(bp->b_flags, B_BUSY)) 726 panic("Trying to undirty buffer on freelist!"); 727 #endif 728 if (ISSET(bp->b_flags, B_DELWRI)) { 729 CLR(bp->b_flags, B_DELWRI); 730 reassignbuf(bp); 731 } 732 } 733 734 /* 735 * Release a buffer on to the free lists. 736 * Described in Bach (p. 46). 737 */ 738 void 739 brelse(struct buf *bp) 740 { 741 int s; 742 743 s = splbio(); 744 745 if (bp->b_data != NULL) 746 KASSERT(bp->b_bufsize > 0); 747 748 /* 749 * Determine which queue the buffer should be on, then put it there. 750 */ 751 752 /* If it's not cacheable, or an error, mark it invalid. */ 753 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))) 754 SET(bp->b_flags, B_INVAL); 755 756 if (ISSET(bp->b_flags, B_INVAL)) { 757 /* 758 * If the buffer is invalid, free it now rather than leaving 759 * it in a queue and wasting memory. 760 */ 761 if (LIST_FIRST(&bp->b_dep) != NULL) 762 buf_deallocate(bp); 763 764 if (ISSET(bp->b_flags, B_DELWRI)) { 765 CLR(bp->b_flags, B_DELWRI); 766 } 767 768 if (bp->b_vp) { 769 RB_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, 770 bp); 771 brelvp(bp); 772 } 773 bp->b_vp = NULL; 774 775 /* 776 * Wake up any processes waiting for _this_ buffer to 777 * become free. They are not allowed to grab it 778 * since it will be freed. But the only sleeper is 779 * getblk and it will restart the operation after 780 * sleep. 781 */ 782 if (ISSET(bp->b_flags, B_WANTED)) { 783 CLR(bp->b_flags, B_WANTED); 784 wakeup(bp); 785 } 786 buf_put(bp); 787 } else { 788 /* 789 * It has valid data. Put it on the end of the appropriate 790 * queue, so that it'll stick around for as long as possible. 791 */ 792 bufcache_release(bp); 793 794 /* Unlock the buffer. */ 795 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED)); 796 buf_release(bp); 797 798 /* Wake up any processes waiting for _this_ buffer to 799 * become free. */ 800 if (ISSET(bp->b_flags, B_WANTED)) { 801 CLR(bp->b_flags, B_WANTED); 802 wakeup(bp); 803 } 804 } 805 806 /* Wake up syncer and cleaner processes waiting for buffers. */ 807 if (nobuffers) { 808 nobuffers = 0; 809 wakeup(&nobuffers); 810 } 811 812 /* Wake up any processes waiting for any buffer to become free. */ 813 if (needbuffer && bcstats.numbufpages < targetpages && 814 bcstats.kvaslots_avail > RESERVE_SLOTS) { 815 needbuffer = 0; 816 wakeup(&needbuffer); 817 } 818 819 splx(s); 820 } 821 822 /* 823 * Determine if a block is in the cache. Just look on what would be its hash 824 * chain. If it's there, return a pointer to it, unless it's marked invalid. 825 */ 826 struct buf * 827 incore(struct vnode *vp, daddr_t blkno) 828 { 829 struct buf *bp; 830 struct buf b; 831 int s; 832 833 s = splbio(); 834 835 /* Search buf lookup tree */ 836 b.b_lblkno = blkno; 837 bp = RB_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b); 838 if (bp != NULL && ISSET(bp->b_flags, B_INVAL)) 839 bp = NULL; 840 841 splx(s); 842 return (bp); 843 } 844 845 /* 846 * Get a block of requested size that is associated with 847 * a given vnode and block offset. If it is found in the 848 * block cache, mark it as having been found, make it busy 849 * and return it. Otherwise, return an empty block of the 850 * correct size. It is up to the caller to ensure that the 851 * cached blocks be of the correct size. 852 */ 853 struct buf * 854 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 855 { 856 struct buf *bp; 857 struct buf b; 858 int s, error; 859 860 /* 861 * XXX 862 * The following is an inlined version of 'incore()', but with 863 * the 'invalid' test moved to after the 'busy' test. It's 864 * necessary because there are some cases in which the NFS 865 * code sets B_INVAL prior to writing data to the server, but 866 * in which the buffers actually contain valid data. In this 867 * case, we can't allow the system to allocate a new buffer for 868 * the block until the write is finished. 869 */ 870 start: 871 s = splbio(); 872 b.b_lblkno = blkno; 873 bp = RB_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b); 874 if (bp != NULL) { 875 if (ISSET(bp->b_flags, B_BUSY)) { 876 SET(bp->b_flags, B_WANTED); 877 error = tsleep(bp, slpflag | (PRIBIO + 1), "getblk", 878 slptimeo); 879 splx(s); 880 if (error) 881 return (NULL); 882 goto start; 883 } 884 885 if (!ISSET(bp->b_flags, B_INVAL)) { 886 bcstats.cachehits++; 887 SET(bp->b_flags, B_CACHE); 888 bufcache_take(bp); 889 buf_acquire(bp); 890 splx(s); 891 return (bp); 892 } 893 } 894 splx(s); 895 896 if ((bp = buf_get(vp, blkno, size)) == NULL) 897 goto start; 898 899 return (bp); 900 } 901 902 /* 903 * Get an empty, disassociated buffer of given size. 904 */ 905 struct buf * 906 geteblk(int size) 907 { 908 struct buf *bp; 909 910 while ((bp = buf_get(NULL, 0, size)) == NULL) 911 continue; 912 913 return (bp); 914 } 915 916 /* 917 * Allocate a buffer. 918 */ 919 struct buf * 920 buf_get(struct vnode *vp, daddr_t blkno, size_t size) 921 { 922 struct buf *bp; 923 int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK; 924 int npages; 925 int s; 926 927 s = splbio(); 928 if (size) { 929 /* 930 * Wake up the cleaner if we have lots of dirty pages, 931 * or if we are getting low on buffer cache kva. 932 */ 933 if (UNCLEAN_PAGES >= hidirtypages || 934 bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS) 935 wakeup(&bd_req); 936 937 npages = atop(round_page(size)); 938 939 /* 940 * if our cache has been previously shrunk, 941 * allow it to grow again with use up to 942 * bufhighpages (cachepercent) 943 */ 944 if (bufpages < bufhighpages) 945 bufadjust(bufhighpages); 946 947 /* 948 * If we would go over the page target with our 949 * new allocation, free enough buffers first 950 * to stay at the target with our new allocation. 951 */ 952 while ((bcstats.numbufpages + npages > targetpages) && 953 (bp = bufcache_getanycleanbuf())) { 954 bufcache_take(bp); 955 if (bp->b_vp) { 956 RB_REMOVE(buf_rb_bufs, 957 &bp->b_vp->v_bufs_tree, bp); 958 brelvp(bp); 959 } 960 buf_put(bp); 961 } 962 963 /* 964 * If we get here, we tried to free the world down 965 * above, and couldn't get down - Wake the cleaner 966 * and wait for it to push some buffers out. 967 */ 968 if ((bcstats.numbufpages + npages > targetpages || 969 bcstats.kvaslots_avail <= RESERVE_SLOTS) && 970 curproc != syncerproc && curproc != cleanerproc) { 971 wakeup(&bd_req); 972 needbuffer++; 973 tsleep(&needbuffer, PRIBIO, "needbuffer", 0); 974 splx(s); 975 return (NULL); 976 } 977 if (bcstats.numbufpages + npages > bufpages) { 978 /* cleaner or syncer */ 979 nobuffers = 1; 980 tsleep(&nobuffers, PRIBIO, "nobuffers", 0); 981 splx(s); 982 return (NULL); 983 } 984 } 985 986 bp = pool_get(&bufpool, poolwait|PR_ZERO); 987 988 if (bp == NULL) { 989 splx(s); 990 return (NULL); 991 } 992 993 bp->b_freelist.tqe_next = NOLIST; 994 bp->b_dev = NODEV; 995 LIST_INIT(&bp->b_dep); 996 bp->b_bcount = size; 997 998 buf_acquire_nomap(bp); 999 1000 if (vp != NULL) { 1001 /* 1002 * We insert the buffer into the hash with B_BUSY set 1003 * while we allocate pages for it. This way any getblk 1004 * that happens while we allocate pages will wait for 1005 * this buffer instead of starting its own buf_get. 1006 * 1007 * But first, we check if someone beat us to it. 1008 */ 1009 if (incore(vp, blkno)) { 1010 pool_put(&bufpool, bp); 1011 splx(s); 1012 return (NULL); 1013 } 1014 1015 bp->b_blkno = bp->b_lblkno = blkno; 1016 bgetvp(vp, bp); 1017 if (RB_INSERT(buf_rb_bufs, &vp->v_bufs_tree, bp)) 1018 panic("buf_get: dup lblk vp %p bp %p", vp, bp); 1019 } else { 1020 bp->b_vnbufs.le_next = NOLIST; 1021 SET(bp->b_flags, B_INVAL); 1022 bp->b_vp = NULL; 1023 } 1024 1025 LIST_INSERT_HEAD(&bufhead, bp, b_list); 1026 bcstats.numbufs++; 1027 1028 if (size) { 1029 buf_alloc_pages(bp, round_page(size)); 1030 KASSERT(ISSET(bp->b_flags, B_DMA)); 1031 buf_map(bp); 1032 } 1033 1034 SET(bp->b_flags, B_BC); 1035 splx(s); 1036 1037 return (bp); 1038 } 1039 1040 /* 1041 * Buffer cleaning daemon. 1042 */ 1043 void 1044 buf_daemon(struct proc *p) 1045 { 1046 struct buf *bp = NULL; 1047 int s, pushed = 0; 1048 1049 cleanerproc = curproc; 1050 1051 s = splbio(); 1052 for (;;) { 1053 if (bp == NULL || (pushed >= 16 && 1054 UNCLEAN_PAGES < hidirtypages && 1055 bcstats.kvaslots_avail > 2 * RESERVE_SLOTS)){ 1056 pushed = 0; 1057 /* 1058 * Wake up anyone who was waiting for buffers 1059 * to be released. 1060 */ 1061 if (needbuffer) { 1062 needbuffer = 0; 1063 wakeup(&needbuffer); 1064 } 1065 tsleep(&bd_req, PRIBIO - 7, "cleaner", 0); 1066 } 1067 1068 while ((bp = bufcache_getdirtybuf())) { 1069 1070 if (UNCLEAN_PAGES < lodirtypages && 1071 bcstats.kvaslots_avail > 2 * RESERVE_SLOTS && 1072 pushed >= 16) 1073 break; 1074 1075 bufcache_take(bp); 1076 buf_acquire(bp); 1077 splx(s); 1078 1079 if (ISSET(bp->b_flags, B_INVAL)) { 1080 brelse(bp); 1081 s = splbio(); 1082 continue; 1083 } 1084 #ifdef DIAGNOSTIC 1085 if (!ISSET(bp->b_flags, B_DELWRI)) 1086 panic("Clean buffer on dirty queue"); 1087 #endif 1088 if (LIST_FIRST(&bp->b_dep) != NULL && 1089 !ISSET(bp->b_flags, B_DEFERRED) && 1090 buf_countdeps(bp, 0, 0)) { 1091 SET(bp->b_flags, B_DEFERRED); 1092 s = splbio(); 1093 bufcache_release(bp); 1094 buf_release(bp); 1095 continue; 1096 } 1097 1098 bawrite(bp); 1099 pushed++; 1100 1101 sched_pause(); 1102 1103 s = splbio(); 1104 } 1105 } 1106 } 1107 1108 /* 1109 * Wait for operations on the buffer to complete. 1110 * When they do, extract and return the I/O's error value. 1111 */ 1112 int 1113 biowait(struct buf *bp) 1114 { 1115 int s; 1116 1117 KASSERT(!(bp->b_flags & B_ASYNC)); 1118 1119 s = splbio(); 1120 while (!ISSET(bp->b_flags, B_DONE)) 1121 tsleep(bp, PRIBIO + 1, "biowait", 0); 1122 splx(s); 1123 1124 /* check for interruption of I/O (e.g. via NFS), then errors. */ 1125 if (ISSET(bp->b_flags, B_EINTR)) { 1126 CLR(bp->b_flags, B_EINTR); 1127 return (EINTR); 1128 } 1129 1130 if (ISSET(bp->b_flags, B_ERROR)) 1131 return (bp->b_error ? bp->b_error : EIO); 1132 else 1133 return (0); 1134 } 1135 1136 /* 1137 * Mark I/O complete on a buffer. 1138 * 1139 * If a callback has been requested, e.g. the pageout 1140 * daemon, do so. Otherwise, awaken waiting processes. 1141 * 1142 * [ Leffler, et al., says on p.247: 1143 * "This routine wakes up the blocked process, frees the buffer 1144 * for an asynchronous write, or, for a request by the pagedaemon 1145 * process, invokes a procedure specified in the buffer structure" ] 1146 * 1147 * In real life, the pagedaemon (or other system processes) wants 1148 * to do async stuff to, and doesn't want the buffer brelse()'d. 1149 * (for swap pager, that puts swap buffers on the free lists (!!!), 1150 * for the vn device, that puts malloc'd buffers on the free lists!) 1151 * 1152 * Must be called at splbio(). 1153 */ 1154 void 1155 biodone(struct buf *bp) 1156 { 1157 splassert(IPL_BIO); 1158 1159 if (ISSET(bp->b_flags, B_DONE)) 1160 panic("biodone already"); 1161 SET(bp->b_flags, B_DONE); /* note that it's done */ 1162 1163 if (bp->b_bq) 1164 bufq_done(bp->b_bq, bp); 1165 1166 if (LIST_FIRST(&bp->b_dep) != NULL) 1167 buf_complete(bp); 1168 1169 if (!ISSET(bp->b_flags, B_READ)) { 1170 CLR(bp->b_flags, B_WRITEINPROG); 1171 vwakeup(bp->b_vp); 1172 } 1173 if (bcstats.numbufs && 1174 (!(ISSET(bp->b_flags, B_RAW) || ISSET(bp->b_flags, B_PHYS)))) { 1175 if (!ISSET(bp->b_flags, B_READ)) { 1176 bcstats.pendingwrites--; 1177 } else 1178 bcstats.pendingreads--; 1179 } 1180 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ 1181 CLR(bp->b_flags, B_CALL); /* but note callout done */ 1182 (*bp->b_iodone)(bp); 1183 } else { 1184 if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */ 1185 brelse(bp); 1186 } else { /* or just wakeup the buffer */ 1187 CLR(bp->b_flags, B_WANTED); 1188 wakeup(bp); 1189 } 1190 } 1191 } 1192 1193 #ifdef DDB 1194 void bcstats_print(int (*)(const char *, ...) 1195 __attribute__((__format__(__kprintf__,1,2)))); 1196 /* 1197 * bcstats_print: ddb hook to print interesting buffer cache counters 1198 */ 1199 void 1200 bcstats_print( 1201 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 1202 { 1203 (*pr)("Current Buffer Cache status:\n"); 1204 (*pr)("numbufs %lld busymapped %lld, delwri %lld\n", 1205 bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs); 1206 (*pr)("kvaslots %lld avail kva slots %lld\n", 1207 bcstats.kvaslots, bcstats.kvaslots_avail); 1208 (*pr)("bufpages %lld, dirtypages %lld\n", 1209 bcstats.numbufpages, bcstats.numdirtypages); 1210 (*pr)("pendingreads %lld, pendingwrites %lld\n", 1211 bcstats.pendingreads, bcstats.pendingwrites); 1212 } 1213 #endif 1214 1215 void 1216 buf_adjcnt(struct buf *bp, long ncount) 1217 { 1218 KASSERT(ncount <= bp->b_bufsize); 1219 bp->b_bcount = ncount; 1220 } 1221 1222 /* bufcache freelist code below */ 1223 /* 1224 * Copyright (c) 2014 Ted Unangst <tedu@openbsd.org> 1225 * 1226 * Permission to use, copy, modify, and distribute this software for any 1227 * purpose with or without fee is hereby granted, provided that the above 1228 * copyright notice and this permission notice appear in all copies. 1229 * 1230 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 1231 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 1232 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 1233 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1234 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 1235 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 1236 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 1237 */ 1238 1239 /* 1240 * The code below implements a variant of the 2Q buffer cache algorithm by 1241 * Johnson and Shasha. 1242 * 1243 * General Outline 1244 * We divide the buffer cache into three working sets: current, previous, 1245 * and long term. Each list is itself LRU and buffers get promoted and moved 1246 * around between them. A buffer starts its life in the current working set. 1247 * As time passes and newer buffers push it out, it will turn into the previous 1248 * working set and is subject to recycling. But if it's accessed again from 1249 * the previous working set, that's an indication that it's actually in the 1250 * long term working set, so we promote it there. The separation of current 1251 * and previous working sets prevents us from promoting a buffer that's only 1252 * temporarily hot to the long term cache. 1253 * 1254 * The objective is to provide scan resistance by making the long term 1255 * working set ineligible for immediate recycling, even as the current 1256 * working set is rapidly turned over. 1257 * 1258 * Implementation 1259 * The code below identifies the current, previous, and long term sets as 1260 * hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at 1261 * 1/3 of the total clean pages, after which point they start pushing their 1262 * oldest buffers into coldqueue. 1263 * A buf always starts out with neither WARM or COLD flags set (implying HOT). 1264 * When released, it will be returned to the tail of the hotqueue list. 1265 * When the hotqueue gets too large, the oldest hot buf will be moved to the 1266 * coldqueue, with the B_COLD flag set. When a cold buf is released, we set 1267 * the B_WARM flag and put it onto the warmqueue. Warm bufs are also 1268 * directly returned to the end of the warmqueue. As with the hotqueue, when 1269 * the warmqueue grows too large, B_WARM bufs are moved onto the coldqueue. 1270 * 1271 * Note that this design does still support large working sets, greater 1272 * than the cap of hotqueue or warmqueue would imply. The coldqueue is still 1273 * cached and has no maximum length. The hot and warm queues form a Y feeding 1274 * into the coldqueue. Moving bufs between queues is constant time, so this 1275 * design decays to one long warm->cold queue. 1276 * 1277 * In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue 1278 * is Am. We always cache pages, as opposed to pointers to pages for A1. 1279 * 1280 * This implementation adds support for multiple 2q caches. 1281 * 1282 * If we have more than one 2q cache, as bufs fall off the cold queue 1283 * for recyclying, bufs that have been warm before (which retain the 1284 * B_WARM flag in addition to B_COLD) can be put into the hot queue of 1285 * a second level 2Q cache. buffers which are only B_COLD are 1286 * recycled. Bufs falling off the last cache's cold queue are always 1287 * recycled. 1288 * 1289 */ 1290 1291 /* 1292 * this function is called when a hot or warm queue may have exceeded its 1293 * size limit. it will move a buf to the coldqueue. 1294 */ 1295 int chillbufs(struct 1296 bufcache *cache, struct bufqueue *queue, int64_t *queuepages); 1297 1298 void 1299 bufcache_init(void) 1300 { 1301 int i; 1302 for (i=0; i < NUM_CACHES; i++) { 1303 TAILQ_INIT(&cleancache[i].hotqueue); 1304 TAILQ_INIT(&cleancache[i].coldqueue); 1305 TAILQ_INIT(&cleancache[i].warmqueue); 1306 } 1307 TAILQ_INIT(&dirtyqueue); 1308 } 1309 1310 /* 1311 * if the buffer caches have shrunk, we may need to rebalance our queues. 1312 */ 1313 void 1314 bufcache_adjust(void) 1315 { 1316 int i; 1317 for (i=0; i < NUM_CACHES; i++) { 1318 while (chillbufs(&cleancache[i], &cleancache[i].warmqueue, 1319 &cleancache[i].warmbufpages) || 1320 chillbufs(&cleancache[i], &cleancache[i].hotqueue, 1321 &cleancache[i].hotbufpages)) 1322 continue; 1323 } 1324 } 1325 1326 /* 1327 * Get a clean buffer from the cache. if "discard" is set do not promote 1328 * previously warm buffers as normal, because we are tossing everything 1329 * away such as in a hibernation 1330 */ 1331 struct buf * 1332 bufcache_getcleanbuf(int cachenum, int discard) 1333 { 1334 struct buf *bp = NULL; 1335 struct bufcache *cache = &cleancache[cachenum]; 1336 1337 splassert(IPL_BIO); 1338 1339 /* try cold queue */ 1340 while ((bp = TAILQ_FIRST(&cache->coldqueue))) { 1341 if ((!discard) && 1342 cachenum < NUM_CACHES - 1 && ISSET(bp->b_flags, B_WARM)) { 1343 /* 1344 * If this buffer was warm before, move it to 1345 * the hot queue in the next cache 1346 */ 1347 TAILQ_REMOVE(&cache->coldqueue, bp, b_freelist); 1348 CLR(bp->b_flags, B_WARM); 1349 CLR(bp->b_flags, B_COLD); 1350 int64_t pages = atop(bp->b_bufsize); 1351 KASSERT(bp->cache == cachenum); 1352 if (bp->cache == 0) 1353 buf_flip_high(bp); 1354 bp->cache++; 1355 struct bufcache *newcache = &cleancache[bp->cache]; 1356 newcache->cachepages += pages; 1357 newcache->hotbufpages += pages; 1358 chillbufs(newcache, &newcache->hotqueue, 1359 &newcache->hotbufpages); 1360 TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist); 1361 } 1362 else 1363 /* buffer is cold - give it up */ 1364 return bp; 1365 } 1366 if ((bp = TAILQ_FIRST(&cache->warmqueue))) 1367 return bp; 1368 if ((bp = TAILQ_FIRST(&cache->hotqueue))) 1369 return bp; 1370 return bp; 1371 } 1372 1373 struct buf * 1374 bufcache_getcleanbuf_range(int start, int end, int discard) 1375 { 1376 int i, j = start, q = end; 1377 struct buf *bp = NULL; 1378 1379 /* 1380 * XXX in theory we could promote warm buffers into a previous queue 1381 * so in the pathological case of where we go through all the caches 1382 * without getting a buffer we have to start at the beginning again. 1383 */ 1384 while (j <= q) { 1385 for (i = q; i >= j; i--) 1386 if ((bp = bufcache_getcleanbuf(i, discard))) 1387 return(bp); 1388 j++; 1389 } 1390 return bp; 1391 } 1392 1393 struct buf * 1394 bufcache_getanycleanbuf(void) 1395 { 1396 return bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES -1, 0); 1397 } 1398 1399 1400 struct buf * 1401 bufcache_getdirtybuf(void) 1402 { 1403 return TAILQ_FIRST(&dirtyqueue); 1404 } 1405 1406 void 1407 bufcache_take(struct buf *bp) 1408 { 1409 struct bufqueue *queue; 1410 int64_t pages; 1411 1412 splassert(IPL_BIO); 1413 1414 KASSERT(ISSET(bp->b_flags, B_BC)); 1415 KASSERT(bp->cache >= DMA_CACHE); 1416 KASSERT((bp->cache < NUM_CACHES)); 1417 pages = atop(bp->b_bufsize); 1418 struct bufcache *cache = &cleancache[bp->cache]; 1419 if (!ISSET(bp->b_flags, B_DELWRI)) { 1420 if (ISSET(bp->b_flags, B_COLD)) { 1421 queue = &cache->coldqueue; 1422 } else if (ISSET(bp->b_flags, B_WARM)) { 1423 queue = &cache->warmqueue; 1424 cache->warmbufpages -= pages; 1425 } else { 1426 queue = &cache->hotqueue; 1427 cache->hotbufpages -= pages; 1428 } 1429 bcstats.numcleanpages -= pages; 1430 cache->cachepages -= pages; 1431 } else { 1432 queue = &dirtyqueue; 1433 bcstats.numdirtypages -= pages; 1434 bcstats.delwribufs--; 1435 } 1436 TAILQ_REMOVE(queue, bp, b_freelist); 1437 } 1438 1439 /* move buffers from a hot or warm queue to a cold queue in a cache */ 1440 int 1441 chillbufs(struct bufcache *cache, struct bufqueue *queue, int64_t *queuepages) 1442 { 1443 struct buf *bp; 1444 int64_t limit, pages; 1445 1446 /* 1447 * The warm and hot queues are allowed to be up to one third each. 1448 * We impose a minimum size of 96 to prevent too much "wobbling". 1449 */ 1450 limit = cache->cachepages / 3; 1451 if (*queuepages > 96 && *queuepages > limit) { 1452 bp = TAILQ_FIRST(queue); 1453 if (!bp) 1454 panic("inconsistent bufpage counts"); 1455 pages = atop(bp->b_bufsize); 1456 *queuepages -= pages; 1457 TAILQ_REMOVE(queue, bp, b_freelist); 1458 /* we do not clear B_WARM */ 1459 SET(bp->b_flags, B_COLD); 1460 TAILQ_INSERT_TAIL(&cache->coldqueue, bp, b_freelist); 1461 return 1; 1462 } 1463 return 0; 1464 } 1465 1466 void 1467 bufcache_release(struct buf *bp) 1468 { 1469 struct bufqueue *queue; 1470 int64_t pages; 1471 struct bufcache *cache = &cleancache[bp->cache]; 1472 pages = atop(bp->b_bufsize); 1473 KASSERT(ISSET(bp->b_flags, B_BC)); 1474 KASSERT((ISSET(bp->b_flags, B_DMA) && bp->cache == 0) 1475 || ((!ISSET(bp->b_flags, B_DMA)) && bp->cache > 0)); 1476 if (!ISSET(bp->b_flags, B_DELWRI)) { 1477 int64_t *queuepages; 1478 if (ISSET(bp->b_flags, B_WARM | B_COLD)) { 1479 SET(bp->b_flags, B_WARM); 1480 CLR(bp->b_flags, B_COLD); 1481 queue = &cache->warmqueue; 1482 queuepages = &cache->warmbufpages; 1483 } else { 1484 queue = &cache->hotqueue; 1485 queuepages = &cache->hotbufpages; 1486 } 1487 *queuepages += pages; 1488 bcstats.numcleanpages += pages; 1489 cache->cachepages += pages; 1490 chillbufs(cache, queue, queuepages); 1491 } else { 1492 queue = &dirtyqueue; 1493 bcstats.numdirtypages += pages; 1494 bcstats.delwribufs++; 1495 } 1496 TAILQ_INSERT_TAIL(queue, bp, b_freelist); 1497 } 1498 1499 #ifdef HIBERNATE 1500 /* 1501 * Nuke the buffer cache from orbit when hibernating. We do not want to save 1502 * any clean cache pages to swap and read them back. the original disk files 1503 * are just as good. 1504 */ 1505 void 1506 hibernate_suspend_bufcache(void) 1507 { 1508 struct buf *bp; 1509 int s; 1510 1511 s = splbio(); 1512 /* Chuck away all the cache pages.. discard bufs, do not promote */ 1513 while ((bp = bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 1))) { 1514 bufcache_take(bp); 1515 if (bp->b_vp) { 1516 RB_REMOVE(buf_rb_bufs, 1517 &bp->b_vp->v_bufs_tree, bp); 1518 brelvp(bp); 1519 } 1520 buf_put(bp); 1521 } 1522 splx(s); 1523 } 1524 1525 void 1526 hibernate_resume_bufcache(void) 1527 { 1528 /* XXX Nothing needed here for now */ 1529 } 1530 #endif /* HIBERNATE */ 1531