1 /* 2 * Copyright (c) 1994,1997 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Absolutely no warranty of function or purpose is made by the author 12 * John S. Dyson. 13 * 14 * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $ 15 */ 16 17 /* 18 * this file contains a new buffer I/O scheme implementing a coherent 19 * VM object and buffer cache scheme. Pains have been taken to make 20 * sure that the performance degradation associated with schemes such 21 * as this is not realized. 22 * 23 * Author: John S. Dyson 24 * Significant help during the development and debugging phases 25 * had been provided by David Greenman, also of the FreeBSD core team. 26 * 27 * see man buf(9) for more info. Note that man buf(9) doesn't reflect 28 * the actual buf/bio implementation in DragonFly. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/devicestat.h> 36 #include <sys/eventhandler.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/mount.h> 40 #include <sys/kernel.h> 41 #include <sys/kthread.h> 42 #include <sys/proc.h> 43 #include <sys/reboot.h> 44 #include <sys/resourcevar.h> 45 #include <sys/sysctl.h> 46 #include <sys/vmmeter.h> 47 #include <sys/vnode.h> 48 #include <sys/dsched.h> 49 #include <vm/vm.h> 50 #include <vm/vm_param.h> 51 #include <vm/vm_kern.h> 52 #include <vm/vm_pageout.h> 53 #include <vm/vm_page.h> 54 #include <vm/vm_object.h> 55 #include <vm/vm_extern.h> 56 #include <vm/vm_map.h> 57 #include <vm/vm_pager.h> 58 #include <vm/swap_pager.h> 59 60 #include <sys/buf2.h> 61 #include <sys/thread2.h> 62 #include <sys/spinlock2.h> 63 #include <sys/mplock2.h> 64 #include <vm/vm_page2.h> 65 66 #include "opt_ddb.h" 67 #ifdef DDB 68 #include <ddb/ddb.h> 69 #endif 70 71 /* 72 * Buffer queues. 73 */ 74 enum bufq_type { 75 BQUEUE_NONE, /* not on any queue */ 76 BQUEUE_LOCKED, /* locked buffers */ 77 BQUEUE_CLEAN, /* non-B_DELWRI buffers */ 78 BQUEUE_DIRTY, /* B_DELWRI buffers */ 79 BQUEUE_DIRTY_HW, /* B_DELWRI buffers - heavy weight */ 80 BQUEUE_EMPTY, /* empty buffer headers */ 81 82 BUFFER_QUEUES /* number of buffer queues */ 83 }; 84 85 typedef enum bufq_type bufq_type_t; 86 87 #define BD_WAKE_SIZE 16384 88 #define BD_WAKE_MASK (BD_WAKE_SIZE - 1) 89 90 TAILQ_HEAD(bqueues, buf); 91 92 struct bufpcpu { 93 struct spinlock spin; 94 struct bqueues bufqueues[BUFFER_QUEUES]; 95 } __cachealign; 96 97 struct bufpcpu bufpcpu[MAXCPU]; 98 99 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); 100 101 struct buf *buf; /* buffer header pool */ 102 103 static void vfs_clean_pages(struct buf *bp); 104 static void vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m); 105 #if 0 106 static void vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m); 107 #endif 108 static void vfs_vmio_release(struct buf *bp); 109 static int flushbufqueues(struct buf *marker, bufq_type_t q); 110 static void repurposebuf(struct buf *bp, int size); 111 static vm_page_t bio_page_alloc(struct buf *bp, vm_object_t obj, 112 vm_pindex_t pg, int deficit); 113 114 static void bd_signal(long totalspace); 115 static void buf_daemon(void); 116 static void buf_daemon_hw(void); 117 118 /* 119 * bogus page -- for I/O to/from partially complete buffers 120 * this is a temporary solution to the problem, but it is not 121 * really that bad. it would be better to split the buffer 122 * for input in the case of buffers partially already in memory, 123 * but the code is intricate enough already. 124 */ 125 vm_page_t bogus_page; 126 127 /* 128 * These are all static, but make the ones we export globals so we do 129 * not need to use compiler magic. 130 */ 131 long bufspace; /* atomic ops */ 132 long maxbufspace; 133 static long bufmallocspace; /* atomic ops */ 134 long maxbufmallocspace, lobufspace, hibufspace; 135 static long lorunningspace; 136 static long hirunningspace; 137 static long dirtykvaspace; /* atomic */ 138 long dirtybufspace; /* atomic (global for systat) */ 139 static long dirtybufcount; /* atomic */ 140 static long dirtybufspacehw; /* atomic */ 141 static long dirtybufcounthw; /* atomic */ 142 static long runningbufspace; /* atomic */ 143 static long runningbufcount; /* atomic */ 144 static long repurposedspace; 145 long lodirtybufspace; 146 long hidirtybufspace; 147 static int getnewbufcalls; 148 static int recoverbufcalls; 149 static int needsbuffer; /* atomic */ 150 static int runningbufreq; /* atomic */ 151 static int bd_request; /* atomic */ 152 static int bd_request_hw; /* atomic */ 153 static u_int bd_wake_ary[BD_WAKE_SIZE]; 154 static u_int bd_wake_index; 155 static u_int vm_cycle_point = 40; /* 23-36 will migrate more act->inact */ 156 static int debug_commit; 157 static int debug_bufbio; 158 static long bufcache_bw = 200 * 1024 * 1024; 159 static long bufcache_bw_accum; 160 static int bufcache_bw_ticks; 161 162 static struct thread *bufdaemon_td; 163 static struct thread *bufdaemonhw_td; 164 static u_int lowmempgallocs; 165 static u_int lowmempgfails; 166 static u_int flushperqueue = 1024; 167 static int repurpose_enable; 168 169 /* 170 * Sysctls for operational control of the buffer cache. 171 */ 172 SYSCTL_UINT(_vfs, OID_AUTO, flushperqueue, CTLFLAG_RW, &flushperqueue, 0, 173 "Number of buffers to flush from each per-cpu queue"); 174 SYSCTL_LONG(_vfs, OID_AUTO, lodirtybufspace, CTLFLAG_RW, &lodirtybufspace, 0, 175 "Number of dirty buffers to flush before bufdaemon becomes inactive"); 176 SYSCTL_LONG(_vfs, OID_AUTO, hidirtybufspace, CTLFLAG_RW, &hidirtybufspace, 0, 177 "High watermark used to trigger explicit flushing of dirty buffers"); 178 SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, 179 "Minimum amount of buffer space required for active I/O"); 180 SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, 181 "Maximum amount of buffer space to usable for active I/O"); 182 SYSCTL_LONG(_vfs, OID_AUTO, bufcache_bw, CTLFLAG_RW, &bufcache_bw, 0, 183 "Buffer-cache -> VM page cache transfer bandwidth"); 184 SYSCTL_UINT(_vfs, OID_AUTO, lowmempgallocs, CTLFLAG_RW, &lowmempgallocs, 0, 185 "Page allocations done during periods of very low free memory"); 186 SYSCTL_UINT(_vfs, OID_AUTO, lowmempgfails, CTLFLAG_RW, &lowmempgfails, 0, 187 "Page allocations which failed during periods of very low free memory"); 188 SYSCTL_UINT(_vfs, OID_AUTO, vm_cycle_point, CTLFLAG_RW, &vm_cycle_point, 0, 189 "Recycle pages to active or inactive queue transition pt 0-64"); 190 SYSCTL_UINT(_vfs, OID_AUTO, repurpose_enable, CTLFLAG_RW, &repurpose_enable, 0, 191 "Enable buffer cache VM repurposing for high-I/O"); 192 /* 193 * Sysctls determining current state of the buffer cache. 194 */ 195 SYSCTL_LONG(_vfs, OID_AUTO, nbuf, CTLFLAG_RD, &nbuf, 0, 196 "Total number of buffers in buffer cache"); 197 SYSCTL_LONG(_vfs, OID_AUTO, dirtykvaspace, CTLFLAG_RD, &dirtykvaspace, 0, 198 "KVA reserved by dirty buffers (all)"); 199 SYSCTL_LONG(_vfs, OID_AUTO, dirtybufspace, CTLFLAG_RD, &dirtybufspace, 0, 200 "Pending bytes of dirty buffers (all)"); 201 SYSCTL_LONG(_vfs, OID_AUTO, dirtybufspacehw, CTLFLAG_RD, &dirtybufspacehw, 0, 202 "Pending bytes of dirty buffers (heavy weight)"); 203 SYSCTL_LONG(_vfs, OID_AUTO, dirtybufcount, CTLFLAG_RD, &dirtybufcount, 0, 204 "Pending number of dirty buffers"); 205 SYSCTL_LONG(_vfs, OID_AUTO, dirtybufcounthw, CTLFLAG_RD, &dirtybufcounthw, 0, 206 "Pending number of dirty buffers (heavy weight)"); 207 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 208 "I/O bytes currently in progress due to asynchronous writes"); 209 SYSCTL_LONG(_vfs, OID_AUTO, runningbufcount, CTLFLAG_RD, &runningbufcount, 0, 210 "I/O buffers currently in progress due to asynchronous writes"); 211 SYSCTL_LONG(_vfs, OID_AUTO, repurposedspace, CTLFLAG_RD, &repurposedspace, 0, 212 "Buffer-cache memory repurposed in-place"); 213 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, 214 "Hard limit on maximum amount of memory usable for buffer space"); 215 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, 216 "Soft limit on maximum amount of memory usable for buffer space"); 217 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, 218 "Minimum amount of memory to reserve for system buffer space"); 219 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 220 "Amount of memory available for buffers"); 221 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace, 222 0, "Maximum amount of memory reserved for buffers using malloc"); 223 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 224 "Amount of memory left for buffers using malloc-scheme"); 225 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, 0, 226 "New buffer header acquisition requests"); 227 SYSCTL_INT(_vfs, OID_AUTO, recoverbufcalls, CTLFLAG_RD, &recoverbufcalls, 0, 228 "Recover VM space in an emergency"); 229 SYSCTL_INT(_vfs, OID_AUTO, debug_commit, CTLFLAG_RW, &debug_commit, 0, ""); 230 SYSCTL_INT(_vfs, OID_AUTO, debug_bufbio, CTLFLAG_RW, &debug_bufbio, 0, ""); 231 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf), 232 "sizeof(struct buf)"); 233 234 char *buf_wmesg = BUF_WMESG; 235 236 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 237 #define VFS_BIO_NEED_UNUSED02 0x02 238 #define VFS_BIO_NEED_UNUSED04 0x04 239 #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 240 241 /* 242 * Called when buffer space is potentially available for recovery. 243 * getnewbuf() will block on this flag when it is unable to free 244 * sufficient buffer space. Buffer space becomes recoverable when 245 * bp's get placed back in the queues. 246 */ 247 static __inline void 248 bufspacewakeup(void) 249 { 250 /* 251 * If someone is waiting for BUF space, wake them up. Even 252 * though we haven't freed the kva space yet, the waiting 253 * process will be able to now. 254 */ 255 for (;;) { 256 int flags = needsbuffer; 257 cpu_ccfence(); 258 if ((flags & VFS_BIO_NEED_BUFSPACE) == 0) 259 break; 260 if (atomic_cmpset_int(&needsbuffer, flags, 261 flags & ~VFS_BIO_NEED_BUFSPACE)) { 262 wakeup(&needsbuffer); 263 break; 264 } 265 /* retry */ 266 } 267 } 268 269 /* 270 * runningbufwakeup: 271 * 272 * Accounting for I/O in progress. 273 * 274 */ 275 static __inline void 276 runningbufwakeup(struct buf *bp) 277 { 278 long totalspace; 279 long flags; 280 281 if ((totalspace = bp->b_runningbufspace) != 0) { 282 atomic_add_long(&runningbufspace, -totalspace); 283 atomic_add_long(&runningbufcount, -1); 284 bp->b_runningbufspace = 0; 285 286 /* 287 * see waitrunningbufspace() for limit test. 288 */ 289 for (;;) { 290 flags = runningbufreq; 291 cpu_ccfence(); 292 if (flags == 0) 293 break; 294 if (atomic_cmpset_int(&runningbufreq, flags, 0)) { 295 wakeup(&runningbufreq); 296 break; 297 } 298 /* retry */ 299 } 300 bd_signal(totalspace); 301 } 302 } 303 304 /* 305 * bufcountwakeup: 306 * 307 * Called when a buffer has been added to one of the free queues to 308 * account for the buffer and to wakeup anyone waiting for free buffers. 309 * This typically occurs when large amounts of metadata are being handled 310 * by the buffer cache ( else buffer space runs out first, usually ). 311 */ 312 static __inline void 313 bufcountwakeup(void) 314 { 315 long flags; 316 317 for (;;) { 318 flags = needsbuffer; 319 if (flags == 0) 320 break; 321 if (atomic_cmpset_int(&needsbuffer, flags, 322 (flags & ~VFS_BIO_NEED_ANY))) { 323 wakeup(&needsbuffer); 324 break; 325 } 326 /* retry */ 327 } 328 } 329 330 /* 331 * waitrunningbufspace() 332 * 333 * If runningbufspace exceeds 4/6 hirunningspace we block until 334 * runningbufspace drops to 3/6 hirunningspace. We also block if another 335 * thread blocked here in order to be fair, even if runningbufspace 336 * is now lower than the limit. 337 * 338 * The caller may be using this function to block in a tight loop, we 339 * must block while runningbufspace is greater than at least 340 * hirunningspace * 3 / 6. 341 */ 342 void 343 waitrunningbufspace(void) 344 { 345 long limit = hirunningspace * 4 / 6; 346 long flags; 347 348 while (runningbufspace > limit || runningbufreq) { 349 tsleep_interlock(&runningbufreq, 0); 350 flags = atomic_fetchadd_int(&runningbufreq, 1); 351 if (runningbufspace > limit || flags) 352 tsleep(&runningbufreq, PINTERLOCKED, "wdrn1", hz); 353 } 354 } 355 356 /* 357 * buf_dirty_count_severe: 358 * 359 * Return true if we have too many dirty buffers. 360 */ 361 int 362 buf_dirty_count_severe(void) 363 { 364 return (runningbufspace + dirtykvaspace >= hidirtybufspace || 365 dirtybufcount >= nbuf / 2); 366 } 367 368 /* 369 * Return true if the amount of running I/O is severe and BIOQ should 370 * start bursting. 371 */ 372 int 373 buf_runningbufspace_severe(void) 374 { 375 return (runningbufspace >= hirunningspace * 4 / 6); 376 } 377 378 /* 379 * vfs_buf_test_cache: 380 * 381 * Called when a buffer is extended. This function clears the B_CACHE 382 * bit if the newly extended portion of the buffer does not contain 383 * valid data. 384 * 385 * NOTE! Dirty VM pages are not processed into dirty (B_DELWRI) buffer 386 * cache buffers. The VM pages remain dirty, as someone had mmap()'d 387 * them while a clean buffer was present. 388 */ 389 static __inline__ 390 void 391 vfs_buf_test_cache(struct buf *bp, 392 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 393 vm_page_t m) 394 { 395 if (bp->b_flags & B_CACHE) { 396 int base = (foff + off) & PAGE_MASK; 397 if (vm_page_is_valid(m, base, size) == 0) 398 bp->b_flags &= ~B_CACHE; 399 } 400 } 401 402 /* 403 * bd_speedup() 404 * 405 * Spank the buf_daemon[_hw] if the total dirty buffer space exceeds the 406 * low water mark. 407 */ 408 static __inline__ 409 void 410 bd_speedup(void) 411 { 412 if (dirtykvaspace < lodirtybufspace && dirtybufcount < nbuf / 2) 413 return; 414 415 if (bd_request == 0 && 416 (dirtykvaspace > lodirtybufspace / 2 || 417 dirtybufcount - dirtybufcounthw >= nbuf / 2)) { 418 if (atomic_fetchadd_int(&bd_request, 1) == 0) 419 wakeup(&bd_request); 420 } 421 if (bd_request_hw == 0 && 422 (dirtykvaspace > lodirtybufspace / 2 || 423 dirtybufcounthw >= nbuf / 2)) { 424 if (atomic_fetchadd_int(&bd_request_hw, 1) == 0) 425 wakeup(&bd_request_hw); 426 } 427 } 428 429 /* 430 * bd_heatup() 431 * 432 * Get the buf_daemon heated up when the number of running and dirty 433 * buffers exceeds the mid-point. 434 * 435 * Return the total number of dirty bytes past the second mid point 436 * as a measure of how much excess dirty data there is in the system. 437 */ 438 long 439 bd_heatup(void) 440 { 441 long mid1; 442 long mid2; 443 long totalspace; 444 445 mid1 = lodirtybufspace + (hidirtybufspace - lodirtybufspace) / 2; 446 447 totalspace = runningbufspace + dirtykvaspace; 448 if (totalspace >= mid1 || dirtybufcount >= nbuf / 2) { 449 bd_speedup(); 450 mid2 = mid1 + (hidirtybufspace - mid1) / 2; 451 if (totalspace >= mid2) 452 return(totalspace - mid2); 453 } 454 return(0); 455 } 456 457 /* 458 * bd_wait() 459 * 460 * Wait for the buffer cache to flush (totalspace) bytes worth of 461 * buffers, then return. 462 * 463 * Regardless this function blocks while the number of dirty buffers 464 * exceeds hidirtybufspace. 465 */ 466 void 467 bd_wait(long totalspace) 468 { 469 u_int i; 470 u_int j; 471 u_int mi; 472 int count; 473 474 if (curthread == bufdaemonhw_td || curthread == bufdaemon_td) 475 return; 476 477 while (totalspace > 0) { 478 bd_heatup(); 479 480 /* 481 * Order is important. Suppliers adjust bd_wake_index after 482 * updating runningbufspace/dirtykvaspace. We want to fetch 483 * bd_wake_index before accessing. Any error should thus 484 * be in our favor. 485 */ 486 i = atomic_fetchadd_int(&bd_wake_index, 0); 487 if (totalspace > runningbufspace + dirtykvaspace) 488 totalspace = runningbufspace + dirtykvaspace; 489 count = totalspace / MAXBSIZE; 490 if (count >= BD_WAKE_SIZE / 2) 491 count = BD_WAKE_SIZE / 2; 492 i = i + count; 493 mi = i & BD_WAKE_MASK; 494 495 /* 496 * This is not a strict interlock, so we play a bit loose 497 * with locking access to dirtybufspace*. We have to re-check 498 * bd_wake_index to ensure that it hasn't passed us. 499 */ 500 tsleep_interlock(&bd_wake_ary[mi], 0); 501 atomic_add_int(&bd_wake_ary[mi], 1); 502 j = atomic_fetchadd_int(&bd_wake_index, 0); 503 if ((int)(i - j) >= 0) 504 tsleep(&bd_wake_ary[mi], PINTERLOCKED, "flstik", hz); 505 506 totalspace = runningbufspace + dirtykvaspace - hidirtybufspace; 507 } 508 } 509 510 /* 511 * bd_signal() 512 * 513 * This function is called whenever runningbufspace or dirtykvaspace 514 * is reduced. Track threads waiting for run+dirty buffer I/O 515 * complete. 516 */ 517 static void 518 bd_signal(long totalspace) 519 { 520 u_int i; 521 522 if (totalspace > 0) { 523 if (totalspace > MAXBSIZE * BD_WAKE_SIZE) 524 totalspace = MAXBSIZE * BD_WAKE_SIZE; 525 while (totalspace > 0) { 526 i = atomic_fetchadd_int(&bd_wake_index, 1); 527 i &= BD_WAKE_MASK; 528 if (atomic_readandclear_int(&bd_wake_ary[i])) 529 wakeup(&bd_wake_ary[i]); 530 totalspace -= MAXBSIZE; 531 } 532 } 533 } 534 535 /* 536 * BIO tracking support routines. 537 * 538 * Release a ref on a bio_track. Wakeup requests are atomically released 539 * along with the last reference so bk_active will never wind up set to 540 * only 0x80000000. 541 */ 542 static 543 void 544 bio_track_rel(struct bio_track *track) 545 { 546 int active; 547 int desired; 548 549 /* 550 * Shortcut 551 */ 552 active = track->bk_active; 553 if (active == 1 && atomic_cmpset_int(&track->bk_active, 1, 0)) 554 return; 555 556 /* 557 * Full-on. Note that the wait flag is only atomically released on 558 * the 1->0 count transition. 559 * 560 * We check for a negative count transition using bit 30 since bit 31 561 * has a different meaning. 562 */ 563 for (;;) { 564 desired = (active & 0x7FFFFFFF) - 1; 565 if (desired) 566 desired |= active & 0x80000000; 567 if (atomic_cmpset_int(&track->bk_active, active, desired)) { 568 if (desired & 0x40000000) 569 panic("bio_track_rel: bad count: %p", track); 570 if (active & 0x80000000) 571 wakeup(track); 572 break; 573 } 574 active = track->bk_active; 575 } 576 } 577 578 /* 579 * Wait for the tracking count to reach 0. 580 * 581 * Use atomic ops such that the wait flag is only set atomically when 582 * bk_active is non-zero. 583 */ 584 int 585 bio_track_wait(struct bio_track *track, int slp_flags, int slp_timo) 586 { 587 int active; 588 int desired; 589 int error; 590 591 /* 592 * Shortcut 593 */ 594 if (track->bk_active == 0) 595 return(0); 596 597 /* 598 * Full-on. Note that the wait flag may only be atomically set if 599 * the active count is non-zero. 600 * 601 * NOTE: We cannot optimize active == desired since a wakeup could 602 * clear active prior to our tsleep_interlock(). 603 */ 604 error = 0; 605 while ((active = track->bk_active) != 0) { 606 cpu_ccfence(); 607 desired = active | 0x80000000; 608 tsleep_interlock(track, slp_flags); 609 if (atomic_cmpset_int(&track->bk_active, active, desired)) { 610 error = tsleep(track, slp_flags | PINTERLOCKED, 611 "trwait", slp_timo); 612 if (error) 613 break; 614 } 615 } 616 return (error); 617 } 618 619 /* 620 * bufinit: 621 * 622 * Load time initialisation of the buffer cache, called from machine 623 * dependant initialization code. 624 */ 625 static 626 void 627 bufinit(void *dummy __unused) 628 { 629 struct bufpcpu *pcpu; 630 struct buf *bp; 631 vm_offset_t bogus_offset; 632 int i; 633 int j; 634 long n; 635 636 /* next, make a null set of free lists */ 637 for (i = 0; i < ncpus; ++i) { 638 pcpu = &bufpcpu[i]; 639 spin_init(&pcpu->spin, "bufinit"); 640 for (j = 0; j < BUFFER_QUEUES; j++) 641 TAILQ_INIT(&pcpu->bufqueues[j]); 642 } 643 644 /* 645 * Finally, initialize each buffer header and stick on empty q. 646 * Each buffer gets its own KVA reservation. 647 */ 648 i = 0; 649 pcpu = &bufpcpu[i]; 650 651 for (n = 0; n < nbuf; n++) { 652 bp = &buf[n]; 653 bzero(bp, sizeof *bp); 654 bp->b_flags = B_INVAL; /* we're just an empty header */ 655 bp->b_cmd = BUF_CMD_DONE; 656 bp->b_qindex = BQUEUE_EMPTY; 657 bp->b_qcpu = i; 658 bp->b_kvabase = (void *)(vm_map_min(&buffer_map) + 659 MAXBSIZE * n); 660 bp->b_kvasize = MAXBSIZE; 661 initbufbio(bp); 662 xio_init(&bp->b_xio); 663 buf_dep_init(bp); 664 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex], 665 bp, b_freelist); 666 667 i = (i + 1) % ncpus; 668 pcpu = &bufpcpu[i]; 669 } 670 671 /* 672 * maxbufspace is the absolute maximum amount of buffer space we are 673 * allowed to reserve in KVM and in real terms. The absolute maximum 674 * is nominally used by buf_daemon. hibufspace is the nominal maximum 675 * used by most other processes. The differential is required to 676 * ensure that buf_daemon is able to run when other processes might 677 * be blocked waiting for buffer space. 678 * 679 * Calculate hysteresis (lobufspace, hibufspace). Don't make it 680 * too large or we might lockup a cpu for too long a period of 681 * time in our tight loop. 682 */ 683 maxbufspace = nbuf * NBUFCALCSIZE; 684 hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); 685 lobufspace = hibufspace * 7 / 8; 686 if (hibufspace - lobufspace > 64 * 1024 * 1024) 687 lobufspace = hibufspace - 64 * 1024 * 1024; 688 if (lobufspace > hibufspace - MAXBSIZE) 689 lobufspace = hibufspace - MAXBSIZE; 690 691 lorunningspace = 512 * 1024; 692 /* hirunningspace -- see below */ 693 694 /* 695 * Limit the amount of malloc memory since it is wired permanently 696 * into the kernel space. Even though this is accounted for in 697 * the buffer allocation, we don't want the malloced region to grow 698 * uncontrolled. The malloc scheme improves memory utilization 699 * significantly on average (small) directories. 700 */ 701 maxbufmallocspace = hibufspace / 20; 702 703 /* 704 * Reduce the chance of a deadlock occuring by limiting the number 705 * of delayed-write dirty buffers we allow to stack up. 706 * 707 * We don't want too much actually queued to the device at once 708 * (XXX this needs to be per-mount!), because the buffers will 709 * wind up locked for a very long period of time while the I/O 710 * drains. 711 */ 712 hidirtybufspace = hibufspace / 2; /* dirty + running */ 713 hirunningspace = hibufspace / 16; /* locked & queued to device */ 714 if (hirunningspace < 1024 * 1024) 715 hirunningspace = 1024 * 1024; 716 717 dirtykvaspace = 0; 718 dirtybufspace = 0; 719 dirtybufspacehw = 0; 720 721 lodirtybufspace = hidirtybufspace / 2; 722 723 /* 724 * Maximum number of async ops initiated per buf_daemon loop. This is 725 * somewhat of a hack at the moment, we really need to limit ourselves 726 * based on the number of bytes of I/O in-transit that were initiated 727 * from buf_daemon. 728 */ 729 730 bogus_offset = kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 731 vm_object_hold(&kernel_object); 732 bogus_page = vm_page_alloc(&kernel_object, 733 (bogus_offset >> PAGE_SHIFT), 734 VM_ALLOC_NORMAL); 735 vm_object_drop(&kernel_object); 736 vmstats.v_wire_count++; 737 738 } 739 740 SYSINIT(do_bufinit, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, bufinit, NULL); 741 742 /* 743 * Initialize the embedded bio structures, typically used by 744 * deprecated code which tries to allocate its own struct bufs. 745 */ 746 void 747 initbufbio(struct buf *bp) 748 { 749 bp->b_bio1.bio_buf = bp; 750 bp->b_bio1.bio_prev = NULL; 751 bp->b_bio1.bio_offset = NOOFFSET; 752 bp->b_bio1.bio_next = &bp->b_bio2; 753 bp->b_bio1.bio_done = NULL; 754 bp->b_bio1.bio_flags = 0; 755 756 bp->b_bio2.bio_buf = bp; 757 bp->b_bio2.bio_prev = &bp->b_bio1; 758 bp->b_bio2.bio_offset = NOOFFSET; 759 bp->b_bio2.bio_next = NULL; 760 bp->b_bio2.bio_done = NULL; 761 bp->b_bio2.bio_flags = 0; 762 763 BUF_LOCKINIT(bp); 764 } 765 766 /* 767 * Reinitialize the embedded bio structures as well as any additional 768 * translation cache layers. 769 */ 770 void 771 reinitbufbio(struct buf *bp) 772 { 773 struct bio *bio; 774 775 for (bio = &bp->b_bio1; bio; bio = bio->bio_next) { 776 bio->bio_done = NULL; 777 bio->bio_offset = NOOFFSET; 778 } 779 } 780 781 /* 782 * Undo the effects of an initbufbio(). 783 */ 784 void 785 uninitbufbio(struct buf *bp) 786 { 787 dsched_buf_exit(bp); 788 BUF_LOCKFREE(bp); 789 } 790 791 /* 792 * Push another BIO layer onto an existing BIO and return it. The new 793 * BIO layer may already exist, holding cached translation data. 794 */ 795 struct bio * 796 push_bio(struct bio *bio) 797 { 798 struct bio *nbio; 799 800 if ((nbio = bio->bio_next) == NULL) { 801 int index = bio - &bio->bio_buf->b_bio_array[0]; 802 if (index >= NBUF_BIO - 1) { 803 panic("push_bio: too many layers %d for bp %p", 804 index, bio->bio_buf); 805 } 806 nbio = &bio->bio_buf->b_bio_array[index + 1]; 807 bio->bio_next = nbio; 808 nbio->bio_prev = bio; 809 nbio->bio_buf = bio->bio_buf; 810 nbio->bio_offset = NOOFFSET; 811 nbio->bio_done = NULL; 812 nbio->bio_next = NULL; 813 } 814 KKASSERT(nbio->bio_done == NULL); 815 return(nbio); 816 } 817 818 /* 819 * Pop a BIO translation layer, returning the previous layer. The 820 * must have been previously pushed. 821 */ 822 struct bio * 823 pop_bio(struct bio *bio) 824 { 825 return(bio->bio_prev); 826 } 827 828 void 829 clearbiocache(struct bio *bio) 830 { 831 while (bio) { 832 bio->bio_offset = NOOFFSET; 833 bio = bio->bio_next; 834 } 835 } 836 837 /* 838 * Remove the buffer from the appropriate free list. 839 * (caller must be locked) 840 */ 841 static __inline void 842 _bremfree(struct buf *bp) 843 { 844 struct bufpcpu *pcpu = &bufpcpu[bp->b_qcpu]; 845 846 if (bp->b_qindex != BQUEUE_NONE) { 847 KASSERT(BUF_REFCNTNB(bp) == 1, 848 ("bremfree: bp %p not locked",bp)); 849 TAILQ_REMOVE(&pcpu->bufqueues[bp->b_qindex], bp, b_freelist); 850 bp->b_qindex = BQUEUE_NONE; 851 } else { 852 if (BUF_REFCNTNB(bp) <= 1) 853 panic("bremfree: removing a buffer not on a queue"); 854 } 855 } 856 857 /* 858 * bremfree() - must be called with a locked buffer 859 */ 860 void 861 bremfree(struct buf *bp) 862 { 863 struct bufpcpu *pcpu = &bufpcpu[bp->b_qcpu]; 864 865 spin_lock(&pcpu->spin); 866 _bremfree(bp); 867 spin_unlock(&pcpu->spin); 868 } 869 870 /* 871 * bremfree_locked - must be called with pcpu->spin locked 872 */ 873 static void 874 bremfree_locked(struct buf *bp) 875 { 876 _bremfree(bp); 877 } 878 879 /* 880 * This version of bread issues any required I/O asyncnronously and 881 * makes a callback on completion. 882 * 883 * The callback must check whether BIO_DONE is set in the bio and issue 884 * the bpdone(bp, 0) if it isn't. The callback is responsible for clearing 885 * BIO_DONE and disposing of the I/O (bqrelse()ing it). 886 */ 887 void 888 breadcb(struct vnode *vp, off_t loffset, int size, 889 void (*func)(struct bio *), void *arg) 890 { 891 struct buf *bp; 892 893 bp = getblk(vp, loffset, size, 0, 0); 894 895 /* if not found in cache, do some I/O */ 896 if ((bp->b_flags & B_CACHE) == 0) { 897 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 898 bp->b_cmd = BUF_CMD_READ; 899 bp->b_bio1.bio_done = func; 900 bp->b_bio1.bio_caller_info1.ptr = arg; 901 vfs_busy_pages(vp, bp); 902 BUF_KERNPROC(bp); 903 vn_strategy(vp, &bp->b_bio1); 904 } else if (func) { 905 /* 906 * Since we are issuing the callback synchronously it cannot 907 * race the BIO_DONE, so no need for atomic ops here. 908 */ 909 /*bp->b_bio1.bio_done = func;*/ 910 bp->b_bio1.bio_caller_info1.ptr = arg; 911 bp->b_bio1.bio_flags |= BIO_DONE; 912 func(&bp->b_bio1); 913 } else { 914 bqrelse(bp); 915 } 916 } 917 918 /* 919 * breadnx() - Terminal function for bread() and breadn(). 920 * 921 * This function will start asynchronous I/O on read-ahead blocks as well 922 * as satisfy the primary request. 923 * 924 * We must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE is 925 * set, the buffer is valid and we do not have to do anything. 926 */ 927 int 928 breadnx(struct vnode *vp, off_t loffset, int size, off_t *raoffset, 929 int *rabsize, int cnt, struct buf **bpp) 930 { 931 struct buf *bp, *rabp; 932 int i; 933 int rv = 0, readwait = 0; 934 935 if (*bpp) 936 bp = *bpp; 937 else 938 *bpp = bp = getblk(vp, loffset, size, 0, 0); 939 940 /* if not found in cache, do some I/O */ 941 if ((bp->b_flags & B_CACHE) == 0) { 942 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 943 bp->b_cmd = BUF_CMD_READ; 944 bp->b_bio1.bio_done = biodone_sync; 945 bp->b_bio1.bio_flags |= BIO_SYNC; 946 vfs_busy_pages(vp, bp); 947 vn_strategy(vp, &bp->b_bio1); 948 ++readwait; 949 } 950 951 for (i = 0; i < cnt; i++, raoffset++, rabsize++) { 952 if (inmem(vp, *raoffset)) 953 continue; 954 rabp = getblk(vp, *raoffset, *rabsize, 0, 0); 955 956 if ((rabp->b_flags & B_CACHE) == 0) { 957 rabp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 958 rabp->b_cmd = BUF_CMD_READ; 959 vfs_busy_pages(vp, rabp); 960 BUF_KERNPROC(rabp); 961 vn_strategy(vp, &rabp->b_bio1); 962 } else { 963 brelse(rabp); 964 } 965 } 966 if (readwait) 967 rv = biowait(&bp->b_bio1, "biord"); 968 return (rv); 969 } 970 971 /* 972 * bwrite: 973 * 974 * Synchronous write, waits for completion. 975 * 976 * Write, release buffer on completion. (Done by iodone 977 * if async). Do not bother writing anything if the buffer 978 * is invalid. 979 * 980 * Note that we set B_CACHE here, indicating that buffer is 981 * fully valid and thus cacheable. This is true even of NFS 982 * now so we set it generally. This could be set either here 983 * or in biodone() since the I/O is synchronous. We put it 984 * here. 985 */ 986 int 987 bwrite(struct buf *bp) 988 { 989 int error; 990 991 if (bp->b_flags & B_INVAL) { 992 brelse(bp); 993 return (0); 994 } 995 if (BUF_REFCNTNB(bp) == 0) 996 panic("bwrite: buffer is not busy???"); 997 998 /* 999 * NOTE: We no longer mark the buffer clear prior to the vn_strategy() 1000 * call because it will remove the buffer from the vnode's 1001 * dirty buffer list prematurely and possibly cause filesystem 1002 * checks to race buffer flushes. This is now handled in 1003 * bpdone(). 1004 * 1005 * bundirty(bp); REMOVED 1006 */ 1007 1008 bp->b_flags &= ~(B_ERROR | B_EINTR); 1009 bp->b_flags |= B_CACHE; 1010 bp->b_cmd = BUF_CMD_WRITE; 1011 bp->b_bio1.bio_done = biodone_sync; 1012 bp->b_bio1.bio_flags |= BIO_SYNC; 1013 vfs_busy_pages(bp->b_vp, bp); 1014 1015 /* 1016 * Normal bwrites pipeline writes. NOTE: b_bufsize is only 1017 * valid for vnode-backed buffers. 1018 */ 1019 bsetrunningbufspace(bp, bp->b_bufsize); 1020 vn_strategy(bp->b_vp, &bp->b_bio1); 1021 error = biowait(&bp->b_bio1, "biows"); 1022 brelse(bp); 1023 1024 return (error); 1025 } 1026 1027 /* 1028 * bawrite: 1029 * 1030 * Asynchronous write. Start output on a buffer, but do not wait for 1031 * it to complete. The buffer is released when the output completes. 1032 * 1033 * bwrite() ( or the VOP routine anyway ) is responsible for handling 1034 * B_INVAL buffers. Not us. 1035 */ 1036 void 1037 bawrite(struct buf *bp) 1038 { 1039 if (bp->b_flags & B_INVAL) { 1040 brelse(bp); 1041 return; 1042 } 1043 if (BUF_REFCNTNB(bp) == 0) 1044 panic("bawrite: buffer is not busy???"); 1045 1046 /* 1047 * NOTE: We no longer mark the buffer clear prior to the vn_strategy() 1048 * call because it will remove the buffer from the vnode's 1049 * dirty buffer list prematurely and possibly cause filesystem 1050 * checks to race buffer flushes. This is now handled in 1051 * bpdone(). 1052 * 1053 * bundirty(bp); REMOVED 1054 */ 1055 bp->b_flags &= ~(B_ERROR | B_EINTR); 1056 bp->b_flags |= B_CACHE; 1057 bp->b_cmd = BUF_CMD_WRITE; 1058 KKASSERT(bp->b_bio1.bio_done == NULL); 1059 vfs_busy_pages(bp->b_vp, bp); 1060 1061 /* 1062 * Normal bwrites pipeline writes. NOTE: b_bufsize is only 1063 * valid for vnode-backed buffers. 1064 */ 1065 bsetrunningbufspace(bp, bp->b_bufsize); 1066 BUF_KERNPROC(bp); 1067 vn_strategy(bp->b_vp, &bp->b_bio1); 1068 } 1069 1070 /* 1071 * bowrite: 1072 * 1073 * Ordered write. Start output on a buffer, and flag it so that the 1074 * device will write it in the order it was queued. The buffer is 1075 * released when the output completes. bwrite() ( or the VOP routine 1076 * anyway ) is responsible for handling B_INVAL buffers. 1077 */ 1078 int 1079 bowrite(struct buf *bp) 1080 { 1081 bp->b_flags |= B_ORDERED; 1082 bawrite(bp); 1083 return (0); 1084 } 1085 1086 /* 1087 * bdwrite: 1088 * 1089 * Delayed write. (Buffer is marked dirty). Do not bother writing 1090 * anything if the buffer is marked invalid. 1091 * 1092 * Note that since the buffer must be completely valid, we can safely 1093 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 1094 * biodone() in order to prevent getblk from writing the buffer 1095 * out synchronously. 1096 */ 1097 void 1098 bdwrite(struct buf *bp) 1099 { 1100 if (BUF_REFCNTNB(bp) == 0) 1101 panic("bdwrite: buffer is not busy"); 1102 1103 if (bp->b_flags & B_INVAL) { 1104 brelse(bp); 1105 return; 1106 } 1107 bdirty(bp); 1108 1109 dsched_buf_enter(bp); /* might stack */ 1110 1111 /* 1112 * Set B_CACHE, indicating that the buffer is fully valid. This is 1113 * true even of NFS now. 1114 */ 1115 bp->b_flags |= B_CACHE; 1116 1117 /* 1118 * This bmap keeps the system from needing to do the bmap later, 1119 * perhaps when the system is attempting to do a sync. Since it 1120 * is likely that the indirect block -- or whatever other datastructure 1121 * that the filesystem needs is still in memory now, it is a good 1122 * thing to do this. Note also, that if the pageout daemon is 1123 * requesting a sync -- there might not be enough memory to do 1124 * the bmap then... So, this is important to do. 1125 */ 1126 if (bp->b_bio2.bio_offset == NOOFFSET) { 1127 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, 1128 NULL, NULL, BUF_CMD_WRITE); 1129 } 1130 1131 /* 1132 * Because the underlying pages may still be mapped and 1133 * writable trying to set the dirty buffer (b_dirtyoff/end) 1134 * range here will be inaccurate. 1135 * 1136 * However, we must still clean the pages to satisfy the 1137 * vnode_pager and pageout daemon, so they think the pages 1138 * have been "cleaned". What has really occured is that 1139 * they've been earmarked for later writing by the buffer 1140 * cache. 1141 * 1142 * So we get the b_dirtyoff/end update but will not actually 1143 * depend on it (NFS that is) until the pages are busied for 1144 * writing later on. 1145 */ 1146 vfs_clean_pages(bp); 1147 bqrelse(bp); 1148 1149 /* 1150 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 1151 * due to the softdep code. 1152 */ 1153 } 1154 1155 /* 1156 * Fake write - return pages to VM system as dirty, leave the buffer clean. 1157 * This is used by tmpfs. 1158 * 1159 * It is important for any VFS using this routine to NOT use it for 1160 * IO_SYNC or IO_ASYNC operations which occur when the system really 1161 * wants to flush VM pages to backing store. 1162 */ 1163 void 1164 buwrite(struct buf *bp) 1165 { 1166 vm_page_t m; 1167 int i; 1168 1169 /* 1170 * Only works for VMIO buffers. If the buffer is already 1171 * marked for delayed-write we can't avoid the bdwrite(). 1172 */ 1173 if ((bp->b_flags & B_VMIO) == 0 || (bp->b_flags & B_DELWRI)) { 1174 bdwrite(bp); 1175 return; 1176 } 1177 1178 /* 1179 * Mark as needing a commit. 1180 */ 1181 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1182 m = bp->b_xio.xio_pages[i]; 1183 vm_page_need_commit(m); 1184 } 1185 bqrelse(bp); 1186 } 1187 1188 /* 1189 * bdirty: 1190 * 1191 * Turn buffer into delayed write request by marking it B_DELWRI. 1192 * B_RELBUF and B_NOCACHE must be cleared. 1193 * 1194 * We reassign the buffer to itself to properly update it in the 1195 * dirty/clean lists. 1196 * 1197 * Must be called from a critical section. 1198 * The buffer must be on BQUEUE_NONE. 1199 */ 1200 void 1201 bdirty(struct buf *bp) 1202 { 1203 KASSERT(bp->b_qindex == BQUEUE_NONE, 1204 ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1205 if (bp->b_flags & B_NOCACHE) { 1206 kprintf("bdirty: clearing B_NOCACHE on buf %p\n", bp); 1207 bp->b_flags &= ~B_NOCACHE; 1208 } 1209 if (bp->b_flags & B_INVAL) { 1210 kprintf("bdirty: warning, dirtying invalid buffer %p\n", bp); 1211 } 1212 bp->b_flags &= ~B_RELBUF; 1213 1214 if ((bp->b_flags & B_DELWRI) == 0) { 1215 lwkt_gettoken(&bp->b_vp->v_token); 1216 bp->b_flags |= B_DELWRI; 1217 reassignbuf(bp); 1218 lwkt_reltoken(&bp->b_vp->v_token); 1219 1220 atomic_add_long(&dirtybufcount, 1); 1221 atomic_add_long(&dirtykvaspace, bp->b_kvasize); 1222 atomic_add_long(&dirtybufspace, bp->b_bufsize); 1223 if (bp->b_flags & B_HEAVY) { 1224 atomic_add_long(&dirtybufcounthw, 1); 1225 atomic_add_long(&dirtybufspacehw, bp->b_bufsize); 1226 } 1227 bd_heatup(); 1228 } 1229 } 1230 1231 /* 1232 * Set B_HEAVY, indicating that this is a heavy-weight buffer that 1233 * needs to be flushed with a different buf_daemon thread to avoid 1234 * deadlocks. B_HEAVY also imposes restrictions in getnewbuf(). 1235 */ 1236 void 1237 bheavy(struct buf *bp) 1238 { 1239 if ((bp->b_flags & B_HEAVY) == 0) { 1240 bp->b_flags |= B_HEAVY; 1241 if (bp->b_flags & B_DELWRI) { 1242 atomic_add_long(&dirtybufcounthw, 1); 1243 atomic_add_long(&dirtybufspacehw, bp->b_bufsize); 1244 } 1245 } 1246 } 1247 1248 /* 1249 * bundirty: 1250 * 1251 * Clear B_DELWRI for buffer. 1252 * 1253 * Must be called from a critical section. 1254 * 1255 * The buffer is typically on BQUEUE_NONE but there is one case in 1256 * brelse() that calls this function after placing the buffer on 1257 * a different queue. 1258 */ 1259 void 1260 bundirty(struct buf *bp) 1261 { 1262 if (bp->b_flags & B_DELWRI) { 1263 lwkt_gettoken(&bp->b_vp->v_token); 1264 bp->b_flags &= ~B_DELWRI; 1265 reassignbuf(bp); 1266 lwkt_reltoken(&bp->b_vp->v_token); 1267 1268 atomic_add_long(&dirtybufcount, -1); 1269 atomic_add_long(&dirtykvaspace, -bp->b_kvasize); 1270 atomic_add_long(&dirtybufspace, -bp->b_bufsize); 1271 if (bp->b_flags & B_HEAVY) { 1272 atomic_add_long(&dirtybufcounthw, -1); 1273 atomic_add_long(&dirtybufspacehw, -bp->b_bufsize); 1274 } 1275 bd_signal(bp->b_bufsize); 1276 } 1277 /* 1278 * Since it is now being written, we can clear its deferred write flag. 1279 */ 1280 bp->b_flags &= ~B_DEFERRED; 1281 } 1282 1283 /* 1284 * Set the b_runningbufspace field, used to track how much I/O is 1285 * in progress at any given moment. 1286 */ 1287 void 1288 bsetrunningbufspace(struct buf *bp, int bytes) 1289 { 1290 bp->b_runningbufspace = bytes; 1291 if (bytes) { 1292 atomic_add_long(&runningbufspace, bytes); 1293 atomic_add_long(&runningbufcount, 1); 1294 } 1295 } 1296 1297 /* 1298 * brelse: 1299 * 1300 * Release a busy buffer and, if requested, free its resources. The 1301 * buffer will be stashed in the appropriate bufqueue[] allowing it 1302 * to be accessed later as a cache entity or reused for other purposes. 1303 */ 1304 void 1305 brelse(struct buf *bp) 1306 { 1307 struct bufpcpu *pcpu; 1308 #ifdef INVARIANTS 1309 int saved_flags = bp->b_flags; 1310 #endif 1311 1312 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1313 ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1314 1315 /* 1316 * If B_NOCACHE is set we are being asked to destroy the buffer and 1317 * its backing store. Clear B_DELWRI. 1318 * 1319 * B_NOCACHE is set in two cases: (1) when the caller really wants 1320 * to destroy the buffer and backing store and (2) when the caller 1321 * wants to destroy the buffer and backing store after a write 1322 * completes. 1323 */ 1324 if ((bp->b_flags & (B_NOCACHE|B_DELWRI)) == (B_NOCACHE|B_DELWRI)) { 1325 bundirty(bp); 1326 } 1327 1328 if ((bp->b_flags & (B_INVAL | B_DELWRI)) == B_DELWRI) { 1329 /* 1330 * A re-dirtied buffer is only subject to destruction 1331 * by B_INVAL. B_ERROR and B_NOCACHE are ignored. 1332 */ 1333 /* leave buffer intact */ 1334 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || 1335 (bp->b_bufsize <= 0)) { 1336 /* 1337 * Either a failed read or we were asked to free or not 1338 * cache the buffer. This path is reached with B_DELWRI 1339 * set only if B_INVAL is already set. B_NOCACHE governs 1340 * backing store destruction. 1341 * 1342 * NOTE: HAMMER will set B_LOCKED in buf_deallocate if the 1343 * buffer cannot be immediately freed. 1344 */ 1345 bp->b_flags |= B_INVAL; 1346 if (LIST_FIRST(&bp->b_dep) != NULL) 1347 buf_deallocate(bp); 1348 if (bp->b_flags & B_DELWRI) { 1349 atomic_add_long(&dirtybufcount, -1); 1350 atomic_add_long(&dirtykvaspace, -bp->b_kvasize); 1351 atomic_add_long(&dirtybufspace, -bp->b_bufsize); 1352 if (bp->b_flags & B_HEAVY) { 1353 atomic_add_long(&dirtybufcounthw, -1); 1354 atomic_add_long(&dirtybufspacehw, 1355 -bp->b_bufsize); 1356 } 1357 bd_signal(bp->b_bufsize); 1358 } 1359 bp->b_flags &= ~(B_DELWRI | B_CACHE); 1360 } 1361 1362 /* 1363 * We must clear B_RELBUF if B_DELWRI or B_LOCKED is set, 1364 * or if b_refs is non-zero. 1365 * 1366 * If vfs_vmio_release() is called with either bit set, the 1367 * underlying pages may wind up getting freed causing a previous 1368 * write (bdwrite()) to get 'lost' because pages associated with 1369 * a B_DELWRI bp are marked clean. Pages associated with a 1370 * B_LOCKED buffer may be mapped by the filesystem. 1371 * 1372 * If we want to release the buffer ourselves (rather then the 1373 * originator asking us to release it), give the originator a 1374 * chance to countermand the release by setting B_LOCKED. 1375 * 1376 * We still allow the B_INVAL case to call vfs_vmio_release(), even 1377 * if B_DELWRI is set. 1378 * 1379 * If B_DELWRI is not set we may have to set B_RELBUF if we are low 1380 * on pages to return pages to the VM page queues. 1381 */ 1382 if ((bp->b_flags & (B_DELWRI | B_LOCKED)) || bp->b_refs) { 1383 bp->b_flags &= ~B_RELBUF; 1384 } else if (vm_page_count_min(0)) { 1385 if (LIST_FIRST(&bp->b_dep) != NULL) 1386 buf_deallocate(bp); /* can set B_LOCKED */ 1387 if (bp->b_flags & (B_DELWRI | B_LOCKED)) 1388 bp->b_flags &= ~B_RELBUF; 1389 else 1390 bp->b_flags |= B_RELBUF; 1391 } 1392 1393 /* 1394 * Make sure b_cmd is clear. It may have already been cleared by 1395 * biodone(). 1396 * 1397 * At this point destroying the buffer is governed by the B_INVAL 1398 * or B_RELBUF flags. 1399 */ 1400 bp->b_cmd = BUF_CMD_DONE; 1401 dsched_buf_exit(bp); 1402 1403 /* 1404 * VMIO buffer rundown. Make sure the VM page array is restored 1405 * after an I/O may have replaces some of the pages with bogus pages 1406 * in order to not destroy dirty pages in a fill-in read. 1407 * 1408 * Note that due to the code above, if a buffer is marked B_DELWRI 1409 * then the B_RELBUF and B_NOCACHE bits will always be clear. 1410 * B_INVAL may still be set, however. 1411 * 1412 * For clean buffers, B_INVAL or B_RELBUF will destroy the buffer 1413 * but not the backing store. B_NOCACHE will destroy the backing 1414 * store. 1415 * 1416 * Note that dirty NFS buffers contain byte-granular write ranges 1417 * and should not be destroyed w/ B_INVAL even if the backing store 1418 * is left intact. 1419 */ 1420 if (bp->b_flags & B_VMIO) { 1421 /* 1422 * Rundown for VMIO buffers which are not dirty NFS buffers. 1423 */ 1424 int i, j, resid; 1425 vm_page_t m; 1426 off_t foff; 1427 vm_pindex_t poff; 1428 vm_object_t obj; 1429 struct vnode *vp; 1430 1431 vp = bp->b_vp; 1432 1433 /* 1434 * Get the base offset and length of the buffer. Note that 1435 * in the VMIO case if the buffer block size is not 1436 * page-aligned then b_data pointer may not be page-aligned. 1437 * But our b_xio.xio_pages array *IS* page aligned. 1438 * 1439 * block sizes less then DEV_BSIZE (usually 512) are not 1440 * supported due to the page granularity bits (m->valid, 1441 * m->dirty, etc...). 1442 * 1443 * See man buf(9) for more information 1444 */ 1445 1446 resid = bp->b_bufsize; 1447 foff = bp->b_loffset; 1448 1449 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1450 m = bp->b_xio.xio_pages[i]; 1451 1452 /* 1453 * If we hit a bogus page, fixup *all* of them 1454 * now. Note that we left these pages wired 1455 * when we removed them so they had better exist, 1456 * and they cannot be ripped out from under us so 1457 * no critical section protection is necessary. 1458 */ 1459 if (m == bogus_page) { 1460 obj = vp->v_object; 1461 poff = OFF_TO_IDX(bp->b_loffset); 1462 1463 vm_object_hold(obj); 1464 for (j = i; j < bp->b_xio.xio_npages; j++) { 1465 vm_page_t mtmp; 1466 1467 mtmp = bp->b_xio.xio_pages[j]; 1468 if (mtmp == bogus_page) { 1469 if ((bp->b_flags & B_HASBOGUS) == 0) 1470 panic("brelse: bp %p corrupt bogus", bp); 1471 mtmp = vm_page_lookup(obj, poff + j); 1472 if (!mtmp) 1473 panic("brelse: bp %p page %d missing", bp, j); 1474 bp->b_xio.xio_pages[j] = mtmp; 1475 } 1476 } 1477 vm_object_drop(obj); 1478 1479 if ((bp->b_flags & B_HASBOGUS) || (bp->b_flags & B_INVAL) == 0) { 1480 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 1481 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 1482 bp->b_flags &= ~B_HASBOGUS; 1483 } 1484 m = bp->b_xio.xio_pages[i]; 1485 } 1486 1487 /* 1488 * Invalidate the backing store if B_NOCACHE is set 1489 * (e.g. used with vinvalbuf()). If this is NFS 1490 * we impose a requirement that the block size be 1491 * a multiple of PAGE_SIZE and create a temporary 1492 * hack to basically invalidate the whole page. The 1493 * problem is that NFS uses really odd buffer sizes 1494 * especially when tracking piecemeal writes and 1495 * it also vinvalbuf()'s a lot, which would result 1496 * in only partial page validation and invalidation 1497 * here. If the file page is mmap()'d, however, 1498 * all the valid bits get set so after we invalidate 1499 * here we would end up with weird m->valid values 1500 * like 0xfc. nfs_getpages() can't handle this so 1501 * we clear all the valid bits for the NFS case 1502 * instead of just some of them. 1503 * 1504 * The real bug is the VM system having to set m->valid 1505 * to VM_PAGE_BITS_ALL for faulted-in pages, which 1506 * itself is an artifact of the whole 512-byte 1507 * granular mess that exists to support odd block 1508 * sizes and UFS meta-data block sizes (e.g. 6144). 1509 * A complete rewrite is required. 1510 * 1511 * XXX 1512 */ 1513 if (bp->b_flags & (B_NOCACHE|B_ERROR)) { 1514 int poffset = foff & PAGE_MASK; 1515 int presid; 1516 1517 presid = PAGE_SIZE - poffset; 1518 if (bp->b_vp->v_tag == VT_NFS && 1519 bp->b_vp->v_type == VREG) { 1520 ; /* entire page */ 1521 } else if (presid > resid) { 1522 presid = resid; 1523 } 1524 KASSERT(presid >= 0, ("brelse: extra page")); 1525 vm_page_set_invalid(m, poffset, presid); 1526 1527 /* 1528 * Also make sure any swap cache is removed 1529 * as it is now stale (HAMMER in particular 1530 * uses B_NOCACHE to deal with buffer 1531 * aliasing). 1532 */ 1533 swap_pager_unswapped(m); 1534 } 1535 resid -= PAGE_SIZE - (foff & PAGE_MASK); 1536 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 1537 } 1538 if (bp->b_flags & (B_INVAL | B_RELBUF)) 1539 vfs_vmio_release(bp); 1540 } else { 1541 /* 1542 * Rundown for non-VMIO buffers. 1543 */ 1544 if (bp->b_flags & (B_INVAL | B_RELBUF)) { 1545 if (bp->b_bufsize) 1546 allocbuf(bp, 0); 1547 KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); 1548 if (bp->b_vp) 1549 brelvp(bp); 1550 } 1551 } 1552 1553 if (bp->b_qindex != BQUEUE_NONE) 1554 panic("brelse: free buffer onto another queue???"); 1555 if (BUF_REFCNTNB(bp) > 1) { 1556 /* Temporary panic to verify exclusive locking */ 1557 /* This panic goes away when we allow shared refs */ 1558 panic("brelse: multiple refs"); 1559 /* NOT REACHED */ 1560 return; 1561 } 1562 1563 /* 1564 * Figure out the correct queue to place the cleaned up buffer on. 1565 * Buffers placed in the EMPTY or EMPTYKVA had better already be 1566 * disassociated from their vnode. 1567 * 1568 * Return the buffer to its original pcpu area 1569 */ 1570 pcpu = &bufpcpu[bp->b_qcpu]; 1571 spin_lock(&pcpu->spin); 1572 1573 if (bp->b_flags & B_LOCKED) { 1574 /* 1575 * Buffers that are locked are placed in the locked queue 1576 * immediately, regardless of their state. 1577 */ 1578 bp->b_qindex = BQUEUE_LOCKED; 1579 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex], 1580 bp, b_freelist); 1581 } else if (bp->b_bufsize == 0) { 1582 /* 1583 * Buffers with no memory. Due to conditionals near the top 1584 * of brelse() such buffers should probably already be 1585 * marked B_INVAL and disassociated from their vnode. 1586 */ 1587 bp->b_flags |= B_INVAL; 1588 KASSERT(bp->b_vp == NULL, 1589 ("bp1 %p flags %08x/%08x vnode %p " 1590 "unexpectededly still associated!", 1591 bp, saved_flags, bp->b_flags, bp->b_vp)); 1592 KKASSERT((bp->b_flags & B_HASHED) == 0); 1593 bp->b_qindex = BQUEUE_EMPTY; 1594 TAILQ_INSERT_HEAD(&pcpu->bufqueues[bp->b_qindex], 1595 bp, b_freelist); 1596 } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) { 1597 /* 1598 * Buffers with junk contents. Again these buffers had better 1599 * already be disassociated from their vnode. 1600 */ 1601 KASSERT(bp->b_vp == NULL, 1602 ("bp2 %p flags %08x/%08x vnode %p unexpectededly " 1603 "still associated!", 1604 bp, saved_flags, bp->b_flags, bp->b_vp)); 1605 KKASSERT((bp->b_flags & B_HASHED) == 0); 1606 bp->b_flags |= B_INVAL; 1607 bp->b_qindex = BQUEUE_CLEAN; 1608 TAILQ_INSERT_HEAD(&pcpu->bufqueues[bp->b_qindex], 1609 bp, b_freelist); 1610 } else { 1611 /* 1612 * Remaining buffers. These buffers are still associated with 1613 * their vnode. 1614 */ 1615 switch(bp->b_flags & (B_DELWRI|B_HEAVY)) { 1616 case B_DELWRI: 1617 bp->b_qindex = BQUEUE_DIRTY; 1618 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex], 1619 bp, b_freelist); 1620 break; 1621 case B_DELWRI | B_HEAVY: 1622 bp->b_qindex = BQUEUE_DIRTY_HW; 1623 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex], 1624 bp, b_freelist); 1625 break; 1626 default: 1627 /* 1628 * NOTE: Buffers are always placed at the end of the 1629 * queue. If B_AGE is not set the buffer will cycle 1630 * through the queue twice. 1631 */ 1632 bp->b_qindex = BQUEUE_CLEAN; 1633 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex], 1634 bp, b_freelist); 1635 break; 1636 } 1637 } 1638 spin_unlock(&pcpu->spin); 1639 1640 /* 1641 * If B_INVAL, clear B_DELWRI. We've already placed the buffer 1642 * on the correct queue but we have not yet unlocked it. 1643 */ 1644 if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) 1645 bundirty(bp); 1646 1647 /* 1648 * The bp is on an appropriate queue unless locked. If it is not 1649 * locked or dirty we can wakeup threads waiting for buffer space. 1650 * 1651 * We've already handled the B_INVAL case ( B_DELWRI will be clear 1652 * if B_INVAL is set ). 1653 */ 1654 if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) 1655 bufcountwakeup(); 1656 1657 /* 1658 * Something we can maybe free or reuse 1659 */ 1660 if (bp->b_bufsize || bp->b_kvasize) 1661 bufspacewakeup(); 1662 1663 /* 1664 * Clean up temporary flags and unlock the buffer. 1665 */ 1666 bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF | B_DIRECT); 1667 BUF_UNLOCK(bp); 1668 } 1669 1670 /* 1671 * bqrelse: 1672 * 1673 * Release a buffer back to the appropriate queue but do not try to free 1674 * it. The buffer is expected to be used again soon. 1675 * 1676 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1677 * biodone() to requeue an async I/O on completion. It is also used when 1678 * known good buffers need to be requeued but we think we may need the data 1679 * again soon. 1680 * 1681 * XXX we should be able to leave the B_RELBUF hint set on completion. 1682 */ 1683 void 1684 bqrelse(struct buf *bp) 1685 { 1686 struct bufpcpu *pcpu; 1687 1688 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1689 ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1690 1691 if (bp->b_qindex != BQUEUE_NONE) 1692 panic("bqrelse: free buffer onto another queue???"); 1693 if (BUF_REFCNTNB(bp) > 1) { 1694 /* do not release to free list */ 1695 panic("bqrelse: multiple refs"); 1696 return; 1697 } 1698 1699 buf_act_advance(bp); 1700 1701 pcpu = &bufpcpu[bp->b_qcpu]; 1702 spin_lock(&pcpu->spin); 1703 1704 if (bp->b_flags & B_LOCKED) { 1705 /* 1706 * Locked buffers are released to the locked queue. However, 1707 * if the buffer is dirty it will first go into the dirty 1708 * queue and later on after the I/O completes successfully it 1709 * will be released to the locked queue. 1710 */ 1711 bp->b_qindex = BQUEUE_LOCKED; 1712 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex], 1713 bp, b_freelist); 1714 } else if (bp->b_flags & B_DELWRI) { 1715 bp->b_qindex = (bp->b_flags & B_HEAVY) ? 1716 BQUEUE_DIRTY_HW : BQUEUE_DIRTY; 1717 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex], 1718 bp, b_freelist); 1719 } else if (vm_page_count_min(0)) { 1720 /* 1721 * We are too low on memory, we have to try to free the 1722 * buffer (most importantly: the wired pages making up its 1723 * backing store) *now*. 1724 */ 1725 spin_unlock(&pcpu->spin); 1726 brelse(bp); 1727 return; 1728 } else { 1729 bp->b_qindex = BQUEUE_CLEAN; 1730 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex], 1731 bp, b_freelist); 1732 } 1733 spin_unlock(&pcpu->spin); 1734 1735 /* 1736 * We have now placed the buffer on the proper queue, but have yet 1737 * to unlock it. 1738 */ 1739 if ((bp->b_flags & B_LOCKED) == 0 && 1740 ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)) { 1741 bufcountwakeup(); 1742 } 1743 1744 /* 1745 * Something we can maybe free or reuse. 1746 */ 1747 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 1748 bufspacewakeup(); 1749 1750 /* 1751 * Final cleanup and unlock. Clear bits that are only used while a 1752 * buffer is actively locked. 1753 */ 1754 bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF); 1755 dsched_buf_exit(bp); 1756 BUF_UNLOCK(bp); 1757 } 1758 1759 /* 1760 * Hold a buffer, preventing it from being reused. This will prevent 1761 * normal B_RELBUF operations on the buffer but will not prevent B_INVAL 1762 * operations. If a B_INVAL operation occurs the buffer will remain held 1763 * but the underlying pages may get ripped out. 1764 * 1765 * These functions are typically used in VOP_READ/VOP_WRITE functions 1766 * to hold a buffer during a copyin or copyout, preventing deadlocks 1767 * or recursive lock panics when read()/write() is used over mmap()'d 1768 * space. 1769 * 1770 * NOTE: bqhold() requires that the buffer be locked at the time of the 1771 * hold. bqdrop() has no requirements other than the buffer having 1772 * previously been held. 1773 */ 1774 void 1775 bqhold(struct buf *bp) 1776 { 1777 atomic_add_int(&bp->b_refs, 1); 1778 } 1779 1780 void 1781 bqdrop(struct buf *bp) 1782 { 1783 KKASSERT(bp->b_refs > 0); 1784 atomic_add_int(&bp->b_refs, -1); 1785 } 1786 1787 /* 1788 * Return backing pages held by the buffer 'bp' back to the VM system. 1789 * This routine is called when the bp is invalidated, released, or 1790 * reused. 1791 * 1792 * The KVA mapping (b_data) for the underlying pages is removed by 1793 * this function. 1794 * 1795 * WARNING! This routine is integral to the low memory critical path 1796 * when a buffer is B_RELBUF'd. If the system has a severe page 1797 * deficit we need to get the page(s) onto the PQ_FREE or PQ_CACHE 1798 * queues so they can be reused in the current pageout daemon 1799 * pass. 1800 */ 1801 static void 1802 vfs_vmio_release(struct buf *bp) 1803 { 1804 int i; 1805 vm_page_t m; 1806 1807 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1808 m = bp->b_xio.xio_pages[i]; 1809 bp->b_xio.xio_pages[i] = NULL; 1810 1811 /* 1812 * We need to own the page in order to safely unwire it. 1813 */ 1814 vm_page_busy_wait(m, FALSE, "vmiopg"); 1815 1816 /* 1817 * The VFS is telling us this is not a meta-data buffer 1818 * even if it is backed by a block device. 1819 */ 1820 if (bp->b_flags & B_NOTMETA) 1821 vm_page_flag_set(m, PG_NOTMETA); 1822 1823 /* 1824 * This is a very important bit of code. We try to track 1825 * VM page use whether the pages are wired into the buffer 1826 * cache or not. While wired into the buffer cache the 1827 * bp tracks the act_count. 1828 * 1829 * We can choose to place unwired pages on the inactive 1830 * queue (0) or active queue (1). If we place too many 1831 * on the active queue the queue will cycle the act_count 1832 * on pages we'd like to keep, just from single-use pages 1833 * (such as when doing a tar-up or file scan). 1834 */ 1835 if (bp->b_act_count < vm_cycle_point) 1836 vm_page_unwire(m, 0); 1837 else 1838 vm_page_unwire(m, 1); 1839 1840 /* 1841 * If the wire_count has dropped to 0 we may need to take 1842 * further action before unbusying the page. 1843 * 1844 * WARNING: vm_page_try_*() also checks PG_NEED_COMMIT for us. 1845 */ 1846 if (m->wire_count == 0) { 1847 if (bp->b_flags & B_DIRECT) { 1848 /* 1849 * Attempt to free the page if B_DIRECT is 1850 * set, the caller does not desire the page 1851 * to be cached. 1852 */ 1853 vm_page_wakeup(m); 1854 vm_page_try_to_free(m); 1855 } else if ((bp->b_flags & B_NOTMETA) || 1856 vm_page_count_min(0)) { 1857 /* 1858 * Attempt to move the page to PQ_CACHE 1859 * if B_NOTMETA is set. This flag is set 1860 * by HAMMER to remove one of the two pages 1861 * present when double buffering is enabled. 1862 * 1863 * Attempt to move the page to PQ_CACHE 1864 * If we have a severe page deficit. This 1865 * will cause buffer cache operations related 1866 * to pageouts to recycle the related pages 1867 * in order to avoid a low memory deadlock. 1868 */ 1869 m->act_count = bp->b_act_count; 1870 vm_page_wakeup(m); 1871 vm_page_try_to_cache(m); 1872 } else { 1873 /* 1874 * Nominal case, leave the page on the 1875 * queue the original unwiring placed it on 1876 * (active or inactive). 1877 */ 1878 m->act_count = bp->b_act_count; 1879 vm_page_wakeup(m); 1880 } 1881 } else { 1882 vm_page_wakeup(m); 1883 } 1884 } 1885 1886 /* 1887 * Zero out the pmap pte's for the mapping, but don't bother 1888 * invalidating the TLB. The range will be properly invalidating 1889 * when new pages are entered into the mapping. 1890 * 1891 * This in particular reduces tmpfs tear-down overhead and reduces 1892 * buffer cache re-use overhead (one invalidation sequence instead 1893 * of two per re-use). 1894 */ 1895 pmap_qremove_noinval(trunc_page((vm_offset_t) bp->b_data), 1896 bp->b_xio.xio_npages); 1897 if (bp->b_bufsize) { 1898 atomic_add_long(&bufspace, -bp->b_bufsize); 1899 bp->b_bufsize = 0; 1900 bufspacewakeup(); 1901 } 1902 bp->b_xio.xio_npages = 0; 1903 bp->b_flags &= ~B_VMIO; 1904 KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); 1905 if (bp->b_vp) 1906 brelvp(bp); 1907 } 1908 1909 /* 1910 * Find and initialize a new buffer header, freeing up existing buffers 1911 * in the bufqueues as necessary. The new buffer is returned locked. 1912 * 1913 * If repurpose is non-NULL getnewbuf() is allowed to re-purpose an existing 1914 * buffer. The buffer will be disassociated, its page and page mappings 1915 * left intact, and returned with *repurpose set to 1. Else *repurpose is set 1916 * to 0. If 1, the caller must repurpose the underlying VM pages. 1917 * 1918 * If repurpose is NULL getnewbuf() is not allowed to re-purpose an 1919 * existing buffer. That is, it must completely initialize the returned 1920 * buffer. 1921 * 1922 * Important: B_INVAL is not set. If the caller wishes to throw the 1923 * buffer away, the caller must set B_INVAL prior to calling brelse(). 1924 * 1925 * We block if: 1926 * We have insufficient buffer headers 1927 * We have insufficient buffer space 1928 * 1929 * To avoid VFS layer recursion we do not flush dirty buffers ourselves. 1930 * Instead we ask the buf daemon to do it for us. We attempt to 1931 * avoid piecemeal wakeups of the pageout daemon. 1932 */ 1933 struct buf * 1934 getnewbuf(int blkflags, int slptimeo, int size, int maxsize, 1935 struct vm_object **repurposep) 1936 { 1937 struct bufpcpu *pcpu; 1938 struct buf *bp; 1939 struct buf *nbp; 1940 int nqindex; 1941 int nqcpu; 1942 int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0; 1943 int maxloops = 200000; 1944 int restart_reason = 0; 1945 struct buf *restart_bp = NULL; 1946 static char flushingbufs[MAXCPU]; 1947 char *flushingp; 1948 1949 /* 1950 * We can't afford to block since we might be holding a vnode lock, 1951 * which may prevent system daemons from running. We deal with 1952 * low-memory situations by proactively returning memory and running 1953 * async I/O rather then sync I/O. 1954 */ 1955 1956 ++getnewbufcalls; 1957 nqcpu = mycpu->gd_cpuid; 1958 flushingp = &flushingbufs[nqcpu]; 1959 restart: 1960 if (bufspace < lobufspace) 1961 *flushingp = 0; 1962 1963 if (debug_bufbio && --maxloops == 0) 1964 panic("getnewbuf, excessive loops on cpu %d restart %d (%p)", 1965 mycpu->gd_cpuid, restart_reason, restart_bp); 1966 1967 /* 1968 * Setup for scan. If we do not have enough free buffers, 1969 * we setup a degenerate case that immediately fails. Note 1970 * that if we are specially marked process, we are allowed to 1971 * dip into our reserves. 1972 * 1973 * The scanning sequence is nominally: EMPTY->CLEAN 1974 */ 1975 pcpu = &bufpcpu[nqcpu]; 1976 spin_lock(&pcpu->spin); 1977 1978 /* 1979 * Determine if repurposing should be disallowed. Generally speaking 1980 * do not repurpose buffers if the buffer cache hasn't capped. Also 1981 * control repurposing based on buffer-cache -> main-memory bandwidth. 1982 * That is, we want to recycle buffers normally up until the buffer 1983 * cache bandwidth (new-buffer bw) exceeds bufcache_bw. 1984 * 1985 * (This is heuristical, SMP collisions are ok) 1986 */ 1987 if (repurposep) { 1988 int delta = ticks - bufcache_bw_ticks; 1989 if (delta < 0 || delta >= hz) { 1990 atomic_swap_long(&bufcache_bw_accum, 0); 1991 atomic_swap_int(&bufcache_bw_ticks, ticks); 1992 } 1993 atomic_add_long(&bufcache_bw_accum, size); 1994 if (bufspace < lobufspace) { 1995 repurposep = NULL; 1996 } else if (bufcache_bw_accum < bufcache_bw) { 1997 repurposep = NULL; 1998 } 1999 } 2000 2001 /* 2002 * Prime the scan for this cpu. Locate the first buffer to 2003 * check. If we are flushing buffers we must skip the 2004 * EMPTY queue. 2005 */ 2006 nqindex = BQUEUE_EMPTY; 2007 nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_EMPTY]); 2008 if (nbp == NULL || *flushingp || repurposep) { 2009 nqindex = BQUEUE_CLEAN; 2010 nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_CLEAN]); 2011 } 2012 2013 /* 2014 * Run scan, possibly freeing data and/or kva mappings on the fly, 2015 * depending. 2016 * 2017 * WARNING! spin is held! 2018 */ 2019 while ((bp = nbp) != NULL) { 2020 int qindex = nqindex; 2021 2022 nbp = TAILQ_NEXT(bp, b_freelist); 2023 2024 /* 2025 * BQUEUE_CLEAN - B_AGE special case. If not set the bp 2026 * cycles through the queue twice before being selected. 2027 */ 2028 if (qindex == BQUEUE_CLEAN && 2029 (bp->b_flags & B_AGE) == 0 && nbp) { 2030 bp->b_flags |= B_AGE; 2031 TAILQ_REMOVE(&pcpu->bufqueues[qindex], 2032 bp, b_freelist); 2033 TAILQ_INSERT_TAIL(&pcpu->bufqueues[qindex], 2034 bp, b_freelist); 2035 continue; 2036 } 2037 2038 /* 2039 * Calculate next bp ( we can only use it if we do not block 2040 * or do other fancy things ). 2041 */ 2042 if (nbp == NULL) { 2043 switch(qindex) { 2044 case BQUEUE_EMPTY: 2045 nqindex = BQUEUE_CLEAN; 2046 if ((nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_CLEAN]))) 2047 break; 2048 /* fall through */ 2049 case BQUEUE_CLEAN: 2050 /* 2051 * nbp is NULL. 2052 */ 2053 break; 2054 } 2055 } 2056 2057 /* 2058 * Sanity Checks 2059 */ 2060 KASSERT(bp->b_qindex == qindex, 2061 ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); 2062 2063 /* 2064 * Note: we no longer distinguish between VMIO and non-VMIO 2065 * buffers. 2066 */ 2067 KASSERT((bp->b_flags & B_DELWRI) == 0, 2068 ("delwri buffer %p found in queue %d", bp, qindex)); 2069 2070 /* 2071 * Do not try to reuse a buffer with a non-zero b_refs. 2072 * This is an unsynchronized test. A synchronized test 2073 * is also performed after we lock the buffer. 2074 */ 2075 if (bp->b_refs) 2076 continue; 2077 2078 /* 2079 * Start freeing the bp. This is somewhat involved. nbp 2080 * remains valid only for BQUEUE_EMPTY bp's. Buffers 2081 * on the clean list must be disassociated from their 2082 * current vnode. Buffers on the empty lists have 2083 * already been disassociated. 2084 * 2085 * b_refs is checked after locking along with queue changes. 2086 * We must check here to deal with zero->nonzero transitions 2087 * made by the owner of the buffer lock, which is used by 2088 * VFS's to hold the buffer while issuing an unlocked 2089 * uiomove()s. We cannot invalidate the buffer's pages 2090 * for this case. Once we successfully lock a buffer the 2091 * only 0->1 transitions of b_refs will occur via findblk(). 2092 * 2093 * We must also check for queue changes after successful 2094 * locking as the current lock holder may dispose of the 2095 * buffer and change its queue. 2096 */ 2097 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 2098 spin_unlock(&pcpu->spin); 2099 tsleep(&bd_request, 0, "gnbxxx", (hz + 99) / 100); 2100 restart_reason = 1; 2101 restart_bp = bp; 2102 goto restart; 2103 } 2104 if (bp->b_qindex != qindex || bp->b_refs) { 2105 spin_unlock(&pcpu->spin); 2106 BUF_UNLOCK(bp); 2107 restart_reason = 2; 2108 restart_bp = bp; 2109 goto restart; 2110 } 2111 bremfree_locked(bp); 2112 spin_unlock(&pcpu->spin); 2113 2114 /* 2115 * Dependancies must be handled before we disassociate the 2116 * vnode. 2117 * 2118 * NOTE: HAMMER will set B_LOCKED if the buffer cannot 2119 * be immediately disassociated. HAMMER then becomes 2120 * responsible for releasing the buffer. 2121 * 2122 * NOTE: spin is UNLOCKED now. 2123 */ 2124 if (LIST_FIRST(&bp->b_dep) != NULL) { 2125 buf_deallocate(bp); 2126 if (bp->b_flags & B_LOCKED) { 2127 bqrelse(bp); 2128 restart_reason = 3; 2129 restart_bp = bp; 2130 goto restart; 2131 } 2132 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2133 } 2134 2135 /* 2136 * CLEAN buffers have content or associations that must be 2137 * cleaned out if not repurposing. 2138 */ 2139 if (qindex == BQUEUE_CLEAN) { 2140 if (bp->b_flags & B_VMIO) { 2141 if (repurpose_enable && 2142 repurposep && bp->b_bufsize && 2143 (bp->b_flags & (B_DELWRI | B_MALLOC)) == 0) { 2144 *repurposep = bp->b_vp->v_object; 2145 vm_object_hold(*repurposep); 2146 } else { 2147 vfs_vmio_release(bp); 2148 } 2149 } 2150 if (bp->b_vp) 2151 brelvp(bp); 2152 } 2153 2154 /* 2155 * NOTE: nbp is now entirely invalid. We can only restart 2156 * the scan from this point on. 2157 * 2158 * Get the rest of the buffer freed up. b_kva* is still 2159 * valid after this operation. 2160 */ 2161 KASSERT(bp->b_vp == NULL, 2162 ("bp3 %p flags %08x vnode %p qindex %d " 2163 "unexpectededly still associated!", 2164 bp, bp->b_flags, bp->b_vp, qindex)); 2165 KKASSERT((bp->b_flags & B_HASHED) == 0); 2166 2167 if (repurposep == NULL || *repurposep == NULL) { 2168 if (bp->b_bufsize) 2169 allocbuf(bp, 0); 2170 } 2171 2172 if (bp->b_flags & (B_VNDIRTY | B_VNCLEAN | B_HASHED)) { 2173 kprintf("getnewbuf: caught bug vp queue " 2174 "%p/%08x qidx %d\n", 2175 bp, bp->b_flags, qindex); 2176 brelvp(bp); 2177 } 2178 bp->b_flags = B_BNOCLIP; 2179 bp->b_cmd = BUF_CMD_DONE; 2180 bp->b_vp = NULL; 2181 bp->b_error = 0; 2182 bp->b_resid = 0; 2183 bp->b_bcount = 0; 2184 if (repurposep == NULL || *repurposep == NULL) 2185 bp->b_xio.xio_npages = 0; 2186 bp->b_dirtyoff = bp->b_dirtyend = 0; 2187 bp->b_act_count = ACT_INIT; 2188 reinitbufbio(bp); 2189 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2190 buf_dep_init(bp); 2191 if (blkflags & GETBLK_BHEAVY) 2192 bp->b_flags |= B_HEAVY; 2193 2194 if (bufspace >= hibufspace) 2195 *flushingp = 1; 2196 if (bufspace < lobufspace) 2197 *flushingp = 0; 2198 if (*flushingp) { 2199 if (repurposep && *repurposep != NULL) { 2200 bp->b_flags |= B_VMIO; 2201 vfs_vmio_release(bp); 2202 if (bp->b_bufsize) 2203 allocbuf(bp, 0); 2204 vm_object_drop(*repurposep); 2205 *repurposep = NULL; 2206 } 2207 bp->b_flags |= B_INVAL; 2208 brelse(bp); 2209 restart_reason = 5; 2210 restart_bp = bp; 2211 goto restart; 2212 } 2213 2214 /* 2215 * b_refs can transition to a non-zero value while we hold 2216 * the buffer locked due to a findblk(). Our brelvp() above 2217 * interlocked any future possible transitions due to 2218 * findblk()s. 2219 * 2220 * If we find b_refs to be non-zero we can destroy the 2221 * buffer's contents but we cannot yet reuse the buffer. 2222 */ 2223 if (bp->b_refs) { 2224 if (repurposep && *repurposep != NULL) { 2225 bp->b_flags |= B_VMIO; 2226 vfs_vmio_release(bp); 2227 if (bp->b_bufsize) 2228 allocbuf(bp, 0); 2229 vm_object_drop(*repurposep); 2230 *repurposep = NULL; 2231 } 2232 bp->b_flags |= B_INVAL; 2233 brelse(bp); 2234 restart_reason = 6; 2235 restart_bp = bp; 2236 2237 goto restart; 2238 } 2239 2240 /* 2241 * We found our buffer! 2242 */ 2243 break; 2244 } 2245 2246 /* 2247 * If we exhausted our list, iterate other cpus. If that fails, 2248 * sleep as appropriate. We may have to wakeup various daemons 2249 * and write out some dirty buffers. 2250 * 2251 * Generally we are sleeping due to insufficient buffer space. 2252 * 2253 * NOTE: spin is held if bp is NULL, else it is not held. 2254 */ 2255 if (bp == NULL) { 2256 int flags; 2257 char *waitmsg; 2258 2259 spin_unlock(&pcpu->spin); 2260 2261 nqcpu = (nqcpu + 1) % ncpus; 2262 if (nqcpu != mycpu->gd_cpuid) { 2263 restart_reason = 7; 2264 restart_bp = bp; 2265 goto restart; 2266 } 2267 2268 if (bufspace >= hibufspace) { 2269 waitmsg = "bufspc"; 2270 flags = VFS_BIO_NEED_BUFSPACE; 2271 } else { 2272 waitmsg = "newbuf"; 2273 flags = VFS_BIO_NEED_ANY; 2274 } 2275 2276 bd_speedup(); /* heeeelp */ 2277 atomic_set_int(&needsbuffer, flags); 2278 while (needsbuffer & flags) { 2279 int value; 2280 2281 tsleep_interlock(&needsbuffer, 0); 2282 value = atomic_fetchadd_int(&needsbuffer, 0); 2283 if (value & flags) { 2284 if (tsleep(&needsbuffer, PINTERLOCKED|slpflags, 2285 waitmsg, slptimeo)) { 2286 return (NULL); 2287 } 2288 } 2289 } 2290 } else { 2291 /* 2292 * We finally have a valid bp. Reset b_data. 2293 * 2294 * (spin is not held) 2295 */ 2296 bp->b_data = bp->b_kvabase; 2297 } 2298 return(bp); 2299 } 2300 2301 /* 2302 * buf_daemon: 2303 * 2304 * Buffer flushing daemon. Buffers are normally flushed by the 2305 * update daemon but if it cannot keep up this process starts to 2306 * take the load in an attempt to prevent getnewbuf() from blocking. 2307 * 2308 * Once a flush is initiated it does not stop until the number 2309 * of buffers falls below lodirtybuffers, but we will wake up anyone 2310 * waiting at the mid-point. 2311 */ 2312 static struct kproc_desc buf_kp = { 2313 "bufdaemon", 2314 buf_daemon, 2315 &bufdaemon_td 2316 }; 2317 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, 2318 kproc_start, &buf_kp); 2319 2320 static struct kproc_desc bufhw_kp = { 2321 "bufdaemon_hw", 2322 buf_daemon_hw, 2323 &bufdaemonhw_td 2324 }; 2325 SYSINIT(bufdaemon_hw, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, 2326 kproc_start, &bufhw_kp); 2327 2328 static void 2329 buf_daemon1(struct thread *td, int queue, int (*buf_limit_fn)(long), 2330 int *bd_req) 2331 { 2332 long limit; 2333 struct buf *marker; 2334 2335 marker = kmalloc(sizeof(*marker), M_BIOBUF, M_WAITOK | M_ZERO); 2336 marker->b_flags |= B_MARKER; 2337 marker->b_qindex = BQUEUE_NONE; 2338 marker->b_qcpu = 0; 2339 2340 /* 2341 * This process needs to be suspended prior to shutdown sync. 2342 */ 2343 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 2344 td, SHUTDOWN_PRI_LAST); 2345 curthread->td_flags |= TDF_SYSTHREAD; 2346 2347 /* 2348 * This process is allowed to take the buffer cache to the limit 2349 */ 2350 for (;;) { 2351 kproc_suspend_loop(); 2352 2353 /* 2354 * Do the flush as long as the number of dirty buffers 2355 * (including those running) exceeds lodirtybufspace. 2356 * 2357 * When flushing limit running I/O to hirunningspace 2358 * Do the flush. Limit the amount of in-transit I/O we 2359 * allow to build up, otherwise we would completely saturate 2360 * the I/O system. Wakeup any waiting processes before we 2361 * normally would so they can run in parallel with our drain. 2362 * 2363 * Our aggregate normal+HW lo water mark is lodirtybufspace, 2364 * but because we split the operation into two threads we 2365 * have to cut it in half for each thread. 2366 */ 2367 waitrunningbufspace(); 2368 limit = lodirtybufspace / 2; 2369 while (buf_limit_fn(limit)) { 2370 if (flushbufqueues(marker, queue) == 0) 2371 break; 2372 if (runningbufspace < hirunningspace) 2373 continue; 2374 waitrunningbufspace(); 2375 } 2376 2377 /* 2378 * We reached our low water mark, reset the 2379 * request and sleep until we are needed again. 2380 * The sleep is just so the suspend code works. 2381 */ 2382 tsleep_interlock(bd_req, 0); 2383 if (atomic_swap_int(bd_req, 0) == 0) 2384 tsleep(bd_req, PINTERLOCKED, "psleep", hz); 2385 } 2386 /* NOT REACHED */ 2387 /*kfree(marker, M_BIOBUF);*/ 2388 } 2389 2390 static int 2391 buf_daemon_limit(long limit) 2392 { 2393 return (runningbufspace + dirtykvaspace > limit || 2394 dirtybufcount - dirtybufcounthw >= nbuf / 2); 2395 } 2396 2397 static int 2398 buf_daemon_hw_limit(long limit) 2399 { 2400 return (runningbufspace + dirtykvaspace > limit || 2401 dirtybufcounthw >= nbuf / 2); 2402 } 2403 2404 static void 2405 buf_daemon(void) 2406 { 2407 buf_daemon1(bufdaemon_td, BQUEUE_DIRTY, buf_daemon_limit, 2408 &bd_request); 2409 } 2410 2411 static void 2412 buf_daemon_hw(void) 2413 { 2414 buf_daemon1(bufdaemonhw_td, BQUEUE_DIRTY_HW, buf_daemon_hw_limit, 2415 &bd_request_hw); 2416 } 2417 2418 /* 2419 * Flush up to (flushperqueue) buffers in the dirty queue. Each cpu has a 2420 * localized version of the queue. Each call made to this function iterates 2421 * to another cpu. It is desireable to flush several buffers from the same 2422 * cpu's queue at once, as these are likely going to be linear. 2423 * 2424 * We must be careful to free up B_INVAL buffers instead of write them, which 2425 * NFS is particularly sensitive to. 2426 * 2427 * B_RELBUF may only be set by VFSs. We do set B_AGE to indicate that we 2428 * really want to try to get the buffer out and reuse it due to the write 2429 * load on the machine. 2430 * 2431 * We must lock the buffer in order to check its validity before we can mess 2432 * with its contents. spin isn't enough. 2433 */ 2434 static int 2435 flushbufqueues(struct buf *marker, bufq_type_t q) 2436 { 2437 struct bufpcpu *pcpu; 2438 struct buf *bp; 2439 int r = 0; 2440 u_int loops = flushperqueue; 2441 int lcpu = marker->b_qcpu; 2442 2443 KKASSERT(marker->b_qindex == BQUEUE_NONE); 2444 KKASSERT(marker->b_flags & B_MARKER); 2445 2446 again: 2447 /* 2448 * Spinlock needed to perform operations on the queue and may be 2449 * held through a non-blocking BUF_LOCK(), but cannot be held when 2450 * BUF_UNLOCK()ing or through any other major operation. 2451 */ 2452 pcpu = &bufpcpu[marker->b_qcpu]; 2453 spin_lock(&pcpu->spin); 2454 marker->b_qindex = q; 2455 TAILQ_INSERT_HEAD(&pcpu->bufqueues[q], marker, b_freelist); 2456 bp = marker; 2457 2458 while ((bp = TAILQ_NEXT(bp, b_freelist)) != NULL) { 2459 /* 2460 * NOTE: spinlock is always held at the top of the loop 2461 */ 2462 if (bp->b_flags & B_MARKER) 2463 continue; 2464 if ((bp->b_flags & B_DELWRI) == 0) { 2465 kprintf("Unexpected clean buffer %p\n", bp); 2466 continue; 2467 } 2468 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) 2469 continue; 2470 KKASSERT(bp->b_qcpu == marker->b_qcpu && bp->b_qindex == q); 2471 2472 /* 2473 * Once the buffer is locked we will have no choice but to 2474 * unlock the spinlock around a later BUF_UNLOCK and re-set 2475 * bp = marker when looping. Move the marker now to make 2476 * things easier. 2477 */ 2478 TAILQ_REMOVE(&pcpu->bufqueues[q], marker, b_freelist); 2479 TAILQ_INSERT_AFTER(&pcpu->bufqueues[q], bp, marker, b_freelist); 2480 2481 /* 2482 * Must recheck B_DELWRI after successfully locking 2483 * the buffer. 2484 */ 2485 if ((bp->b_flags & B_DELWRI) == 0) { 2486 spin_unlock(&pcpu->spin); 2487 BUF_UNLOCK(bp); 2488 spin_lock(&pcpu->spin); 2489 bp = marker; 2490 continue; 2491 } 2492 2493 /* 2494 * Remove the buffer from its queue. We still own the 2495 * spinlock here. 2496 */ 2497 _bremfree(bp); 2498 2499 /* 2500 * Disposing of an invalid buffer counts as a flush op 2501 */ 2502 if (bp->b_flags & B_INVAL) { 2503 spin_unlock(&pcpu->spin); 2504 brelse(bp); 2505 goto doloop; 2506 } 2507 2508 /* 2509 * Release the spinlock for the more complex ops we 2510 * are now going to do. 2511 */ 2512 spin_unlock(&pcpu->spin); 2513 lwkt_yield(); 2514 2515 /* 2516 * This is a bit messy 2517 */ 2518 if (LIST_FIRST(&bp->b_dep) != NULL && 2519 (bp->b_flags & B_DEFERRED) == 0 && 2520 buf_countdeps(bp, 0)) { 2521 spin_lock(&pcpu->spin); 2522 TAILQ_INSERT_TAIL(&pcpu->bufqueues[q], bp, b_freelist); 2523 bp->b_qindex = q; 2524 bp->b_flags |= B_DEFERRED; 2525 spin_unlock(&pcpu->spin); 2526 BUF_UNLOCK(bp); 2527 spin_lock(&pcpu->spin); 2528 bp = marker; 2529 continue; 2530 } 2531 2532 /* 2533 * spinlock not held here. 2534 * 2535 * If the buffer has a dependancy, buf_checkwrite() must 2536 * also return 0 for us to be able to initate the write. 2537 * 2538 * If the buffer is flagged B_ERROR it may be requeued 2539 * over and over again, we try to avoid a live lock. 2540 */ 2541 if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) { 2542 brelse(bp); 2543 } else if (bp->b_flags & B_ERROR) { 2544 tsleep(bp, 0, "bioer", 1); 2545 bp->b_flags &= ~B_AGE; 2546 cluster_awrite(bp); 2547 } else { 2548 bp->b_flags |= B_AGE; 2549 cluster_awrite(bp); 2550 } 2551 /* bp invalid but needs to be NULL-tested if we break out */ 2552 doloop: 2553 spin_lock(&pcpu->spin); 2554 ++r; 2555 if (--loops == 0) 2556 break; 2557 bp = marker; 2558 } 2559 /* bp is invalid here but can be NULL-tested to advance */ 2560 2561 TAILQ_REMOVE(&pcpu->bufqueues[q], marker, b_freelist); 2562 marker->b_qindex = BQUEUE_NONE; 2563 spin_unlock(&pcpu->spin); 2564 2565 /* 2566 * Advance the marker to be fair. 2567 */ 2568 marker->b_qcpu = (marker->b_qcpu + 1) % ncpus; 2569 if (bp == NULL) { 2570 if (marker->b_qcpu != lcpu) 2571 goto again; 2572 } 2573 2574 return (r); 2575 } 2576 2577 /* 2578 * inmem: 2579 * 2580 * Returns true if no I/O is needed to access the associated VM object. 2581 * This is like findblk except it also hunts around in the VM system for 2582 * the data. 2583 * 2584 * Note that we ignore vm_page_free() races from interrupts against our 2585 * lookup, since if the caller is not protected our return value will not 2586 * be any more valid then otherwise once we exit the critical section. 2587 */ 2588 int 2589 inmem(struct vnode *vp, off_t loffset) 2590 { 2591 vm_object_t obj; 2592 vm_offset_t toff, tinc, size; 2593 vm_page_t m; 2594 int res = 1; 2595 2596 if (findblk(vp, loffset, FINDBLK_TEST)) 2597 return 1; 2598 if (vp->v_mount == NULL) 2599 return 0; 2600 if ((obj = vp->v_object) == NULL) 2601 return 0; 2602 2603 size = PAGE_SIZE; 2604 if (size > vp->v_mount->mnt_stat.f_iosize) 2605 size = vp->v_mount->mnt_stat.f_iosize; 2606 2607 vm_object_hold(obj); 2608 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 2609 m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff)); 2610 if (m == NULL) { 2611 res = 0; 2612 break; 2613 } 2614 tinc = size; 2615 if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK)) 2616 tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK); 2617 if (vm_page_is_valid(m, 2618 (vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0) { 2619 res = 0; 2620 break; 2621 } 2622 } 2623 vm_object_drop(obj); 2624 return (res); 2625 } 2626 2627 /* 2628 * findblk: 2629 * 2630 * Locate and return the specified buffer. Unless flagged otherwise, 2631 * a locked buffer will be returned if it exists or NULL if it does not. 2632 * 2633 * findblk()'d buffers are still on the bufqueues and if you intend 2634 * to use your (locked NON-TEST) buffer you need to bremfree(bp) 2635 * and possibly do other stuff to it. 2636 * 2637 * FINDBLK_TEST - Do not lock the buffer. The caller is responsible 2638 * for locking the buffer and ensuring that it remains 2639 * the desired buffer after locking. 2640 * 2641 * FINDBLK_NBLOCK - Lock the buffer non-blocking. If we are unable 2642 * to acquire the lock we return NULL, even if the 2643 * buffer exists. 2644 * 2645 * FINDBLK_REF - Returns the buffer ref'd, which prevents normal 2646 * reuse by getnewbuf() but does not prevent 2647 * disassociation (B_INVAL). Used to avoid deadlocks 2648 * against random (vp,loffset)s due to reassignment. 2649 * 2650 * (0) - Lock the buffer blocking. 2651 */ 2652 struct buf * 2653 findblk(struct vnode *vp, off_t loffset, int flags) 2654 { 2655 struct buf *bp; 2656 int lkflags; 2657 2658 lkflags = LK_EXCLUSIVE; 2659 if (flags & FINDBLK_NBLOCK) 2660 lkflags |= LK_NOWAIT; 2661 2662 for (;;) { 2663 /* 2664 * Lookup. Ref the buf while holding v_token to prevent 2665 * reuse (but does not prevent diassociation). 2666 */ 2667 lwkt_gettoken_shared(&vp->v_token); 2668 bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset); 2669 if (bp == NULL) { 2670 lwkt_reltoken(&vp->v_token); 2671 return(NULL); 2672 } 2673 bqhold(bp); 2674 lwkt_reltoken(&vp->v_token); 2675 2676 /* 2677 * If testing only break and return bp, do not lock. 2678 */ 2679 if (flags & FINDBLK_TEST) 2680 break; 2681 2682 /* 2683 * Lock the buffer, return an error if the lock fails. 2684 * (only FINDBLK_NBLOCK can cause the lock to fail). 2685 */ 2686 if (BUF_LOCK(bp, lkflags)) { 2687 atomic_subtract_int(&bp->b_refs, 1); 2688 /* bp = NULL; not needed */ 2689 return(NULL); 2690 } 2691 2692 /* 2693 * Revalidate the locked buf before allowing it to be 2694 * returned. 2695 */ 2696 if (bp->b_vp == vp && bp->b_loffset == loffset) 2697 break; 2698 atomic_subtract_int(&bp->b_refs, 1); 2699 BUF_UNLOCK(bp); 2700 } 2701 2702 /* 2703 * Success 2704 */ 2705 if ((flags & FINDBLK_REF) == 0) 2706 atomic_subtract_int(&bp->b_refs, 1); 2707 return(bp); 2708 } 2709 2710 /* 2711 * getcacheblk: 2712 * 2713 * Similar to getblk() except only returns the buffer if it is 2714 * B_CACHE and requires no other manipulation. Otherwise NULL 2715 * is returned. NULL is also returned if GETBLK_NOWAIT is set 2716 * and the getblk() would block. 2717 * 2718 * If B_RAM is set the buffer might be just fine, but we return 2719 * NULL anyway because we want the code to fall through to the 2720 * cluster read to issue more read-aheads. Otherwise read-ahead breaks. 2721 * 2722 * If blksize is 0 the buffer cache buffer must already be fully 2723 * cached. 2724 * 2725 * If blksize is non-zero getblk() will be used, allowing a buffer 2726 * to be reinstantiated from its VM backing store. The buffer must 2727 * still be fully cached after reinstantiation to be returned. 2728 */ 2729 struct buf * 2730 getcacheblk(struct vnode *vp, off_t loffset, int blksize, int blkflags) 2731 { 2732 struct buf *bp; 2733 int fndflags = (blkflags & GETBLK_NOWAIT) ? FINDBLK_NBLOCK : 0; 2734 2735 if (blksize) { 2736 bp = getblk(vp, loffset, blksize, blkflags, 0); 2737 if (bp) { 2738 if ((bp->b_flags & (B_INVAL | B_CACHE)) == B_CACHE) { 2739 bp->b_flags &= ~B_AGE; 2740 if (bp->b_flags & B_RAM) { 2741 bqrelse(bp); 2742 bp = NULL; 2743 } 2744 } else { 2745 brelse(bp); 2746 bp = NULL; 2747 } 2748 } 2749 } else { 2750 bp = findblk(vp, loffset, fndflags); 2751 if (bp) { 2752 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == 2753 B_CACHE) { 2754 bp->b_flags &= ~B_AGE; 2755 bremfree(bp); 2756 } else { 2757 BUF_UNLOCK(bp); 2758 bp = NULL; 2759 } 2760 } 2761 } 2762 return (bp); 2763 } 2764 2765 /* 2766 * getblk: 2767 * 2768 * Get a block given a specified block and offset into a file/device. 2769 * B_INVAL may or may not be set on return. The caller should clear 2770 * B_INVAL prior to initiating a READ. 2771 * 2772 * IT IS IMPORTANT TO UNDERSTAND THAT IF YOU CALL GETBLK() AND B_CACHE 2773 * IS NOT SET, YOU MUST INITIALIZE THE RETURNED BUFFER, ISSUE A READ, 2774 * OR SET B_INVAL BEFORE RETIRING IT. If you retire a getblk'd buffer 2775 * without doing any of those things the system will likely believe 2776 * the buffer to be valid (especially if it is not B_VMIO), and the 2777 * next getblk() will return the buffer with B_CACHE set. 2778 * 2779 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 2780 * an existing buffer. 2781 * 2782 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 2783 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 2784 * and then cleared based on the backing VM. If the previous buffer is 2785 * non-0-sized but invalid, B_CACHE will be cleared. 2786 * 2787 * If getblk() must create a new buffer, the new buffer is returned with 2788 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 2789 * case it is returned with B_INVAL clear and B_CACHE set based on the 2790 * backing VM. 2791 * 2792 * getblk() also forces a bwrite() for any B_DELWRI buffer whos 2793 * B_CACHE bit is clear. 2794 * 2795 * What this means, basically, is that the caller should use B_CACHE to 2796 * determine whether the buffer is fully valid or not and should clear 2797 * B_INVAL prior to issuing a read. If the caller intends to validate 2798 * the buffer by loading its data area with something, the caller needs 2799 * to clear B_INVAL. If the caller does this without issuing an I/O, 2800 * the caller should set B_CACHE ( as an optimization ), else the caller 2801 * should issue the I/O and biodone() will set B_CACHE if the I/O was 2802 * a write attempt or if it was a successfull read. If the caller 2803 * intends to issue a READ, the caller must clear B_INVAL and B_ERROR 2804 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 2805 * 2806 * getblk flags: 2807 * 2808 * GETBLK_PCATCH - catch signal if blocked, can cause NULL return 2809 * GETBLK_BHEAVY - heavy-weight buffer cache buffer 2810 */ 2811 struct buf * 2812 getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo) 2813 { 2814 struct buf *bp; 2815 int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0; 2816 int error; 2817 int lkflags; 2818 2819 if (size > MAXBSIZE) 2820 panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE); 2821 if (vp->v_object == NULL) 2822 panic("getblk: vnode %p has no object!", vp); 2823 2824 loop: 2825 if ((bp = findblk(vp, loffset, FINDBLK_REF | FINDBLK_TEST)) != NULL) { 2826 /* 2827 * The buffer was found in the cache, but we need to lock it. 2828 * We must acquire a ref on the bp to prevent reuse, but 2829 * this will not prevent disassociation (brelvp()) so we 2830 * must recheck (vp,loffset) after acquiring the lock. 2831 * 2832 * Without the ref the buffer could potentially be reused 2833 * before we acquire the lock and create a deadlock 2834 * situation between the thread trying to reuse the buffer 2835 * and us due to the fact that we would wind up blocking 2836 * on a random (vp,loffset). 2837 */ 2838 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 2839 if (blkflags & GETBLK_NOWAIT) { 2840 bqdrop(bp); 2841 return(NULL); 2842 } 2843 lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; 2844 if (blkflags & GETBLK_PCATCH) 2845 lkflags |= LK_PCATCH; 2846 error = BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo); 2847 if (error) { 2848 bqdrop(bp); 2849 if (error == ENOLCK) 2850 goto loop; 2851 return (NULL); 2852 } 2853 /* buffer may have changed on us */ 2854 } 2855 bqdrop(bp); 2856 2857 /* 2858 * Once the buffer has been locked, make sure we didn't race 2859 * a buffer recyclement. Buffers that are no longer hashed 2860 * will have b_vp == NULL, so this takes care of that check 2861 * as well. 2862 */ 2863 if (bp->b_vp != vp || bp->b_loffset != loffset) { 2864 #if 0 2865 kprintf("Warning buffer %p (vp %p loffset %lld) " 2866 "was recycled\n", 2867 bp, vp, (long long)loffset); 2868 #endif 2869 BUF_UNLOCK(bp); 2870 goto loop; 2871 } 2872 2873 /* 2874 * If SZMATCH any pre-existing buffer must be of the requested 2875 * size or NULL is returned. The caller absolutely does not 2876 * want getblk() to bwrite() the buffer on a size mismatch. 2877 */ 2878 if ((blkflags & GETBLK_SZMATCH) && size != bp->b_bcount) { 2879 BUF_UNLOCK(bp); 2880 return(NULL); 2881 } 2882 2883 /* 2884 * All vnode-based buffers must be backed by a VM object. 2885 */ 2886 KKASSERT(bp->b_flags & B_VMIO); 2887 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 2888 bp->b_flags &= ~B_AGE; 2889 2890 /* 2891 * Make sure that B_INVAL buffers do not have a cached 2892 * block number translation. 2893 */ 2894 if ((bp->b_flags & B_INVAL) && (bp->b_bio2.bio_offset != NOOFFSET)) { 2895 kprintf("Warning invalid buffer %p (vp %p loffset %lld)" 2896 " did not have cleared bio_offset cache\n", 2897 bp, vp, (long long)loffset); 2898 clearbiocache(&bp->b_bio2); 2899 } 2900 2901 /* 2902 * The buffer is locked. B_CACHE is cleared if the buffer is 2903 * invalid. 2904 */ 2905 if (bp->b_flags & B_INVAL) 2906 bp->b_flags &= ~B_CACHE; 2907 bremfree(bp); 2908 2909 /* 2910 * Any size inconsistancy with a dirty buffer or a buffer 2911 * with a softupdates dependancy must be resolved. Resizing 2912 * the buffer in such circumstances can lead to problems. 2913 * 2914 * Dirty or dependant buffers are written synchronously. 2915 * Other types of buffers are simply released and 2916 * reconstituted as they may be backed by valid, dirty VM 2917 * pages (but not marked B_DELWRI). 2918 * 2919 * NFS NOTE: NFS buffers which straddle EOF are oddly-sized 2920 * and may be left over from a prior truncation (and thus 2921 * no longer represent the actual EOF point), so we 2922 * definitely do not want to B_NOCACHE the backing store. 2923 */ 2924 if (size != bp->b_bcount) { 2925 if (bp->b_flags & B_DELWRI) { 2926 bp->b_flags |= B_RELBUF; 2927 bwrite(bp); 2928 } else if (LIST_FIRST(&bp->b_dep)) { 2929 bp->b_flags |= B_RELBUF; 2930 bwrite(bp); 2931 } else { 2932 bp->b_flags |= B_RELBUF; 2933 brelse(bp); 2934 } 2935 goto loop; 2936 } 2937 KKASSERT(size <= bp->b_kvasize); 2938 KASSERT(bp->b_loffset != NOOFFSET, 2939 ("getblk: no buffer offset")); 2940 2941 /* 2942 * A buffer with B_DELWRI set and B_CACHE clear must 2943 * be committed before we can return the buffer in 2944 * order to prevent the caller from issuing a read 2945 * ( due to B_CACHE not being set ) and overwriting 2946 * it. 2947 * 2948 * Most callers, including NFS and FFS, need this to 2949 * operate properly either because they assume they 2950 * can issue a read if B_CACHE is not set, or because 2951 * ( for example ) an uncached B_DELWRI might loop due 2952 * to softupdates re-dirtying the buffer. In the latter 2953 * case, B_CACHE is set after the first write completes, 2954 * preventing further loops. 2955 * 2956 * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 2957 * above while extending the buffer, we cannot allow the 2958 * buffer to remain with B_CACHE set after the write 2959 * completes or it will represent a corrupt state. To 2960 * deal with this we set B_NOCACHE to scrap the buffer 2961 * after the write. 2962 * 2963 * XXX Should this be B_RELBUF instead of B_NOCACHE? 2964 * I'm not even sure this state is still possible 2965 * now that getblk() writes out any dirty buffers 2966 * on size changes. 2967 * 2968 * We might be able to do something fancy, like setting 2969 * B_CACHE in bwrite() except if B_DELWRI is already set, 2970 * so the below call doesn't set B_CACHE, but that gets real 2971 * confusing. This is much easier. 2972 */ 2973 2974 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 2975 kprintf("getblk: Warning, bp %p loff=%jx DELWRI set " 2976 "and CACHE clear, b_flags %08x\n", 2977 bp, (uintmax_t)bp->b_loffset, bp->b_flags); 2978 bp->b_flags |= B_NOCACHE; 2979 bwrite(bp); 2980 goto loop; 2981 } 2982 } else { 2983 /* 2984 * Buffer is not in-core, create new buffer. The buffer 2985 * returned by getnewbuf() is locked. Note that the returned 2986 * buffer is also considered valid (not marked B_INVAL). 2987 * 2988 * Calculating the offset for the I/O requires figuring out 2989 * the block size. We use DEV_BSIZE for VBLK or VCHR and 2990 * the mount's f_iosize otherwise. If the vnode does not 2991 * have an associated mount we assume that the passed size is 2992 * the block size. 2993 * 2994 * Note that vn_isdisk() cannot be used here since it may 2995 * return a failure for numerous reasons. Note that the 2996 * buffer size may be larger then the block size (the caller 2997 * will use block numbers with the proper multiple). Beware 2998 * of using any v_* fields which are part of unions. In 2999 * particular, in DragonFly the mount point overloading 3000 * mechanism uses the namecache only and the underlying 3001 * directory vnode is not a special case. 3002 */ 3003 int bsize, maxsize; 3004 vm_object_t repurpose; 3005 3006 if (vp->v_type == VBLK || vp->v_type == VCHR) 3007 bsize = DEV_BSIZE; 3008 else if (vp->v_mount) 3009 bsize = vp->v_mount->mnt_stat.f_iosize; 3010 else 3011 bsize = size; 3012 3013 maxsize = size + (loffset & PAGE_MASK); 3014 maxsize = imax(maxsize, bsize); 3015 repurpose = NULL; 3016 3017 /* 3018 * Allow repurposing. The returned buffer may contain VM 3019 * pages associated with its previous incarnation. These 3020 * pages must be repurposed for the new buffer (hopefully 3021 * without disturbing the KVM mapping). 3022 * 3023 * WARNING! If repurpose != NULL on return, the buffer will 3024 * still contain some data from its prior 3025 * incarnation. We MUST properly dispose of this 3026 * data. 3027 */ 3028 bp = getnewbuf(blkflags, slptimeo, size, maxsize, &repurpose); 3029 if (bp == NULL) { 3030 if (slpflags || slptimeo) 3031 return NULL; 3032 goto loop; 3033 } 3034 3035 /* 3036 * Atomically insert the buffer into the hash, so that it can 3037 * be found by findblk(). 3038 * 3039 * If bgetvp() returns non-zero a collision occured, and the 3040 * bp will not be associated with the vnode. 3041 * 3042 * Make sure the translation layer has been cleared. 3043 */ 3044 bp->b_loffset = loffset; 3045 bp->b_bio2.bio_offset = NOOFFSET; 3046 /* bp->b_bio2.bio_next = NULL; */ 3047 3048 if (bgetvp(vp, bp, size)) { 3049 if (repurpose) { 3050 bp->b_flags |= B_VMIO; 3051 repurposebuf(bp, 0); 3052 vm_object_drop(repurpose); 3053 } 3054 bp->b_flags |= B_INVAL; 3055 brelse(bp); 3056 goto loop; 3057 } 3058 3059 /* 3060 * All vnode-based buffers must be backed by a VM object. 3061 */ 3062 KKASSERT(vp->v_object != NULL); 3063 bp->b_flags |= B_VMIO; 3064 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 3065 3066 /* 3067 * If we allowed repurposing of the buffer it will contain 3068 * free-but-held vm_page's, already kmapped, that can be 3069 * repurposed. The repurposebuf() code handles reassigning 3070 * those pages to the new (object, offsets) and dealing with 3071 * the case where the pages already exist. 3072 */ 3073 if (repurpose) { 3074 repurposebuf(bp, size); 3075 vm_object_drop(repurpose); 3076 } else { 3077 allocbuf(bp, size); 3078 } 3079 } 3080 return (bp); 3081 } 3082 3083 /* 3084 * regetblk(bp) 3085 * 3086 * Reacquire a buffer that was previously released to the locked queue, 3087 * or reacquire a buffer which is interlocked by having bioops->io_deallocate 3088 * set B_LOCKED (which handles the acquisition race). 3089 * 3090 * To this end, either B_LOCKED must be set or the dependancy list must be 3091 * non-empty. 3092 */ 3093 void 3094 regetblk(struct buf *bp) 3095 { 3096 KKASSERT((bp->b_flags & B_LOCKED) || LIST_FIRST(&bp->b_dep) != NULL); 3097 BUF_LOCK(bp, LK_EXCLUSIVE | LK_RETRY); 3098 bremfree(bp); 3099 } 3100 3101 /* 3102 * geteblk: 3103 * 3104 * Get an empty, disassociated buffer of given size. The buffer is 3105 * initially set to B_INVAL. 3106 * 3107 * critical section protection is not required for the allocbuf() 3108 * call because races are impossible here. 3109 */ 3110 struct buf * 3111 geteblk(int size) 3112 { 3113 struct buf *bp; 3114 3115 while ((bp = getnewbuf(0, 0, size, MAXBSIZE, NULL)) == NULL) 3116 ; 3117 allocbuf(bp, size); 3118 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 3119 3120 return (bp); 3121 } 3122 3123 /* 3124 * allocbuf: 3125 * 3126 * This code constitutes the buffer memory from either anonymous system 3127 * memory (in the case of non-VMIO operations) or from an associated 3128 * VM object (in the case of VMIO operations). This code is able to 3129 * resize a buffer up or down. 3130 * 3131 * Note that this code is tricky, and has many complications to resolve 3132 * deadlock or inconsistant data situations. Tread lightly!!! 3133 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 3134 * the caller. Calling this code willy nilly can result in the loss of 3135 * data. 3136 * 3137 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 3138 * B_CACHE for the non-VMIO case. 3139 * 3140 * This routine does not need to be called from a critical section but you 3141 * must own the buffer. 3142 */ 3143 void 3144 allocbuf(struct buf *bp, int size) 3145 { 3146 int newbsize, mbsize; 3147 int i; 3148 3149 if (BUF_REFCNT(bp) == 0) 3150 panic("allocbuf: buffer not busy"); 3151 3152 if (bp->b_kvasize < size) 3153 panic("allocbuf: buffer too small"); 3154 3155 if ((bp->b_flags & B_VMIO) == 0) { 3156 caddr_t origbuf; 3157 int origbufsize; 3158 /* 3159 * Just get anonymous memory from the kernel. Don't 3160 * mess with B_CACHE. 3161 */ 3162 mbsize = roundup2(size, DEV_BSIZE); 3163 if (bp->b_flags & B_MALLOC) 3164 newbsize = mbsize; 3165 else 3166 newbsize = round_page(size); 3167 3168 if (newbsize < bp->b_bufsize) { 3169 /* 3170 * Malloced buffers are not shrunk 3171 */ 3172 if (bp->b_flags & B_MALLOC) { 3173 if (newbsize) { 3174 bp->b_bcount = size; 3175 } else { 3176 kfree(bp->b_data, M_BIOBUF); 3177 if (bp->b_bufsize) { 3178 atomic_subtract_long(&bufmallocspace, bp->b_bufsize); 3179 bp->b_bufsize = 0; 3180 bufspacewakeup(); 3181 } 3182 bp->b_data = bp->b_kvabase; 3183 bp->b_bcount = 0; 3184 bp->b_flags &= ~B_MALLOC; 3185 } 3186 return; 3187 } 3188 vm_hold_free_pages( 3189 bp, 3190 (vm_offset_t) bp->b_data + newbsize, 3191 (vm_offset_t) bp->b_data + bp->b_bufsize); 3192 } else if (newbsize > bp->b_bufsize) { 3193 /* 3194 * We only use malloced memory on the first allocation. 3195 * and revert to page-allocated memory when the buffer 3196 * grows. 3197 */ 3198 if ((bufmallocspace < maxbufmallocspace) && 3199 (bp->b_bufsize == 0) && 3200 (mbsize <= PAGE_SIZE/2)) { 3201 3202 bp->b_data = kmalloc(mbsize, M_BIOBUF, M_WAITOK); 3203 bp->b_bufsize = mbsize; 3204 bp->b_bcount = size; 3205 bp->b_flags |= B_MALLOC; 3206 atomic_add_long(&bufmallocspace, mbsize); 3207 return; 3208 } 3209 origbuf = NULL; 3210 origbufsize = 0; 3211 /* 3212 * If the buffer is growing on its other-than-first 3213 * allocation, then we revert to the page-allocation 3214 * scheme. 3215 */ 3216 if (bp->b_flags & B_MALLOC) { 3217 origbuf = bp->b_data; 3218 origbufsize = bp->b_bufsize; 3219 bp->b_data = bp->b_kvabase; 3220 if (bp->b_bufsize) { 3221 atomic_subtract_long(&bufmallocspace, 3222 bp->b_bufsize); 3223 bp->b_bufsize = 0; 3224 bufspacewakeup(); 3225 } 3226 bp->b_flags &= ~B_MALLOC; 3227 newbsize = round_page(newbsize); 3228 } 3229 vm_hold_load_pages( 3230 bp, 3231 (vm_offset_t) bp->b_data + bp->b_bufsize, 3232 (vm_offset_t) bp->b_data + newbsize); 3233 if (origbuf) { 3234 bcopy(origbuf, bp->b_data, origbufsize); 3235 kfree(origbuf, M_BIOBUF); 3236 } 3237 } 3238 } else { 3239 vm_page_t m; 3240 int desiredpages; 3241 3242 newbsize = roundup2(size, DEV_BSIZE); 3243 desiredpages = ((int)(bp->b_loffset & PAGE_MASK) + 3244 newbsize + PAGE_MASK) >> PAGE_SHIFT; 3245 KKASSERT(desiredpages <= XIO_INTERNAL_PAGES); 3246 3247 if (bp->b_flags & B_MALLOC) 3248 panic("allocbuf: VMIO buffer can't be malloced"); 3249 /* 3250 * Set B_CACHE initially if buffer is 0 length or will become 3251 * 0-length. 3252 */ 3253 if (size == 0 || bp->b_bufsize == 0) 3254 bp->b_flags |= B_CACHE; 3255 3256 if (newbsize < bp->b_bufsize) { 3257 /* 3258 * DEV_BSIZE aligned new buffer size is less then the 3259 * DEV_BSIZE aligned existing buffer size. Figure out 3260 * if we have to remove any pages. 3261 */ 3262 if (desiredpages < bp->b_xio.xio_npages) { 3263 for (i = desiredpages; i < bp->b_xio.xio_npages; i++) { 3264 /* 3265 * the page is not freed here -- it 3266 * is the responsibility of 3267 * vnode_pager_setsize 3268 */ 3269 m = bp->b_xio.xio_pages[i]; 3270 KASSERT(m != bogus_page, 3271 ("allocbuf: bogus page found")); 3272 vm_page_busy_wait(m, TRUE, "biodep"); 3273 bp->b_xio.xio_pages[i] = NULL; 3274 vm_page_unwire(m, 0); 3275 vm_page_wakeup(m); 3276 } 3277 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + 3278 (desiredpages << PAGE_SHIFT), (bp->b_xio.xio_npages - desiredpages)); 3279 bp->b_xio.xio_npages = desiredpages; 3280 } 3281 } else if (size > bp->b_bcount) { 3282 /* 3283 * We are growing the buffer, possibly in a 3284 * byte-granular fashion. 3285 */ 3286 struct vnode *vp; 3287 vm_object_t obj; 3288 vm_offset_t toff; 3289 vm_offset_t tinc; 3290 3291 /* 3292 * Step 1, bring in the VM pages from the object, 3293 * allocating them if necessary. We must clear 3294 * B_CACHE if these pages are not valid for the 3295 * range covered by the buffer. 3296 */ 3297 vp = bp->b_vp; 3298 obj = vp->v_object; 3299 3300 vm_object_hold(obj); 3301 while (bp->b_xio.xio_npages < desiredpages) { 3302 vm_page_t m; 3303 vm_pindex_t pi; 3304 int error; 3305 3306 pi = OFF_TO_IDX(bp->b_loffset) + 3307 bp->b_xio.xio_npages; 3308 3309 /* 3310 * Blocking on m->busy might lead to a 3311 * deadlock: 3312 * 3313 * vm_fault->getpages->cluster_read->allocbuf 3314 */ 3315 m = vm_page_lookup_busy_try(obj, pi, FALSE, 3316 &error); 3317 if (error) { 3318 vm_page_sleep_busy(m, FALSE, "pgtblk"); 3319 continue; 3320 } 3321 if (m == NULL) { 3322 /* 3323 * note: must allocate system pages 3324 * since blocking here could intefere 3325 * with paging I/O, no matter which 3326 * process we are. 3327 */ 3328 m = bio_page_alloc(bp, obj, pi, desiredpages - bp->b_xio.xio_npages); 3329 if (m) { 3330 vm_page_wire(m); 3331 vm_page_wakeup(m); 3332 bp->b_flags &= ~B_CACHE; 3333 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 3334 ++bp->b_xio.xio_npages; 3335 } 3336 continue; 3337 } 3338 3339 /* 3340 * We found a page and were able to busy it. 3341 */ 3342 vm_page_wire(m); 3343 vm_page_wakeup(m); 3344 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 3345 ++bp->b_xio.xio_npages; 3346 if (bp->b_act_count < m->act_count) 3347 bp->b_act_count = m->act_count; 3348 } 3349 vm_object_drop(obj); 3350 3351 /* 3352 * Step 2. We've loaded the pages into the buffer, 3353 * we have to figure out if we can still have B_CACHE 3354 * set. Note that B_CACHE is set according to the 3355 * byte-granular range ( bcount and size ), not the 3356 * aligned range ( newbsize ). 3357 * 3358 * The VM test is against m->valid, which is DEV_BSIZE 3359 * aligned. Needless to say, the validity of the data 3360 * needs to also be DEV_BSIZE aligned. Note that this 3361 * fails with NFS if the server or some other client 3362 * extends the file's EOF. If our buffer is resized, 3363 * B_CACHE may remain set! XXX 3364 */ 3365 3366 toff = bp->b_bcount; 3367 tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK); 3368 3369 while ((bp->b_flags & B_CACHE) && toff < size) { 3370 vm_pindex_t pi; 3371 3372 if (tinc > (size - toff)) 3373 tinc = size - toff; 3374 3375 pi = ((bp->b_loffset & PAGE_MASK) + toff) >> 3376 PAGE_SHIFT; 3377 3378 vfs_buf_test_cache( 3379 bp, 3380 bp->b_loffset, 3381 toff, 3382 tinc, 3383 bp->b_xio.xio_pages[pi] 3384 ); 3385 toff += tinc; 3386 tinc = PAGE_SIZE; 3387 } 3388 3389 /* 3390 * Step 3, fixup the KVM pmap. Remember that 3391 * bp->b_data is relative to bp->b_loffset, but 3392 * bp->b_loffset may be offset into the first page. 3393 */ 3394 bp->b_data = (caddr_t) 3395 trunc_page((vm_offset_t)bp->b_data); 3396 pmap_qenter((vm_offset_t)bp->b_data, 3397 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 3398 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 3399 (vm_offset_t)(bp->b_loffset & PAGE_MASK)); 3400 } 3401 atomic_add_long(&bufspace, newbsize - bp->b_bufsize); 3402 } 3403 3404 /* adjust space use on already-dirty buffer */ 3405 if (bp->b_flags & B_DELWRI) { 3406 /* dirtykvaspace unchanged */ 3407 atomic_add_long(&dirtybufspace, newbsize - bp->b_bufsize); 3408 if (bp->b_flags & B_HEAVY) { 3409 atomic_add_long(&dirtybufspacehw, 3410 newbsize - bp->b_bufsize); 3411 } 3412 } 3413 bp->b_bufsize = newbsize; /* actual buffer allocation */ 3414 bp->b_bcount = size; /* requested buffer size */ 3415 bufspacewakeup(); 3416 } 3417 3418 /* 3419 * repurposebuf() (VMIO only) 3420 * 3421 * This performs a function similar to allocbuf() but the passed-in buffer 3422 * may contain some detrius from its previous incarnation in the form of 3423 * the page array. We try to repurpose the underlying pages. 3424 * 3425 * This code is nominally called to recycle buffer cache buffers AND (if 3426 * they are clean) to also recycle their underlying pages. We currently 3427 * can only recycle unmapped, clean pages. The code is called when buffer 3428 * cache 'newbuf' bandwidth exceeds (bufrate_cache) bytes per second. 3429 */ 3430 static 3431 void 3432 repurposebuf(struct buf *bp, int size) 3433 { 3434 int newbsize; 3435 int desiredpages; 3436 vm_offset_t toff; 3437 vm_offset_t tinc; 3438 vm_object_t obj; 3439 vm_page_t m; 3440 int i; 3441 int must_reenter = 0; 3442 long deaccumulate = 0; 3443 3444 3445 KKASSERT((bp->b_flags & (B_VMIO | B_DELWRI | B_MALLOC)) == B_VMIO); 3446 if (BUF_REFCNT(bp) == 0) 3447 panic("repurposebuf: buffer not busy"); 3448 3449 if (bp->b_kvasize < size) 3450 panic("repurposebuf: buffer too small"); 3451 3452 newbsize = roundup2(size, DEV_BSIZE); 3453 desiredpages = ((int)(bp->b_loffset & PAGE_MASK) + 3454 newbsize + PAGE_MASK) >> PAGE_SHIFT; 3455 KKASSERT(desiredpages <= XIO_INTERNAL_PAGES); 3456 3457 /* 3458 * Buffer starts out 0-length with B_CACHE set. We will clear 3459 * As we check the backing store we will clear B_CACHE if necessary. 3460 */ 3461 atomic_add_long(&bufspace, newbsize - bp->b_bufsize); 3462 bp->b_bufsize = 0; 3463 bp->b_bcount = 0; 3464 bp->b_flags |= B_CACHE; 3465 3466 if (desiredpages) { 3467 obj = bp->b_vp->v_object; 3468 vm_object_hold(obj); 3469 } else { 3470 obj = NULL; 3471 } 3472 3473 /* 3474 * Step 1, bring in the VM pages from the object, repurposing or 3475 * allocating them if necessary. We must clear B_CACHE if these 3476 * pages are not valid for the range covered by the buffer. 3477 * 3478 * We are growing the buffer, possibly in a byte-granular fashion. 3479 */ 3480 for (i = 0; i < desiredpages; ++i) { 3481 vm_pindex_t pi; 3482 int error; 3483 int iswired; 3484 3485 pi = OFF_TO_IDX(bp->b_loffset) + i; 3486 3487 /* 3488 * Blocking on m->busy might lead to a 3489 * deadlock: 3490 * 3491 * vm_fault->getpages->cluster_read->allocbuf 3492 */ 3493 m = (i < bp->b_xio.xio_npages) ? bp->b_xio.xio_pages[i] : NULL; 3494 bp->b_xio.xio_pages[i] = NULL; 3495 KASSERT(m != bogus_page, ("repurposebuf: bogus page found")); 3496 m = vm_page_repurpose(obj, pi, FALSE, &error, m, 3497 &must_reenter, &iswired); 3498 3499 if (error) { 3500 vm_page_sleep_busy(m, FALSE, "pgtblk"); 3501 --i; /* retry */ 3502 continue; 3503 } 3504 if (m == NULL) { 3505 /* 3506 * note: must allocate system pages 3507 * since blocking here could intefere 3508 * with paging I/O, no matter which 3509 * process we are. 3510 */ 3511 must_reenter = 1; 3512 m = bio_page_alloc(bp, obj, pi, desiredpages - i); 3513 if (m) { 3514 vm_page_wire(m); 3515 vm_page_wakeup(m); 3516 bp->b_flags &= ~B_CACHE; 3517 bp->b_xio.xio_pages[i] = m; 3518 if (m->valid) 3519 deaccumulate += PAGE_SIZE; 3520 } else { 3521 --i; /* retry */ 3522 } 3523 continue; 3524 } 3525 if (m->valid) 3526 deaccumulate += PAGE_SIZE; 3527 3528 /* 3529 * We found a page and were able to busy it. 3530 */ 3531 if (!iswired) 3532 vm_page_wire(m); 3533 vm_page_wakeup(m); 3534 bp->b_xio.xio_pages[i] = m; 3535 if (bp->b_act_count < m->act_count) 3536 bp->b_act_count = m->act_count; 3537 } 3538 if (desiredpages) 3539 vm_object_drop(obj); 3540 3541 /* 3542 * Even though its a new buffer, any pages already in the VM 3543 * page cache should not count towards I/O bandwidth. 3544 */ 3545 if (deaccumulate) 3546 atomic_add_long(&bufcache_bw_accum, -deaccumulate); 3547 3548 /* 3549 * Clean-up any loose pages. 3550 */ 3551 while (i < bp->b_xio.xio_npages) { 3552 m = bp->b_xio.xio_pages[i]; 3553 KASSERT(m != bogus_page, ("repurposebuf: bogus page found")); 3554 vm_page_busy_wait(m, TRUE, "biodep"); 3555 bp->b_xio.xio_pages[i] = NULL; 3556 vm_page_unwire(m, 0); 3557 vm_page_wakeup(m); 3558 ++i; 3559 } 3560 if (desiredpages < bp->b_xio.xio_npages) { 3561 pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + 3562 (desiredpages << PAGE_SHIFT), 3563 (bp->b_xio.xio_npages - desiredpages)); 3564 } 3565 bp->b_xio.xio_npages = desiredpages; 3566 3567 /* 3568 * Step 2. We've loaded the pages into the buffer, 3569 * we have to figure out if we can still have B_CACHE 3570 * set. Note that B_CACHE is set according to the 3571 * byte-granular range ( bcount and size ), not the 3572 * aligned range ( newbsize ). 3573 * 3574 * The VM test is against m->valid, which is DEV_BSIZE 3575 * aligned. Needless to say, the validity of the data 3576 * needs to also be DEV_BSIZE aligned. Note that this 3577 * fails with NFS if the server or some other client 3578 * extends the file's EOF. If our buffer is resized, 3579 * B_CACHE may remain set! XXX 3580 */ 3581 toff = bp->b_bcount; 3582 tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK); 3583 3584 while ((bp->b_flags & B_CACHE) && toff < size) { 3585 vm_pindex_t pi; 3586 3587 if (tinc > (size - toff)) 3588 tinc = size - toff; 3589 3590 pi = ((bp->b_loffset & PAGE_MASK) + toff) >> PAGE_SHIFT; 3591 3592 vfs_buf_test_cache(bp, bp->b_loffset, toff, 3593 tinc, bp->b_xio.xio_pages[pi]); 3594 toff += tinc; 3595 tinc = PAGE_SIZE; 3596 } 3597 3598 /* 3599 * Step 3, fixup the KVM pmap. Remember that 3600 * bp->b_data is relative to bp->b_loffset, but 3601 * bp->b_loffset may be offset into the first page. 3602 */ 3603 bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); 3604 if (must_reenter) { 3605 pmap_qenter((vm_offset_t)bp->b_data, 3606 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 3607 } else { 3608 atomic_add_long(&repurposedspace, newbsize); 3609 } 3610 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 3611 (vm_offset_t)(bp->b_loffset & PAGE_MASK)); 3612 3613 if (newbsize < bp->b_bufsize) 3614 bufspacewakeup(); 3615 bp->b_bufsize = newbsize; /* actual buffer allocation */ 3616 bp->b_bcount = size; /* requested buffer size */ 3617 } 3618 3619 /* 3620 * biowait: 3621 * 3622 * Wait for buffer I/O completion, returning error status. B_EINTR 3623 * is converted into an EINTR error but not cleared (since a chain 3624 * of biowait() calls may occur). 3625 * 3626 * On return bpdone() will have been called but the buffer will remain 3627 * locked and will not have been brelse()'d. 3628 * 3629 * NOTE! If a timeout is specified and ETIMEDOUT occurs the I/O is 3630 * likely still in progress on return. 3631 * 3632 * NOTE! This operation is on a BIO, not a BUF. 3633 * 3634 * NOTE! BIO_DONE is cleared by vn_strategy() 3635 */ 3636 static __inline int 3637 _biowait(struct bio *bio, const char *wmesg, int to) 3638 { 3639 struct buf *bp = bio->bio_buf; 3640 u_int32_t flags; 3641 u_int32_t nflags; 3642 int error; 3643 3644 KKASSERT(bio == &bp->b_bio1); 3645 for (;;) { 3646 flags = bio->bio_flags; 3647 if (flags & BIO_DONE) 3648 break; 3649 nflags = flags | BIO_WANT; 3650 tsleep_interlock(bio, 0); 3651 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) { 3652 if (wmesg) 3653 error = tsleep(bio, PINTERLOCKED, wmesg, to); 3654 else if (bp->b_cmd == BUF_CMD_READ) 3655 error = tsleep(bio, PINTERLOCKED, "biord", to); 3656 else 3657 error = tsleep(bio, PINTERLOCKED, "biowr", to); 3658 if (error) { 3659 kprintf("tsleep error biowait %d\n", error); 3660 return (error); 3661 } 3662 } 3663 } 3664 3665 /* 3666 * Finish up. 3667 */ 3668 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 3669 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC); 3670 if (bp->b_flags & B_EINTR) 3671 return (EINTR); 3672 if (bp->b_flags & B_ERROR) 3673 return (bp->b_error ? bp->b_error : EIO); 3674 return (0); 3675 } 3676 3677 int 3678 biowait(struct bio *bio, const char *wmesg) 3679 { 3680 return(_biowait(bio, wmesg, 0)); 3681 } 3682 3683 int 3684 biowait_timeout(struct bio *bio, const char *wmesg, int to) 3685 { 3686 return(_biowait(bio, wmesg, to)); 3687 } 3688 3689 /* 3690 * This associates a tracking count with an I/O. vn_strategy() and 3691 * dev_dstrategy() do this automatically but there are a few cases 3692 * where a vnode or device layer is bypassed when a block translation 3693 * is cached. In such cases bio_start_transaction() may be called on 3694 * the bypassed layers so the system gets an I/O in progress indication 3695 * for those higher layers. 3696 */ 3697 void 3698 bio_start_transaction(struct bio *bio, struct bio_track *track) 3699 { 3700 bio->bio_track = track; 3701 bio_track_ref(track); 3702 dsched_buf_enter(bio->bio_buf); /* might stack */ 3703 } 3704 3705 /* 3706 * Initiate I/O on a vnode. 3707 * 3708 * SWAPCACHE OPERATION: 3709 * 3710 * Real buffer cache buffers have a non-NULL bp->b_vp. Unfortunately 3711 * devfs also uses b_vp for fake buffers so we also have to check 3712 * that B_PAGING is 0. In this case the passed 'vp' is probably the 3713 * underlying block device. The swap assignments are related to the 3714 * buffer cache buffer's b_vp, not the passed vp. 3715 * 3716 * The passed vp == bp->b_vp only in the case where the strategy call 3717 * is made on the vp itself for its own buffers (a regular file or 3718 * block device vp). The filesystem usually then re-calls vn_strategy() 3719 * after translating the request to an underlying device. 3720 * 3721 * Cluster buffers set B_CLUSTER and the passed vp is the vp of the 3722 * underlying buffer cache buffers. 3723 * 3724 * We can only deal with page-aligned buffers at the moment, because 3725 * we can't tell what the real dirty state for pages straddling a buffer 3726 * are. 3727 * 3728 * In order to call swap_pager_strategy() we must provide the VM object 3729 * and base offset for the underlying buffer cache pages so it can find 3730 * the swap blocks. 3731 */ 3732 void 3733 vn_strategy(struct vnode *vp, struct bio *bio) 3734 { 3735 struct bio_track *track; 3736 struct buf *bp = bio->bio_buf; 3737 3738 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 3739 3740 /* 3741 * Set when an I/O is issued on the bp. Cleared by consumers 3742 * (aka HAMMER), allowing the consumer to determine if I/O had 3743 * actually occurred. 3744 */ 3745 bp->b_flags |= B_IOISSUED; 3746 3747 /* 3748 * Handle the swap cache intercept. 3749 */ 3750 if (vn_cache_strategy(vp, bio)) 3751 return; 3752 3753 /* 3754 * Otherwise do the operation through the filesystem 3755 */ 3756 if (bp->b_cmd == BUF_CMD_READ) 3757 track = &vp->v_track_read; 3758 else 3759 track = &vp->v_track_write; 3760 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 3761 bio->bio_track = track; 3762 bio_track_ref(track); 3763 dsched_buf_enter(bp); /* might stack */ 3764 vop_strategy(*vp->v_ops, vp, bio); 3765 } 3766 3767 static void vn_cache_strategy_callback(struct bio *bio); 3768 3769 int 3770 vn_cache_strategy(struct vnode *vp, struct bio *bio) 3771 { 3772 struct buf *bp = bio->bio_buf; 3773 struct bio *nbio; 3774 vm_object_t object; 3775 vm_page_t m; 3776 int i; 3777 3778 /* 3779 * Stop using swapcache if paniced, dumping, or dumped 3780 */ 3781 if (panicstr || dumping) 3782 return(0); 3783 3784 /* 3785 * Is this buffer cache buffer suitable for reading from 3786 * the swap cache? 3787 */ 3788 if (vm_swapcache_read_enable == 0 || 3789 bp->b_cmd != BUF_CMD_READ || 3790 ((bp->b_flags & B_CLUSTER) == 0 && 3791 (bp->b_vp == NULL || (bp->b_flags & B_PAGING))) || 3792 ((int)bp->b_loffset & PAGE_MASK) != 0 || 3793 (bp->b_bcount & PAGE_MASK) != 0) { 3794 return(0); 3795 } 3796 3797 /* 3798 * Figure out the original VM object (it will match the underlying 3799 * VM pages). Note that swap cached data uses page indices relative 3800 * to that object, not relative to bio->bio_offset. 3801 */ 3802 if (bp->b_flags & B_CLUSTER) 3803 object = vp->v_object; 3804 else 3805 object = bp->b_vp->v_object; 3806 3807 /* 3808 * In order to be able to use the swap cache all underlying VM 3809 * pages must be marked as such, and we can't have any bogus pages. 3810 */ 3811 for (i = 0; i < bp->b_xio.xio_npages; ++i) { 3812 m = bp->b_xio.xio_pages[i]; 3813 if ((m->flags & PG_SWAPPED) == 0) 3814 break; 3815 if (m == bogus_page) 3816 break; 3817 } 3818 3819 /* 3820 * If we are good then issue the I/O using swap_pager_strategy(). 3821 * 3822 * We can only do this if the buffer actually supports object-backed 3823 * I/O. If it doesn't npages will be 0. 3824 */ 3825 if (i && i == bp->b_xio.xio_npages) { 3826 m = bp->b_xio.xio_pages[0]; 3827 nbio = push_bio(bio); 3828 nbio->bio_done = vn_cache_strategy_callback; 3829 nbio->bio_offset = ptoa(m->pindex); 3830 KKASSERT(m->object == object); 3831 swap_pager_strategy(object, nbio); 3832 return(1); 3833 } 3834 return(0); 3835 } 3836 3837 /* 3838 * This is a bit of a hack but since the vn_cache_strategy() function can 3839 * override a VFS's strategy function we must make sure that the bio, which 3840 * is probably bio2, doesn't leak an unexpected offset value back to the 3841 * filesystem. The filesystem (e.g. UFS) might otherwise assume that the 3842 * bio went through its own file strategy function and the the bio2 offset 3843 * is a cached disk offset when, in fact, it isn't. 3844 */ 3845 static void 3846 vn_cache_strategy_callback(struct bio *bio) 3847 { 3848 bio->bio_offset = NOOFFSET; 3849 biodone(pop_bio(bio)); 3850 } 3851 3852 /* 3853 * bpdone: 3854 * 3855 * Finish I/O on a buffer after all BIOs have been processed. 3856 * Called when the bio chain is exhausted or by biowait. If called 3857 * by biowait, elseit is typically 0. 3858 * 3859 * bpdone is also responsible for setting B_CACHE in a B_VMIO bp. 3860 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 3861 * assuming B_INVAL is clear. 3862 * 3863 * For the VMIO case, we set B_CACHE if the op was a read and no 3864 * read error occured, or if the op was a write. B_CACHE is never 3865 * set if the buffer is invalid or otherwise uncacheable. 3866 * 3867 * bpdone does not mess with B_INVAL, allowing the I/O routine or the 3868 * initiator to leave B_INVAL set to brelse the buffer out of existance 3869 * in the biodone routine. 3870 * 3871 * bpdone is responsible for calling bundirty() on the buffer after a 3872 * successful write. We previously did this prior to initiating the 3873 * write under the assumption that the buffer might be dirtied again 3874 * while the write was in progress, however doing it before-hand creates 3875 * a race condition prior to the call to vn_strategy() where the 3876 * filesystem may not be aware that a dirty buffer is present. 3877 * It should not be possible for the buffer or its underlying pages to 3878 * be redirtied prior to bpdone()'s unbusying of the underlying VM 3879 * pages. 3880 */ 3881 void 3882 bpdone(struct buf *bp, int elseit) 3883 { 3884 buf_cmd_t cmd; 3885 3886 KASSERT(BUF_REFCNTNB(bp) > 0, 3887 ("bpdone: bp %p not busy %d", bp, BUF_REFCNTNB(bp))); 3888 KASSERT(bp->b_cmd != BUF_CMD_DONE, 3889 ("bpdone: bp %p already done!", bp)); 3890 3891 /* 3892 * No more BIOs are left. All completion functions have been dealt 3893 * with, now we clean up the buffer. 3894 */ 3895 cmd = bp->b_cmd; 3896 bp->b_cmd = BUF_CMD_DONE; 3897 3898 /* 3899 * Only reads and writes are processed past this point. 3900 */ 3901 if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) { 3902 if (cmd == BUF_CMD_FREEBLKS) 3903 bp->b_flags |= B_NOCACHE; 3904 if (elseit) 3905 brelse(bp); 3906 return; 3907 } 3908 3909 /* 3910 * A failed write must re-dirty the buffer unless B_INVAL 3911 * was set. 3912 * 3913 * A successful write must clear the dirty flag. This is done after 3914 * the write to ensure that the buffer remains on the vnode's dirty 3915 * list for filesystem interlocks / checks until the write is actually 3916 * complete. HAMMER2 is sensitive to this issue. 3917 * 3918 * Only applicable to normal buffers (with VPs). vinum buffers may 3919 * not have a vp. 3920 * 3921 * Must be done prior to calling buf_complete() as the callback might 3922 * re-dirty the buffer. 3923 */ 3924 if (cmd == BUF_CMD_WRITE) { 3925 if ((bp->b_flags & (B_ERROR | B_INVAL)) == B_ERROR) { 3926 bp->b_flags &= ~B_NOCACHE; 3927 if (bp->b_vp) 3928 bdirty(bp); 3929 } else { 3930 if (bp->b_vp) 3931 bundirty(bp); 3932 } 3933 } 3934 3935 /* 3936 * Warning: softupdates may re-dirty the buffer, and HAMMER can do 3937 * a lot worse. XXX - move this above the clearing of b_cmd 3938 */ 3939 if (LIST_FIRST(&bp->b_dep) != NULL) 3940 buf_complete(bp); 3941 3942 if (bp->b_flags & B_VMIO) { 3943 int i; 3944 vm_ooffset_t foff; 3945 vm_page_t m; 3946 vm_object_t obj; 3947 int iosize; 3948 struct vnode *vp = bp->b_vp; 3949 3950 obj = vp->v_object; 3951 3952 #if defined(VFS_BIO_DEBUG) 3953 if (vp->v_auxrefs == 0) 3954 panic("bpdone: zero vnode hold count"); 3955 if ((vp->v_flag & VOBJBUF) == 0) 3956 panic("bpdone: vnode is not setup for merged cache"); 3957 #endif 3958 3959 foff = bp->b_loffset; 3960 KASSERT(foff != NOOFFSET, ("bpdone: no buffer offset")); 3961 KASSERT(obj != NULL, ("bpdone: missing VM object")); 3962 3963 #if defined(VFS_BIO_DEBUG) 3964 if (obj->paging_in_progress < bp->b_xio.xio_npages) { 3965 kprintf("bpdone: paging in progress(%d) < " 3966 "bp->b_xio.xio_npages(%d)\n", 3967 obj->paging_in_progress, 3968 bp->b_xio.xio_npages); 3969 } 3970 #endif 3971 3972 /* 3973 * Set B_CACHE if the op was a normal read and no error 3974 * occured. B_CACHE is set for writes in the b*write() 3975 * routines. 3976 */ 3977 iosize = bp->b_bcount - bp->b_resid; 3978 if (cmd == BUF_CMD_READ && 3979 (bp->b_flags & (B_INVAL|B_NOCACHE|B_ERROR)) == 0) { 3980 bp->b_flags |= B_CACHE; 3981 } 3982 3983 vm_object_hold(obj); 3984 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3985 int resid; 3986 int isbogus; 3987 3988 resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; 3989 if (resid > iosize) 3990 resid = iosize; 3991 3992 /* 3993 * cleanup bogus pages, restoring the originals. Since 3994 * the originals should still be wired, we don't have 3995 * to worry about interrupt/freeing races destroying 3996 * the VM object association. 3997 */ 3998 m = bp->b_xio.xio_pages[i]; 3999 if (m == bogus_page) { 4000 if ((bp->b_flags & B_HASBOGUS) == 0) 4001 panic("bpdone: bp %p corrupt bogus", bp); 4002 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 4003 if (m == NULL) 4004 panic("bpdone: page disappeared"); 4005 bp->b_xio.xio_pages[i] = m; 4006 isbogus = 1; 4007 } else { 4008 isbogus = 0; 4009 } 4010 #if defined(VFS_BIO_DEBUG) 4011 if (OFF_TO_IDX(foff) != m->pindex) { 4012 kprintf("bpdone: foff(%lu)/m->pindex(%ld) " 4013 "mismatch\n", 4014 (unsigned long)foff, (long)m->pindex); 4015 } 4016 #endif 4017 4018 /* 4019 * In the write case, the valid and clean bits are 4020 * already changed correctly (see bdwrite()), so we 4021 * only need to do this here in the read case. 4022 */ 4023 vm_page_busy_wait(m, FALSE, "bpdpgw"); 4024 if (cmd == BUF_CMD_READ && isbogus == 0 && resid > 0) 4025 vfs_clean_one_page(bp, i, m); 4026 4027 /* 4028 * when debugging new filesystems or buffer I/O 4029 * methods, this is the most common error that pops 4030 * up. if you see this, you have not set the page 4031 * busy flag correctly!!! 4032 */ 4033 if (m->busy == 0) { 4034 kprintf("bpdone: page busy < 0, " 4035 "pindex: %d, foff: 0x(%x,%x), " 4036 "resid: %d, index: %d\n", 4037 (int) m->pindex, (int)(foff >> 32), 4038 (int) foff & 0xffffffff, resid, i); 4039 if (!vn_isdisk(vp, NULL)) 4040 kprintf(" iosize: %ld, loffset: %lld, " 4041 "flags: 0x%08x, npages: %d\n", 4042 bp->b_vp->v_mount->mnt_stat.f_iosize, 4043 (long long)bp->b_loffset, 4044 bp->b_flags, bp->b_xio.xio_npages); 4045 else 4046 kprintf(" VDEV, loffset: %lld, flags: 0x%08x, npages: %d\n", 4047 (long long)bp->b_loffset, 4048 bp->b_flags, bp->b_xio.xio_npages); 4049 kprintf(" valid: 0x%x, dirty: 0x%x, " 4050 "wired: %d\n", 4051 m->valid, m->dirty, 4052 m->wire_count); 4053 panic("bpdone: page busy < 0"); 4054 } 4055 vm_page_io_finish(m); 4056 vm_page_wakeup(m); 4057 vm_object_pip_wakeup(obj); 4058 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 4059 iosize -= resid; 4060 } 4061 if (bp->b_flags & B_HASBOGUS) { 4062 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4063 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 4064 bp->b_flags &= ~B_HASBOGUS; 4065 } 4066 vm_object_drop(obj); 4067 } 4068 4069 /* 4070 * Finish up by releasing the buffer. There are no more synchronous 4071 * or asynchronous completions, those were handled by bio_done 4072 * callbacks. 4073 */ 4074 if (elseit) { 4075 if (bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR|B_RELBUF)) 4076 brelse(bp); 4077 else 4078 bqrelse(bp); 4079 } 4080 } 4081 4082 /* 4083 * Normal biodone. 4084 */ 4085 void 4086 biodone(struct bio *bio) 4087 { 4088 struct buf *bp = bio->bio_buf; 4089 4090 runningbufwakeup(bp); 4091 4092 /* 4093 * Run up the chain of BIO's. Leave b_cmd intact for the duration. 4094 */ 4095 while (bio) { 4096 biodone_t *done_func; 4097 struct bio_track *track; 4098 4099 /* 4100 * BIO tracking. Most but not all BIOs are tracked. 4101 */ 4102 if ((track = bio->bio_track) != NULL) { 4103 bio_track_rel(track); 4104 bio->bio_track = NULL; 4105 } 4106 4107 /* 4108 * A bio_done function terminates the loop. The function 4109 * will be responsible for any further chaining and/or 4110 * buffer management. 4111 * 4112 * WARNING! The done function can deallocate the buffer! 4113 */ 4114 if ((done_func = bio->bio_done) != NULL) { 4115 bio->bio_done = NULL; 4116 done_func(bio); 4117 return; 4118 } 4119 bio = bio->bio_prev; 4120 } 4121 4122 /* 4123 * If we've run out of bio's do normal [a]synchronous completion. 4124 */ 4125 bpdone(bp, 1); 4126 } 4127 4128 /* 4129 * Synchronous biodone - this terminates a synchronous BIO. 4130 * 4131 * bpdone() is called with elseit=FALSE, leaving the buffer completed 4132 * but still locked. The caller must brelse() the buffer after waiting 4133 * for completion. 4134 */ 4135 void 4136 biodone_sync(struct bio *bio) 4137 { 4138 struct buf *bp = bio->bio_buf; 4139 int flags; 4140 int nflags; 4141 4142 KKASSERT(bio == &bp->b_bio1); 4143 bpdone(bp, 0); 4144 4145 for (;;) { 4146 flags = bio->bio_flags; 4147 nflags = (flags | BIO_DONE) & ~BIO_WANT; 4148 4149 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) { 4150 if (flags & BIO_WANT) 4151 wakeup(bio); 4152 break; 4153 } 4154 } 4155 } 4156 4157 /* 4158 * vfs_unbusy_pages: 4159 * 4160 * This routine is called in lieu of iodone in the case of 4161 * incomplete I/O. This keeps the busy status for pages 4162 * consistant. 4163 */ 4164 void 4165 vfs_unbusy_pages(struct buf *bp) 4166 { 4167 int i; 4168 4169 runningbufwakeup(bp); 4170 4171 if (bp->b_flags & B_VMIO) { 4172 struct vnode *vp = bp->b_vp; 4173 vm_object_t obj; 4174 4175 obj = vp->v_object; 4176 vm_object_hold(obj); 4177 4178 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4179 vm_page_t m = bp->b_xio.xio_pages[i]; 4180 4181 /* 4182 * When restoring bogus changes the original pages 4183 * should still be wired, so we are in no danger of 4184 * losing the object association and do not need 4185 * critical section protection particularly. 4186 */ 4187 if (m == bogus_page) { 4188 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_loffset) + i); 4189 if (!m) { 4190 panic("vfs_unbusy_pages: page missing"); 4191 } 4192 bp->b_xio.xio_pages[i] = m; 4193 } 4194 vm_page_busy_wait(m, FALSE, "bpdpgw"); 4195 vm_page_io_finish(m); 4196 vm_page_wakeup(m); 4197 vm_object_pip_wakeup(obj); 4198 } 4199 if (bp->b_flags & B_HASBOGUS) { 4200 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4201 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 4202 bp->b_flags &= ~B_HASBOGUS; 4203 } 4204 vm_object_drop(obj); 4205 } 4206 } 4207 4208 /* 4209 * vfs_busy_pages: 4210 * 4211 * This routine is called before a device strategy routine. 4212 * It is used to tell the VM system that paging I/O is in 4213 * progress, and treat the pages associated with the buffer 4214 * almost as being PG_BUSY. Also the object 'paging_in_progress' 4215 * flag is handled to make sure that the object doesn't become 4216 * inconsistant. 4217 * 4218 * Since I/O has not been initiated yet, certain buffer flags 4219 * such as B_ERROR or B_INVAL may be in an inconsistant state 4220 * and should be ignored. 4221 */ 4222 void 4223 vfs_busy_pages(struct vnode *vp, struct buf *bp) 4224 { 4225 int i, bogus; 4226 struct lwp *lp = curthread->td_lwp; 4227 4228 /* 4229 * The buffer's I/O command must already be set. If reading, 4230 * B_CACHE must be 0 (double check against callers only doing 4231 * I/O when B_CACHE is 0). 4232 */ 4233 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 4234 KKASSERT(bp->b_cmd == BUF_CMD_WRITE || (bp->b_flags & B_CACHE) == 0); 4235 4236 if (bp->b_flags & B_VMIO) { 4237 vm_object_t obj; 4238 4239 obj = vp->v_object; 4240 KASSERT(bp->b_loffset != NOOFFSET, 4241 ("vfs_busy_pages: no buffer offset")); 4242 4243 /* 4244 * Busy all the pages. We have to busy them all at once 4245 * to avoid deadlocks. 4246 */ 4247 retry: 4248 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4249 vm_page_t m = bp->b_xio.xio_pages[i]; 4250 4251 if (vm_page_busy_try(m, FALSE)) { 4252 vm_page_sleep_busy(m, FALSE, "vbpage"); 4253 while (--i >= 0) 4254 vm_page_wakeup(bp->b_xio.xio_pages[i]); 4255 goto retry; 4256 } 4257 } 4258 4259 /* 4260 * Setup for I/O, soft-busy the page right now because 4261 * the next loop may block. 4262 */ 4263 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4264 vm_page_t m = bp->b_xio.xio_pages[i]; 4265 4266 if ((bp->b_flags & B_CLUSTER) == 0) { 4267 vm_object_pip_add(obj, 1); 4268 vm_page_io_start(m); 4269 } 4270 } 4271 4272 /* 4273 * Adjust protections for I/O and do bogus-page mapping. 4274 * Assume that vm_page_protect() can block (it can block 4275 * if VM_PROT_NONE, don't take any chances regardless). 4276 * 4277 * In particular note that for writes we must incorporate 4278 * page dirtyness from the VM system into the buffer's 4279 * dirty range. 4280 * 4281 * For reads we theoretically must incorporate page dirtyness 4282 * from the VM system to determine if the page needs bogus 4283 * replacement, but we shortcut the test by simply checking 4284 * that all m->valid bits are set, indicating that the page 4285 * is fully valid and does not need to be re-read. For any 4286 * VM system dirtyness the page will also be fully valid 4287 * since it was mapped at one point. 4288 */ 4289 bogus = 0; 4290 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4291 vm_page_t m = bp->b_xio.xio_pages[i]; 4292 4293 if (bp->b_cmd == BUF_CMD_WRITE) { 4294 /* 4295 * When readying a vnode-backed buffer for 4296 * a write we must zero-fill any invalid 4297 * portions of the backing VM pages, mark 4298 * it valid and clear related dirty bits. 4299 * 4300 * vfs_clean_one_page() incorporates any 4301 * VM dirtyness and updates the b_dirtyoff 4302 * range (after we've made the page RO). 4303 * 4304 * It is also expected that the pmap modified 4305 * bit has already been cleared by the 4306 * vm_page_protect(). We may not be able 4307 * to clear all dirty bits for a page if it 4308 * was also memory mapped (NFS). 4309 * 4310 * Finally be sure to unassign any swap-cache 4311 * backing store as it is now stale. 4312 */ 4313 vm_page_protect(m, VM_PROT_READ); 4314 vfs_clean_one_page(bp, i, m); 4315 swap_pager_unswapped(m); 4316 } else if (m->valid == VM_PAGE_BITS_ALL) { 4317 /* 4318 * When readying a vnode-backed buffer for 4319 * read we must replace any dirty pages with 4320 * a bogus page so dirty data is not destroyed 4321 * when filling gaps. 4322 * 4323 * To avoid testing whether the page is 4324 * dirty we instead test that the page was 4325 * at some point mapped (m->valid fully 4326 * valid) with the understanding that 4327 * this also covers the dirty case. 4328 */ 4329 bp->b_xio.xio_pages[i] = bogus_page; 4330 bp->b_flags |= B_HASBOGUS; 4331 bogus++; 4332 } else if (m->valid & m->dirty) { 4333 /* 4334 * This case should not occur as partial 4335 * dirtyment can only happen if the buffer 4336 * is B_CACHE, and this code is not entered 4337 * if the buffer is B_CACHE. 4338 */ 4339 kprintf("Warning: vfs_busy_pages - page not " 4340 "fully valid! loff=%jx bpf=%08x " 4341 "idx=%d val=%02x dir=%02x\n", 4342 (uintmax_t)bp->b_loffset, bp->b_flags, 4343 i, m->valid, m->dirty); 4344 vm_page_protect(m, VM_PROT_NONE); 4345 } else { 4346 /* 4347 * The page is not valid and can be made 4348 * part of the read. 4349 */ 4350 vm_page_protect(m, VM_PROT_NONE); 4351 } 4352 vm_page_wakeup(m); 4353 } 4354 if (bogus) { 4355 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4356 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 4357 } 4358 } 4359 4360 /* 4361 * This is the easiest place to put the process accounting for the I/O 4362 * for now. 4363 */ 4364 if (lp != NULL) { 4365 if (bp->b_cmd == BUF_CMD_READ) 4366 lp->lwp_ru.ru_inblock++; 4367 else 4368 lp->lwp_ru.ru_oublock++; 4369 } 4370 } 4371 4372 /* 4373 * Tell the VM system that the pages associated with this buffer 4374 * are clean. This is used for delayed writes where the data is 4375 * going to go to disk eventually without additional VM intevention. 4376 * 4377 * NOTE: While we only really need to clean through to b_bcount, we 4378 * just go ahead and clean through to b_bufsize. 4379 */ 4380 static void 4381 vfs_clean_pages(struct buf *bp) 4382 { 4383 vm_page_t m; 4384 int i; 4385 4386 if ((bp->b_flags & B_VMIO) == 0) 4387 return; 4388 4389 KASSERT(bp->b_loffset != NOOFFSET, 4390 ("vfs_clean_pages: no buffer offset")); 4391 4392 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4393 m = bp->b_xio.xio_pages[i]; 4394 vfs_clean_one_page(bp, i, m); 4395 } 4396 } 4397 4398 /* 4399 * vfs_clean_one_page: 4400 * 4401 * Set the valid bits and clear the dirty bits in a page within a 4402 * buffer. The range is restricted to the buffer's size and the 4403 * buffer's logical offset might index into the first page. 4404 * 4405 * The caller has busied or soft-busied the page and it is not mapped, 4406 * test and incorporate the dirty bits into b_dirtyoff/end before 4407 * clearing them. Note that we need to clear the pmap modified bits 4408 * after determining the the page was dirty, vm_page_set_validclean() 4409 * does not do it for us. 4410 * 4411 * This routine is typically called after a read completes (dirty should 4412 * be zero in that case as we are not called on bogus-replace pages), 4413 * or before a write is initiated. 4414 */ 4415 static void 4416 vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m) 4417 { 4418 int bcount; 4419 int xoff; 4420 int soff; 4421 int eoff; 4422 4423 /* 4424 * Calculate offset range within the page but relative to buffer's 4425 * loffset. loffset might be offset into the first page. 4426 */ 4427 xoff = (int)bp->b_loffset & PAGE_MASK; /* loffset offset into pg 0 */ 4428 bcount = bp->b_bcount + xoff; /* offset adjusted */ 4429 4430 if (pageno == 0) { 4431 soff = xoff; 4432 eoff = PAGE_SIZE; 4433 } else { 4434 soff = (pageno << PAGE_SHIFT); 4435 eoff = soff + PAGE_SIZE; 4436 } 4437 if (eoff > bcount) 4438 eoff = bcount; 4439 if (soff >= eoff) 4440 return; 4441 4442 /* 4443 * Test dirty bits and adjust b_dirtyoff/end. 4444 * 4445 * If dirty pages are incorporated into the bp any prior 4446 * B_NEEDCOMMIT state (NFS) must be cleared because the 4447 * caller has not taken into account the new dirty data. 4448 * 4449 * If the page was memory mapped the dirty bits might go beyond the 4450 * end of the buffer, but we can't really make the assumption that 4451 * a file EOF straddles the buffer (even though this is the case for 4452 * NFS if B_NEEDCOMMIT is also set). So for the purposes of clearing 4453 * B_NEEDCOMMIT we only test the dirty bits covered by the buffer. 4454 * This also saves some console spam. 4455 * 4456 * When clearing B_NEEDCOMMIT we must also clear B_CLUSTEROK, 4457 * NFS can handle huge commits but not huge writes. 4458 */ 4459 vm_page_test_dirty(m); 4460 if (m->dirty) { 4461 if ((bp->b_flags & B_NEEDCOMMIT) && 4462 (m->dirty & vm_page_bits(soff & PAGE_MASK, eoff - soff))) { 4463 if (debug_commit) 4464 kprintf("Warning: vfs_clean_one_page: bp %p " 4465 "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT" 4466 " cmd %d vd %02x/%02x x/s/e %d %d %d " 4467 "doff/end %d %d\n", 4468 bp, (uintmax_t)bp->b_loffset, bp->b_bcount, 4469 bp->b_flags, bp->b_cmd, 4470 m->valid, m->dirty, xoff, soff, eoff, 4471 bp->b_dirtyoff, bp->b_dirtyend); 4472 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 4473 if (debug_commit) 4474 print_backtrace(-1); 4475 } 4476 /* 4477 * Only clear the pmap modified bits if ALL the dirty bits 4478 * are set, otherwise the system might mis-clear portions 4479 * of a page. 4480 */ 4481 if (m->dirty == VM_PAGE_BITS_ALL && 4482 (bp->b_flags & B_NEEDCOMMIT) == 0) { 4483 pmap_clear_modify(m); 4484 } 4485 if (bp->b_dirtyoff > soff - xoff) 4486 bp->b_dirtyoff = soff - xoff; 4487 if (bp->b_dirtyend < eoff - xoff) 4488 bp->b_dirtyend = eoff - xoff; 4489 } 4490 4491 /* 4492 * Set related valid bits, clear related dirty bits. 4493 * Does not mess with the pmap modified bit. 4494 * 4495 * WARNING! We cannot just clear all of m->dirty here as the 4496 * buffer cache buffers may use a DEV_BSIZE'd aligned 4497 * block size, or have an odd size (e.g. NFS at file EOF). 4498 * The putpages code can clear m->dirty to 0. 4499 * 4500 * If a VOP_WRITE generates a buffer cache buffer which 4501 * covers the same space as mapped writable pages the 4502 * buffer flush might not be able to clear all the dirty 4503 * bits and still require a putpages from the VM system 4504 * to finish it off. 4505 * 4506 * WARNING! vm_page_set_validclean() currently assumes vm_token 4507 * is held. The page might not be busied (bdwrite() case). 4508 * XXX remove this comment once we've validated that this 4509 * is no longer an issue. 4510 */ 4511 vm_page_set_validclean(m, soff & PAGE_MASK, eoff - soff); 4512 } 4513 4514 #if 0 4515 /* 4516 * Similar to vfs_clean_one_page() but sets the bits to valid and dirty. 4517 * The page data is assumed to be valid (there is no zeroing here). 4518 */ 4519 static void 4520 vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m) 4521 { 4522 int bcount; 4523 int xoff; 4524 int soff; 4525 int eoff; 4526 4527 /* 4528 * Calculate offset range within the page but relative to buffer's 4529 * loffset. loffset might be offset into the first page. 4530 */ 4531 xoff = (int)bp->b_loffset & PAGE_MASK; /* loffset offset into pg 0 */ 4532 bcount = bp->b_bcount + xoff; /* offset adjusted */ 4533 4534 if (pageno == 0) { 4535 soff = xoff; 4536 eoff = PAGE_SIZE; 4537 } else { 4538 soff = (pageno << PAGE_SHIFT); 4539 eoff = soff + PAGE_SIZE; 4540 } 4541 if (eoff > bcount) 4542 eoff = bcount; 4543 if (soff >= eoff) 4544 return; 4545 vm_page_set_validdirty(m, soff & PAGE_MASK, eoff - soff); 4546 } 4547 #endif 4548 4549 /* 4550 * vfs_bio_clrbuf: 4551 * 4552 * Clear a buffer. This routine essentially fakes an I/O, so we need 4553 * to clear B_ERROR and B_INVAL. 4554 * 4555 * Note that while we only theoretically need to clear through b_bcount, 4556 * we go ahead and clear through b_bufsize. 4557 */ 4558 4559 void 4560 vfs_bio_clrbuf(struct buf *bp) 4561 { 4562 int i, mask = 0; 4563 caddr_t sa, ea; 4564 if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { 4565 bp->b_flags &= ~(B_INVAL | B_EINTR | B_ERROR); 4566 if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 4567 (bp->b_loffset & PAGE_MASK) == 0) { 4568 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 4569 if ((bp->b_xio.xio_pages[0]->valid & mask) == mask) { 4570 bp->b_resid = 0; 4571 return; 4572 } 4573 if ((bp->b_xio.xio_pages[0]->valid & mask) == 0) { 4574 bzero(bp->b_data, bp->b_bufsize); 4575 bp->b_xio.xio_pages[0]->valid |= mask; 4576 bp->b_resid = 0; 4577 return; 4578 } 4579 } 4580 sa = bp->b_data; 4581 for(i=0;i<bp->b_xio.xio_npages;i++,sa=ea) { 4582 int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; 4583 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); 4584 ea = (caddr_t)(vm_offset_t)ulmin( 4585 (u_long)(vm_offset_t)ea, 4586 (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); 4587 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 4588 if ((bp->b_xio.xio_pages[i]->valid & mask) == mask) 4589 continue; 4590 if ((bp->b_xio.xio_pages[i]->valid & mask) == 0) { 4591 bzero(sa, ea - sa); 4592 } else { 4593 for (; sa < ea; sa += DEV_BSIZE, j++) { 4594 if ((bp->b_xio.xio_pages[i]->valid & 4595 (1<<j)) == 0) { 4596 bzero(sa, DEV_BSIZE); 4597 } 4598 } 4599 } 4600 bp->b_xio.xio_pages[i]->valid |= mask; 4601 } 4602 bp->b_resid = 0; 4603 } else { 4604 clrbuf(bp); 4605 } 4606 } 4607 4608 /* 4609 * vm_hold_load_pages: 4610 * 4611 * Load pages into the buffer's address space. The pages are 4612 * allocated from the kernel object in order to reduce interference 4613 * with the any VM paging I/O activity. The range of loaded 4614 * pages will be wired. 4615 * 4616 * If a page cannot be allocated, the 'pagedaemon' is woken up to 4617 * retrieve the full range (to - from) of pages. 4618 */ 4619 void 4620 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4621 { 4622 vm_offset_t pg; 4623 vm_page_t p; 4624 int index; 4625 4626 to = round_page(to); 4627 from = round_page(from); 4628 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4629 4630 pg = from; 4631 while (pg < to) { 4632 /* 4633 * Note: must allocate system pages since blocking here 4634 * could intefere with paging I/O, no matter which 4635 * process we are. 4636 */ 4637 vm_object_hold(&kernel_object); 4638 p = bio_page_alloc(bp, &kernel_object, pg >> PAGE_SHIFT, 4639 (vm_pindex_t)((to - pg) >> PAGE_SHIFT)); 4640 vm_object_drop(&kernel_object); 4641 if (p) { 4642 vm_page_wire(p); 4643 p->valid = VM_PAGE_BITS_ALL; 4644 pmap_kenter_noinval(pg, VM_PAGE_TO_PHYS(p)); 4645 bp->b_xio.xio_pages[index] = p; 4646 vm_page_wakeup(p); 4647 4648 pg += PAGE_SIZE; 4649 ++index; 4650 } 4651 } 4652 pmap_invalidate_range(&kernel_pmap, from, to); 4653 bp->b_xio.xio_npages = index; 4654 } 4655 4656 /* 4657 * Allocate a page for a buffer cache buffer. 4658 * 4659 * If NULL is returned the caller is expected to retry (typically check if 4660 * the page already exists on retry before trying to allocate one). 4661 * 4662 * NOTE! Low-memory handling is dealt with in b[q]relse(), not here. This 4663 * function will use the system reserve with the hope that the page 4664 * allocations can be returned to PQ_CACHE/PQ_FREE when the caller 4665 * is done with the buffer. 4666 * 4667 * NOTE! However, TMPFS is a special case because flushing a dirty buffer 4668 * to TMPFS doesn't clean the page. For TMPFS, only the pagedaemon 4669 * is capable of retiring pages (to swap). For TMPFS we don't dig 4670 * into the system reserve because doing so could stall out pretty 4671 * much every process running on the system. 4672 */ 4673 static 4674 vm_page_t 4675 bio_page_alloc(struct buf *bp, vm_object_t obj, vm_pindex_t pg, int deficit) 4676 { 4677 int vmflags = VM_ALLOC_NORMAL | VM_ALLOC_NULL_OK; 4678 vm_page_t p; 4679 4680 ASSERT_LWKT_TOKEN_HELD(vm_object_token(obj)); 4681 4682 /* 4683 * Try a normal allocation first. 4684 */ 4685 p = vm_page_alloc(obj, pg, vmflags); 4686 if (p) 4687 return(p); 4688 if (vm_page_lookup(obj, pg)) 4689 return(NULL); 4690 vm_pageout_deficit += deficit; 4691 4692 /* 4693 * Try again, digging into the system reserve. 4694 * 4695 * Trying to recover pages from the buffer cache here can deadlock 4696 * against other threads trying to busy underlying pages so we 4697 * depend on the code in brelse() and bqrelse() to free/cache the 4698 * underlying buffer cache pages when memory is low. 4699 */ 4700 if (curthread->td_flags & TDF_SYSTHREAD) 4701 vmflags |= VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT; 4702 else if (bp->b_vp && bp->b_vp->v_tag == VT_TMPFS) 4703 vmflags |= 0; 4704 else 4705 vmflags |= VM_ALLOC_SYSTEM; 4706 4707 /*recoverbufpages();*/ 4708 p = vm_page_alloc(obj, pg, vmflags); 4709 if (p) 4710 return(p); 4711 if (vm_page_lookup(obj, pg)) 4712 return(NULL); 4713 4714 /* 4715 * Wait for memory to free up and try again 4716 */ 4717 if (vm_page_count_severe()) 4718 ++lowmempgallocs; 4719 vm_wait(hz / 20 + 1); 4720 4721 p = vm_page_alloc(obj, pg, vmflags); 4722 if (p) 4723 return(p); 4724 if (vm_page_lookup(obj, pg)) 4725 return(NULL); 4726 4727 /* 4728 * Ok, now we are really in trouble. 4729 */ 4730 { 4731 static struct krate biokrate = { .freq = 1 }; 4732 krateprintf(&biokrate, 4733 "Warning: bio_page_alloc: memory exhausted " 4734 "during buffer cache page allocation from %s\n", 4735 curthread->td_comm); 4736 } 4737 if (curthread->td_flags & TDF_SYSTHREAD) 4738 vm_wait(hz / 20 + 1); 4739 else 4740 vm_wait(hz / 2 + 1); 4741 return (NULL); 4742 } 4743 4744 /* 4745 * vm_hold_free_pages: 4746 * 4747 * Return pages associated with the buffer back to the VM system. 4748 * 4749 * The range of pages underlying the buffer's address space will 4750 * be unmapped and un-wired. 4751 */ 4752 void 4753 vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4754 { 4755 vm_offset_t pg; 4756 vm_page_t p; 4757 int index, newnpages; 4758 4759 from = round_page(from); 4760 to = round_page(to); 4761 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4762 newnpages = index; 4763 4764 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 4765 p = bp->b_xio.xio_pages[index]; 4766 if (p && (index < bp->b_xio.xio_npages)) { 4767 if (p->busy) { 4768 kprintf("vm_hold_free_pages: doffset: %lld, " 4769 "loffset: %lld\n", 4770 (long long)bp->b_bio2.bio_offset, 4771 (long long)bp->b_loffset); 4772 } 4773 bp->b_xio.xio_pages[index] = NULL; 4774 pmap_kremove_noinval(pg); 4775 vm_page_busy_wait(p, FALSE, "vmhldpg"); 4776 vm_page_unwire(p, 0); 4777 vm_page_free(p); 4778 } 4779 } 4780 pmap_invalidate_range(&kernel_pmap, from, to); 4781 bp->b_xio.xio_npages = newnpages; 4782 } 4783 4784 /* 4785 * vmapbuf: 4786 * 4787 * Map a user buffer into KVM via a pbuf. On return the buffer's 4788 * b_data, b_bufsize, and b_bcount will be set, and its XIO page array 4789 * initialized. 4790 */ 4791 int 4792 vmapbuf(struct buf *bp, caddr_t udata, int bytes) 4793 { 4794 caddr_t addr; 4795 vm_offset_t va; 4796 vm_page_t m; 4797 int vmprot; 4798 int error; 4799 int pidx; 4800 int i; 4801 4802 /* 4803 * bp had better have a command and it better be a pbuf. 4804 */ 4805 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 4806 KKASSERT(bp->b_flags & B_PAGING); 4807 KKASSERT(bp->b_kvabase); 4808 4809 if (bytes < 0) 4810 return (-1); 4811 4812 /* 4813 * Map the user data into KVM. Mappings have to be page-aligned. 4814 */ 4815 addr = (caddr_t)trunc_page((vm_offset_t)udata); 4816 pidx = 0; 4817 4818 vmprot = VM_PROT_READ; 4819 if (bp->b_cmd == BUF_CMD_READ) 4820 vmprot |= VM_PROT_WRITE; 4821 4822 while (addr < udata + bytes) { 4823 /* 4824 * Do the vm_fault if needed; do the copy-on-write thing 4825 * when reading stuff off device into memory. 4826 * 4827 * vm_fault_page*() returns a held VM page. 4828 */ 4829 va = (addr >= udata) ? (vm_offset_t)addr : (vm_offset_t)udata; 4830 va = trunc_page(va); 4831 4832 m = vm_fault_page_quick(va, vmprot, &error); 4833 if (m == NULL) { 4834 for (i = 0; i < pidx; ++i) { 4835 vm_page_unhold(bp->b_xio.xio_pages[i]); 4836 bp->b_xio.xio_pages[i] = NULL; 4837 } 4838 return(-1); 4839 } 4840 bp->b_xio.xio_pages[pidx] = m; 4841 addr += PAGE_SIZE; 4842 ++pidx; 4843 } 4844 4845 /* 4846 * Map the page array and set the buffer fields to point to 4847 * the mapped data buffer. 4848 */ 4849 if (pidx > btoc(MAXPHYS)) 4850 panic("vmapbuf: mapped more than MAXPHYS"); 4851 pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_xio.xio_pages, pidx); 4852 4853 bp->b_xio.xio_npages = pidx; 4854 bp->b_data = bp->b_kvabase + ((int)(intptr_t)udata & PAGE_MASK); 4855 bp->b_bcount = bytes; 4856 bp->b_bufsize = bytes; 4857 4858 return(0); 4859 } 4860 4861 /* 4862 * vunmapbuf: 4863 * 4864 * Free the io map PTEs associated with this IO operation. 4865 * We also invalidate the TLB entries and restore the original b_addr. 4866 */ 4867 void 4868 vunmapbuf(struct buf *bp) 4869 { 4870 int pidx; 4871 int npages; 4872 4873 KKASSERT(bp->b_flags & B_PAGING); 4874 4875 npages = bp->b_xio.xio_npages; 4876 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); 4877 for (pidx = 0; pidx < npages; ++pidx) { 4878 vm_page_unhold(bp->b_xio.xio_pages[pidx]); 4879 bp->b_xio.xio_pages[pidx] = NULL; 4880 } 4881 bp->b_xio.xio_npages = 0; 4882 bp->b_data = bp->b_kvabase; 4883 } 4884 4885 /* 4886 * Scan all buffers in the system and issue the callback. 4887 */ 4888 int 4889 scan_all_buffers(int (*callback)(struct buf *, void *), void *info) 4890 { 4891 int count = 0; 4892 int error; 4893 long n; 4894 4895 for (n = 0; n < nbuf; ++n) { 4896 if ((error = callback(&buf[n], info)) < 0) { 4897 count = error; 4898 break; 4899 } 4900 count += error; 4901 } 4902 return (count); 4903 } 4904 4905 /* 4906 * nestiobuf_iodone: biodone callback for nested buffers and propagate 4907 * completion to the master buffer. 4908 */ 4909 static void 4910 nestiobuf_iodone(struct bio *bio) 4911 { 4912 struct bio *mbio; 4913 struct buf *mbp, *bp; 4914 struct devstat *stats; 4915 int error; 4916 int donebytes; 4917 4918 bp = bio->bio_buf; 4919 mbio = bio->bio_caller_info1.ptr; 4920 stats = bio->bio_caller_info2.ptr; 4921 mbp = mbio->bio_buf; 4922 4923 KKASSERT(bp->b_bcount <= bp->b_bufsize); 4924 KKASSERT(mbp != bp); 4925 4926 error = bp->b_error; 4927 if (bp->b_error == 0 && 4928 (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) { 4929 /* 4930 * Not all got transfered, raise an error. We have no way to 4931 * propagate these conditions to mbp. 4932 */ 4933 error = EIO; 4934 } 4935 4936 donebytes = bp->b_bufsize; 4937 4938 relpbuf(bp, NULL); 4939 4940 nestiobuf_done(mbio, donebytes, error, stats); 4941 } 4942 4943 void 4944 nestiobuf_done(struct bio *mbio, int donebytes, int error, struct devstat *stats) 4945 { 4946 struct buf *mbp; 4947 4948 mbp = mbio->bio_buf; 4949 4950 KKASSERT((int)(intptr_t)mbio->bio_driver_info > 0); 4951 4952 /* 4953 * If an error occured, propagate it to the master buffer. 4954 * 4955 * Several biodone()s may wind up running concurrently so 4956 * use an atomic op to adjust b_flags. 4957 */ 4958 if (error) { 4959 mbp->b_error = error; 4960 atomic_set_int(&mbp->b_flags, B_ERROR); 4961 } 4962 4963 /* 4964 * Decrement the operations in progress counter and terminate the 4965 * I/O if this was the last bit. 4966 */ 4967 if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) { 4968 mbp->b_resid = 0; 4969 if (stats) 4970 devstat_end_transaction_buf(stats, mbp); 4971 biodone(mbio); 4972 } 4973 } 4974 4975 /* 4976 * Initialize a nestiobuf for use. Set an initial count of 1 to prevent 4977 * the mbio from being biodone()'d while we are still adding sub-bios to 4978 * it. 4979 */ 4980 void 4981 nestiobuf_init(struct bio *bio) 4982 { 4983 bio->bio_driver_info = (void *)1; 4984 } 4985 4986 /* 4987 * The BIOs added to the nestedio have already been started, remove the 4988 * count that placeheld our mbio and biodone() it if the count would 4989 * transition to 0. 4990 */ 4991 void 4992 nestiobuf_start(struct bio *mbio) 4993 { 4994 struct buf *mbp = mbio->bio_buf; 4995 4996 /* 4997 * Decrement the operations in progress counter and terminate the 4998 * I/O if this was the last bit. 4999 */ 5000 if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) { 5001 if (mbp->b_flags & B_ERROR) 5002 mbp->b_resid = mbp->b_bcount; 5003 else 5004 mbp->b_resid = 0; 5005 biodone(mbio); 5006 } 5007 } 5008 5009 /* 5010 * Set an intermediate error prior to calling nestiobuf_start() 5011 */ 5012 void 5013 nestiobuf_error(struct bio *mbio, int error) 5014 { 5015 struct buf *mbp = mbio->bio_buf; 5016 5017 if (error) { 5018 mbp->b_error = error; 5019 atomic_set_int(&mbp->b_flags, B_ERROR); 5020 } 5021 } 5022 5023 /* 5024 * nestiobuf_add: setup a "nested" buffer. 5025 * 5026 * => 'mbp' is a "master" buffer which is being divided into sub pieces. 5027 * => 'bp' should be a buffer allocated by getiobuf. 5028 * => 'offset' is a byte offset in the master buffer. 5029 * => 'size' is a size in bytes of this nested buffer. 5030 */ 5031 void 5032 nestiobuf_add(struct bio *mbio, struct buf *bp, int offset, size_t size, struct devstat *stats) 5033 { 5034 struct buf *mbp = mbio->bio_buf; 5035 struct vnode *vp = mbp->b_vp; 5036 5037 KKASSERT(mbp->b_bcount >= offset + size); 5038 5039 atomic_add_int((int *)&mbio->bio_driver_info, 1); 5040 5041 /* kernel needs to own the lock for it to be released in biodone */ 5042 BUF_KERNPROC(bp); 5043 bp->b_vp = vp; 5044 bp->b_cmd = mbp->b_cmd; 5045 bp->b_bio1.bio_done = nestiobuf_iodone; 5046 bp->b_data = (char *)mbp->b_data + offset; 5047 bp->b_resid = bp->b_bcount = size; 5048 bp->b_bufsize = bp->b_bcount; 5049 5050 bp->b_bio1.bio_track = NULL; 5051 bp->b_bio1.bio_caller_info1.ptr = mbio; 5052 bp->b_bio1.bio_caller_info2.ptr = stats; 5053 } 5054 5055 #ifdef DDB 5056 5057 DB_SHOW_COMMAND(buffer, db_show_buffer) 5058 { 5059 /* get args */ 5060 struct buf *bp = (struct buf *)addr; 5061 5062 if (!have_addr) { 5063 db_printf("usage: show buffer <addr>\n"); 5064 return; 5065 } 5066 5067 db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); 5068 db_printf("b_cmd = %d\n", bp->b_cmd); 5069 db_printf("b_error = %d, b_bufsize = %d, b_bcount = %d, " 5070 "b_resid = %d\n, b_data = %p, " 5071 "bio_offset(disk) = %lld, bio_offset(phys) = %lld\n", 5072 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 5073 bp->b_data, 5074 (long long)bp->b_bio2.bio_offset, 5075 (long long)(bp->b_bio2.bio_next ? 5076 bp->b_bio2.bio_next->bio_offset : (off_t)-1)); 5077 if (bp->b_xio.xio_npages) { 5078 int i; 5079 db_printf("b_xio.xio_npages = %d, pages(OBJ, IDX, PA): ", 5080 bp->b_xio.xio_npages); 5081 for (i = 0; i < bp->b_xio.xio_npages; i++) { 5082 vm_page_t m; 5083 m = bp->b_xio.xio_pages[i]; 5084 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 5085 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 5086 if ((i + 1) < bp->b_xio.xio_npages) 5087 db_printf(","); 5088 } 5089 db_printf("\n"); 5090 } 5091 } 5092 #endif /* DDB */ 5093