1 /* 2 * Copyright (c) 1994,1997 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Absolutely no warranty of function or purpose is made by the author 12 * John S. Dyson. 13 * 14 * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $ 15 * $DragonFly: src/sys/kern/vfs_bio.c,v 1.115 2008/08/13 11:02:31 swildner Exp $ 16 */ 17 18 /* 19 * this file contains a new buffer I/O scheme implementing a coherent 20 * VM object and buffer cache scheme. Pains have been taken to make 21 * sure that the performance degradation associated with schemes such 22 * as this is not realized. 23 * 24 * Author: John S. Dyson 25 * Significant help during the development and debugging phases 26 * had been provided by David Greenman, also of the FreeBSD core team. 27 * 28 * see man buf(9) for more info. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/devicestat.h> 36 #include <sys/eventhandler.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/mount.h> 40 #include <sys/kernel.h> 41 #include <sys/kthread.h> 42 #include <sys/proc.h> 43 #include <sys/reboot.h> 44 #include <sys/resourcevar.h> 45 #include <sys/sysctl.h> 46 #include <sys/vmmeter.h> 47 #include <sys/vnode.h> 48 #include <sys/dsched.h> 49 #include <sys/proc.h> 50 #include <vm/vm.h> 51 #include <vm/vm_param.h> 52 #include <vm/vm_kern.h> 53 #include <vm/vm_pageout.h> 54 #include <vm/vm_page.h> 55 #include <vm/vm_object.h> 56 #include <vm/vm_extern.h> 57 #include <vm/vm_map.h> 58 #include <vm/vm_pager.h> 59 #include <vm/swap_pager.h> 60 61 #include <sys/buf2.h> 62 #include <sys/thread2.h> 63 #include <sys/spinlock2.h> 64 #include <sys/mplock2.h> 65 #include <vm/vm_page2.h> 66 67 #include "opt_ddb.h" 68 #ifdef DDB 69 #include <ddb/ddb.h> 70 #endif 71 72 /* 73 * Buffer queues. 74 */ 75 enum bufq_type { 76 BQUEUE_NONE, /* not on any queue */ 77 BQUEUE_LOCKED, /* locked buffers */ 78 BQUEUE_CLEAN, /* non-B_DELWRI buffers */ 79 BQUEUE_DIRTY, /* B_DELWRI buffers */ 80 BQUEUE_DIRTY_HW, /* B_DELWRI buffers - heavy weight */ 81 BQUEUE_EMPTYKVA, /* empty buffer headers with KVA assignment */ 82 BQUEUE_EMPTY, /* empty buffer headers */ 83 84 BUFFER_QUEUES /* number of buffer queues */ 85 }; 86 87 typedef enum bufq_type bufq_type_t; 88 89 #define BD_WAKE_SIZE 16384 90 #define BD_WAKE_MASK (BD_WAKE_SIZE - 1) 91 92 TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; 93 static struct spinlock bufqspin = SPINLOCK_INITIALIZER(&bufqspin); 94 static struct spinlock bufcspin = SPINLOCK_INITIALIZER(&bufcspin); 95 96 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); 97 98 struct buf *buf; /* buffer header pool */ 99 100 static void vfs_clean_pages(struct buf *bp); 101 static void vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m); 102 static void vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m); 103 static void vfs_vmio_release(struct buf *bp); 104 static int flushbufqueues(bufq_type_t q); 105 static vm_page_t bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit); 106 107 static void bd_signal(int totalspace); 108 static void buf_daemon(void); 109 static void buf_daemon_hw(void); 110 111 /* 112 * bogus page -- for I/O to/from partially complete buffers 113 * this is a temporary solution to the problem, but it is not 114 * really that bad. it would be better to split the buffer 115 * for input in the case of buffers partially already in memory, 116 * but the code is intricate enough already. 117 */ 118 vm_page_t bogus_page; 119 120 /* 121 * These are all static, but make the ones we export globals so we do 122 * not need to use compiler magic. 123 */ 124 int bufspace; /* locked by buffer_map */ 125 int maxbufspace; 126 static int bufmallocspace; /* atomic ops */ 127 int maxbufmallocspace, lobufspace, hibufspace; 128 static int bufreusecnt, bufdefragcnt, buffreekvacnt; 129 static int lorunningspace; 130 static int hirunningspace; 131 static int runningbufreq; /* locked by bufcspin */ 132 static int dirtybufspace; /* locked by bufcspin */ 133 static int dirtybufcount; /* locked by bufcspin */ 134 static int dirtybufspacehw; /* locked by bufcspin */ 135 static int dirtybufcounthw; /* locked by bufcspin */ 136 static int runningbufspace; /* locked by bufcspin */ 137 static int runningbufcount; /* locked by bufcspin */ 138 int lodirtybufspace; 139 int hidirtybufspace; 140 static int getnewbufcalls; 141 static int getnewbufrestarts; 142 static int recoverbufcalls; 143 static int needsbuffer; /* locked by bufcspin */ 144 static int bd_request; /* locked by bufcspin */ 145 static int bd_request_hw; /* locked by bufcspin */ 146 static u_int bd_wake_ary[BD_WAKE_SIZE]; 147 static u_int bd_wake_index; 148 static u_int vm_cycle_point = 40; /* 23-36 will migrate more act->inact */ 149 static int debug_commit; 150 151 static struct thread *bufdaemon_td; 152 static struct thread *bufdaemonhw_td; 153 static u_int lowmempgallocs; 154 static u_int lowmempgfails; 155 156 /* 157 * Sysctls for operational control of the buffer cache. 158 */ 159 SYSCTL_INT(_vfs, OID_AUTO, lodirtybufspace, CTLFLAG_RW, &lodirtybufspace, 0, 160 "Number of dirty buffers to flush before bufdaemon becomes inactive"); 161 SYSCTL_INT(_vfs, OID_AUTO, hidirtybufspace, CTLFLAG_RW, &hidirtybufspace, 0, 162 "High watermark used to trigger explicit flushing of dirty buffers"); 163 SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, 164 "Minimum amount of buffer space required for active I/O"); 165 SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, 166 "Maximum amount of buffer space to usable for active I/O"); 167 SYSCTL_UINT(_vfs, OID_AUTO, lowmempgallocs, CTLFLAG_RW, &lowmempgallocs, 0, 168 "Page allocations done during periods of very low free memory"); 169 SYSCTL_UINT(_vfs, OID_AUTO, lowmempgfails, CTLFLAG_RW, &lowmempgfails, 0, 170 "Page allocations which failed during periods of very low free memory"); 171 SYSCTL_UINT(_vfs, OID_AUTO, vm_cycle_point, CTLFLAG_RW, &vm_cycle_point, 0, 172 "Recycle pages to active or inactive queue transition pt 0-64"); 173 /* 174 * Sysctls determining current state of the buffer cache. 175 */ 176 SYSCTL_INT(_vfs, OID_AUTO, nbuf, CTLFLAG_RD, &nbuf, 0, 177 "Total number of buffers in buffer cache"); 178 SYSCTL_INT(_vfs, OID_AUTO, dirtybufspace, CTLFLAG_RD, &dirtybufspace, 0, 179 "Pending bytes of dirty buffers (all)"); 180 SYSCTL_INT(_vfs, OID_AUTO, dirtybufspacehw, CTLFLAG_RD, &dirtybufspacehw, 0, 181 "Pending bytes of dirty buffers (heavy weight)"); 182 SYSCTL_INT(_vfs, OID_AUTO, dirtybufcount, CTLFLAG_RD, &dirtybufcount, 0, 183 "Pending number of dirty buffers"); 184 SYSCTL_INT(_vfs, OID_AUTO, dirtybufcounthw, CTLFLAG_RD, &dirtybufcounthw, 0, 185 "Pending number of dirty buffers (heavy weight)"); 186 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 187 "I/O bytes currently in progress due to asynchronous writes"); 188 SYSCTL_INT(_vfs, OID_AUTO, runningbufcount, CTLFLAG_RD, &runningbufcount, 0, 189 "I/O buffers currently in progress due to asynchronous writes"); 190 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, 191 "Hard limit on maximum amount of memory usable for buffer space"); 192 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, 193 "Soft limit on maximum amount of memory usable for buffer space"); 194 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, 195 "Minimum amount of memory to reserve for system buffer space"); 196 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 197 "Amount of memory available for buffers"); 198 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace, 199 0, "Maximum amount of memory reserved for buffers using malloc"); 200 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 201 "Amount of memory left for buffers using malloc-scheme"); 202 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, 0, 203 "New buffer header acquisition requests"); 204 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, 205 0, "New buffer header acquisition restarts"); 206 SYSCTL_INT(_vfs, OID_AUTO, recoverbufcalls, CTLFLAG_RD, &recoverbufcalls, 0, 207 "Recover VM space in an emergency"); 208 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RD, &bufdefragcnt, 0, 209 "Buffer acquisition restarts due to fragmented buffer map"); 210 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RD, &buffreekvacnt, 0, 211 "Amount of time KVA space was deallocated in an arbitrary buffer"); 212 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0, 213 "Amount of time buffer re-use operations were successful"); 214 SYSCTL_INT(_vfs, OID_AUTO, debug_commit, CTLFLAG_RW, &debug_commit, 0, ""); 215 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf), 216 "sizeof(struct buf)"); 217 218 char *buf_wmesg = BUF_WMESG; 219 220 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 221 #define VFS_BIO_NEED_UNUSED02 0x02 222 #define VFS_BIO_NEED_UNUSED04 0x04 223 #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 224 225 /* 226 * bufspacewakeup: 227 * 228 * Called when buffer space is potentially available for recovery. 229 * getnewbuf() will block on this flag when it is unable to free 230 * sufficient buffer space. Buffer space becomes recoverable when 231 * bp's get placed back in the queues. 232 */ 233 static __inline void 234 bufspacewakeup(void) 235 { 236 /* 237 * If someone is waiting for BUF space, wake them up. Even 238 * though we haven't freed the kva space yet, the waiting 239 * process will be able to now. 240 */ 241 spin_lock(&bufcspin); 242 if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { 243 needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; 244 spin_unlock(&bufcspin); 245 wakeup(&needsbuffer); 246 } else { 247 spin_unlock(&bufcspin); 248 } 249 } 250 251 /* 252 * runningbufwakeup: 253 * 254 * Accounting for I/O in progress. 255 * 256 */ 257 static __inline void 258 runningbufwakeup(struct buf *bp) 259 { 260 int totalspace; 261 int limit; 262 263 if ((totalspace = bp->b_runningbufspace) != 0) { 264 spin_lock(&bufcspin); 265 runningbufspace -= totalspace; 266 --runningbufcount; 267 bp->b_runningbufspace = 0; 268 269 /* 270 * see waitrunningbufspace() for limit test. 271 */ 272 limit = hirunningspace * 4 / 6; 273 if (runningbufreq && runningbufspace <= limit) { 274 runningbufreq = 0; 275 spin_unlock(&bufcspin); 276 wakeup(&runningbufreq); 277 } else { 278 spin_unlock(&bufcspin); 279 } 280 bd_signal(totalspace); 281 } 282 } 283 284 /* 285 * bufcountwakeup: 286 * 287 * Called when a buffer has been added to one of the free queues to 288 * account for the buffer and to wakeup anyone waiting for free buffers. 289 * This typically occurs when large amounts of metadata are being handled 290 * by the buffer cache ( else buffer space runs out first, usually ). 291 * 292 * MPSAFE 293 */ 294 static __inline void 295 bufcountwakeup(void) 296 { 297 spin_lock(&bufcspin); 298 if (needsbuffer) { 299 needsbuffer &= ~VFS_BIO_NEED_ANY; 300 spin_unlock(&bufcspin); 301 wakeup(&needsbuffer); 302 } else { 303 spin_unlock(&bufcspin); 304 } 305 } 306 307 /* 308 * waitrunningbufspace() 309 * 310 * Wait for the amount of running I/O to drop to hirunningspace * 4 / 6. 311 * This is the point where write bursting stops so we don't want to wait 312 * for the running amount to drop below it (at least if we still want bioq 313 * to burst writes). 314 * 315 * The caller may be using this function to block in a tight loop, we 316 * must block while runningbufspace is greater then or equal to 317 * hirunningspace * 4 / 6. 318 * 319 * And even with that it may not be enough, due to the presence of 320 * B_LOCKED dirty buffers, so also wait for at least one running buffer 321 * to complete. 322 */ 323 void 324 waitrunningbufspace(void) 325 { 326 int limit = hirunningspace * 4 / 6; 327 int dummy; 328 329 spin_lock(&bufcspin); 330 if (runningbufspace > limit) { 331 while (runningbufspace > limit) { 332 ++runningbufreq; 333 ssleep(&runningbufreq, &bufcspin, 0, "wdrn1", 0); 334 } 335 spin_unlock(&bufcspin); 336 } else if (runningbufspace > limit / 2) { 337 ++runningbufreq; 338 spin_unlock(&bufcspin); 339 tsleep(&dummy, 0, "wdrn2", 1); 340 } else { 341 spin_unlock(&bufcspin); 342 } 343 } 344 345 /* 346 * buf_dirty_count_severe: 347 * 348 * Return true if we have too many dirty buffers. 349 */ 350 int 351 buf_dirty_count_severe(void) 352 { 353 return (runningbufspace + dirtybufspace >= hidirtybufspace || 354 dirtybufcount >= nbuf / 2); 355 } 356 357 /* 358 * Return true if the amount of running I/O is severe and BIOQ should 359 * start bursting. 360 */ 361 int 362 buf_runningbufspace_severe(void) 363 { 364 return (runningbufspace >= hirunningspace * 4 / 6); 365 } 366 367 /* 368 * vfs_buf_test_cache: 369 * 370 * Called when a buffer is extended. This function clears the B_CACHE 371 * bit if the newly extended portion of the buffer does not contain 372 * valid data. 373 * 374 * NOTE! Dirty VM pages are not processed into dirty (B_DELWRI) buffer 375 * cache buffers. The VM pages remain dirty, as someone had mmap()'d 376 * them while a clean buffer was present. 377 */ 378 static __inline__ 379 void 380 vfs_buf_test_cache(struct buf *bp, 381 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 382 vm_page_t m) 383 { 384 if (bp->b_flags & B_CACHE) { 385 int base = (foff + off) & PAGE_MASK; 386 if (vm_page_is_valid(m, base, size) == 0) 387 bp->b_flags &= ~B_CACHE; 388 } 389 } 390 391 /* 392 * bd_speedup() 393 * 394 * Spank the buf_daemon[_hw] if the total dirty buffer space exceeds the 395 * low water mark. 396 * 397 * MPSAFE 398 */ 399 static __inline__ 400 void 401 bd_speedup(void) 402 { 403 if (dirtybufspace < lodirtybufspace && dirtybufcount < nbuf / 2) 404 return; 405 406 if (bd_request == 0 && 407 (dirtybufspace - dirtybufspacehw > lodirtybufspace / 2 || 408 dirtybufcount - dirtybufcounthw >= nbuf / 2)) { 409 spin_lock(&bufcspin); 410 bd_request = 1; 411 spin_unlock(&bufcspin); 412 wakeup(&bd_request); 413 } 414 if (bd_request_hw == 0 && 415 (dirtybufspacehw > lodirtybufspace / 2 || 416 dirtybufcounthw >= nbuf / 2)) { 417 spin_lock(&bufcspin); 418 bd_request_hw = 1; 419 spin_unlock(&bufcspin); 420 wakeup(&bd_request_hw); 421 } 422 } 423 424 /* 425 * bd_heatup() 426 * 427 * Get the buf_daemon heated up when the number of running and dirty 428 * buffers exceeds the mid-point. 429 * 430 * Return the total number of dirty bytes past the second mid point 431 * as a measure of how much excess dirty data there is in the system. 432 * 433 * MPSAFE 434 */ 435 int 436 bd_heatup(void) 437 { 438 int mid1; 439 int mid2; 440 int totalspace; 441 442 mid1 = lodirtybufspace + (hidirtybufspace - lodirtybufspace) / 2; 443 444 totalspace = runningbufspace + dirtybufspace; 445 if (totalspace >= mid1 || dirtybufcount >= nbuf / 2) { 446 bd_speedup(); 447 mid2 = mid1 + (hidirtybufspace - mid1) / 2; 448 if (totalspace >= mid2) 449 return(totalspace - mid2); 450 } 451 return(0); 452 } 453 454 /* 455 * bd_wait() 456 * 457 * Wait for the buffer cache to flush (totalspace) bytes worth of 458 * buffers, then return. 459 * 460 * Regardless this function blocks while the number of dirty buffers 461 * exceeds hidirtybufspace. 462 * 463 * MPSAFE 464 */ 465 void 466 bd_wait(int totalspace) 467 { 468 u_int i; 469 int count; 470 471 if (curthread == bufdaemonhw_td || curthread == bufdaemon_td) 472 return; 473 474 while (totalspace > 0) { 475 bd_heatup(); 476 if (totalspace > runningbufspace + dirtybufspace) 477 totalspace = runningbufspace + dirtybufspace; 478 count = totalspace / BKVASIZE; 479 if (count >= BD_WAKE_SIZE) 480 count = BD_WAKE_SIZE - 1; 481 482 spin_lock(&bufcspin); 483 i = (bd_wake_index + count) & BD_WAKE_MASK; 484 ++bd_wake_ary[i]; 485 486 /* 487 * This is not a strict interlock, so we play a bit loose 488 * with locking access to dirtybufspace* 489 */ 490 tsleep_interlock(&bd_wake_ary[i], 0); 491 spin_unlock(&bufcspin); 492 tsleep(&bd_wake_ary[i], PINTERLOCKED, "flstik", hz); 493 494 totalspace = runningbufspace + dirtybufspace - hidirtybufspace; 495 } 496 } 497 498 /* 499 * bd_signal() 500 * 501 * This function is called whenever runningbufspace or dirtybufspace 502 * is reduced. Track threads waiting for run+dirty buffer I/O 503 * complete. 504 * 505 * MPSAFE 506 */ 507 static void 508 bd_signal(int totalspace) 509 { 510 u_int i; 511 512 if (totalspace > 0) { 513 if (totalspace > BKVASIZE * BD_WAKE_SIZE) 514 totalspace = BKVASIZE * BD_WAKE_SIZE; 515 spin_lock(&bufcspin); 516 while (totalspace > 0) { 517 i = bd_wake_index++; 518 i &= BD_WAKE_MASK; 519 if (bd_wake_ary[i]) { 520 bd_wake_ary[i] = 0; 521 spin_unlock(&bufcspin); 522 wakeup(&bd_wake_ary[i]); 523 spin_lock(&bufcspin); 524 } 525 totalspace -= BKVASIZE; 526 } 527 spin_unlock(&bufcspin); 528 } 529 } 530 531 /* 532 * BIO tracking support routines. 533 * 534 * Release a ref on a bio_track. Wakeup requests are atomically released 535 * along with the last reference so bk_active will never wind up set to 536 * only 0x80000000. 537 * 538 * MPSAFE 539 */ 540 static 541 void 542 bio_track_rel(struct bio_track *track) 543 { 544 int active; 545 int desired; 546 547 /* 548 * Shortcut 549 */ 550 active = track->bk_active; 551 if (active == 1 && atomic_cmpset_int(&track->bk_active, 1, 0)) 552 return; 553 554 /* 555 * Full-on. Note that the wait flag is only atomically released on 556 * the 1->0 count transition. 557 * 558 * We check for a negative count transition using bit 30 since bit 31 559 * has a different meaning. 560 */ 561 for (;;) { 562 desired = (active & 0x7FFFFFFF) - 1; 563 if (desired) 564 desired |= active & 0x80000000; 565 if (atomic_cmpset_int(&track->bk_active, active, desired)) { 566 if (desired & 0x40000000) 567 panic("bio_track_rel: bad count: %p\n", track); 568 if (active & 0x80000000) 569 wakeup(track); 570 break; 571 } 572 active = track->bk_active; 573 } 574 } 575 576 /* 577 * Wait for the tracking count to reach 0. 578 * 579 * Use atomic ops such that the wait flag is only set atomically when 580 * bk_active is non-zero. 581 * 582 * MPSAFE 583 */ 584 int 585 bio_track_wait(struct bio_track *track, int slp_flags, int slp_timo) 586 { 587 int active; 588 int desired; 589 int error; 590 591 /* 592 * Shortcut 593 */ 594 if (track->bk_active == 0) 595 return(0); 596 597 /* 598 * Full-on. Note that the wait flag may only be atomically set if 599 * the active count is non-zero. 600 * 601 * NOTE: We cannot optimize active == desired since a wakeup could 602 * clear active prior to our tsleep_interlock(). 603 */ 604 error = 0; 605 while ((active = track->bk_active) != 0) { 606 cpu_ccfence(); 607 desired = active | 0x80000000; 608 tsleep_interlock(track, slp_flags); 609 if (atomic_cmpset_int(&track->bk_active, active, desired)) { 610 error = tsleep(track, slp_flags | PINTERLOCKED, 611 "trwait", slp_timo); 612 if (error) 613 break; 614 } 615 } 616 return (error); 617 } 618 619 /* 620 * bufinit: 621 * 622 * Load time initialisation of the buffer cache, called from machine 623 * dependant initialization code. 624 */ 625 void 626 bufinit(void) 627 { 628 struct buf *bp; 629 vm_offset_t bogus_offset; 630 int i; 631 632 /* next, make a null set of free lists */ 633 for (i = 0; i < BUFFER_QUEUES; i++) 634 TAILQ_INIT(&bufqueues[i]); 635 636 /* finally, initialize each buffer header and stick on empty q */ 637 for (i = 0; i < nbuf; i++) { 638 bp = &buf[i]; 639 bzero(bp, sizeof *bp); 640 bp->b_flags = B_INVAL; /* we're just an empty header */ 641 bp->b_cmd = BUF_CMD_DONE; 642 bp->b_qindex = BQUEUE_EMPTY; 643 initbufbio(bp); 644 xio_init(&bp->b_xio); 645 buf_dep_init(bp); 646 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_EMPTY], bp, b_freelist); 647 } 648 649 /* 650 * maxbufspace is the absolute maximum amount of buffer space we are 651 * allowed to reserve in KVM and in real terms. The absolute maximum 652 * is nominally used by buf_daemon. hibufspace is the nominal maximum 653 * used by most other processes. The differential is required to 654 * ensure that buf_daemon is able to run when other processes might 655 * be blocked waiting for buffer space. 656 * 657 * maxbufspace is based on BKVASIZE. Allocating buffers larger then 658 * this may result in KVM fragmentation which is not handled optimally 659 * by the system. 660 */ 661 maxbufspace = nbuf * BKVASIZE; 662 hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); 663 lobufspace = hibufspace - MAXBSIZE; 664 665 lorunningspace = 512 * 1024; 666 /* hirunningspace -- see below */ 667 668 /* 669 * Limit the amount of malloc memory since it is wired permanently 670 * into the kernel space. Even though this is accounted for in 671 * the buffer allocation, we don't want the malloced region to grow 672 * uncontrolled. The malloc scheme improves memory utilization 673 * significantly on average (small) directories. 674 */ 675 maxbufmallocspace = hibufspace / 20; 676 677 /* 678 * Reduce the chance of a deadlock occuring by limiting the number 679 * of delayed-write dirty buffers we allow to stack up. 680 * 681 * We don't want too much actually queued to the device at once 682 * (XXX this needs to be per-mount!), because the buffers will 683 * wind up locked for a very long period of time while the I/O 684 * drains. 685 */ 686 hidirtybufspace = hibufspace / 2; /* dirty + running */ 687 hirunningspace = hibufspace / 16; /* locked & queued to device */ 688 if (hirunningspace < 1024 * 1024) 689 hirunningspace = 1024 * 1024; 690 691 dirtybufspace = 0; 692 dirtybufspacehw = 0; 693 694 lodirtybufspace = hidirtybufspace / 2; 695 696 /* 697 * Maximum number of async ops initiated per buf_daemon loop. This is 698 * somewhat of a hack at the moment, we really need to limit ourselves 699 * based on the number of bytes of I/O in-transit that were initiated 700 * from buf_daemon. 701 */ 702 703 bogus_offset = kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 704 bogus_page = vm_page_alloc(&kernel_object, 705 (bogus_offset >> PAGE_SHIFT), 706 VM_ALLOC_NORMAL); 707 vmstats.v_wire_count++; 708 709 } 710 711 /* 712 * Initialize the embedded bio structures, typically used by 713 * deprecated code which tries to allocate its own struct bufs. 714 */ 715 void 716 initbufbio(struct buf *bp) 717 { 718 bp->b_bio1.bio_buf = bp; 719 bp->b_bio1.bio_prev = NULL; 720 bp->b_bio1.bio_offset = NOOFFSET; 721 bp->b_bio1.bio_next = &bp->b_bio2; 722 bp->b_bio1.bio_done = NULL; 723 bp->b_bio1.bio_flags = 0; 724 725 bp->b_bio2.bio_buf = bp; 726 bp->b_bio2.bio_prev = &bp->b_bio1; 727 bp->b_bio2.bio_offset = NOOFFSET; 728 bp->b_bio2.bio_next = NULL; 729 bp->b_bio2.bio_done = NULL; 730 bp->b_bio2.bio_flags = 0; 731 732 BUF_LOCKINIT(bp); 733 } 734 735 /* 736 * Reinitialize the embedded bio structures as well as any additional 737 * translation cache layers. 738 */ 739 void 740 reinitbufbio(struct buf *bp) 741 { 742 struct bio *bio; 743 744 for (bio = &bp->b_bio1; bio; bio = bio->bio_next) { 745 bio->bio_done = NULL; 746 bio->bio_offset = NOOFFSET; 747 } 748 } 749 750 /* 751 * Undo the effects of an initbufbio(). 752 */ 753 void 754 uninitbufbio(struct buf *bp) 755 { 756 dsched_exit_buf(bp); 757 BUF_LOCKFREE(bp); 758 } 759 760 /* 761 * Push another BIO layer onto an existing BIO and return it. The new 762 * BIO layer may already exist, holding cached translation data. 763 */ 764 struct bio * 765 push_bio(struct bio *bio) 766 { 767 struct bio *nbio; 768 769 if ((nbio = bio->bio_next) == NULL) { 770 int index = bio - &bio->bio_buf->b_bio_array[0]; 771 if (index >= NBUF_BIO - 1) { 772 panic("push_bio: too many layers bp %p\n", 773 bio->bio_buf); 774 } 775 nbio = &bio->bio_buf->b_bio_array[index + 1]; 776 bio->bio_next = nbio; 777 nbio->bio_prev = bio; 778 nbio->bio_buf = bio->bio_buf; 779 nbio->bio_offset = NOOFFSET; 780 nbio->bio_done = NULL; 781 nbio->bio_next = NULL; 782 } 783 KKASSERT(nbio->bio_done == NULL); 784 return(nbio); 785 } 786 787 /* 788 * Pop a BIO translation layer, returning the previous layer. The 789 * must have been previously pushed. 790 */ 791 struct bio * 792 pop_bio(struct bio *bio) 793 { 794 return(bio->bio_prev); 795 } 796 797 void 798 clearbiocache(struct bio *bio) 799 { 800 while (bio) { 801 bio->bio_offset = NOOFFSET; 802 bio = bio->bio_next; 803 } 804 } 805 806 /* 807 * bfreekva: 808 * 809 * Free the KVA allocation for buffer 'bp'. 810 * 811 * Must be called from a critical section as this is the only locking for 812 * buffer_map. 813 * 814 * Since this call frees up buffer space, we call bufspacewakeup(). 815 * 816 * MPALMOSTSAFE 817 */ 818 static void 819 bfreekva(struct buf *bp) 820 { 821 int count; 822 823 if (bp->b_kvasize) { 824 ++buffreekvacnt; 825 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 826 vm_map_lock(&buffer_map); 827 bufspace -= bp->b_kvasize; 828 vm_map_delete(&buffer_map, 829 (vm_offset_t) bp->b_kvabase, 830 (vm_offset_t) bp->b_kvabase + bp->b_kvasize, 831 &count 832 ); 833 vm_map_unlock(&buffer_map); 834 vm_map_entry_release(count); 835 bp->b_kvasize = 0; 836 bp->b_kvabase = NULL; 837 bufspacewakeup(); 838 } 839 } 840 841 /* 842 * bremfree: 843 * 844 * Remove the buffer from the appropriate free list. 845 */ 846 static __inline void 847 _bremfree(struct buf *bp) 848 { 849 if (bp->b_qindex != BQUEUE_NONE) { 850 KASSERT(BUF_REFCNTNB(bp) == 1, 851 ("bremfree: bp %p not locked",bp)); 852 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 853 bp->b_qindex = BQUEUE_NONE; 854 } else { 855 if (BUF_REFCNTNB(bp) <= 1) 856 panic("bremfree: removing a buffer not on a queue"); 857 } 858 } 859 860 void 861 bremfree(struct buf *bp) 862 { 863 spin_lock(&bufqspin); 864 _bremfree(bp); 865 spin_unlock(&bufqspin); 866 } 867 868 static void 869 bremfree_locked(struct buf *bp) 870 { 871 _bremfree(bp); 872 } 873 874 /* 875 * bread: 876 * 877 * Get a buffer with the specified data. Look in the cache first. We 878 * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 879 * is set, the buffer is valid and we do not have to do anything ( see 880 * getblk() ). 881 * 882 * MPALMOSTSAFE 883 */ 884 int 885 bread(struct vnode *vp, off_t loffset, int size, struct buf **bpp) 886 { 887 struct buf *bp; 888 889 bp = getblk(vp, loffset, size, 0, 0); 890 *bpp = bp; 891 892 /* if not found in cache, do some I/O */ 893 if ((bp->b_flags & B_CACHE) == 0) { 894 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 895 bp->b_cmd = BUF_CMD_READ; 896 bp->b_bio1.bio_done = biodone_sync; 897 bp->b_bio1.bio_flags |= BIO_SYNC; 898 vfs_busy_pages(vp, bp); 899 vn_strategy(vp, &bp->b_bio1); 900 return (biowait(&bp->b_bio1, "biord")); 901 } 902 return (0); 903 } 904 905 /* 906 * This version of bread issues any required I/O asyncnronously and 907 * makes a callback on completion. 908 * 909 * The callback must check whether BIO_DONE is set in the bio and issue 910 * the bpdone(bp, 0) if it isn't. The callback is responsible for clearing 911 * BIO_DONE and disposing of the I/O (bqrelse()ing it). 912 */ 913 void 914 breadcb(struct vnode *vp, off_t loffset, int size, 915 void (*func)(struct bio *), void *arg) 916 { 917 struct buf *bp; 918 919 bp = getblk(vp, loffset, size, 0, 0); 920 921 /* if not found in cache, do some I/O */ 922 if ((bp->b_flags & B_CACHE) == 0) { 923 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 924 bp->b_cmd = BUF_CMD_READ; 925 bp->b_bio1.bio_done = func; 926 bp->b_bio1.bio_caller_info1.ptr = arg; 927 vfs_busy_pages(vp, bp); 928 BUF_KERNPROC(bp); 929 vn_strategy(vp, &bp->b_bio1); 930 } else if (func) { 931 /* 932 * Since we are issuing the callback synchronously it cannot 933 * race the BIO_DONE, so no need for atomic ops here. 934 */ 935 /*bp->b_bio1.bio_done = func;*/ 936 bp->b_bio1.bio_caller_info1.ptr = arg; 937 bp->b_bio1.bio_flags |= BIO_DONE; 938 func(&bp->b_bio1); 939 } else { 940 bqrelse(bp); 941 } 942 } 943 944 /* 945 * breadn: 946 * 947 * Operates like bread, but also starts asynchronous I/O on 948 * read-ahead blocks. We must clear B_ERROR and B_INVAL prior 949 * to initiating I/O . If B_CACHE is set, the buffer is valid 950 * and we do not have to do anything. 951 * 952 * MPALMOSTSAFE 953 */ 954 int 955 breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset, 956 int *rabsize, int cnt, struct buf **bpp) 957 { 958 struct buf *bp, *rabp; 959 int i; 960 int rv = 0, readwait = 0; 961 962 *bpp = bp = getblk(vp, loffset, size, 0, 0); 963 964 /* if not found in cache, do some I/O */ 965 if ((bp->b_flags & B_CACHE) == 0) { 966 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 967 bp->b_cmd = BUF_CMD_READ; 968 bp->b_bio1.bio_done = biodone_sync; 969 bp->b_bio1.bio_flags |= BIO_SYNC; 970 vfs_busy_pages(vp, bp); 971 vn_strategy(vp, &bp->b_bio1); 972 ++readwait; 973 } 974 975 for (i = 0; i < cnt; i++, raoffset++, rabsize++) { 976 if (inmem(vp, *raoffset)) 977 continue; 978 rabp = getblk(vp, *raoffset, *rabsize, 0, 0); 979 980 if ((rabp->b_flags & B_CACHE) == 0) { 981 rabp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 982 rabp->b_cmd = BUF_CMD_READ; 983 vfs_busy_pages(vp, rabp); 984 BUF_KERNPROC(rabp); 985 vn_strategy(vp, &rabp->b_bio1); 986 } else { 987 brelse(rabp); 988 } 989 } 990 if (readwait) 991 rv = biowait(&bp->b_bio1, "biord"); 992 return (rv); 993 } 994 995 /* 996 * bwrite: 997 * 998 * Synchronous write, waits for completion. 999 * 1000 * Write, release buffer on completion. (Done by iodone 1001 * if async). Do not bother writing anything if the buffer 1002 * is invalid. 1003 * 1004 * Note that we set B_CACHE here, indicating that buffer is 1005 * fully valid and thus cacheable. This is true even of NFS 1006 * now so we set it generally. This could be set either here 1007 * or in biodone() since the I/O is synchronous. We put it 1008 * here. 1009 */ 1010 int 1011 bwrite(struct buf *bp) 1012 { 1013 int error; 1014 1015 if (bp->b_flags & B_INVAL) { 1016 brelse(bp); 1017 return (0); 1018 } 1019 if (BUF_REFCNTNB(bp) == 0) 1020 panic("bwrite: buffer is not busy???"); 1021 1022 /* Mark the buffer clean */ 1023 bundirty(bp); 1024 1025 bp->b_flags &= ~(B_ERROR | B_EINTR); 1026 bp->b_flags |= B_CACHE; 1027 bp->b_cmd = BUF_CMD_WRITE; 1028 bp->b_bio1.bio_done = biodone_sync; 1029 bp->b_bio1.bio_flags |= BIO_SYNC; 1030 vfs_busy_pages(bp->b_vp, bp); 1031 1032 /* 1033 * Normal bwrites pipeline writes. NOTE: b_bufsize is only 1034 * valid for vnode-backed buffers. 1035 */ 1036 bsetrunningbufspace(bp, bp->b_bufsize); 1037 vn_strategy(bp->b_vp, &bp->b_bio1); 1038 error = biowait(&bp->b_bio1, "biows"); 1039 brelse(bp); 1040 1041 return (error); 1042 } 1043 1044 /* 1045 * bawrite: 1046 * 1047 * Asynchronous write. Start output on a buffer, but do not wait for 1048 * it to complete. The buffer is released when the output completes. 1049 * 1050 * bwrite() ( or the VOP routine anyway ) is responsible for handling 1051 * B_INVAL buffers. Not us. 1052 */ 1053 void 1054 bawrite(struct buf *bp) 1055 { 1056 if (bp->b_flags & B_INVAL) { 1057 brelse(bp); 1058 return; 1059 } 1060 if (BUF_REFCNTNB(bp) == 0) 1061 panic("bwrite: buffer is not busy???"); 1062 1063 /* Mark the buffer clean */ 1064 bundirty(bp); 1065 1066 bp->b_flags &= ~(B_ERROR | B_EINTR); 1067 bp->b_flags |= B_CACHE; 1068 bp->b_cmd = BUF_CMD_WRITE; 1069 KKASSERT(bp->b_bio1.bio_done == NULL); 1070 vfs_busy_pages(bp->b_vp, bp); 1071 1072 /* 1073 * Normal bwrites pipeline writes. NOTE: b_bufsize is only 1074 * valid for vnode-backed buffers. 1075 */ 1076 bsetrunningbufspace(bp, bp->b_bufsize); 1077 BUF_KERNPROC(bp); 1078 vn_strategy(bp->b_vp, &bp->b_bio1); 1079 } 1080 1081 /* 1082 * bowrite: 1083 * 1084 * Ordered write. Start output on a buffer, and flag it so that the 1085 * device will write it in the order it was queued. The buffer is 1086 * released when the output completes. bwrite() ( or the VOP routine 1087 * anyway ) is responsible for handling B_INVAL buffers. 1088 */ 1089 int 1090 bowrite(struct buf *bp) 1091 { 1092 bp->b_flags |= B_ORDERED; 1093 bawrite(bp); 1094 return (0); 1095 } 1096 1097 /* 1098 * bdwrite: 1099 * 1100 * Delayed write. (Buffer is marked dirty). Do not bother writing 1101 * anything if the buffer is marked invalid. 1102 * 1103 * Note that since the buffer must be completely valid, we can safely 1104 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 1105 * biodone() in order to prevent getblk from writing the buffer 1106 * out synchronously. 1107 */ 1108 void 1109 bdwrite(struct buf *bp) 1110 { 1111 if (BUF_REFCNTNB(bp) == 0) 1112 panic("bdwrite: buffer is not busy"); 1113 1114 if (bp->b_flags & B_INVAL) { 1115 brelse(bp); 1116 return; 1117 } 1118 bdirty(bp); 1119 1120 if (dsched_is_clear_buf_priv(bp)) 1121 dsched_new_buf(bp); 1122 1123 /* 1124 * Set B_CACHE, indicating that the buffer is fully valid. This is 1125 * true even of NFS now. 1126 */ 1127 bp->b_flags |= B_CACHE; 1128 1129 /* 1130 * This bmap keeps the system from needing to do the bmap later, 1131 * perhaps when the system is attempting to do a sync. Since it 1132 * is likely that the indirect block -- or whatever other datastructure 1133 * that the filesystem needs is still in memory now, it is a good 1134 * thing to do this. Note also, that if the pageout daemon is 1135 * requesting a sync -- there might not be enough memory to do 1136 * the bmap then... So, this is important to do. 1137 */ 1138 if (bp->b_bio2.bio_offset == NOOFFSET) { 1139 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, 1140 NULL, NULL, BUF_CMD_WRITE); 1141 } 1142 1143 /* 1144 * Because the underlying pages may still be mapped and 1145 * writable trying to set the dirty buffer (b_dirtyoff/end) 1146 * range here will be inaccurate. 1147 * 1148 * However, we must still clean the pages to satisfy the 1149 * vnode_pager and pageout daemon, so theythink the pages 1150 * have been "cleaned". What has really occured is that 1151 * they've been earmarked for later writing by the buffer 1152 * cache. 1153 * 1154 * So we get the b_dirtyoff/end update but will not actually 1155 * depend on it (NFS that is) until the pages are busied for 1156 * writing later on. 1157 */ 1158 vfs_clean_pages(bp); 1159 bqrelse(bp); 1160 1161 /* 1162 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 1163 * due to the softdep code. 1164 */ 1165 } 1166 1167 /* 1168 * Fake write - return pages to VM system as dirty, leave the buffer clean. 1169 * This is used by tmpfs. 1170 * 1171 * It is important for any VFS using this routine to NOT use it for 1172 * IO_SYNC or IO_ASYNC operations which occur when the system really 1173 * wants to flush VM pages to backing store. 1174 */ 1175 void 1176 buwrite(struct buf *bp) 1177 { 1178 vm_page_t m; 1179 int i; 1180 1181 /* 1182 * Only works for VMIO buffers. If the buffer is already 1183 * marked for delayed-write we can't avoid the bdwrite(). 1184 */ 1185 if ((bp->b_flags & B_VMIO) == 0 || (bp->b_flags & B_DELWRI)) { 1186 bdwrite(bp); 1187 return; 1188 } 1189 1190 /* 1191 * Set valid & dirty. 1192 * 1193 * WARNING! vfs_dirty_one_page() assumes vm_token is held for now. 1194 */ 1195 lwkt_gettoken(&vm_token); 1196 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1197 m = bp->b_xio.xio_pages[i]; 1198 vfs_dirty_one_page(bp, i, m); 1199 } 1200 lwkt_reltoken(&vm_token); 1201 bqrelse(bp); 1202 } 1203 1204 /* 1205 * bdirty: 1206 * 1207 * Turn buffer into delayed write request by marking it B_DELWRI. 1208 * B_RELBUF and B_NOCACHE must be cleared. 1209 * 1210 * We reassign the buffer to itself to properly update it in the 1211 * dirty/clean lists. 1212 * 1213 * Must be called from a critical section. 1214 * The buffer must be on BQUEUE_NONE. 1215 */ 1216 void 1217 bdirty(struct buf *bp) 1218 { 1219 KASSERT(bp->b_qindex == BQUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1220 if (bp->b_flags & B_NOCACHE) { 1221 kprintf("bdirty: clearing B_NOCACHE on buf %p\n", bp); 1222 bp->b_flags &= ~B_NOCACHE; 1223 } 1224 if (bp->b_flags & B_INVAL) { 1225 kprintf("bdirty: warning, dirtying invalid buffer %p\n", bp); 1226 } 1227 bp->b_flags &= ~B_RELBUF; 1228 1229 if ((bp->b_flags & B_DELWRI) == 0) { 1230 lwkt_gettoken(&bp->b_vp->v_token); 1231 bp->b_flags |= B_DELWRI; 1232 reassignbuf(bp); 1233 lwkt_reltoken(&bp->b_vp->v_token); 1234 1235 spin_lock(&bufcspin); 1236 ++dirtybufcount; 1237 dirtybufspace += bp->b_bufsize; 1238 if (bp->b_flags & B_HEAVY) { 1239 ++dirtybufcounthw; 1240 dirtybufspacehw += bp->b_bufsize; 1241 } 1242 spin_unlock(&bufcspin); 1243 1244 bd_heatup(); 1245 } 1246 } 1247 1248 /* 1249 * Set B_HEAVY, indicating that this is a heavy-weight buffer that 1250 * needs to be flushed with a different buf_daemon thread to avoid 1251 * deadlocks. B_HEAVY also imposes restrictions in getnewbuf(). 1252 */ 1253 void 1254 bheavy(struct buf *bp) 1255 { 1256 if ((bp->b_flags & B_HEAVY) == 0) { 1257 bp->b_flags |= B_HEAVY; 1258 if (bp->b_flags & B_DELWRI) { 1259 spin_lock(&bufcspin); 1260 ++dirtybufcounthw; 1261 dirtybufspacehw += bp->b_bufsize; 1262 spin_unlock(&bufcspin); 1263 } 1264 } 1265 } 1266 1267 /* 1268 * bundirty: 1269 * 1270 * Clear B_DELWRI for buffer. 1271 * 1272 * Must be called from a critical section. 1273 * 1274 * The buffer is typically on BQUEUE_NONE but there is one case in 1275 * brelse() that calls this function after placing the buffer on 1276 * a different queue. 1277 * 1278 * MPSAFE 1279 */ 1280 void 1281 bundirty(struct buf *bp) 1282 { 1283 if (bp->b_flags & B_DELWRI) { 1284 lwkt_gettoken(&bp->b_vp->v_token); 1285 bp->b_flags &= ~B_DELWRI; 1286 reassignbuf(bp); 1287 lwkt_reltoken(&bp->b_vp->v_token); 1288 1289 spin_lock(&bufcspin); 1290 --dirtybufcount; 1291 dirtybufspace -= bp->b_bufsize; 1292 if (bp->b_flags & B_HEAVY) { 1293 --dirtybufcounthw; 1294 dirtybufspacehw -= bp->b_bufsize; 1295 } 1296 spin_unlock(&bufcspin); 1297 1298 bd_signal(bp->b_bufsize); 1299 } 1300 /* 1301 * Since it is now being written, we can clear its deferred write flag. 1302 */ 1303 bp->b_flags &= ~B_DEFERRED; 1304 } 1305 1306 /* 1307 * Set the b_runningbufspace field, used to track how much I/O is 1308 * in progress at any given moment. 1309 */ 1310 void 1311 bsetrunningbufspace(struct buf *bp, int bytes) 1312 { 1313 bp->b_runningbufspace = bytes; 1314 if (bytes) { 1315 spin_lock(&bufcspin); 1316 runningbufspace += bytes; 1317 ++runningbufcount; 1318 spin_unlock(&bufcspin); 1319 } 1320 } 1321 1322 /* 1323 * brelse: 1324 * 1325 * Release a busy buffer and, if requested, free its resources. The 1326 * buffer will be stashed in the appropriate bufqueue[] allowing it 1327 * to be accessed later as a cache entity or reused for other purposes. 1328 * 1329 * MPALMOSTSAFE 1330 */ 1331 void 1332 brelse(struct buf *bp) 1333 { 1334 #ifdef INVARIANTS 1335 int saved_flags = bp->b_flags; 1336 #endif 1337 1338 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1339 1340 /* 1341 * If B_NOCACHE is set we are being asked to destroy the buffer and 1342 * its backing store. Clear B_DELWRI. 1343 * 1344 * B_NOCACHE is set in two cases: (1) when the caller really wants 1345 * to destroy the buffer and backing store and (2) when the caller 1346 * wants to destroy the buffer and backing store after a write 1347 * completes. 1348 */ 1349 if ((bp->b_flags & (B_NOCACHE|B_DELWRI)) == (B_NOCACHE|B_DELWRI)) { 1350 bundirty(bp); 1351 } 1352 1353 if ((bp->b_flags & (B_INVAL | B_DELWRI)) == B_DELWRI) { 1354 /* 1355 * A re-dirtied buffer is only subject to destruction 1356 * by B_INVAL. B_ERROR and B_NOCACHE are ignored. 1357 */ 1358 /* leave buffer intact */ 1359 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || 1360 (bp->b_bufsize <= 0)) { 1361 /* 1362 * Either a failed read or we were asked to free or not 1363 * cache the buffer. This path is reached with B_DELWRI 1364 * set only if B_INVAL is already set. B_NOCACHE governs 1365 * backing store destruction. 1366 * 1367 * NOTE: HAMMER will set B_LOCKED in buf_deallocate if the 1368 * buffer cannot be immediately freed. 1369 */ 1370 bp->b_flags |= B_INVAL; 1371 if (LIST_FIRST(&bp->b_dep) != NULL) 1372 buf_deallocate(bp); 1373 if (bp->b_flags & B_DELWRI) { 1374 spin_lock(&bufcspin); 1375 --dirtybufcount; 1376 dirtybufspace -= bp->b_bufsize; 1377 if (bp->b_flags & B_HEAVY) { 1378 --dirtybufcounthw; 1379 dirtybufspacehw -= bp->b_bufsize; 1380 } 1381 spin_unlock(&bufcspin); 1382 1383 bd_signal(bp->b_bufsize); 1384 } 1385 bp->b_flags &= ~(B_DELWRI | B_CACHE); 1386 } 1387 1388 /* 1389 * We must clear B_RELBUF if B_DELWRI or B_LOCKED is set, 1390 * or if b_refs is non-zero. 1391 * 1392 * If vfs_vmio_release() is called with either bit set, the 1393 * underlying pages may wind up getting freed causing a previous 1394 * write (bdwrite()) to get 'lost' because pages associated with 1395 * a B_DELWRI bp are marked clean. Pages associated with a 1396 * B_LOCKED buffer may be mapped by the filesystem. 1397 * 1398 * If we want to release the buffer ourselves (rather then the 1399 * originator asking us to release it), give the originator a 1400 * chance to countermand the release by setting B_LOCKED. 1401 * 1402 * We still allow the B_INVAL case to call vfs_vmio_release(), even 1403 * if B_DELWRI is set. 1404 * 1405 * If B_DELWRI is not set we may have to set B_RELBUF if we are low 1406 * on pages to return pages to the VM page queues. 1407 */ 1408 if ((bp->b_flags & (B_DELWRI | B_LOCKED)) || bp->b_refs) { 1409 bp->b_flags &= ~B_RELBUF; 1410 } else if (vm_page_count_severe()) { 1411 if (LIST_FIRST(&bp->b_dep) != NULL) 1412 buf_deallocate(bp); /* can set B_LOCKED */ 1413 if (bp->b_flags & (B_DELWRI | B_LOCKED)) 1414 bp->b_flags &= ~B_RELBUF; 1415 else 1416 bp->b_flags |= B_RELBUF; 1417 } 1418 1419 /* 1420 * Make sure b_cmd is clear. It may have already been cleared by 1421 * biodone(). 1422 * 1423 * At this point destroying the buffer is governed by the B_INVAL 1424 * or B_RELBUF flags. 1425 */ 1426 bp->b_cmd = BUF_CMD_DONE; 1427 dsched_exit_buf(bp); 1428 1429 /* 1430 * VMIO buffer rundown. Make sure the VM page array is restored 1431 * after an I/O may have replaces some of the pages with bogus pages 1432 * in order to not destroy dirty pages in a fill-in read. 1433 * 1434 * Note that due to the code above, if a buffer is marked B_DELWRI 1435 * then the B_RELBUF and B_NOCACHE bits will always be clear. 1436 * B_INVAL may still be set, however. 1437 * 1438 * For clean buffers, B_INVAL or B_RELBUF will destroy the buffer 1439 * but not the backing store. B_NOCACHE will destroy the backing 1440 * store. 1441 * 1442 * Note that dirty NFS buffers contain byte-granular write ranges 1443 * and should not be destroyed w/ B_INVAL even if the backing store 1444 * is left intact. 1445 */ 1446 if (bp->b_flags & B_VMIO) { 1447 /* 1448 * Rundown for VMIO buffers which are not dirty NFS buffers. 1449 */ 1450 int i, j, resid; 1451 vm_page_t m; 1452 off_t foff; 1453 vm_pindex_t poff; 1454 vm_object_t obj; 1455 struct vnode *vp; 1456 1457 vp = bp->b_vp; 1458 1459 /* 1460 * Get the base offset and length of the buffer. Note that 1461 * in the VMIO case if the buffer block size is not 1462 * page-aligned then b_data pointer may not be page-aligned. 1463 * But our b_xio.xio_pages array *IS* page aligned. 1464 * 1465 * block sizes less then DEV_BSIZE (usually 512) are not 1466 * supported due to the page granularity bits (m->valid, 1467 * m->dirty, etc...). 1468 * 1469 * See man buf(9) for more information 1470 */ 1471 1472 resid = bp->b_bufsize; 1473 foff = bp->b_loffset; 1474 1475 lwkt_gettoken(&vm_token); 1476 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1477 m = bp->b_xio.xio_pages[i]; 1478 vm_page_flag_clear(m, PG_ZERO); 1479 /* 1480 * If we hit a bogus page, fixup *all* of them 1481 * now. Note that we left these pages wired 1482 * when we removed them so they had better exist, 1483 * and they cannot be ripped out from under us so 1484 * no critical section protection is necessary. 1485 */ 1486 if (m == bogus_page) { 1487 obj = vp->v_object; 1488 poff = OFF_TO_IDX(bp->b_loffset); 1489 1490 for (j = i; j < bp->b_xio.xio_npages; j++) { 1491 vm_page_t mtmp; 1492 1493 mtmp = bp->b_xio.xio_pages[j]; 1494 if (mtmp == bogus_page) { 1495 mtmp = vm_page_lookup(obj, poff + j); 1496 if (!mtmp) { 1497 panic("brelse: page missing"); 1498 } 1499 bp->b_xio.xio_pages[j] = mtmp; 1500 } 1501 } 1502 bp->b_flags &= ~B_HASBOGUS; 1503 1504 if ((bp->b_flags & B_INVAL) == 0) { 1505 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 1506 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 1507 } 1508 m = bp->b_xio.xio_pages[i]; 1509 } 1510 1511 /* 1512 * Invalidate the backing store if B_NOCACHE is set 1513 * (e.g. used with vinvalbuf()). If this is NFS 1514 * we impose a requirement that the block size be 1515 * a multiple of PAGE_SIZE and create a temporary 1516 * hack to basically invalidate the whole page. The 1517 * problem is that NFS uses really odd buffer sizes 1518 * especially when tracking piecemeal writes and 1519 * it also vinvalbuf()'s a lot, which would result 1520 * in only partial page validation and invalidation 1521 * here. If the file page is mmap()'d, however, 1522 * all the valid bits get set so after we invalidate 1523 * here we would end up with weird m->valid values 1524 * like 0xfc. nfs_getpages() can't handle this so 1525 * we clear all the valid bits for the NFS case 1526 * instead of just some of them. 1527 * 1528 * The real bug is the VM system having to set m->valid 1529 * to VM_PAGE_BITS_ALL for faulted-in pages, which 1530 * itself is an artifact of the whole 512-byte 1531 * granular mess that exists to support odd block 1532 * sizes and UFS meta-data block sizes (e.g. 6144). 1533 * A complete rewrite is required. 1534 * 1535 * XXX 1536 */ 1537 if (bp->b_flags & (B_NOCACHE|B_ERROR)) { 1538 int poffset = foff & PAGE_MASK; 1539 int presid; 1540 1541 presid = PAGE_SIZE - poffset; 1542 if (bp->b_vp->v_tag == VT_NFS && 1543 bp->b_vp->v_type == VREG) { 1544 ; /* entire page */ 1545 } else if (presid > resid) { 1546 presid = resid; 1547 } 1548 KASSERT(presid >= 0, ("brelse: extra page")); 1549 vm_page_set_invalid(m, poffset, presid); 1550 1551 /* 1552 * Also make sure any swap cache is removed 1553 * as it is now stale (HAMMER in particular 1554 * uses B_NOCACHE to deal with buffer 1555 * aliasing). 1556 */ 1557 swap_pager_unswapped(m); 1558 } 1559 resid -= PAGE_SIZE - (foff & PAGE_MASK); 1560 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 1561 } 1562 if (bp->b_flags & (B_INVAL | B_RELBUF)) 1563 vfs_vmio_release(bp); 1564 lwkt_reltoken(&vm_token); 1565 } else { 1566 /* 1567 * Rundown for non-VMIO buffers. 1568 */ 1569 if (bp->b_flags & (B_INVAL | B_RELBUF)) { 1570 if (bp->b_bufsize) 1571 allocbuf(bp, 0); 1572 KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); 1573 if (bp->b_vp) 1574 brelvp(bp); 1575 } 1576 } 1577 1578 if (bp->b_qindex != BQUEUE_NONE) 1579 panic("brelse: free buffer onto another queue???"); 1580 if (BUF_REFCNTNB(bp) > 1) { 1581 /* Temporary panic to verify exclusive locking */ 1582 /* This panic goes away when we allow shared refs */ 1583 panic("brelse: multiple refs"); 1584 /* NOT REACHED */ 1585 return; 1586 } 1587 1588 /* 1589 * Figure out the correct queue to place the cleaned up buffer on. 1590 * Buffers placed in the EMPTY or EMPTYKVA had better already be 1591 * disassociated from their vnode. 1592 */ 1593 spin_lock(&bufqspin); 1594 if (bp->b_flags & B_LOCKED) { 1595 /* 1596 * Buffers that are locked are placed in the locked queue 1597 * immediately, regardless of their state. 1598 */ 1599 bp->b_qindex = BQUEUE_LOCKED; 1600 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist); 1601 } else if (bp->b_bufsize == 0) { 1602 /* 1603 * Buffers with no memory. Due to conditionals near the top 1604 * of brelse() such buffers should probably already be 1605 * marked B_INVAL and disassociated from their vnode. 1606 */ 1607 bp->b_flags |= B_INVAL; 1608 KASSERT(bp->b_vp == NULL, ("bp1 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); 1609 KKASSERT((bp->b_flags & B_HASHED) == 0); 1610 if (bp->b_kvasize) { 1611 bp->b_qindex = BQUEUE_EMPTYKVA; 1612 } else { 1613 bp->b_qindex = BQUEUE_EMPTY; 1614 } 1615 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1616 } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) { 1617 /* 1618 * Buffers with junk contents. Again these buffers had better 1619 * already be disassociated from their vnode. 1620 */ 1621 KASSERT(bp->b_vp == NULL, ("bp2 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); 1622 KKASSERT((bp->b_flags & B_HASHED) == 0); 1623 bp->b_flags |= B_INVAL; 1624 bp->b_qindex = BQUEUE_CLEAN; 1625 TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 1626 } else { 1627 /* 1628 * Remaining buffers. These buffers are still associated with 1629 * their vnode. 1630 */ 1631 switch(bp->b_flags & (B_DELWRI|B_HEAVY)) { 1632 case B_DELWRI: 1633 bp->b_qindex = BQUEUE_DIRTY; 1634 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY], bp, b_freelist); 1635 break; 1636 case B_DELWRI | B_HEAVY: 1637 bp->b_qindex = BQUEUE_DIRTY_HW; 1638 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY_HW], bp, 1639 b_freelist); 1640 break; 1641 default: 1642 /* 1643 * NOTE: Buffers are always placed at the end of the 1644 * queue. If B_AGE is not set the buffer will cycle 1645 * through the queue twice. 1646 */ 1647 bp->b_qindex = BQUEUE_CLEAN; 1648 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 1649 break; 1650 } 1651 } 1652 spin_unlock(&bufqspin); 1653 1654 /* 1655 * If B_INVAL, clear B_DELWRI. We've already placed the buffer 1656 * on the correct queue. 1657 */ 1658 if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) 1659 bundirty(bp); 1660 1661 /* 1662 * The bp is on an appropriate queue unless locked. If it is not 1663 * locked or dirty we can wakeup threads waiting for buffer space. 1664 * 1665 * We've already handled the B_INVAL case ( B_DELWRI will be clear 1666 * if B_INVAL is set ). 1667 */ 1668 if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) 1669 bufcountwakeup(); 1670 1671 /* 1672 * Something we can maybe free or reuse 1673 */ 1674 if (bp->b_bufsize || bp->b_kvasize) 1675 bufspacewakeup(); 1676 1677 /* 1678 * Clean up temporary flags and unlock the buffer. 1679 */ 1680 bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF | B_DIRECT); 1681 BUF_UNLOCK(bp); 1682 } 1683 1684 /* 1685 * bqrelse: 1686 * 1687 * Release a buffer back to the appropriate queue but do not try to free 1688 * it. The buffer is expected to be used again soon. 1689 * 1690 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1691 * biodone() to requeue an async I/O on completion. It is also used when 1692 * known good buffers need to be requeued but we think we may need the data 1693 * again soon. 1694 * 1695 * XXX we should be able to leave the B_RELBUF hint set on completion. 1696 * 1697 * MPSAFE 1698 */ 1699 void 1700 bqrelse(struct buf *bp) 1701 { 1702 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1703 1704 if (bp->b_qindex != BQUEUE_NONE) 1705 panic("bqrelse: free buffer onto another queue???"); 1706 if (BUF_REFCNTNB(bp) > 1) { 1707 /* do not release to free list */ 1708 panic("bqrelse: multiple refs"); 1709 return; 1710 } 1711 1712 buf_act_advance(bp); 1713 1714 spin_lock(&bufqspin); 1715 if (bp->b_flags & B_LOCKED) { 1716 /* 1717 * Locked buffers are released to the locked queue. However, 1718 * if the buffer is dirty it will first go into the dirty 1719 * queue and later on after the I/O completes successfully it 1720 * will be released to the locked queue. 1721 */ 1722 bp->b_qindex = BQUEUE_LOCKED; 1723 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist); 1724 } else if (bp->b_flags & B_DELWRI) { 1725 bp->b_qindex = (bp->b_flags & B_HEAVY) ? 1726 BQUEUE_DIRTY_HW : BQUEUE_DIRTY; 1727 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); 1728 } else if (vm_page_count_severe()) { 1729 /* 1730 * We are too low on memory, we have to try to free the 1731 * buffer (most importantly: the wired pages making up its 1732 * backing store) *now*. 1733 */ 1734 spin_unlock(&bufqspin); 1735 brelse(bp); 1736 return; 1737 } else { 1738 bp->b_qindex = BQUEUE_CLEAN; 1739 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 1740 } 1741 spin_unlock(&bufqspin); 1742 1743 if ((bp->b_flags & B_LOCKED) == 0 && 1744 ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)) { 1745 bufcountwakeup(); 1746 } 1747 1748 /* 1749 * Something we can maybe free or reuse. 1750 */ 1751 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 1752 bufspacewakeup(); 1753 1754 /* 1755 * Final cleanup and unlock. Clear bits that are only used while a 1756 * buffer is actively locked. 1757 */ 1758 bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF); 1759 dsched_exit_buf(bp); 1760 BUF_UNLOCK(bp); 1761 } 1762 1763 /* 1764 * Hold a buffer, preventing it from being reused. This will prevent 1765 * normal B_RELBUF operations on the buffer but will not prevent B_INVAL 1766 * operations. If a B_INVAL operation occurs the buffer will remain held 1767 * but the underlying pages may get ripped out. 1768 * 1769 * These functions are typically used in VOP_READ/VOP_WRITE functions 1770 * to hold a buffer during a copyin or copyout, preventing deadlocks 1771 * or recursive lock panics when read()/write() is used over mmap()'d 1772 * space. 1773 * 1774 * NOTE: bqhold() requires that the buffer be locked at the time of the 1775 * hold. bqdrop() has no requirements other than the buffer having 1776 * previously been held. 1777 */ 1778 void 1779 bqhold(struct buf *bp) 1780 { 1781 atomic_add_int(&bp->b_refs, 1); 1782 } 1783 1784 void 1785 bqdrop(struct buf *bp) 1786 { 1787 KKASSERT(bp->b_refs > 0); 1788 atomic_add_int(&bp->b_refs, -1); 1789 } 1790 1791 /* 1792 * vfs_vmio_release: 1793 * 1794 * Return backing pages held by the buffer 'bp' back to the VM system 1795 * if possible. The pages are freed if they are no longer valid or 1796 * attempt to free if it was used for direct I/O otherwise they are 1797 * sent to the page cache. 1798 * 1799 * Pages that were marked busy are left alone and skipped. 1800 * 1801 * The KVA mapping (b_data) for the underlying pages is removed by 1802 * this function. 1803 */ 1804 static void 1805 vfs_vmio_release(struct buf *bp) 1806 { 1807 int i; 1808 vm_page_t m; 1809 1810 lwkt_gettoken(&vm_token); 1811 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1812 m = bp->b_xio.xio_pages[i]; 1813 bp->b_xio.xio_pages[i] = NULL; 1814 1815 /* 1816 * The VFS is telling us this is not a meta-data buffer 1817 * even if it is backed by a block device. 1818 */ 1819 if (bp->b_flags & B_NOTMETA) 1820 vm_page_flag_set(m, PG_NOTMETA); 1821 1822 /* 1823 * This is a very important bit of code. We try to track 1824 * VM page use whether the pages are wired into the buffer 1825 * cache or not. While wired into the buffer cache the 1826 * bp tracks the act_count. 1827 * 1828 * We can choose to place unwired pages on the inactive 1829 * queue (0) or active queue (1). If we place too many 1830 * on the active queue the queue will cycle the act_count 1831 * on pages we'd like to keep, just from single-use pages 1832 * (such as when doing a tar-up or file scan). 1833 */ 1834 if (bp->b_act_count < vm_cycle_point) 1835 vm_page_unwire(m, 0); 1836 else 1837 vm_page_unwire(m, 1); 1838 1839 /* 1840 * We don't mess with busy pages, it is the responsibility 1841 * of the process that busied the pages to deal with them. 1842 * 1843 * However, the caller may have marked the page invalid and 1844 * we must still make sure the page is no longer mapped. 1845 */ 1846 if ((m->flags & PG_BUSY) || (m->busy != 0)) { 1847 vm_page_protect(m, VM_PROT_NONE); 1848 continue; 1849 } 1850 1851 if (m->wire_count == 0) { 1852 vm_page_flag_clear(m, PG_ZERO); 1853 /* 1854 * Might as well free the page if we can and it has 1855 * no valid data. We also free the page if the 1856 * buffer was used for direct I/O. 1857 */ 1858 #if 0 1859 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && 1860 m->hold_count == 0) { 1861 vm_page_busy(m); 1862 vm_page_protect(m, VM_PROT_NONE); 1863 vm_page_free(m); 1864 } else 1865 #endif 1866 /* 1867 * Cache the page if we are really low on free 1868 * pages. 1869 * 1870 * Also bypass the active and inactive queues 1871 * if B_NOTMETA is set. This flag is set by HAMMER 1872 * on a regular file buffer when double buffering 1873 * is enabled or on a block device buffer representing 1874 * file data when double buffering is not enabled. 1875 * The flag prevents two copies of the same data from 1876 * being cached for long periods of time. 1877 */ 1878 if (bp->b_flags & B_DIRECT) { 1879 vm_page_try_to_free(m); 1880 } else if ((bp->b_flags & B_NOTMETA) || 1881 vm_page_count_severe()) { 1882 m->act_count = bp->b_act_count; 1883 vm_page_try_to_cache(m); 1884 } else { 1885 m->act_count = bp->b_act_count; 1886 } 1887 } 1888 } 1889 lwkt_reltoken(&vm_token); 1890 1891 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), 1892 bp->b_xio.xio_npages); 1893 if (bp->b_bufsize) { 1894 bufspacewakeup(); 1895 bp->b_bufsize = 0; 1896 } 1897 bp->b_xio.xio_npages = 0; 1898 bp->b_flags &= ~B_VMIO; 1899 KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); 1900 if (bp->b_vp) 1901 brelvp(bp); 1902 } 1903 1904 /* 1905 * vfs_bio_awrite: 1906 * 1907 * Implement clustered async writes for clearing out B_DELWRI buffers. 1908 * This is much better then the old way of writing only one buffer at 1909 * a time. Note that we may not be presented with the buffers in the 1910 * correct order, so we search for the cluster in both directions. 1911 * 1912 * The buffer is locked on call. 1913 */ 1914 int 1915 vfs_bio_awrite(struct buf *bp) 1916 { 1917 int i; 1918 int j; 1919 off_t loffset = bp->b_loffset; 1920 struct vnode *vp = bp->b_vp; 1921 int nbytes; 1922 struct buf *bpa; 1923 int nwritten; 1924 int size; 1925 1926 /* 1927 * right now we support clustered writing only to regular files. If 1928 * we find a clusterable block we could be in the middle of a cluster 1929 * rather then at the beginning. 1930 * 1931 * NOTE: b_bio1 contains the logical loffset and is aliased 1932 * to b_loffset. b_bio2 contains the translated block number. 1933 */ 1934 if ((vp->v_type == VREG) && 1935 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 1936 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 1937 1938 size = vp->v_mount->mnt_stat.f_iosize; 1939 1940 for (i = size; i < MAXPHYS; i += size) { 1941 if ((bpa = findblk(vp, loffset + i, FINDBLK_TEST)) && 1942 BUF_REFCNT(bpa) == 0 && 1943 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1944 (B_DELWRI | B_CLUSTEROK)) && 1945 (bpa->b_bufsize == size)) { 1946 if ((bpa->b_bio2.bio_offset == NOOFFSET) || 1947 (bpa->b_bio2.bio_offset != 1948 bp->b_bio2.bio_offset + i)) 1949 break; 1950 } else { 1951 break; 1952 } 1953 } 1954 for (j = size; i + j <= MAXPHYS && j <= loffset; j += size) { 1955 if ((bpa = findblk(vp, loffset - j, FINDBLK_TEST)) && 1956 BUF_REFCNT(bpa) == 0 && 1957 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1958 (B_DELWRI | B_CLUSTEROK)) && 1959 (bpa->b_bufsize == size)) { 1960 if ((bpa->b_bio2.bio_offset == NOOFFSET) || 1961 (bpa->b_bio2.bio_offset != 1962 bp->b_bio2.bio_offset - j)) 1963 break; 1964 } else { 1965 break; 1966 } 1967 } 1968 j -= size; 1969 nbytes = (i + j); 1970 1971 /* 1972 * this is a possible cluster write 1973 */ 1974 if (nbytes != size) { 1975 BUF_UNLOCK(bp); 1976 nwritten = cluster_wbuild(vp, size, 1977 loffset - j, nbytes); 1978 return nwritten; 1979 } 1980 } 1981 1982 /* 1983 * default (old) behavior, writing out only one block 1984 * 1985 * XXX returns b_bufsize instead of b_bcount for nwritten? 1986 */ 1987 nwritten = bp->b_bufsize; 1988 bremfree(bp); 1989 bawrite(bp); 1990 1991 return nwritten; 1992 } 1993 1994 /* 1995 * getnewbuf: 1996 * 1997 * Find and initialize a new buffer header, freeing up existing buffers 1998 * in the bufqueues as necessary. The new buffer is returned locked. 1999 * 2000 * Important: B_INVAL is not set. If the caller wishes to throw the 2001 * buffer away, the caller must set B_INVAL prior to calling brelse(). 2002 * 2003 * We block if: 2004 * We have insufficient buffer headers 2005 * We have insufficient buffer space 2006 * buffer_map is too fragmented ( space reservation fails ) 2007 * If we have to flush dirty buffers ( but we try to avoid this ) 2008 * 2009 * To avoid VFS layer recursion we do not flush dirty buffers ourselves. 2010 * Instead we ask the buf daemon to do it for us. We attempt to 2011 * avoid piecemeal wakeups of the pageout daemon. 2012 * 2013 * MPALMOSTSAFE 2014 */ 2015 static struct buf * 2016 getnewbuf(int blkflags, int slptimeo, int size, int maxsize) 2017 { 2018 struct buf *bp; 2019 struct buf *nbp; 2020 int defrag = 0; 2021 int nqindex; 2022 int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0; 2023 static int flushingbufs; 2024 2025 /* 2026 * We can't afford to block since we might be holding a vnode lock, 2027 * which may prevent system daemons from running. We deal with 2028 * low-memory situations by proactively returning memory and running 2029 * async I/O rather then sync I/O. 2030 */ 2031 2032 ++getnewbufcalls; 2033 --getnewbufrestarts; 2034 restart: 2035 ++getnewbufrestarts; 2036 2037 /* 2038 * Setup for scan. If we do not have enough free buffers, 2039 * we setup a degenerate case that immediately fails. Note 2040 * that if we are specially marked process, we are allowed to 2041 * dip into our reserves. 2042 * 2043 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN 2044 * 2045 * We start with EMPTYKVA. If the list is empty we backup to EMPTY. 2046 * However, there are a number of cases (defragging, reusing, ...) 2047 * where we cannot backup. 2048 */ 2049 nqindex = BQUEUE_EMPTYKVA; 2050 spin_lock(&bufqspin); 2051 nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]); 2052 2053 if (nbp == NULL) { 2054 /* 2055 * If no EMPTYKVA buffers and we are either 2056 * defragging or reusing, locate a CLEAN buffer 2057 * to free or reuse. If bufspace useage is low 2058 * skip this step so we can allocate a new buffer. 2059 */ 2060 if (defrag || bufspace >= lobufspace) { 2061 nqindex = BQUEUE_CLEAN; 2062 nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]); 2063 } 2064 2065 /* 2066 * If we could not find or were not allowed to reuse a 2067 * CLEAN buffer, check to see if it is ok to use an EMPTY 2068 * buffer. We can only use an EMPTY buffer if allocating 2069 * its KVA would not otherwise run us out of buffer space. 2070 */ 2071 if (nbp == NULL && defrag == 0 && 2072 bufspace + maxsize < hibufspace) { 2073 nqindex = BQUEUE_EMPTY; 2074 nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTY]); 2075 } 2076 } 2077 2078 /* 2079 * Run scan, possibly freeing data and/or kva mappings on the fly 2080 * depending. 2081 * 2082 * WARNING! bufqspin is held! 2083 */ 2084 while ((bp = nbp) != NULL) { 2085 int qindex = nqindex; 2086 2087 nbp = TAILQ_NEXT(bp, b_freelist); 2088 2089 /* 2090 * BQUEUE_CLEAN - B_AGE special case. If not set the bp 2091 * cycles through the queue twice before being selected. 2092 */ 2093 if (qindex == BQUEUE_CLEAN && 2094 (bp->b_flags & B_AGE) == 0 && nbp) { 2095 bp->b_flags |= B_AGE; 2096 TAILQ_REMOVE(&bufqueues[qindex], bp, b_freelist); 2097 TAILQ_INSERT_TAIL(&bufqueues[qindex], bp, b_freelist); 2098 continue; 2099 } 2100 2101 /* 2102 * Calculate next bp ( we can only use it if we do not block 2103 * or do other fancy things ). 2104 */ 2105 if (nbp == NULL) { 2106 switch(qindex) { 2107 case BQUEUE_EMPTY: 2108 nqindex = BQUEUE_EMPTYKVA; 2109 if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]))) 2110 break; 2111 /* fall through */ 2112 case BQUEUE_EMPTYKVA: 2113 nqindex = BQUEUE_CLEAN; 2114 if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]))) 2115 break; 2116 /* fall through */ 2117 case BQUEUE_CLEAN: 2118 /* 2119 * nbp is NULL. 2120 */ 2121 break; 2122 } 2123 } 2124 2125 /* 2126 * Sanity Checks 2127 */ 2128 KASSERT(bp->b_qindex == qindex, 2129 ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); 2130 2131 /* 2132 * Note: we no longer distinguish between VMIO and non-VMIO 2133 * buffers. 2134 */ 2135 KASSERT((bp->b_flags & B_DELWRI) == 0, 2136 ("delwri buffer %p found in queue %d", bp, qindex)); 2137 2138 /* 2139 * Do not try to reuse a buffer with a non-zero b_refs. 2140 * This is an unsynchronized test. A synchronized test 2141 * is also performed after we lock the buffer. 2142 */ 2143 if (bp->b_refs) 2144 continue; 2145 2146 /* 2147 * If we are defragging then we need a buffer with 2148 * b_kvasize != 0. XXX this situation should no longer 2149 * occur, if defrag is non-zero the buffer's b_kvasize 2150 * should also be non-zero at this point. XXX 2151 */ 2152 if (defrag && bp->b_kvasize == 0) { 2153 kprintf("Warning: defrag empty buffer %p\n", bp); 2154 continue; 2155 } 2156 2157 /* 2158 * Start freeing the bp. This is somewhat involved. nbp 2159 * remains valid only for BQUEUE_EMPTY[KVA] bp's. Buffers 2160 * on the clean list must be disassociated from their 2161 * current vnode. Buffers on the empty[kva] lists have 2162 * already been disassociated. 2163 * 2164 * b_refs is checked after locking along with queue changes. 2165 * We must check here to deal with zero->nonzero transitions 2166 * made by the owner of the buffer lock, which is used by 2167 * VFS's to hold the buffer while issuing an unlocked 2168 * uiomove()s. We cannot invalidate the buffer's pages 2169 * for this case. Once we successfully lock a buffer the 2170 * only 0->1 transitions of b_refs will occur via findblk(). 2171 * 2172 * We must also check for queue changes after successful 2173 * locking as the current lock holder may dispose of the 2174 * buffer and change its queue. 2175 */ 2176 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 2177 spin_unlock(&bufqspin); 2178 tsleep(&bd_request, 0, "gnbxxx", (hz + 99) / 100); 2179 goto restart; 2180 } 2181 if (bp->b_qindex != qindex || bp->b_refs) { 2182 spin_unlock(&bufqspin); 2183 BUF_UNLOCK(bp); 2184 goto restart; 2185 } 2186 bremfree_locked(bp); 2187 spin_unlock(&bufqspin); 2188 2189 /* 2190 * Dependancies must be handled before we disassociate the 2191 * vnode. 2192 * 2193 * NOTE: HAMMER will set B_LOCKED if the buffer cannot 2194 * be immediately disassociated. HAMMER then becomes 2195 * responsible for releasing the buffer. 2196 * 2197 * NOTE: bufqspin is UNLOCKED now. 2198 */ 2199 if (LIST_FIRST(&bp->b_dep) != NULL) { 2200 buf_deallocate(bp); 2201 if (bp->b_flags & B_LOCKED) { 2202 bqrelse(bp); 2203 goto restart; 2204 } 2205 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2206 } 2207 2208 if (qindex == BQUEUE_CLEAN) { 2209 if (bp->b_flags & B_VMIO) 2210 vfs_vmio_release(bp); 2211 if (bp->b_vp) 2212 brelvp(bp); 2213 } 2214 2215 /* 2216 * NOTE: nbp is now entirely invalid. We can only restart 2217 * the scan from this point on. 2218 * 2219 * Get the rest of the buffer freed up. b_kva* is still 2220 * valid after this operation. 2221 */ 2222 KASSERT(bp->b_vp == NULL, 2223 ("bp3 %p flags %08x vnode %p qindex %d " 2224 "unexpectededly still associated!", 2225 bp, bp->b_flags, bp->b_vp, qindex)); 2226 KKASSERT((bp->b_flags & B_HASHED) == 0); 2227 2228 /* 2229 * critical section protection is not required when 2230 * scrapping a buffer's contents because it is already 2231 * wired. 2232 */ 2233 if (bp->b_bufsize) 2234 allocbuf(bp, 0); 2235 2236 bp->b_flags = B_BNOCLIP; 2237 bp->b_cmd = BUF_CMD_DONE; 2238 bp->b_vp = NULL; 2239 bp->b_error = 0; 2240 bp->b_resid = 0; 2241 bp->b_bcount = 0; 2242 bp->b_xio.xio_npages = 0; 2243 bp->b_dirtyoff = bp->b_dirtyend = 0; 2244 bp->b_act_count = ACT_INIT; 2245 reinitbufbio(bp); 2246 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2247 buf_dep_init(bp); 2248 if (blkflags & GETBLK_BHEAVY) 2249 bp->b_flags |= B_HEAVY; 2250 2251 /* 2252 * If we are defragging then free the buffer. 2253 */ 2254 if (defrag) { 2255 bp->b_flags |= B_INVAL; 2256 bfreekva(bp); 2257 brelse(bp); 2258 defrag = 0; 2259 goto restart; 2260 } 2261 2262 /* 2263 * If we are overcomitted then recover the buffer and its 2264 * KVM space. This occurs in rare situations when multiple 2265 * processes are blocked in getnewbuf() or allocbuf(). 2266 */ 2267 if (bufspace >= hibufspace) 2268 flushingbufs = 1; 2269 if (flushingbufs && bp->b_kvasize != 0) { 2270 bp->b_flags |= B_INVAL; 2271 bfreekva(bp); 2272 brelse(bp); 2273 goto restart; 2274 } 2275 if (bufspace < lobufspace) 2276 flushingbufs = 0; 2277 2278 /* 2279 * b_refs can transition to a non-zero value while we hold 2280 * the buffer locked due to a findblk(). Our brelvp() above 2281 * interlocked any future possible transitions due to 2282 * findblk()s. 2283 * 2284 * If we find b_refs to be non-zero we can destroy the 2285 * buffer's contents but we cannot yet reuse the buffer. 2286 */ 2287 if (bp->b_refs) { 2288 bp->b_flags |= B_INVAL; 2289 bfreekva(bp); 2290 brelse(bp); 2291 goto restart; 2292 } 2293 break; 2294 /* NOT REACHED, bufqspin not held */ 2295 } 2296 2297 /* 2298 * If we exhausted our list, sleep as appropriate. We may have to 2299 * wakeup various daemons and write out some dirty buffers. 2300 * 2301 * Generally we are sleeping due to insufficient buffer space. 2302 * 2303 * NOTE: bufqspin is held if bp is NULL, else it is not held. 2304 */ 2305 if (bp == NULL) { 2306 int flags; 2307 char *waitmsg; 2308 2309 spin_unlock(&bufqspin); 2310 if (defrag) { 2311 flags = VFS_BIO_NEED_BUFSPACE; 2312 waitmsg = "nbufkv"; 2313 } else if (bufspace >= hibufspace) { 2314 waitmsg = "nbufbs"; 2315 flags = VFS_BIO_NEED_BUFSPACE; 2316 } else { 2317 waitmsg = "newbuf"; 2318 flags = VFS_BIO_NEED_ANY; 2319 } 2320 2321 bd_speedup(); /* heeeelp */ 2322 spin_lock(&bufcspin); 2323 needsbuffer |= flags; 2324 while (needsbuffer & flags) { 2325 if (ssleep(&needsbuffer, &bufcspin, 2326 slpflags, waitmsg, slptimeo)) { 2327 spin_unlock(&bufcspin); 2328 return (NULL); 2329 } 2330 } 2331 spin_unlock(&bufcspin); 2332 } else { 2333 /* 2334 * We finally have a valid bp. We aren't quite out of the 2335 * woods, we still have to reserve kva space. In order 2336 * to keep fragmentation sane we only allocate kva in 2337 * BKVASIZE chunks. 2338 * 2339 * (bufqspin is not held) 2340 */ 2341 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; 2342 2343 if (maxsize != bp->b_kvasize) { 2344 vm_offset_t addr = 0; 2345 int count; 2346 2347 bfreekva(bp); 2348 2349 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 2350 vm_map_lock(&buffer_map); 2351 2352 if (vm_map_findspace(&buffer_map, 2353 vm_map_min(&buffer_map), maxsize, 2354 maxsize, 0, &addr)) { 2355 /* 2356 * Uh oh. Buffer map is too fragmented. We 2357 * must defragment the map. 2358 */ 2359 vm_map_unlock(&buffer_map); 2360 vm_map_entry_release(count); 2361 ++bufdefragcnt; 2362 defrag = 1; 2363 bp->b_flags |= B_INVAL; 2364 brelse(bp); 2365 goto restart; 2366 } 2367 if (addr) { 2368 vm_map_insert(&buffer_map, &count, 2369 NULL, 0, 2370 addr, addr + maxsize, 2371 VM_MAPTYPE_NORMAL, 2372 VM_PROT_ALL, VM_PROT_ALL, 2373 MAP_NOFAULT); 2374 2375 bp->b_kvabase = (caddr_t) addr; 2376 bp->b_kvasize = maxsize; 2377 bufspace += bp->b_kvasize; 2378 ++bufreusecnt; 2379 } 2380 vm_map_unlock(&buffer_map); 2381 vm_map_entry_release(count); 2382 } 2383 bp->b_data = bp->b_kvabase; 2384 } 2385 return(bp); 2386 } 2387 2388 /* 2389 * This routine is called in an emergency to recover VM pages from the 2390 * buffer cache by cashing in clean buffers. The idea is to recover 2391 * enough pages to be able to satisfy a stuck bio_page_alloc(). 2392 * 2393 * MPSAFE 2394 */ 2395 static int 2396 recoverbufpages(void) 2397 { 2398 struct buf *bp; 2399 int bytes = 0; 2400 2401 ++recoverbufcalls; 2402 2403 spin_lock(&bufqspin); 2404 while (bytes < MAXBSIZE) { 2405 bp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]); 2406 if (bp == NULL) 2407 break; 2408 2409 /* 2410 * BQUEUE_CLEAN - B_AGE special case. If not set the bp 2411 * cycles through the queue twice before being selected. 2412 */ 2413 if ((bp->b_flags & B_AGE) == 0 && TAILQ_NEXT(bp, b_freelist)) { 2414 bp->b_flags |= B_AGE; 2415 TAILQ_REMOVE(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 2416 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], 2417 bp, b_freelist); 2418 continue; 2419 } 2420 2421 /* 2422 * Sanity Checks 2423 */ 2424 KKASSERT(bp->b_qindex == BQUEUE_CLEAN); 2425 KKASSERT((bp->b_flags & B_DELWRI) == 0); 2426 2427 /* 2428 * Start freeing the bp. This is somewhat involved. 2429 * 2430 * Buffers on the clean list must be disassociated from 2431 * their current vnode 2432 */ 2433 2434 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 2435 kprintf("recoverbufpages: warning, locked buf %p, " 2436 "race corrected\n", 2437 bp); 2438 ssleep(&bd_request, &bufqspin, 0, "gnbxxx", hz / 100); 2439 continue; 2440 } 2441 if (bp->b_qindex != BQUEUE_CLEAN) { 2442 kprintf("recoverbufpages: warning, BUF_LOCK blocked " 2443 "unexpectedly on buf %p index %d, race " 2444 "corrected\n", 2445 bp, bp->b_qindex); 2446 BUF_UNLOCK(bp); 2447 continue; 2448 } 2449 bremfree_locked(bp); 2450 spin_unlock(&bufqspin); 2451 2452 /* 2453 * Dependancies must be handled before we disassociate the 2454 * vnode. 2455 * 2456 * NOTE: HAMMER will set B_LOCKED if the buffer cannot 2457 * be immediately disassociated. HAMMER then becomes 2458 * responsible for releasing the buffer. 2459 */ 2460 if (LIST_FIRST(&bp->b_dep) != NULL) { 2461 buf_deallocate(bp); 2462 if (bp->b_flags & B_LOCKED) { 2463 bqrelse(bp); 2464 spin_lock(&bufqspin); 2465 continue; 2466 } 2467 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2468 } 2469 2470 bytes += bp->b_bufsize; 2471 2472 if (bp->b_flags & B_VMIO) { 2473 bp->b_flags |= B_DIRECT; /* try to free pages */ 2474 vfs_vmio_release(bp); 2475 } 2476 if (bp->b_vp) 2477 brelvp(bp); 2478 2479 KKASSERT(bp->b_vp == NULL); 2480 KKASSERT((bp->b_flags & B_HASHED) == 0); 2481 2482 /* 2483 * critical section protection is not required when 2484 * scrapping a buffer's contents because it is already 2485 * wired. 2486 */ 2487 if (bp->b_bufsize) 2488 allocbuf(bp, 0); 2489 2490 bp->b_flags = B_BNOCLIP; 2491 bp->b_cmd = BUF_CMD_DONE; 2492 bp->b_vp = NULL; 2493 bp->b_error = 0; 2494 bp->b_resid = 0; 2495 bp->b_bcount = 0; 2496 bp->b_xio.xio_npages = 0; 2497 bp->b_dirtyoff = bp->b_dirtyend = 0; 2498 reinitbufbio(bp); 2499 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2500 buf_dep_init(bp); 2501 bp->b_flags |= B_INVAL; 2502 /* bfreekva(bp); */ 2503 brelse(bp); 2504 spin_lock(&bufqspin); 2505 } 2506 spin_unlock(&bufqspin); 2507 return(bytes); 2508 } 2509 2510 /* 2511 * buf_daemon: 2512 * 2513 * Buffer flushing daemon. Buffers are normally flushed by the 2514 * update daemon but if it cannot keep up this process starts to 2515 * take the load in an attempt to prevent getnewbuf() from blocking. 2516 * 2517 * Once a flush is initiated it does not stop until the number 2518 * of buffers falls below lodirtybuffers, but we will wake up anyone 2519 * waiting at the mid-point. 2520 */ 2521 2522 static struct kproc_desc buf_kp = { 2523 "bufdaemon", 2524 buf_daemon, 2525 &bufdaemon_td 2526 }; 2527 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, 2528 kproc_start, &buf_kp) 2529 2530 static struct kproc_desc bufhw_kp = { 2531 "bufdaemon_hw", 2532 buf_daemon_hw, 2533 &bufdaemonhw_td 2534 }; 2535 SYSINIT(bufdaemon_hw, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, 2536 kproc_start, &bufhw_kp) 2537 2538 /* 2539 * MPSAFE thread 2540 */ 2541 static void 2542 buf_daemon(void) 2543 { 2544 int limit; 2545 2546 /* 2547 * This process needs to be suspended prior to shutdown sync. 2548 */ 2549 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 2550 bufdaemon_td, SHUTDOWN_PRI_LAST); 2551 curthread->td_flags |= TDF_SYSTHREAD; 2552 2553 /* 2554 * This process is allowed to take the buffer cache to the limit 2555 */ 2556 for (;;) { 2557 kproc_suspend_loop(); 2558 2559 /* 2560 * Do the flush as long as the number of dirty buffers 2561 * (including those running) exceeds lodirtybufspace. 2562 * 2563 * When flushing limit running I/O to hirunningspace 2564 * Do the flush. Limit the amount of in-transit I/O we 2565 * allow to build up, otherwise we would completely saturate 2566 * the I/O system. Wakeup any waiting processes before we 2567 * normally would so they can run in parallel with our drain. 2568 * 2569 * Our aggregate normal+HW lo water mark is lodirtybufspace, 2570 * but because we split the operation into two threads we 2571 * have to cut it in half for each thread. 2572 */ 2573 waitrunningbufspace(); 2574 limit = lodirtybufspace / 2; 2575 while (runningbufspace + dirtybufspace > limit || 2576 dirtybufcount - dirtybufcounthw >= nbuf / 2) { 2577 if (flushbufqueues(BQUEUE_DIRTY) == 0) 2578 break; 2579 if (runningbufspace < hirunningspace) 2580 continue; 2581 waitrunningbufspace(); 2582 } 2583 2584 /* 2585 * We reached our low water mark, reset the 2586 * request and sleep until we are needed again. 2587 * The sleep is just so the suspend code works. 2588 */ 2589 spin_lock(&bufcspin); 2590 if (bd_request == 0) 2591 ssleep(&bd_request, &bufcspin, 0, "psleep", hz); 2592 bd_request = 0; 2593 spin_unlock(&bufcspin); 2594 } 2595 } 2596 2597 /* 2598 * MPSAFE thread 2599 */ 2600 static void 2601 buf_daemon_hw(void) 2602 { 2603 int limit; 2604 2605 /* 2606 * This process needs to be suspended prior to shutdown sync. 2607 */ 2608 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 2609 bufdaemonhw_td, SHUTDOWN_PRI_LAST); 2610 curthread->td_flags |= TDF_SYSTHREAD; 2611 2612 /* 2613 * This process is allowed to take the buffer cache to the limit 2614 */ 2615 for (;;) { 2616 kproc_suspend_loop(); 2617 2618 /* 2619 * Do the flush. Limit the amount of in-transit I/O we 2620 * allow to build up, otherwise we would completely saturate 2621 * the I/O system. Wakeup any waiting processes before we 2622 * normally would so they can run in parallel with our drain. 2623 * 2624 * Once we decide to flush push the queued I/O up to 2625 * hirunningspace in order to trigger bursting by the bioq 2626 * subsystem. 2627 * 2628 * Our aggregate normal+HW lo water mark is lodirtybufspace, 2629 * but because we split the operation into two threads we 2630 * have to cut it in half for each thread. 2631 */ 2632 waitrunningbufspace(); 2633 limit = lodirtybufspace / 2; 2634 while (runningbufspace + dirtybufspacehw > limit || 2635 dirtybufcounthw >= nbuf / 2) { 2636 if (flushbufqueues(BQUEUE_DIRTY_HW) == 0) 2637 break; 2638 if (runningbufspace < hirunningspace) 2639 continue; 2640 waitrunningbufspace(); 2641 } 2642 2643 /* 2644 * We reached our low water mark, reset the 2645 * request and sleep until we are needed again. 2646 * The sleep is just so the suspend code works. 2647 */ 2648 spin_lock(&bufcspin); 2649 if (bd_request_hw == 0) 2650 ssleep(&bd_request_hw, &bufcspin, 0, "psleep", hz); 2651 bd_request_hw = 0; 2652 spin_unlock(&bufcspin); 2653 } 2654 } 2655 2656 /* 2657 * flushbufqueues: 2658 * 2659 * Try to flush a buffer in the dirty queue. We must be careful to 2660 * free up B_INVAL buffers instead of write them, which NFS is 2661 * particularly sensitive to. 2662 * 2663 * B_RELBUF may only be set by VFSs. We do set B_AGE to indicate 2664 * that we really want to try to get the buffer out and reuse it 2665 * due to the write load on the machine. 2666 * 2667 * We must lock the buffer in order to check its validity before we 2668 * can mess with its contents. bufqspin isn't enough. 2669 */ 2670 static int 2671 flushbufqueues(bufq_type_t q) 2672 { 2673 struct buf *bp; 2674 int r = 0; 2675 int spun; 2676 2677 spin_lock(&bufqspin); 2678 spun = 1; 2679 2680 bp = TAILQ_FIRST(&bufqueues[q]); 2681 while (bp) { 2682 if ((bp->b_flags & B_DELWRI) == 0) { 2683 kprintf("Unexpected clean buffer %p\n", bp); 2684 bp = TAILQ_NEXT(bp, b_freelist); 2685 continue; 2686 } 2687 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 2688 bp = TAILQ_NEXT(bp, b_freelist); 2689 continue; 2690 } 2691 KKASSERT(bp->b_qindex == q); 2692 2693 /* 2694 * Must recheck B_DELWRI after successfully locking 2695 * the buffer. 2696 */ 2697 if ((bp->b_flags & B_DELWRI) == 0) { 2698 BUF_UNLOCK(bp); 2699 bp = TAILQ_NEXT(bp, b_freelist); 2700 continue; 2701 } 2702 2703 if (bp->b_flags & B_INVAL) { 2704 _bremfree(bp); 2705 spin_unlock(&bufqspin); 2706 spun = 0; 2707 brelse(bp); 2708 ++r; 2709 break; 2710 } 2711 2712 spin_unlock(&bufqspin); 2713 spun = 0; 2714 2715 if (LIST_FIRST(&bp->b_dep) != NULL && 2716 (bp->b_flags & B_DEFERRED) == 0 && 2717 buf_countdeps(bp, 0)) { 2718 spin_lock(&bufqspin); 2719 spun = 1; 2720 TAILQ_REMOVE(&bufqueues[q], bp, b_freelist); 2721 TAILQ_INSERT_TAIL(&bufqueues[q], bp, b_freelist); 2722 bp->b_flags |= B_DEFERRED; 2723 BUF_UNLOCK(bp); 2724 bp = TAILQ_FIRST(&bufqueues[q]); 2725 continue; 2726 } 2727 2728 /* 2729 * If the buffer has a dependancy, buf_checkwrite() must 2730 * also return 0 for us to be able to initate the write. 2731 * 2732 * If the buffer is flagged B_ERROR it may be requeued 2733 * over and over again, we try to avoid a live lock. 2734 * 2735 * NOTE: buf_checkwrite is MPSAFE. 2736 */ 2737 if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) { 2738 bremfree(bp); 2739 brelse(bp); 2740 } else if (bp->b_flags & B_ERROR) { 2741 tsleep(bp, 0, "bioer", 1); 2742 bp->b_flags &= ~B_AGE; 2743 vfs_bio_awrite(bp); 2744 } else { 2745 bp->b_flags |= B_AGE; 2746 vfs_bio_awrite(bp); 2747 } 2748 ++r; 2749 break; 2750 } 2751 if (spun) 2752 spin_unlock(&bufqspin); 2753 return (r); 2754 } 2755 2756 /* 2757 * inmem: 2758 * 2759 * Returns true if no I/O is needed to access the associated VM object. 2760 * This is like findblk except it also hunts around in the VM system for 2761 * the data. 2762 * 2763 * Note that we ignore vm_page_free() races from interrupts against our 2764 * lookup, since if the caller is not protected our return value will not 2765 * be any more valid then otherwise once we exit the critical section. 2766 */ 2767 int 2768 inmem(struct vnode *vp, off_t loffset) 2769 { 2770 vm_object_t obj; 2771 vm_offset_t toff, tinc, size; 2772 vm_page_t m; 2773 2774 if (findblk(vp, loffset, FINDBLK_TEST)) 2775 return 1; 2776 if (vp->v_mount == NULL) 2777 return 0; 2778 if ((obj = vp->v_object) == NULL) 2779 return 0; 2780 2781 size = PAGE_SIZE; 2782 if (size > vp->v_mount->mnt_stat.f_iosize) 2783 size = vp->v_mount->mnt_stat.f_iosize; 2784 2785 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 2786 lwkt_gettoken(&vm_token); 2787 m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff)); 2788 lwkt_reltoken(&vm_token); 2789 if (m == NULL) 2790 return 0; 2791 tinc = size; 2792 if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK)) 2793 tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK); 2794 if (vm_page_is_valid(m, 2795 (vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0) 2796 return 0; 2797 } 2798 return 1; 2799 } 2800 2801 /* 2802 * findblk: 2803 * 2804 * Locate and return the specified buffer. Unless flagged otherwise, 2805 * a locked buffer will be returned if it exists or NULL if it does not. 2806 * 2807 * findblk()'d buffers are still on the bufqueues and if you intend 2808 * to use your (locked NON-TEST) buffer you need to bremfree(bp) 2809 * and possibly do other stuff to it. 2810 * 2811 * FINDBLK_TEST - Do not lock the buffer. The caller is responsible 2812 * for locking the buffer and ensuring that it remains 2813 * the desired buffer after locking. 2814 * 2815 * FINDBLK_NBLOCK - Lock the buffer non-blocking. If we are unable 2816 * to acquire the lock we return NULL, even if the 2817 * buffer exists. 2818 * 2819 * FINDBLK_REF - Returns the buffer ref'd, which prevents normal 2820 * reuse by getnewbuf() but does not prevent 2821 * disassociation (B_INVAL). Used to avoid deadlocks 2822 * against random (vp,loffset)s due to reassignment. 2823 * 2824 * (0) - Lock the buffer blocking. 2825 * 2826 * MPSAFE 2827 */ 2828 struct buf * 2829 findblk(struct vnode *vp, off_t loffset, int flags) 2830 { 2831 struct buf *bp; 2832 int lkflags; 2833 2834 lkflags = LK_EXCLUSIVE; 2835 if (flags & FINDBLK_NBLOCK) 2836 lkflags |= LK_NOWAIT; 2837 2838 for (;;) { 2839 /* 2840 * Lookup. Ref the buf while holding v_token to prevent 2841 * reuse (but does not prevent diassociation). 2842 */ 2843 lwkt_gettoken(&vp->v_token); 2844 bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset); 2845 if (bp == NULL) { 2846 lwkt_reltoken(&vp->v_token); 2847 return(NULL); 2848 } 2849 bqhold(bp); 2850 lwkt_reltoken(&vp->v_token); 2851 2852 /* 2853 * If testing only break and return bp, do not lock. 2854 */ 2855 if (flags & FINDBLK_TEST) 2856 break; 2857 2858 /* 2859 * Lock the buffer, return an error if the lock fails. 2860 * (only FINDBLK_NBLOCK can cause the lock to fail). 2861 */ 2862 if (BUF_LOCK(bp, lkflags)) { 2863 atomic_subtract_int(&bp->b_refs, 1); 2864 /* bp = NULL; not needed */ 2865 return(NULL); 2866 } 2867 2868 /* 2869 * Revalidate the locked buf before allowing it to be 2870 * returned. 2871 */ 2872 if (bp->b_vp == vp && bp->b_loffset == loffset) 2873 break; 2874 atomic_subtract_int(&bp->b_refs, 1); 2875 BUF_UNLOCK(bp); 2876 } 2877 2878 /* 2879 * Success 2880 */ 2881 if ((flags & FINDBLK_REF) == 0) 2882 atomic_subtract_int(&bp->b_refs, 1); 2883 return(bp); 2884 } 2885 2886 /* 2887 * getcacheblk: 2888 * 2889 * Similar to getblk() except only returns the buffer if it is 2890 * B_CACHE and requires no other manipulation. Otherwise NULL 2891 * is returned. 2892 * 2893 * If B_RAM is set the buffer might be just fine, but we return 2894 * NULL anyway because we want the code to fall through to the 2895 * cluster read. Otherwise read-ahead breaks. 2896 * 2897 * If blksize is 0 the buffer cache buffer must already be fully 2898 * cached. 2899 * 2900 * If blksize is non-zero getblk() will be used, allowing a buffer 2901 * to be reinstantiated from its VM backing store. The buffer must 2902 * still be fully cached after reinstantiation to be returned. 2903 */ 2904 struct buf * 2905 getcacheblk(struct vnode *vp, off_t loffset, int blksize) 2906 { 2907 struct buf *bp; 2908 2909 if (blksize) { 2910 bp = getblk(vp, loffset, blksize, 0, 0); 2911 if (bp) { 2912 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == 2913 B_CACHE) { 2914 bp->b_flags &= ~B_AGE; 2915 } else { 2916 brelse(bp); 2917 bp = NULL; 2918 } 2919 } 2920 } else { 2921 bp = findblk(vp, loffset, 0); 2922 if (bp) { 2923 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == 2924 B_CACHE) { 2925 bp->b_flags &= ~B_AGE; 2926 bremfree(bp); 2927 } else { 2928 BUF_UNLOCK(bp); 2929 bp = NULL; 2930 } 2931 } 2932 } 2933 return (bp); 2934 } 2935 2936 /* 2937 * getblk: 2938 * 2939 * Get a block given a specified block and offset into a file/device. 2940 * B_INVAL may or may not be set on return. The caller should clear 2941 * B_INVAL prior to initiating a READ. 2942 * 2943 * IT IS IMPORTANT TO UNDERSTAND THAT IF YOU CALL GETBLK() AND B_CACHE 2944 * IS NOT SET, YOU MUST INITIALIZE THE RETURNED BUFFER, ISSUE A READ, 2945 * OR SET B_INVAL BEFORE RETIRING IT. If you retire a getblk'd buffer 2946 * without doing any of those things the system will likely believe 2947 * the buffer to be valid (especially if it is not B_VMIO), and the 2948 * next getblk() will return the buffer with B_CACHE set. 2949 * 2950 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 2951 * an existing buffer. 2952 * 2953 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 2954 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 2955 * and then cleared based on the backing VM. If the previous buffer is 2956 * non-0-sized but invalid, B_CACHE will be cleared. 2957 * 2958 * If getblk() must create a new buffer, the new buffer is returned with 2959 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 2960 * case it is returned with B_INVAL clear and B_CACHE set based on the 2961 * backing VM. 2962 * 2963 * getblk() also forces a bwrite() for any B_DELWRI buffer whos 2964 * B_CACHE bit is clear. 2965 * 2966 * What this means, basically, is that the caller should use B_CACHE to 2967 * determine whether the buffer is fully valid or not and should clear 2968 * B_INVAL prior to issuing a read. If the caller intends to validate 2969 * the buffer by loading its data area with something, the caller needs 2970 * to clear B_INVAL. If the caller does this without issuing an I/O, 2971 * the caller should set B_CACHE ( as an optimization ), else the caller 2972 * should issue the I/O and biodone() will set B_CACHE if the I/O was 2973 * a write attempt or if it was a successfull read. If the caller 2974 * intends to issue a READ, the caller must clear B_INVAL and B_ERROR 2975 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 2976 * 2977 * getblk flags: 2978 * 2979 * GETBLK_PCATCH - catch signal if blocked, can cause NULL return 2980 * GETBLK_BHEAVY - heavy-weight buffer cache buffer 2981 * 2982 * MPALMOSTSAFE 2983 */ 2984 struct buf * 2985 getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo) 2986 { 2987 struct buf *bp; 2988 int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0; 2989 int error; 2990 int lkflags; 2991 2992 if (size > MAXBSIZE) 2993 panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE); 2994 if (vp->v_object == NULL) 2995 panic("getblk: vnode %p has no object!", vp); 2996 2997 loop: 2998 if ((bp = findblk(vp, loffset, FINDBLK_REF | FINDBLK_TEST)) != NULL) { 2999 /* 3000 * The buffer was found in the cache, but we need to lock it. 3001 * We must acquire a ref on the bp to prevent reuse, but 3002 * this will not prevent disassociation (brelvp()) so we 3003 * must recheck (vp,loffset) after acquiring the lock. 3004 * 3005 * Without the ref the buffer could potentially be reused 3006 * before we acquire the lock and create a deadlock 3007 * situation between the thread trying to reuse the buffer 3008 * and us due to the fact that we would wind up blocking 3009 * on a random (vp,loffset). 3010 */ 3011 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 3012 if (blkflags & GETBLK_NOWAIT) { 3013 bqdrop(bp); 3014 return(NULL); 3015 } 3016 lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; 3017 if (blkflags & GETBLK_PCATCH) 3018 lkflags |= LK_PCATCH; 3019 error = BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo); 3020 if (error) { 3021 bqdrop(bp); 3022 if (error == ENOLCK) 3023 goto loop; 3024 return (NULL); 3025 } 3026 /* buffer may have changed on us */ 3027 } 3028 bqdrop(bp); 3029 3030 /* 3031 * Once the buffer has been locked, make sure we didn't race 3032 * a buffer recyclement. Buffers that are no longer hashed 3033 * will have b_vp == NULL, so this takes care of that check 3034 * as well. 3035 */ 3036 if (bp->b_vp != vp || bp->b_loffset != loffset) { 3037 kprintf("Warning buffer %p (vp %p loffset %lld) " 3038 "was recycled\n", 3039 bp, vp, (long long)loffset); 3040 BUF_UNLOCK(bp); 3041 goto loop; 3042 } 3043 3044 /* 3045 * If SZMATCH any pre-existing buffer must be of the requested 3046 * size or NULL is returned. The caller absolutely does not 3047 * want getblk() to bwrite() the buffer on a size mismatch. 3048 */ 3049 if ((blkflags & GETBLK_SZMATCH) && size != bp->b_bcount) { 3050 BUF_UNLOCK(bp); 3051 return(NULL); 3052 } 3053 3054 /* 3055 * All vnode-based buffers must be backed by a VM object. 3056 */ 3057 KKASSERT(bp->b_flags & B_VMIO); 3058 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 3059 bp->b_flags &= ~B_AGE; 3060 3061 /* 3062 * Make sure that B_INVAL buffers do not have a cached 3063 * block number translation. 3064 */ 3065 if ((bp->b_flags & B_INVAL) && (bp->b_bio2.bio_offset != NOOFFSET)) { 3066 kprintf("Warning invalid buffer %p (vp %p loffset %lld)" 3067 " did not have cleared bio_offset cache\n", 3068 bp, vp, (long long)loffset); 3069 clearbiocache(&bp->b_bio2); 3070 } 3071 3072 /* 3073 * The buffer is locked. B_CACHE is cleared if the buffer is 3074 * invalid. 3075 */ 3076 if (bp->b_flags & B_INVAL) 3077 bp->b_flags &= ~B_CACHE; 3078 bremfree(bp); 3079 3080 /* 3081 * Any size inconsistancy with a dirty buffer or a buffer 3082 * with a softupdates dependancy must be resolved. Resizing 3083 * the buffer in such circumstances can lead to problems. 3084 * 3085 * Dirty or dependant buffers are written synchronously. 3086 * Other types of buffers are simply released and 3087 * reconstituted as they may be backed by valid, dirty VM 3088 * pages (but not marked B_DELWRI). 3089 * 3090 * NFS NOTE: NFS buffers which straddle EOF are oddly-sized 3091 * and may be left over from a prior truncation (and thus 3092 * no longer represent the actual EOF point), so we 3093 * definitely do not want to B_NOCACHE the backing store. 3094 */ 3095 if (size != bp->b_bcount) { 3096 if (bp->b_flags & B_DELWRI) { 3097 bp->b_flags |= B_RELBUF; 3098 bwrite(bp); 3099 } else if (LIST_FIRST(&bp->b_dep)) { 3100 bp->b_flags |= B_RELBUF; 3101 bwrite(bp); 3102 } else { 3103 bp->b_flags |= B_RELBUF; 3104 brelse(bp); 3105 } 3106 goto loop; 3107 } 3108 KKASSERT(size <= bp->b_kvasize); 3109 KASSERT(bp->b_loffset != NOOFFSET, 3110 ("getblk: no buffer offset")); 3111 3112 /* 3113 * A buffer with B_DELWRI set and B_CACHE clear must 3114 * be committed before we can return the buffer in 3115 * order to prevent the caller from issuing a read 3116 * ( due to B_CACHE not being set ) and overwriting 3117 * it. 3118 * 3119 * Most callers, including NFS and FFS, need this to 3120 * operate properly either because they assume they 3121 * can issue a read if B_CACHE is not set, or because 3122 * ( for example ) an uncached B_DELWRI might loop due 3123 * to softupdates re-dirtying the buffer. In the latter 3124 * case, B_CACHE is set after the first write completes, 3125 * preventing further loops. 3126 * 3127 * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 3128 * above while extending the buffer, we cannot allow the 3129 * buffer to remain with B_CACHE set after the write 3130 * completes or it will represent a corrupt state. To 3131 * deal with this we set B_NOCACHE to scrap the buffer 3132 * after the write. 3133 * 3134 * XXX Should this be B_RELBUF instead of B_NOCACHE? 3135 * I'm not even sure this state is still possible 3136 * now that getblk() writes out any dirty buffers 3137 * on size changes. 3138 * 3139 * We might be able to do something fancy, like setting 3140 * B_CACHE in bwrite() except if B_DELWRI is already set, 3141 * so the below call doesn't set B_CACHE, but that gets real 3142 * confusing. This is much easier. 3143 */ 3144 3145 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 3146 kprintf("getblk: Warning, bp %p loff=%jx DELWRI set " 3147 "and CACHE clear, b_flags %08x\n", 3148 bp, (intmax_t)bp->b_loffset, bp->b_flags); 3149 bp->b_flags |= B_NOCACHE; 3150 bwrite(bp); 3151 goto loop; 3152 } 3153 } else { 3154 /* 3155 * Buffer is not in-core, create new buffer. The buffer 3156 * returned by getnewbuf() is locked. Note that the returned 3157 * buffer is also considered valid (not marked B_INVAL). 3158 * 3159 * Calculating the offset for the I/O requires figuring out 3160 * the block size. We use DEV_BSIZE for VBLK or VCHR and 3161 * the mount's f_iosize otherwise. If the vnode does not 3162 * have an associated mount we assume that the passed size is 3163 * the block size. 3164 * 3165 * Note that vn_isdisk() cannot be used here since it may 3166 * return a failure for numerous reasons. Note that the 3167 * buffer size may be larger then the block size (the caller 3168 * will use block numbers with the proper multiple). Beware 3169 * of using any v_* fields which are part of unions. In 3170 * particular, in DragonFly the mount point overloading 3171 * mechanism uses the namecache only and the underlying 3172 * directory vnode is not a special case. 3173 */ 3174 int bsize, maxsize; 3175 3176 if (vp->v_type == VBLK || vp->v_type == VCHR) 3177 bsize = DEV_BSIZE; 3178 else if (vp->v_mount) 3179 bsize = vp->v_mount->mnt_stat.f_iosize; 3180 else 3181 bsize = size; 3182 3183 maxsize = size + (loffset & PAGE_MASK); 3184 maxsize = imax(maxsize, bsize); 3185 3186 bp = getnewbuf(blkflags, slptimeo, size, maxsize); 3187 if (bp == NULL) { 3188 if (slpflags || slptimeo) 3189 return NULL; 3190 goto loop; 3191 } 3192 3193 /* 3194 * Atomically insert the buffer into the hash, so that it can 3195 * be found by findblk(). 3196 * 3197 * If bgetvp() returns non-zero a collision occured, and the 3198 * bp will not be associated with the vnode. 3199 * 3200 * Make sure the translation layer has been cleared. 3201 */ 3202 bp->b_loffset = loffset; 3203 bp->b_bio2.bio_offset = NOOFFSET; 3204 /* bp->b_bio2.bio_next = NULL; */ 3205 3206 if (bgetvp(vp, bp, size)) { 3207 bp->b_flags |= B_INVAL; 3208 brelse(bp); 3209 goto loop; 3210 } 3211 3212 /* 3213 * All vnode-based buffers must be backed by a VM object. 3214 */ 3215 KKASSERT(vp->v_object != NULL); 3216 bp->b_flags |= B_VMIO; 3217 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 3218 3219 allocbuf(bp, size); 3220 } 3221 KKASSERT(dsched_is_clear_buf_priv(bp)); 3222 return (bp); 3223 } 3224 3225 /* 3226 * regetblk(bp) 3227 * 3228 * Reacquire a buffer that was previously released to the locked queue, 3229 * or reacquire a buffer which is interlocked by having bioops->io_deallocate 3230 * set B_LOCKED (which handles the acquisition race). 3231 * 3232 * To this end, either B_LOCKED must be set or the dependancy list must be 3233 * non-empty. 3234 * 3235 * MPSAFE 3236 */ 3237 void 3238 regetblk(struct buf *bp) 3239 { 3240 KKASSERT((bp->b_flags & B_LOCKED) || LIST_FIRST(&bp->b_dep) != NULL); 3241 BUF_LOCK(bp, LK_EXCLUSIVE | LK_RETRY); 3242 bremfree(bp); 3243 } 3244 3245 /* 3246 * geteblk: 3247 * 3248 * Get an empty, disassociated buffer of given size. The buffer is 3249 * initially set to B_INVAL. 3250 * 3251 * critical section protection is not required for the allocbuf() 3252 * call because races are impossible here. 3253 * 3254 * MPALMOSTSAFE 3255 */ 3256 struct buf * 3257 geteblk(int size) 3258 { 3259 struct buf *bp; 3260 int maxsize; 3261 3262 maxsize = (size + BKVAMASK) & ~BKVAMASK; 3263 3264 while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) 3265 ; 3266 allocbuf(bp, size); 3267 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 3268 KKASSERT(dsched_is_clear_buf_priv(bp)); 3269 return (bp); 3270 } 3271 3272 3273 /* 3274 * allocbuf: 3275 * 3276 * This code constitutes the buffer memory from either anonymous system 3277 * memory (in the case of non-VMIO operations) or from an associated 3278 * VM object (in the case of VMIO operations). This code is able to 3279 * resize a buffer up or down. 3280 * 3281 * Note that this code is tricky, and has many complications to resolve 3282 * deadlock or inconsistant data situations. Tread lightly!!! 3283 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 3284 * the caller. Calling this code willy nilly can result in the loss of 3285 * data. 3286 * 3287 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 3288 * B_CACHE for the non-VMIO case. 3289 * 3290 * This routine does not need to be called from a critical section but you 3291 * must own the buffer. 3292 * 3293 * MPSAFE 3294 */ 3295 int 3296 allocbuf(struct buf *bp, int size) 3297 { 3298 int newbsize, mbsize; 3299 int i; 3300 3301 if (BUF_REFCNT(bp) == 0) 3302 panic("allocbuf: buffer not busy"); 3303 3304 if (bp->b_kvasize < size) 3305 panic("allocbuf: buffer too small"); 3306 3307 if ((bp->b_flags & B_VMIO) == 0) { 3308 caddr_t origbuf; 3309 int origbufsize; 3310 /* 3311 * Just get anonymous memory from the kernel. Don't 3312 * mess with B_CACHE. 3313 */ 3314 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 3315 if (bp->b_flags & B_MALLOC) 3316 newbsize = mbsize; 3317 else 3318 newbsize = round_page(size); 3319 3320 if (newbsize < bp->b_bufsize) { 3321 /* 3322 * Malloced buffers are not shrunk 3323 */ 3324 if (bp->b_flags & B_MALLOC) { 3325 if (newbsize) { 3326 bp->b_bcount = size; 3327 } else { 3328 kfree(bp->b_data, M_BIOBUF); 3329 if (bp->b_bufsize) { 3330 atomic_subtract_int(&bufmallocspace, bp->b_bufsize); 3331 bufspacewakeup(); 3332 bp->b_bufsize = 0; 3333 } 3334 bp->b_data = bp->b_kvabase; 3335 bp->b_bcount = 0; 3336 bp->b_flags &= ~B_MALLOC; 3337 } 3338 return 1; 3339 } 3340 vm_hold_free_pages( 3341 bp, 3342 (vm_offset_t) bp->b_data + newbsize, 3343 (vm_offset_t) bp->b_data + bp->b_bufsize); 3344 } else if (newbsize > bp->b_bufsize) { 3345 /* 3346 * We only use malloced memory on the first allocation. 3347 * and revert to page-allocated memory when the buffer 3348 * grows. 3349 */ 3350 if ((bufmallocspace < maxbufmallocspace) && 3351 (bp->b_bufsize == 0) && 3352 (mbsize <= PAGE_SIZE/2)) { 3353 3354 bp->b_data = kmalloc(mbsize, M_BIOBUF, M_WAITOK); 3355 bp->b_bufsize = mbsize; 3356 bp->b_bcount = size; 3357 bp->b_flags |= B_MALLOC; 3358 atomic_add_int(&bufmallocspace, mbsize); 3359 return 1; 3360 } 3361 origbuf = NULL; 3362 origbufsize = 0; 3363 /* 3364 * If the buffer is growing on its other-than-first 3365 * allocation, then we revert to the page-allocation 3366 * scheme. 3367 */ 3368 if (bp->b_flags & B_MALLOC) { 3369 origbuf = bp->b_data; 3370 origbufsize = bp->b_bufsize; 3371 bp->b_data = bp->b_kvabase; 3372 if (bp->b_bufsize) { 3373 atomic_subtract_int(&bufmallocspace, 3374 bp->b_bufsize); 3375 bufspacewakeup(); 3376 bp->b_bufsize = 0; 3377 } 3378 bp->b_flags &= ~B_MALLOC; 3379 newbsize = round_page(newbsize); 3380 } 3381 vm_hold_load_pages( 3382 bp, 3383 (vm_offset_t) bp->b_data + bp->b_bufsize, 3384 (vm_offset_t) bp->b_data + newbsize); 3385 if (origbuf) { 3386 bcopy(origbuf, bp->b_data, origbufsize); 3387 kfree(origbuf, M_BIOBUF); 3388 } 3389 } 3390 } else { 3391 vm_page_t m; 3392 int desiredpages; 3393 3394 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 3395 desiredpages = ((int)(bp->b_loffset & PAGE_MASK) + 3396 newbsize + PAGE_MASK) >> PAGE_SHIFT; 3397 KKASSERT(desiredpages <= XIO_INTERNAL_PAGES); 3398 3399 if (bp->b_flags & B_MALLOC) 3400 panic("allocbuf: VMIO buffer can't be malloced"); 3401 /* 3402 * Set B_CACHE initially if buffer is 0 length or will become 3403 * 0-length. 3404 */ 3405 if (size == 0 || bp->b_bufsize == 0) 3406 bp->b_flags |= B_CACHE; 3407 3408 if (newbsize < bp->b_bufsize) { 3409 /* 3410 * DEV_BSIZE aligned new buffer size is less then the 3411 * DEV_BSIZE aligned existing buffer size. Figure out 3412 * if we have to remove any pages. 3413 */ 3414 if (desiredpages < bp->b_xio.xio_npages) { 3415 for (i = desiredpages; i < bp->b_xio.xio_npages; i++) { 3416 /* 3417 * the page is not freed here -- it 3418 * is the responsibility of 3419 * vnode_pager_setsize 3420 */ 3421 m = bp->b_xio.xio_pages[i]; 3422 KASSERT(m != bogus_page, 3423 ("allocbuf: bogus page found")); 3424 while (vm_page_sleep_busy(m, TRUE, "biodep")) 3425 ; 3426 3427 bp->b_xio.xio_pages[i] = NULL; 3428 vm_page_unwire(m, 0); 3429 } 3430 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + 3431 (desiredpages << PAGE_SHIFT), (bp->b_xio.xio_npages - desiredpages)); 3432 bp->b_xio.xio_npages = desiredpages; 3433 } 3434 } else if (size > bp->b_bcount) { 3435 /* 3436 * We are growing the buffer, possibly in a 3437 * byte-granular fashion. 3438 */ 3439 struct vnode *vp; 3440 vm_object_t obj; 3441 vm_offset_t toff; 3442 vm_offset_t tinc; 3443 3444 /* 3445 * Step 1, bring in the VM pages from the object, 3446 * allocating them if necessary. We must clear 3447 * B_CACHE if these pages are not valid for the 3448 * range covered by the buffer. 3449 * 3450 * critical section protection is required to protect 3451 * against interrupts unbusying and freeing pages 3452 * between our vm_page_lookup() and our 3453 * busycheck/wiring call. 3454 */ 3455 vp = bp->b_vp; 3456 obj = vp->v_object; 3457 3458 lwkt_gettoken(&vm_token); 3459 while (bp->b_xio.xio_npages < desiredpages) { 3460 vm_page_t m; 3461 vm_pindex_t pi; 3462 3463 pi = OFF_TO_IDX(bp->b_loffset) + bp->b_xio.xio_npages; 3464 if ((m = vm_page_lookup(obj, pi)) == NULL) { 3465 /* 3466 * note: must allocate system pages 3467 * since blocking here could intefere 3468 * with paging I/O, no matter which 3469 * process we are. 3470 */ 3471 m = bio_page_alloc(obj, pi, desiredpages - bp->b_xio.xio_npages); 3472 if (m) { 3473 vm_page_wire(m); 3474 vm_page_flag_clear(m, PG_ZERO); 3475 vm_page_wakeup(m); 3476 bp->b_flags &= ~B_CACHE; 3477 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 3478 ++bp->b_xio.xio_npages; 3479 } 3480 continue; 3481 } 3482 3483 /* 3484 * We found a page. If we have to sleep on it, 3485 * retry because it might have gotten freed out 3486 * from under us. 3487 * 3488 * We can only test PG_BUSY here. Blocking on 3489 * m->busy might lead to a deadlock: 3490 * 3491 * vm_fault->getpages->cluster_read->allocbuf 3492 * 3493 */ 3494 3495 if (vm_page_sleep_busy(m, FALSE, "pgtblk")) 3496 continue; 3497 vm_page_flag_clear(m, PG_ZERO); 3498 vm_page_wire(m); 3499 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 3500 ++bp->b_xio.xio_npages; 3501 if (bp->b_act_count < m->act_count) 3502 bp->b_act_count = m->act_count; 3503 } 3504 lwkt_reltoken(&vm_token); 3505 3506 /* 3507 * Step 2. We've loaded the pages into the buffer, 3508 * we have to figure out if we can still have B_CACHE 3509 * set. Note that B_CACHE is set according to the 3510 * byte-granular range ( bcount and size ), not the 3511 * aligned range ( newbsize ). 3512 * 3513 * The VM test is against m->valid, which is DEV_BSIZE 3514 * aligned. Needless to say, the validity of the data 3515 * needs to also be DEV_BSIZE aligned. Note that this 3516 * fails with NFS if the server or some other client 3517 * extends the file's EOF. If our buffer is resized, 3518 * B_CACHE may remain set! XXX 3519 */ 3520 3521 toff = bp->b_bcount; 3522 tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK); 3523 3524 while ((bp->b_flags & B_CACHE) && toff < size) { 3525 vm_pindex_t pi; 3526 3527 if (tinc > (size - toff)) 3528 tinc = size - toff; 3529 3530 pi = ((bp->b_loffset & PAGE_MASK) + toff) >> 3531 PAGE_SHIFT; 3532 3533 vfs_buf_test_cache( 3534 bp, 3535 bp->b_loffset, 3536 toff, 3537 tinc, 3538 bp->b_xio.xio_pages[pi] 3539 ); 3540 toff += tinc; 3541 tinc = PAGE_SIZE; 3542 } 3543 3544 /* 3545 * Step 3, fixup the KVM pmap. Remember that 3546 * bp->b_data is relative to bp->b_loffset, but 3547 * bp->b_loffset may be offset into the first page. 3548 */ 3549 3550 bp->b_data = (caddr_t) 3551 trunc_page((vm_offset_t)bp->b_data); 3552 pmap_qenter( 3553 (vm_offset_t)bp->b_data, 3554 bp->b_xio.xio_pages, 3555 bp->b_xio.xio_npages 3556 ); 3557 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 3558 (vm_offset_t)(bp->b_loffset & PAGE_MASK)); 3559 } 3560 } 3561 3562 /* adjust space use on already-dirty buffer */ 3563 if (bp->b_flags & B_DELWRI) { 3564 spin_lock(&bufcspin); 3565 dirtybufspace += newbsize - bp->b_bufsize; 3566 if (bp->b_flags & B_HEAVY) 3567 dirtybufspacehw += newbsize - bp->b_bufsize; 3568 spin_unlock(&bufcspin); 3569 } 3570 if (newbsize < bp->b_bufsize) 3571 bufspacewakeup(); 3572 bp->b_bufsize = newbsize; /* actual buffer allocation */ 3573 bp->b_bcount = size; /* requested buffer size */ 3574 return 1; 3575 } 3576 3577 /* 3578 * biowait: 3579 * 3580 * Wait for buffer I/O completion, returning error status. B_EINTR 3581 * is converted into an EINTR error but not cleared (since a chain 3582 * of biowait() calls may occur). 3583 * 3584 * On return bpdone() will have been called but the buffer will remain 3585 * locked and will not have been brelse()'d. 3586 * 3587 * NOTE! If a timeout is specified and ETIMEDOUT occurs the I/O is 3588 * likely still in progress on return. 3589 * 3590 * NOTE! This operation is on a BIO, not a BUF. 3591 * 3592 * NOTE! BIO_DONE is cleared by vn_strategy() 3593 * 3594 * MPSAFE 3595 */ 3596 static __inline int 3597 _biowait(struct bio *bio, const char *wmesg, int to) 3598 { 3599 struct buf *bp = bio->bio_buf; 3600 u_int32_t flags; 3601 u_int32_t nflags; 3602 int error; 3603 3604 KKASSERT(bio == &bp->b_bio1); 3605 for (;;) { 3606 flags = bio->bio_flags; 3607 if (flags & BIO_DONE) 3608 break; 3609 nflags = flags | BIO_WANT; 3610 tsleep_interlock(bio, 0); 3611 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) { 3612 if (wmesg) 3613 error = tsleep(bio, PINTERLOCKED, wmesg, to); 3614 else if (bp->b_cmd == BUF_CMD_READ) 3615 error = tsleep(bio, PINTERLOCKED, "biord", to); 3616 else 3617 error = tsleep(bio, PINTERLOCKED, "biowr", to); 3618 if (error) { 3619 kprintf("tsleep error biowait %d\n", error); 3620 return (error); 3621 } 3622 } 3623 } 3624 3625 /* 3626 * Finish up. 3627 */ 3628 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 3629 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC); 3630 if (bp->b_flags & B_EINTR) 3631 return (EINTR); 3632 if (bp->b_flags & B_ERROR) 3633 return (bp->b_error ? bp->b_error : EIO); 3634 return (0); 3635 } 3636 3637 int 3638 biowait(struct bio *bio, const char *wmesg) 3639 { 3640 return(_biowait(bio, wmesg, 0)); 3641 } 3642 3643 int 3644 biowait_timeout(struct bio *bio, const char *wmesg, int to) 3645 { 3646 return(_biowait(bio, wmesg, to)); 3647 } 3648 3649 /* 3650 * This associates a tracking count with an I/O. vn_strategy() and 3651 * dev_dstrategy() do this automatically but there are a few cases 3652 * where a vnode or device layer is bypassed when a block translation 3653 * is cached. In such cases bio_start_transaction() may be called on 3654 * the bypassed layers so the system gets an I/O in progress indication 3655 * for those higher layers. 3656 */ 3657 void 3658 bio_start_transaction(struct bio *bio, struct bio_track *track) 3659 { 3660 bio->bio_track = track; 3661 if (dsched_is_clear_buf_priv(bio->bio_buf)) 3662 dsched_new_buf(bio->bio_buf); 3663 bio_track_ref(track); 3664 } 3665 3666 /* 3667 * Initiate I/O on a vnode. 3668 * 3669 * SWAPCACHE OPERATION: 3670 * 3671 * Real buffer cache buffers have a non-NULL bp->b_vp. Unfortunately 3672 * devfs also uses b_vp for fake buffers so we also have to check 3673 * that B_PAGING is 0. In this case the passed 'vp' is probably the 3674 * underlying block device. The swap assignments are related to the 3675 * buffer cache buffer's b_vp, not the passed vp. 3676 * 3677 * The passed vp == bp->b_vp only in the case where the strategy call 3678 * is made on the vp itself for its own buffers (a regular file or 3679 * block device vp). The filesystem usually then re-calls vn_strategy() 3680 * after translating the request to an underlying device. 3681 * 3682 * Cluster buffers set B_CLUSTER and the passed vp is the vp of the 3683 * underlying buffer cache buffers. 3684 * 3685 * We can only deal with page-aligned buffers at the moment, because 3686 * we can't tell what the real dirty state for pages straddling a buffer 3687 * are. 3688 * 3689 * In order to call swap_pager_strategy() we must provide the VM object 3690 * and base offset for the underlying buffer cache pages so it can find 3691 * the swap blocks. 3692 */ 3693 void 3694 vn_strategy(struct vnode *vp, struct bio *bio) 3695 { 3696 struct bio_track *track; 3697 struct buf *bp = bio->bio_buf; 3698 3699 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 3700 3701 /* 3702 * Set when an I/O is issued on the bp. Cleared by consumers 3703 * (aka HAMMER), allowing the consumer to determine if I/O had 3704 * actually occurred. 3705 */ 3706 bp->b_flags |= B_IODEBUG; 3707 3708 /* 3709 * Handle the swap cache intercept. 3710 */ 3711 if (vn_cache_strategy(vp, bio)) 3712 return; 3713 3714 /* 3715 * Otherwise do the operation through the filesystem 3716 */ 3717 if (bp->b_cmd == BUF_CMD_READ) 3718 track = &vp->v_track_read; 3719 else 3720 track = &vp->v_track_write; 3721 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 3722 bio->bio_track = track; 3723 if (dsched_is_clear_buf_priv(bio->bio_buf)) 3724 dsched_new_buf(bio->bio_buf); 3725 bio_track_ref(track); 3726 vop_strategy(*vp->v_ops, vp, bio); 3727 } 3728 3729 static void vn_cache_strategy_callback(struct bio *bio); 3730 3731 int 3732 vn_cache_strategy(struct vnode *vp, struct bio *bio) 3733 { 3734 struct buf *bp = bio->bio_buf; 3735 struct bio *nbio; 3736 vm_object_t object; 3737 vm_page_t m; 3738 int i; 3739 3740 /* 3741 * Is this buffer cache buffer suitable for reading from 3742 * the swap cache? 3743 */ 3744 if (vm_swapcache_read_enable == 0 || 3745 bp->b_cmd != BUF_CMD_READ || 3746 ((bp->b_flags & B_CLUSTER) == 0 && 3747 (bp->b_vp == NULL || (bp->b_flags & B_PAGING))) || 3748 ((int)bp->b_loffset & PAGE_MASK) != 0 || 3749 (bp->b_bcount & PAGE_MASK) != 0) { 3750 return(0); 3751 } 3752 3753 /* 3754 * Figure out the original VM object (it will match the underlying 3755 * VM pages). Note that swap cached data uses page indices relative 3756 * to that object, not relative to bio->bio_offset. 3757 */ 3758 if (bp->b_flags & B_CLUSTER) 3759 object = vp->v_object; 3760 else 3761 object = bp->b_vp->v_object; 3762 3763 /* 3764 * In order to be able to use the swap cache all underlying VM 3765 * pages must be marked as such, and we can't have any bogus pages. 3766 */ 3767 for (i = 0; i < bp->b_xio.xio_npages; ++i) { 3768 m = bp->b_xio.xio_pages[i]; 3769 if ((m->flags & PG_SWAPPED) == 0) 3770 break; 3771 if (m == bogus_page) 3772 break; 3773 } 3774 3775 /* 3776 * If we are good then issue the I/O using swap_pager_strategy(). 3777 */ 3778 if (i == bp->b_xio.xio_npages) { 3779 m = bp->b_xio.xio_pages[0]; 3780 nbio = push_bio(bio); 3781 nbio->bio_done = vn_cache_strategy_callback; 3782 nbio->bio_offset = ptoa(m->pindex); 3783 KKASSERT(m->object == object); 3784 swap_pager_strategy(object, nbio); 3785 return(1); 3786 } 3787 return(0); 3788 } 3789 3790 /* 3791 * This is a bit of a hack but since the vn_cache_strategy() function can 3792 * override a VFS's strategy function we must make sure that the bio, which 3793 * is probably bio2, doesn't leak an unexpected offset value back to the 3794 * filesystem. The filesystem (e.g. UFS) might otherwise assume that the 3795 * bio went through its own file strategy function and the the bio2 offset 3796 * is a cached disk offset when, in fact, it isn't. 3797 */ 3798 static void 3799 vn_cache_strategy_callback(struct bio *bio) 3800 { 3801 bio->bio_offset = NOOFFSET; 3802 biodone(pop_bio(bio)); 3803 } 3804 3805 /* 3806 * bpdone: 3807 * 3808 * Finish I/O on a buffer after all BIOs have been processed. 3809 * Called when the bio chain is exhausted or by biowait. If called 3810 * by biowait, elseit is typically 0. 3811 * 3812 * bpdone is also responsible for setting B_CACHE in a B_VMIO bp. 3813 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 3814 * assuming B_INVAL is clear. 3815 * 3816 * For the VMIO case, we set B_CACHE if the op was a read and no 3817 * read error occured, or if the op was a write. B_CACHE is never 3818 * set if the buffer is invalid or otherwise uncacheable. 3819 * 3820 * bpdone does not mess with B_INVAL, allowing the I/O routine or the 3821 * initiator to leave B_INVAL set to brelse the buffer out of existance 3822 * in the biodone routine. 3823 */ 3824 void 3825 bpdone(struct buf *bp, int elseit) 3826 { 3827 buf_cmd_t cmd; 3828 3829 KASSERT(BUF_REFCNTNB(bp) > 0, 3830 ("biodone: bp %p not busy %d", bp, BUF_REFCNTNB(bp))); 3831 KASSERT(bp->b_cmd != BUF_CMD_DONE, 3832 ("biodone: bp %p already done!", bp)); 3833 3834 /* 3835 * No more BIOs are left. All completion functions have been dealt 3836 * with, now we clean up the buffer. 3837 */ 3838 cmd = bp->b_cmd; 3839 bp->b_cmd = BUF_CMD_DONE; 3840 3841 /* 3842 * Only reads and writes are processed past this point. 3843 */ 3844 if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) { 3845 if (cmd == BUF_CMD_FREEBLKS) 3846 bp->b_flags |= B_NOCACHE; 3847 if (elseit) 3848 brelse(bp); 3849 return; 3850 } 3851 3852 /* 3853 * Warning: softupdates may re-dirty the buffer, and HAMMER can do 3854 * a lot worse. XXX - move this above the clearing of b_cmd 3855 */ 3856 if (LIST_FIRST(&bp->b_dep) != NULL) 3857 buf_complete(bp); /* MPSAFE */ 3858 3859 /* 3860 * A failed write must re-dirty the buffer unless B_INVAL 3861 * was set. Only applicable to normal buffers (with VPs). 3862 * vinum buffers may not have a vp. 3863 */ 3864 if (cmd == BUF_CMD_WRITE && 3865 (bp->b_flags & (B_ERROR | B_INVAL)) == B_ERROR) { 3866 bp->b_flags &= ~B_NOCACHE; 3867 if (bp->b_vp) 3868 bdirty(bp); 3869 } 3870 3871 if (bp->b_flags & B_VMIO) { 3872 int i; 3873 vm_ooffset_t foff; 3874 vm_page_t m; 3875 vm_object_t obj; 3876 int iosize; 3877 struct vnode *vp = bp->b_vp; 3878 3879 obj = vp->v_object; 3880 3881 #if defined(VFS_BIO_DEBUG) 3882 if (vp->v_auxrefs == 0) 3883 panic("biodone: zero vnode hold count"); 3884 if ((vp->v_flag & VOBJBUF) == 0) 3885 panic("biodone: vnode is not setup for merged cache"); 3886 #endif 3887 3888 foff = bp->b_loffset; 3889 KASSERT(foff != NOOFFSET, ("biodone: no buffer offset")); 3890 KASSERT(obj != NULL, ("biodone: missing VM object")); 3891 3892 #if defined(VFS_BIO_DEBUG) 3893 if (obj->paging_in_progress < bp->b_xio.xio_npages) { 3894 kprintf("biodone: paging in progress(%d) < bp->b_xio.xio_npages(%d)\n", 3895 obj->paging_in_progress, bp->b_xio.xio_npages); 3896 } 3897 #endif 3898 3899 /* 3900 * Set B_CACHE if the op was a normal read and no error 3901 * occured. B_CACHE is set for writes in the b*write() 3902 * routines. 3903 */ 3904 iosize = bp->b_bcount - bp->b_resid; 3905 if (cmd == BUF_CMD_READ && 3906 (bp->b_flags & (B_INVAL|B_NOCACHE|B_ERROR)) == 0) { 3907 bp->b_flags |= B_CACHE; 3908 } 3909 3910 lwkt_gettoken(&vm_token); 3911 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3912 int bogusflag = 0; 3913 int resid; 3914 3915 resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; 3916 if (resid > iosize) 3917 resid = iosize; 3918 3919 /* 3920 * cleanup bogus pages, restoring the originals. Since 3921 * the originals should still be wired, we don't have 3922 * to worry about interrupt/freeing races destroying 3923 * the VM object association. 3924 */ 3925 m = bp->b_xio.xio_pages[i]; 3926 if (m == bogus_page) { 3927 bogusflag = 1; 3928 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 3929 if (m == NULL) 3930 panic("biodone: page disappeared"); 3931 bp->b_xio.xio_pages[i] = m; 3932 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3933 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 3934 } 3935 #if defined(VFS_BIO_DEBUG) 3936 if (OFF_TO_IDX(foff) != m->pindex) { 3937 kprintf("biodone: foff(%lu)/m->pindex(%ld) " 3938 "mismatch\n", 3939 (unsigned long)foff, (long)m->pindex); 3940 } 3941 #endif 3942 3943 /* 3944 * In the write case, the valid and clean bits are 3945 * already changed correctly (see bdwrite()), so we 3946 * only need to do this here in the read case. 3947 */ 3948 if (cmd == BUF_CMD_READ && !bogusflag && resid > 0) { 3949 vfs_clean_one_page(bp, i, m); 3950 } 3951 vm_page_flag_clear(m, PG_ZERO); 3952 3953 /* 3954 * when debugging new filesystems or buffer I/O 3955 * methods, this is the most common error that pops 3956 * up. if you see this, you have not set the page 3957 * busy flag correctly!!! 3958 */ 3959 if (m->busy == 0) { 3960 kprintf("biodone: page busy < 0, " 3961 "pindex: %d, foff: 0x(%x,%x), " 3962 "resid: %d, index: %d\n", 3963 (int) m->pindex, (int)(foff >> 32), 3964 (int) foff & 0xffffffff, resid, i); 3965 if (!vn_isdisk(vp, NULL)) 3966 kprintf(" iosize: %ld, loffset: %lld, " 3967 "flags: 0x%08x, npages: %d\n", 3968 bp->b_vp->v_mount->mnt_stat.f_iosize, 3969 (long long)bp->b_loffset, 3970 bp->b_flags, bp->b_xio.xio_npages); 3971 else 3972 kprintf(" VDEV, loffset: %lld, flags: 0x%08x, npages: %d\n", 3973 (long long)bp->b_loffset, 3974 bp->b_flags, bp->b_xio.xio_npages); 3975 kprintf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", 3976 m->valid, m->dirty, m->wire_count); 3977 panic("biodone: page busy < 0"); 3978 } 3979 vm_page_io_finish(m); 3980 vm_object_pip_subtract(obj, 1); 3981 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3982 iosize -= resid; 3983 } 3984 bp->b_flags &= ~B_HASBOGUS; 3985 if (obj) 3986 vm_object_pip_wakeupn(obj, 0); 3987 lwkt_reltoken(&vm_token); 3988 } 3989 3990 /* 3991 * Finish up by releasing the buffer. There are no more synchronous 3992 * or asynchronous completions, those were handled by bio_done 3993 * callbacks. 3994 */ 3995 if (elseit) { 3996 if (bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR|B_RELBUF)) 3997 brelse(bp); 3998 else 3999 bqrelse(bp); 4000 } 4001 } 4002 4003 /* 4004 * Normal biodone. 4005 */ 4006 void 4007 biodone(struct bio *bio) 4008 { 4009 struct buf *bp = bio->bio_buf; 4010 4011 runningbufwakeup(bp); 4012 4013 /* 4014 * Run up the chain of BIO's. Leave b_cmd intact for the duration. 4015 */ 4016 while (bio) { 4017 biodone_t *done_func; 4018 struct bio_track *track; 4019 4020 /* 4021 * BIO tracking. Most but not all BIOs are tracked. 4022 */ 4023 if ((track = bio->bio_track) != NULL) { 4024 bio_track_rel(track); 4025 bio->bio_track = NULL; 4026 } 4027 4028 /* 4029 * A bio_done function terminates the loop. The function 4030 * will be responsible for any further chaining and/or 4031 * buffer management. 4032 * 4033 * WARNING! The done function can deallocate the buffer! 4034 */ 4035 if ((done_func = bio->bio_done) != NULL) { 4036 bio->bio_done = NULL; 4037 done_func(bio); 4038 return; 4039 } 4040 bio = bio->bio_prev; 4041 } 4042 4043 /* 4044 * If we've run out of bio's do normal [a]synchronous completion. 4045 */ 4046 bpdone(bp, 1); 4047 } 4048 4049 /* 4050 * Synchronous biodone - this terminates a synchronous BIO. 4051 * 4052 * bpdone() is called with elseit=FALSE, leaving the buffer completed 4053 * but still locked. The caller must brelse() the buffer after waiting 4054 * for completion. 4055 */ 4056 void 4057 biodone_sync(struct bio *bio) 4058 { 4059 struct buf *bp = bio->bio_buf; 4060 int flags; 4061 int nflags; 4062 4063 KKASSERT(bio == &bp->b_bio1); 4064 bpdone(bp, 0); 4065 4066 for (;;) { 4067 flags = bio->bio_flags; 4068 nflags = (flags | BIO_DONE) & ~BIO_WANT; 4069 4070 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) { 4071 if (flags & BIO_WANT) 4072 wakeup(bio); 4073 break; 4074 } 4075 } 4076 } 4077 4078 /* 4079 * vfs_unbusy_pages: 4080 * 4081 * This routine is called in lieu of iodone in the case of 4082 * incomplete I/O. This keeps the busy status for pages 4083 * consistant. 4084 */ 4085 void 4086 vfs_unbusy_pages(struct buf *bp) 4087 { 4088 int i; 4089 4090 runningbufwakeup(bp); 4091 4092 lwkt_gettoken(&vm_token); 4093 if (bp->b_flags & B_VMIO) { 4094 struct vnode *vp = bp->b_vp; 4095 vm_object_t obj; 4096 4097 obj = vp->v_object; 4098 4099 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4100 vm_page_t m = bp->b_xio.xio_pages[i]; 4101 4102 /* 4103 * When restoring bogus changes the original pages 4104 * should still be wired, so we are in no danger of 4105 * losing the object association and do not need 4106 * critical section protection particularly. 4107 */ 4108 if (m == bogus_page) { 4109 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_loffset) + i); 4110 if (!m) { 4111 panic("vfs_unbusy_pages: page missing"); 4112 } 4113 bp->b_xio.xio_pages[i] = m; 4114 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4115 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 4116 } 4117 vm_object_pip_subtract(obj, 1); 4118 vm_page_flag_clear(m, PG_ZERO); 4119 vm_page_io_finish(m); 4120 } 4121 bp->b_flags &= ~B_HASBOGUS; 4122 vm_object_pip_wakeupn(obj, 0); 4123 } 4124 lwkt_reltoken(&vm_token); 4125 } 4126 4127 /* 4128 * vfs_busy_pages: 4129 * 4130 * This routine is called before a device strategy routine. 4131 * It is used to tell the VM system that paging I/O is in 4132 * progress, and treat the pages associated with the buffer 4133 * almost as being PG_BUSY. Also the object 'paging_in_progress' 4134 * flag is handled to make sure that the object doesn't become 4135 * inconsistant. 4136 * 4137 * Since I/O has not been initiated yet, certain buffer flags 4138 * such as B_ERROR or B_INVAL may be in an inconsistant state 4139 * and should be ignored. 4140 * 4141 * MPSAFE 4142 */ 4143 void 4144 vfs_busy_pages(struct vnode *vp, struct buf *bp) 4145 { 4146 int i, bogus; 4147 struct lwp *lp = curthread->td_lwp; 4148 4149 /* 4150 * The buffer's I/O command must already be set. If reading, 4151 * B_CACHE must be 0 (double check against callers only doing 4152 * I/O when B_CACHE is 0). 4153 */ 4154 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 4155 KKASSERT(bp->b_cmd == BUF_CMD_WRITE || (bp->b_flags & B_CACHE) == 0); 4156 4157 if (bp->b_flags & B_VMIO) { 4158 vm_object_t obj; 4159 4160 lwkt_gettoken(&vm_token); 4161 4162 obj = vp->v_object; 4163 KASSERT(bp->b_loffset != NOOFFSET, 4164 ("vfs_busy_pages: no buffer offset")); 4165 4166 /* 4167 * Loop until none of the pages are busy. 4168 */ 4169 retry: 4170 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4171 vm_page_t m = bp->b_xio.xio_pages[i]; 4172 4173 if (vm_page_sleep_busy(m, FALSE, "vbpage")) 4174 goto retry; 4175 } 4176 4177 /* 4178 * Setup for I/O, soft-busy the page right now because 4179 * the next loop may block. 4180 */ 4181 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4182 vm_page_t m = bp->b_xio.xio_pages[i]; 4183 4184 vm_page_flag_clear(m, PG_ZERO); 4185 if ((bp->b_flags & B_CLUSTER) == 0) { 4186 vm_object_pip_add(obj, 1); 4187 vm_page_io_start(m); 4188 } 4189 } 4190 4191 /* 4192 * Adjust protections for I/O and do bogus-page mapping. 4193 * Assume that vm_page_protect() can block (it can block 4194 * if VM_PROT_NONE, don't take any chances regardless). 4195 * 4196 * In particular note that for writes we must incorporate 4197 * page dirtyness from the VM system into the buffer's 4198 * dirty range. 4199 * 4200 * For reads we theoretically must incorporate page dirtyness 4201 * from the VM system to determine if the page needs bogus 4202 * replacement, but we shortcut the test by simply checking 4203 * that all m->valid bits are set, indicating that the page 4204 * is fully valid and does not need to be re-read. For any 4205 * VM system dirtyness the page will also be fully valid 4206 * since it was mapped at one point. 4207 */ 4208 bogus = 0; 4209 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4210 vm_page_t m = bp->b_xio.xio_pages[i]; 4211 4212 vm_page_flag_clear(m, PG_ZERO); /* XXX */ 4213 if (bp->b_cmd == BUF_CMD_WRITE) { 4214 /* 4215 * When readying a vnode-backed buffer for 4216 * a write we must zero-fill any invalid 4217 * portions of the backing VM pages, mark 4218 * it valid and clear related dirty bits. 4219 * 4220 * vfs_clean_one_page() incorporates any 4221 * VM dirtyness and updates the b_dirtyoff 4222 * range (after we've made the page RO). 4223 * 4224 * It is also expected that the pmap modified 4225 * bit has already been cleared by the 4226 * vm_page_protect(). We may not be able 4227 * to clear all dirty bits for a page if it 4228 * was also memory mapped (NFS). 4229 * 4230 * Finally be sure to unassign any swap-cache 4231 * backing store as it is now stale. 4232 */ 4233 vm_page_protect(m, VM_PROT_READ); 4234 vfs_clean_one_page(bp, i, m); 4235 swap_pager_unswapped(m); 4236 } else if (m->valid == VM_PAGE_BITS_ALL) { 4237 /* 4238 * When readying a vnode-backed buffer for 4239 * read we must replace any dirty pages with 4240 * a bogus page so dirty data is not destroyed 4241 * when filling gaps. 4242 * 4243 * To avoid testing whether the page is 4244 * dirty we instead test that the page was 4245 * at some point mapped (m->valid fully 4246 * valid) with the understanding that 4247 * this also covers the dirty case. 4248 */ 4249 bp->b_xio.xio_pages[i] = bogus_page; 4250 bp->b_flags |= B_HASBOGUS; 4251 bogus++; 4252 } else if (m->valid & m->dirty) { 4253 /* 4254 * This case should not occur as partial 4255 * dirtyment can only happen if the buffer 4256 * is B_CACHE, and this code is not entered 4257 * if the buffer is B_CACHE. 4258 */ 4259 kprintf("Warning: vfs_busy_pages - page not " 4260 "fully valid! loff=%jx bpf=%08x " 4261 "idx=%d val=%02x dir=%02x\n", 4262 (intmax_t)bp->b_loffset, bp->b_flags, 4263 i, m->valid, m->dirty); 4264 vm_page_protect(m, VM_PROT_NONE); 4265 } else { 4266 /* 4267 * The page is not valid and can be made 4268 * part of the read. 4269 */ 4270 vm_page_protect(m, VM_PROT_NONE); 4271 } 4272 } 4273 if (bogus) { 4274 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4275 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 4276 } 4277 lwkt_reltoken(&vm_token); 4278 } 4279 4280 /* 4281 * This is the easiest place to put the process accounting for the I/O 4282 * for now. 4283 */ 4284 if (lp != NULL) { 4285 if (bp->b_cmd == BUF_CMD_READ) 4286 lp->lwp_ru.ru_inblock++; 4287 else 4288 lp->lwp_ru.ru_oublock++; 4289 } 4290 } 4291 4292 /* 4293 * Tell the VM system that the pages associated with this buffer 4294 * are clean. This is used for delayed writes where the data is 4295 * going to go to disk eventually without additional VM intevention. 4296 * 4297 * NOTE: While we only really need to clean through to b_bcount, we 4298 * just go ahead and clean through to b_bufsize. 4299 */ 4300 static void 4301 vfs_clean_pages(struct buf *bp) 4302 { 4303 vm_page_t m; 4304 int i; 4305 4306 if ((bp->b_flags & B_VMIO) == 0) 4307 return; 4308 4309 KASSERT(bp->b_loffset != NOOFFSET, 4310 ("vfs_clean_pages: no buffer offset")); 4311 4312 /* 4313 * vm_token must be held for vfs_clean_one_page() calls. 4314 */ 4315 lwkt_gettoken(&vm_token); 4316 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4317 m = bp->b_xio.xio_pages[i]; 4318 vfs_clean_one_page(bp, i, m); 4319 } 4320 lwkt_reltoken(&vm_token); 4321 } 4322 4323 /* 4324 * vfs_clean_one_page: 4325 * 4326 * Set the valid bits and clear the dirty bits in a page within a 4327 * buffer. The range is restricted to the buffer's size and the 4328 * buffer's logical offset might index into the first page. 4329 * 4330 * The caller has busied or soft-busied the page and it is not mapped, 4331 * test and incorporate the dirty bits into b_dirtyoff/end before 4332 * clearing them. Note that we need to clear the pmap modified bits 4333 * after determining the the page was dirty, vm_page_set_validclean() 4334 * does not do it for us. 4335 * 4336 * This routine is typically called after a read completes (dirty should 4337 * be zero in that case as we are not called on bogus-replace pages), 4338 * or before a write is initiated. 4339 * 4340 * NOTE: vm_token must be held by the caller, and vm_page_set_validclean() 4341 * currently assumes the vm_token is held. 4342 */ 4343 static void 4344 vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m) 4345 { 4346 int bcount; 4347 int xoff; 4348 int soff; 4349 int eoff; 4350 4351 /* 4352 * Calculate offset range within the page but relative to buffer's 4353 * loffset. loffset might be offset into the first page. 4354 */ 4355 xoff = (int)bp->b_loffset & PAGE_MASK; /* loffset offset into pg 0 */ 4356 bcount = bp->b_bcount + xoff; /* offset adjusted */ 4357 4358 if (pageno == 0) { 4359 soff = xoff; 4360 eoff = PAGE_SIZE; 4361 } else { 4362 soff = (pageno << PAGE_SHIFT); 4363 eoff = soff + PAGE_SIZE; 4364 } 4365 if (eoff > bcount) 4366 eoff = bcount; 4367 if (soff >= eoff) 4368 return; 4369 4370 /* 4371 * Test dirty bits and adjust b_dirtyoff/end. 4372 * 4373 * If dirty pages are incorporated into the bp any prior 4374 * B_NEEDCOMMIT state (NFS) must be cleared because the 4375 * caller has not taken into account the new dirty data. 4376 * 4377 * If the page was memory mapped the dirty bits might go beyond the 4378 * end of the buffer, but we can't really make the assumption that 4379 * a file EOF straddles the buffer (even though this is the case for 4380 * NFS if B_NEEDCOMMIT is also set). So for the purposes of clearing 4381 * B_NEEDCOMMIT we only test the dirty bits covered by the buffer. 4382 * This also saves some console spam. 4383 * 4384 * When clearing B_NEEDCOMMIT we must also clear B_CLUSTEROK, 4385 * NFS can handle huge commits but not huge writes. 4386 */ 4387 vm_page_test_dirty(m); 4388 if (m->dirty) { 4389 if ((bp->b_flags & B_NEEDCOMMIT) && 4390 (m->dirty & vm_page_bits(soff & PAGE_MASK, eoff - soff))) { 4391 if (debug_commit) 4392 kprintf("Warning: vfs_clean_one_page: bp %p " 4393 "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT" 4394 " cmd %d vd %02x/%02x x/s/e %d %d %d " 4395 "doff/end %d %d\n", 4396 bp, (intmax_t)bp->b_loffset, bp->b_bcount, 4397 bp->b_flags, bp->b_cmd, 4398 m->valid, m->dirty, xoff, soff, eoff, 4399 bp->b_dirtyoff, bp->b_dirtyend); 4400 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 4401 if (debug_commit) 4402 print_backtrace(-1); 4403 } 4404 /* 4405 * Only clear the pmap modified bits if ALL the dirty bits 4406 * are set, otherwise the system might mis-clear portions 4407 * of a page. 4408 */ 4409 if (m->dirty == VM_PAGE_BITS_ALL && 4410 (bp->b_flags & B_NEEDCOMMIT) == 0) { 4411 pmap_clear_modify(m); 4412 } 4413 if (bp->b_dirtyoff > soff - xoff) 4414 bp->b_dirtyoff = soff - xoff; 4415 if (bp->b_dirtyend < eoff - xoff) 4416 bp->b_dirtyend = eoff - xoff; 4417 } 4418 4419 /* 4420 * Set related valid bits, clear related dirty bits. 4421 * Does not mess with the pmap modified bit. 4422 * 4423 * WARNING! We cannot just clear all of m->dirty here as the 4424 * buffer cache buffers may use a DEV_BSIZE'd aligned 4425 * block size, or have an odd size (e.g. NFS at file EOF). 4426 * The putpages code can clear m->dirty to 0. 4427 * 4428 * If a VOP_WRITE generates a buffer cache buffer which 4429 * covers the same space as mapped writable pages the 4430 * buffer flush might not be able to clear all the dirty 4431 * bits and still require a putpages from the VM system 4432 * to finish it off. 4433 * 4434 * WARNING! vm_page_set_validclean() currently assumes vm_token 4435 * is held. The page might not be busied (bdwrite() case). 4436 */ 4437 vm_page_set_validclean(m, soff & PAGE_MASK, eoff - soff); 4438 } 4439 4440 /* 4441 * Similar to vfs_clean_one_page() but sets the bits to valid and dirty. 4442 * The page data is assumed to be valid (there is no zeroing here). 4443 */ 4444 static void 4445 vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m) 4446 { 4447 int bcount; 4448 int xoff; 4449 int soff; 4450 int eoff; 4451 4452 /* 4453 * Calculate offset range within the page but relative to buffer's 4454 * loffset. loffset might be offset into the first page. 4455 */ 4456 xoff = (int)bp->b_loffset & PAGE_MASK; /* loffset offset into pg 0 */ 4457 bcount = bp->b_bcount + xoff; /* offset adjusted */ 4458 4459 if (pageno == 0) { 4460 soff = xoff; 4461 eoff = PAGE_SIZE; 4462 } else { 4463 soff = (pageno << PAGE_SHIFT); 4464 eoff = soff + PAGE_SIZE; 4465 } 4466 if (eoff > bcount) 4467 eoff = bcount; 4468 if (soff >= eoff) 4469 return; 4470 vm_page_set_validdirty(m, soff & PAGE_MASK, eoff - soff); 4471 } 4472 4473 /* 4474 * vfs_bio_clrbuf: 4475 * 4476 * Clear a buffer. This routine essentially fakes an I/O, so we need 4477 * to clear B_ERROR and B_INVAL. 4478 * 4479 * Note that while we only theoretically need to clear through b_bcount, 4480 * we go ahead and clear through b_bufsize. 4481 */ 4482 4483 void 4484 vfs_bio_clrbuf(struct buf *bp) 4485 { 4486 int i, mask = 0; 4487 caddr_t sa, ea; 4488 if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { 4489 bp->b_flags &= ~(B_INVAL | B_EINTR | B_ERROR); 4490 if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 4491 (bp->b_loffset & PAGE_MASK) == 0) { 4492 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 4493 if ((bp->b_xio.xio_pages[0]->valid & mask) == mask) { 4494 bp->b_resid = 0; 4495 return; 4496 } 4497 if (((bp->b_xio.xio_pages[0]->flags & PG_ZERO) == 0) && 4498 ((bp->b_xio.xio_pages[0]->valid & mask) == 0)) { 4499 bzero(bp->b_data, bp->b_bufsize); 4500 bp->b_xio.xio_pages[0]->valid |= mask; 4501 bp->b_resid = 0; 4502 return; 4503 } 4504 } 4505 sa = bp->b_data; 4506 for(i=0;i<bp->b_xio.xio_npages;i++,sa=ea) { 4507 int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; 4508 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); 4509 ea = (caddr_t)(vm_offset_t)ulmin( 4510 (u_long)(vm_offset_t)ea, 4511 (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); 4512 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 4513 if ((bp->b_xio.xio_pages[i]->valid & mask) == mask) 4514 continue; 4515 if ((bp->b_xio.xio_pages[i]->valid & mask) == 0) { 4516 if ((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) { 4517 bzero(sa, ea - sa); 4518 } 4519 } else { 4520 for (; sa < ea; sa += DEV_BSIZE, j++) { 4521 if (((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) && 4522 (bp->b_xio.xio_pages[i]->valid & (1<<j)) == 0) 4523 bzero(sa, DEV_BSIZE); 4524 } 4525 } 4526 bp->b_xio.xio_pages[i]->valid |= mask; 4527 vm_page_flag_clear(bp->b_xio.xio_pages[i], PG_ZERO); 4528 } 4529 bp->b_resid = 0; 4530 } else { 4531 clrbuf(bp); 4532 } 4533 } 4534 4535 /* 4536 * vm_hold_load_pages: 4537 * 4538 * Load pages into the buffer's address space. The pages are 4539 * allocated from the kernel object in order to reduce interference 4540 * with the any VM paging I/O activity. The range of loaded 4541 * pages will be wired. 4542 * 4543 * If a page cannot be allocated, the 'pagedaemon' is woken up to 4544 * retrieve the full range (to - from) of pages. 4545 * 4546 * MPSAFE 4547 */ 4548 void 4549 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4550 { 4551 vm_offset_t pg; 4552 vm_page_t p; 4553 int index; 4554 4555 to = round_page(to); 4556 from = round_page(from); 4557 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4558 4559 pg = from; 4560 while (pg < to) { 4561 /* 4562 * Note: must allocate system pages since blocking here 4563 * could intefere with paging I/O, no matter which 4564 * process we are. 4565 */ 4566 p = bio_page_alloc(&kernel_object, pg >> PAGE_SHIFT, 4567 (vm_pindex_t)((to - pg) >> PAGE_SHIFT)); 4568 if (p) { 4569 vm_page_wire(p); 4570 p->valid = VM_PAGE_BITS_ALL; 4571 vm_page_flag_clear(p, PG_ZERO); 4572 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 4573 bp->b_xio.xio_pages[index] = p; 4574 vm_page_wakeup(p); 4575 4576 pg += PAGE_SIZE; 4577 ++index; 4578 } 4579 } 4580 bp->b_xio.xio_npages = index; 4581 } 4582 4583 /* 4584 * Allocate pages for a buffer cache buffer. 4585 * 4586 * Under extremely severe memory conditions even allocating out of the 4587 * system reserve can fail. If this occurs we must allocate out of the 4588 * interrupt reserve to avoid a deadlock with the pageout daemon. 4589 * 4590 * The pageout daemon can run (putpages -> VOP_WRITE -> getblk -> allocbuf). 4591 * If the buffer cache's vm_page_alloc() fails a vm_wait() can deadlock 4592 * against the pageout daemon if pages are not freed from other sources. 4593 * 4594 * MPSAFE 4595 */ 4596 static 4597 vm_page_t 4598 bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit) 4599 { 4600 vm_page_t p; 4601 4602 /* 4603 * Try a normal allocation, allow use of system reserve. 4604 */ 4605 lwkt_gettoken(&vm_token); 4606 p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM); 4607 if (p) { 4608 lwkt_reltoken(&vm_token); 4609 return(p); 4610 } 4611 4612 /* 4613 * The normal allocation failed and we clearly have a page 4614 * deficit. Try to reclaim some clean VM pages directly 4615 * from the buffer cache. 4616 */ 4617 vm_pageout_deficit += deficit; 4618 recoverbufpages(); 4619 4620 /* 4621 * We may have blocked, the caller will know what to do if the 4622 * page now exists. 4623 */ 4624 if (vm_page_lookup(obj, pg)) { 4625 lwkt_reltoken(&vm_token); 4626 return(NULL); 4627 } 4628 4629 /* 4630 * Allocate and allow use of the interrupt reserve. 4631 * 4632 * If after all that we still can't allocate a VM page we are 4633 * in real trouble, but we slog on anyway hoping that the system 4634 * won't deadlock. 4635 */ 4636 p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 4637 VM_ALLOC_INTERRUPT); 4638 if (p) { 4639 if (vm_page_count_severe()) { 4640 ++lowmempgallocs; 4641 vm_wait(hz / 20 + 1); 4642 } 4643 } else { 4644 kprintf("bio_page_alloc: Memory exhausted during bufcache " 4645 "page allocation\n"); 4646 ++lowmempgfails; 4647 vm_wait(hz); 4648 } 4649 lwkt_reltoken(&vm_token); 4650 return(p); 4651 } 4652 4653 /* 4654 * vm_hold_free_pages: 4655 * 4656 * Return pages associated with the buffer back to the VM system. 4657 * 4658 * The range of pages underlying the buffer's address space will 4659 * be unmapped and un-wired. 4660 * 4661 * MPSAFE 4662 */ 4663 void 4664 vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4665 { 4666 vm_offset_t pg; 4667 vm_page_t p; 4668 int index, newnpages; 4669 4670 from = round_page(from); 4671 to = round_page(to); 4672 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4673 newnpages = index; 4674 4675 lwkt_gettoken(&vm_token); 4676 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 4677 p = bp->b_xio.xio_pages[index]; 4678 if (p && (index < bp->b_xio.xio_npages)) { 4679 if (p->busy) { 4680 kprintf("vm_hold_free_pages: doffset: %lld, " 4681 "loffset: %lld\n", 4682 (long long)bp->b_bio2.bio_offset, 4683 (long long)bp->b_loffset); 4684 } 4685 bp->b_xio.xio_pages[index] = NULL; 4686 pmap_kremove(pg); 4687 vm_page_busy(p); 4688 vm_page_unwire(p, 0); 4689 vm_page_free(p); 4690 } 4691 } 4692 bp->b_xio.xio_npages = newnpages; 4693 lwkt_reltoken(&vm_token); 4694 } 4695 4696 /* 4697 * vmapbuf: 4698 * 4699 * Map a user buffer into KVM via a pbuf. On return the buffer's 4700 * b_data, b_bufsize, and b_bcount will be set, and its XIO page array 4701 * initialized. 4702 */ 4703 int 4704 vmapbuf(struct buf *bp, caddr_t udata, int bytes) 4705 { 4706 caddr_t addr; 4707 vm_offset_t va; 4708 vm_page_t m; 4709 int vmprot; 4710 int error; 4711 int pidx; 4712 int i; 4713 4714 /* 4715 * bp had better have a command and it better be a pbuf. 4716 */ 4717 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 4718 KKASSERT(bp->b_flags & B_PAGING); 4719 KKASSERT(bp->b_kvabase); 4720 4721 if (bytes < 0) 4722 return (-1); 4723 4724 /* 4725 * Map the user data into KVM. Mappings have to be page-aligned. 4726 */ 4727 addr = (caddr_t)trunc_page((vm_offset_t)udata); 4728 pidx = 0; 4729 4730 vmprot = VM_PROT_READ; 4731 if (bp->b_cmd == BUF_CMD_READ) 4732 vmprot |= VM_PROT_WRITE; 4733 4734 while (addr < udata + bytes) { 4735 /* 4736 * Do the vm_fault if needed; do the copy-on-write thing 4737 * when reading stuff off device into memory. 4738 * 4739 * vm_fault_page*() returns a held VM page. 4740 */ 4741 va = (addr >= udata) ? (vm_offset_t)addr : (vm_offset_t)udata; 4742 va = trunc_page(va); 4743 4744 m = vm_fault_page_quick(va, vmprot, &error); 4745 if (m == NULL) { 4746 for (i = 0; i < pidx; ++i) { 4747 vm_page_unhold(bp->b_xio.xio_pages[i]); 4748 bp->b_xio.xio_pages[i] = NULL; 4749 } 4750 return(-1); 4751 } 4752 bp->b_xio.xio_pages[pidx] = m; 4753 addr += PAGE_SIZE; 4754 ++pidx; 4755 } 4756 4757 /* 4758 * Map the page array and set the buffer fields to point to 4759 * the mapped data buffer. 4760 */ 4761 if (pidx > btoc(MAXPHYS)) 4762 panic("vmapbuf: mapped more than MAXPHYS"); 4763 pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_xio.xio_pages, pidx); 4764 4765 bp->b_xio.xio_npages = pidx; 4766 bp->b_data = bp->b_kvabase + ((int)(intptr_t)udata & PAGE_MASK); 4767 bp->b_bcount = bytes; 4768 bp->b_bufsize = bytes; 4769 return(0); 4770 } 4771 4772 /* 4773 * vunmapbuf: 4774 * 4775 * Free the io map PTEs associated with this IO operation. 4776 * We also invalidate the TLB entries and restore the original b_addr. 4777 */ 4778 void 4779 vunmapbuf(struct buf *bp) 4780 { 4781 int pidx; 4782 int npages; 4783 4784 KKASSERT(bp->b_flags & B_PAGING); 4785 4786 npages = bp->b_xio.xio_npages; 4787 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); 4788 for (pidx = 0; pidx < npages; ++pidx) { 4789 vm_page_unhold(bp->b_xio.xio_pages[pidx]); 4790 bp->b_xio.xio_pages[pidx] = NULL; 4791 } 4792 bp->b_xio.xio_npages = 0; 4793 bp->b_data = bp->b_kvabase; 4794 } 4795 4796 /* 4797 * Scan all buffers in the system and issue the callback. 4798 */ 4799 int 4800 scan_all_buffers(int (*callback)(struct buf *, void *), void *info) 4801 { 4802 int count = 0; 4803 int error; 4804 int n; 4805 4806 for (n = 0; n < nbuf; ++n) { 4807 if ((error = callback(&buf[n], info)) < 0) { 4808 count = error; 4809 break; 4810 } 4811 count += error; 4812 } 4813 return (count); 4814 } 4815 4816 /* 4817 * nestiobuf_iodone: biodone callback for nested buffers and propagate 4818 * completion to the master buffer. 4819 */ 4820 static void 4821 nestiobuf_iodone(struct bio *bio) 4822 { 4823 struct bio *mbio; 4824 struct buf *mbp, *bp; 4825 struct devstat *stats; 4826 int error; 4827 int donebytes; 4828 4829 bp = bio->bio_buf; 4830 mbio = bio->bio_caller_info1.ptr; 4831 stats = bio->bio_caller_info2.ptr; 4832 mbp = mbio->bio_buf; 4833 4834 KKASSERT(bp->b_bcount <= bp->b_bufsize); 4835 KKASSERT(mbp != bp); 4836 4837 error = bp->b_error; 4838 if (bp->b_error == 0 && 4839 (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) { 4840 /* 4841 * Not all got transfered, raise an error. We have no way to 4842 * propagate these conditions to mbp. 4843 */ 4844 error = EIO; 4845 } 4846 4847 donebytes = bp->b_bufsize; 4848 4849 relpbuf(bp, NULL); 4850 4851 nestiobuf_done(mbio, donebytes, error, stats); 4852 } 4853 4854 void 4855 nestiobuf_done(struct bio *mbio, int donebytes, int error, struct devstat *stats) 4856 { 4857 struct buf *mbp; 4858 4859 mbp = mbio->bio_buf; 4860 4861 KKASSERT((int)(intptr_t)mbio->bio_driver_info > 0); 4862 4863 /* 4864 * If an error occured, propagate it to the master buffer. 4865 * 4866 * Several biodone()s may wind up running concurrently so 4867 * use an atomic op to adjust b_flags. 4868 */ 4869 if (error) { 4870 mbp->b_error = error; 4871 atomic_set_int(&mbp->b_flags, B_ERROR); 4872 } 4873 4874 /* 4875 * Decrement the operations in progress counter and terminate the 4876 * I/O if this was the last bit. 4877 */ 4878 if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) { 4879 mbp->b_resid = 0; 4880 if (stats) 4881 devstat_end_transaction_buf(stats, mbp); 4882 biodone(mbio); 4883 } 4884 } 4885 4886 /* 4887 * Initialize a nestiobuf for use. Set an initial count of 1 to prevent 4888 * the mbio from being biodone()'d while we are still adding sub-bios to 4889 * it. 4890 */ 4891 void 4892 nestiobuf_init(struct bio *bio) 4893 { 4894 bio->bio_driver_info = (void *)1; 4895 } 4896 4897 /* 4898 * The BIOs added to the nestedio have already been started, remove the 4899 * count that placeheld our mbio and biodone() it if the count would 4900 * transition to 0. 4901 */ 4902 void 4903 nestiobuf_start(struct bio *mbio) 4904 { 4905 struct buf *mbp = mbio->bio_buf; 4906 4907 /* 4908 * Decrement the operations in progress counter and terminate the 4909 * I/O if this was the last bit. 4910 */ 4911 if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) { 4912 if (mbp->b_flags & B_ERROR) 4913 mbp->b_resid = mbp->b_bcount; 4914 else 4915 mbp->b_resid = 0; 4916 biodone(mbio); 4917 } 4918 } 4919 4920 /* 4921 * Set an intermediate error prior to calling nestiobuf_start() 4922 */ 4923 void 4924 nestiobuf_error(struct bio *mbio, int error) 4925 { 4926 struct buf *mbp = mbio->bio_buf; 4927 4928 if (error) { 4929 mbp->b_error = error; 4930 atomic_set_int(&mbp->b_flags, B_ERROR); 4931 } 4932 } 4933 4934 /* 4935 * nestiobuf_add: setup a "nested" buffer. 4936 * 4937 * => 'mbp' is a "master" buffer which is being divided into sub pieces. 4938 * => 'bp' should be a buffer allocated by getiobuf. 4939 * => 'offset' is a byte offset in the master buffer. 4940 * => 'size' is a size in bytes of this nested buffer. 4941 */ 4942 void 4943 nestiobuf_add(struct bio *mbio, struct buf *bp, int offset, size_t size, struct devstat *stats) 4944 { 4945 struct buf *mbp = mbio->bio_buf; 4946 struct vnode *vp = mbp->b_vp; 4947 4948 KKASSERT(mbp->b_bcount >= offset + size); 4949 4950 atomic_add_int((int *)&mbio->bio_driver_info, 1); 4951 4952 /* kernel needs to own the lock for it to be released in biodone */ 4953 BUF_KERNPROC(bp); 4954 bp->b_vp = vp; 4955 bp->b_cmd = mbp->b_cmd; 4956 bp->b_bio1.bio_done = nestiobuf_iodone; 4957 bp->b_data = (char *)mbp->b_data + offset; 4958 bp->b_resid = bp->b_bcount = size; 4959 bp->b_bufsize = bp->b_bcount; 4960 4961 bp->b_bio1.bio_track = NULL; 4962 bp->b_bio1.bio_caller_info1.ptr = mbio; 4963 bp->b_bio1.bio_caller_info2.ptr = stats; 4964 } 4965 4966 /* 4967 * print out statistics from the current status of the buffer pool 4968 * this can be toggeled by the system control option debug.syncprt 4969 */ 4970 #ifdef DEBUG 4971 void 4972 vfs_bufstats(void) 4973 { 4974 int i, j, count; 4975 struct buf *bp; 4976 struct bqueues *dp; 4977 int counts[(MAXBSIZE / PAGE_SIZE) + 1]; 4978 static char *bname[3] = { "LOCKED", "LRU", "AGE" }; 4979 4980 for (dp = bufqueues, i = 0; dp < &bufqueues[3]; dp++, i++) { 4981 count = 0; 4982 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 4983 counts[j] = 0; 4984 4985 spin_lock(&bufqspin); 4986 TAILQ_FOREACH(bp, dp, b_freelist) { 4987 counts[bp->b_bufsize/PAGE_SIZE]++; 4988 count++; 4989 } 4990 spin_unlock(&bufqspin); 4991 4992 kprintf("%s: total-%d", bname[i], count); 4993 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 4994 if (counts[j] != 0) 4995 kprintf(", %d-%d", j * PAGE_SIZE, counts[j]); 4996 kprintf("\n"); 4997 } 4998 } 4999 #endif 5000 5001 #ifdef DDB 5002 5003 DB_SHOW_COMMAND(buffer, db_show_buffer) 5004 { 5005 /* get args */ 5006 struct buf *bp = (struct buf *)addr; 5007 5008 if (!have_addr) { 5009 db_printf("usage: show buffer <addr>\n"); 5010 return; 5011 } 5012 5013 db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); 5014 db_printf("b_cmd = %d\n", bp->b_cmd); 5015 db_printf("b_error = %d, b_bufsize = %d, b_bcount = %d, " 5016 "b_resid = %d\n, b_data = %p, " 5017 "bio_offset(disk) = %lld, bio_offset(phys) = %lld\n", 5018 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 5019 bp->b_data, 5020 (long long)bp->b_bio2.bio_offset, 5021 (long long)(bp->b_bio2.bio_next ? 5022 bp->b_bio2.bio_next->bio_offset : (off_t)-1)); 5023 if (bp->b_xio.xio_npages) { 5024 int i; 5025 db_printf("b_xio.xio_npages = %d, pages(OBJ, IDX, PA): ", 5026 bp->b_xio.xio_npages); 5027 for (i = 0; i < bp->b_xio.xio_npages; i++) { 5028 vm_page_t m; 5029 m = bp->b_xio.xio_pages[i]; 5030 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 5031 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 5032 if ((i + 1) < bp->b_xio.xio_npages) 5033 db_printf(","); 5034 } 5035 db_printf("\n"); 5036 } 5037 } 5038 #endif /* DDB */ 5039