1 /*- 2 * Copyright (c) 1982, 1986, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)vmmeter.h 8.2 (Berkeley) 7/10/94 30 * $FreeBSD: src/sys/sys/vmmeter.h,v 1.21.2.2 2002/10/10 19:28:21 dillon Exp $ 31 */ 32 33 #ifndef _VM_VM_PAGE2_H_ 34 #define _VM_VM_PAGE2_H_ 35 36 #ifdef _KERNEL 37 38 #ifndef _SYS_VMMETER_H_ 39 #include <sys/vmmeter.h> 40 #endif 41 #ifndef _SYS_QUEUE_H_ 42 #include <sys/queue.h> 43 #endif 44 #ifndef _VM_VM_PAGE_H_ 45 #include <vm/vm_page.h> 46 #endif 47 #ifndef _SYS_SPINLOCK_H_ 48 #include <sys/spinlock.h> 49 #endif 50 #ifndef _SYS_SPINLOCK2_H_ 51 #include <sys/spinlock2.h> 52 #endif 53 54 /* 55 * SMP NOTE 56 * 57 * VM fault rates are highly dependent on SMP locking conflicts and, on 58 * multi-socket systems, cache mastership changes for globals due to atomic 59 * ops (even simple atomic_add_*() calls). Cache mastership changes can 60 * limit the aggregate fault rate. 61 * 62 * For this reason we go through some hoops to access VM statistics for 63 * low-memory handling, pageout, and other triggers. Each cpu collects 64 * adjustments in gd->gd_vmstats_adj. These get rolled up into the global 65 * vmstats structure. The global vmstats structure is then pulled into 66 * gd->gd_vmstats by each cpu when it needs it. Critical path checks always 67 * use the pcpu gd->gd_vmstats structure. 68 */ 69 /* 70 * Return TRUE if we are under our severe low-free-pages threshold 71 * 72 * This causes user processes to stall to avoid exhausting memory that 73 * the kernel might need. 74 * 75 * reserved < severe < minimum < wait < start < target1 < target2 76 */ 77 static __inline 78 int 79 vm_paging_severe(void) 80 { 81 globaldata_t gd = mycpu; 82 83 if (__predict_false(gd->gd_vmstats.v_free_severe > 84 gd->gd_vmstats.v_free_count + 85 gd->gd_vmstats.v_cache_count)) 86 { 87 return 1; 88 } 89 if (__predict_false(gd->gd_vmstats.v_free_reserved > 90 gd->gd_vmstats.v_free_count)) 91 { 92 return 1; 93 } 94 return 0; 95 } 96 97 /* 98 * Return TRUE if we are under our minimum low-free-pages threshold. We 99 * will not count (donotcount) free pages as being free (used mainly for 100 * hystersis tests). 101 * 102 * This will cause most normal page faults to block and activate the 103 * pageout daemon. 104 * 105 * The pageout daemon should already be active due to vm_paging_start(n) 106 * and will typically continue running until it hits target2 107 * 108 * reserved < severe < minimum < wait < start < target1 < target2 109 */ 110 static __inline 111 int 112 vm_paging_min_dnc(long donotcount) 113 { 114 globaldata_t gd = mycpu; 115 116 if (__predict_false(gd->gd_vmstats.v_free_min + donotcount > 117 (gd->gd_vmstats.v_free_count + 118 gd->gd_vmstats.v_cache_count))) 119 { 120 return 1; 121 } 122 if (__predict_false(gd->gd_vmstats.v_free_reserved > 123 gd->gd_vmstats.v_free_count)) 124 { 125 return 1; 126 } 127 return 0; 128 } 129 130 /* 131 * Returns TRUE if the number of FREE+CACHE pages falls below vm_paging_wait, 132 * based on the nice value the trip point can be between vm_paging_min and 133 * vm_paging_wait. 134 * 135 * Used by vm_fault (see vm_wait_pfault()) to block a process on low-memory 136 * based on the process 'nice' value (-20 to +20). 137 */ 138 static __inline 139 int 140 vm_paging_min_nice(int nice) 141 { 142 long count; 143 long delta; 144 145 count = 0; 146 if (nice) { 147 delta = vmstats.v_paging_wait - vmstats.v_free_min - 1; 148 delta = delta >> 1; 149 if (delta > 0) { 150 /* range 0-40, 0 is high priority, 40 is low */ 151 count = (nice + 20) * delta / 40; 152 } 153 } 154 return vm_paging_min_dnc(count); 155 } 156 157 static __inline 158 int 159 vm_paging_min(void) 160 { 161 return vm_paging_min_dnc(0); 162 } 163 164 /* 165 * Return TRUE if nominal userland / VM-system allocations should slow 166 * down (but not stop) due to low free pages in the system. This is 167 * typically 1/2 way between min and start. 168 * 169 * reserved < severe < minimum < wait < start < target1 < target2 170 */ 171 static __inline 172 int 173 vm_paging_wait(void) 174 { 175 globaldata_t gd = mycpu; 176 177 if (__predict_false(gd->gd_vmstats.v_paging_wait > 178 (gd->gd_vmstats.v_free_count + 179 gd->gd_vmstats.v_cache_count))) 180 { 181 return 1; 182 } 183 if (__predict_false(gd->gd_vmstats.v_free_reserved > 184 gd->gd_vmstats.v_free_count)) 185 { 186 return 1; 187 } 188 return 0; 189 } 190 191 /* 192 * Return TRUE if the pageout daemon should be started up or continue 193 * running. Available pages have dropped to a level where we need to 194 * think about freeing some up. 195 * 196 * Also handles edge cases for required 'actually-free' pages. 197 * 198 * reserved < severe < minimum < wait < start < target1 < target2 199 */ 200 static __inline 201 int 202 vm_paging_start(int adj) 203 { 204 globaldata_t gd = mycpu; 205 206 if (__predict_false(gd->gd_vmstats.v_paging_start > 207 (gd->gd_vmstats.v_free_count + 208 gd->gd_vmstats.v_cache_count + adj))) 209 { 210 return 1; 211 } 212 if (__predict_false(gd->gd_vmstats.v_free_min > 213 gd->gd_vmstats.v_free_count + adj)) 214 { 215 return 1; 216 } 217 if (__predict_false(gd->gd_vmstats.v_free_reserved > 218 gd->gd_vmstats.v_free_count)) 219 { 220 return 1; 221 } 222 return 0; 223 } 224 225 /* 226 * Return TRUE if the pageout daemon has not yet reached its initial target. 227 * The pageout daemon works hard to reach target1. 228 * 229 * reserved < severe < minimum < wait < start < target1 < target2 230 */ 231 static __inline 232 int 233 vm_paging_target1(void) 234 { 235 globaldata_t gd = mycpu; 236 237 if (__predict_false(gd->gd_vmstats.v_paging_target1 > 238 (gd->gd_vmstats.v_free_count + 239 gd->gd_vmstats.v_cache_count))) 240 { 241 return 1; 242 } 243 if (__predict_false(gd->gd_vmstats.v_free_reserved > 244 gd->gd_vmstats.v_free_count)) 245 { 246 return 1; 247 } 248 return 0; 249 } 250 251 static __inline 252 long 253 vm_paging_target1_count(void) 254 { 255 globaldata_t gd = mycpu; 256 long delta; 257 258 delta = gd->gd_vmstats.v_paging_target1 - 259 (gd->gd_vmstats.v_free_count + gd->gd_vmstats.v_cache_count); 260 return delta; 261 } 262 263 /* 264 * Return TRUE if the pageout daemon has not yet reached its final target. 265 * The pageout daemon takes it easy on its way between target1 and target2. 266 * 267 * reserved < severe < minimum < wait < start < target1 < target2 268 */ 269 static __inline 270 int 271 vm_paging_target2(void) 272 { 273 globaldata_t gd = mycpu; 274 275 if (__predict_false(gd->gd_vmstats.v_paging_target2 > 276 (gd->gd_vmstats.v_free_count + 277 gd->gd_vmstats.v_cache_count))) 278 { 279 return 1; 280 } 281 if (__predict_false(gd->gd_vmstats.v_free_reserved > 282 gd->gd_vmstats.v_free_count)) 283 { 284 return 1; 285 } 286 return 0; 287 } 288 289 static __inline 290 long 291 vm_paging_target2_count(void) 292 { 293 globaldata_t gd = mycpu; 294 long delta; 295 296 delta = gd->gd_vmstats.v_paging_target2 - 297 (gd->gd_vmstats.v_free_count + gd->gd_vmstats.v_cache_count); 298 return delta; 299 } 300 301 /* 302 * Returns TRUE if additional pages must be deactivated, either during a 303 * pageout operation or during the page stats scan. 304 * 305 * Inactive tests are used in two places. During heavy paging the 306 * inactive_target is used to refill the inactive queue in staged. 307 * Those pages are then ultimately flushed and moved to the cache or free 308 * queues. 309 * 310 * The inactive queue is also used to manage scans to update page stats 311 * (m->act_count). The page stats scan occurs lazily in small batches to 312 * update m->act_count for pages in the active queue and to move pages 313 * (limited by inactive_target) to the inactive queue. Page stats scanning 314 * and active deactivations only run while the inactive queue is below target. 315 * After this, additional page stats scanning just to update m->act_count 316 * (but not do further deactivations) continues to run for a limited period 317 * of time after any pageout daemon activity. 318 */ 319 static __inline 320 int 321 vm_paging_inactive(void) 322 { 323 globaldata_t gd = mycpu; 324 325 if (__predict_false((gd->gd_vmstats.v_free_count + 326 gd->gd_vmstats.v_cache_count + 327 gd->gd_vmstats.v_inactive_count) < 328 (gd->gd_vmstats.v_free_min + 329 gd->gd_vmstats.v_inactive_target))) 330 { 331 return 1; 332 } 333 return 0; 334 } 335 336 /* 337 * Return number of pages that need to be deactivated to achieve the inactive 338 * target as a positive number. A negative number indicates that there are 339 * already a sufficient number of inactive pages. 340 */ 341 static __inline 342 long 343 vm_paging_inactive_count(void) 344 { 345 globaldata_t gd = mycpu; 346 long delta; 347 348 delta = (gd->gd_vmstats.v_free_min + gd->gd_vmstats.v_inactive_target) - 349 (gd->gd_vmstats.v_free_count + gd->gd_vmstats.v_cache_count + 350 gd->gd_vmstats.v_inactive_count); 351 352 return delta; 353 } 354 355 /* 356 * Clear dirty bits in the VM page but truncate the 357 * end to a DEV_BSIZE'd boundary. 358 * 359 * Used when reading data in, typically via getpages. 360 * The partial device block at the end of the truncation 361 * range should not lose its dirty bit. 362 * 363 * NOTE: This function does not clear the pmap modified bit. 364 */ 365 static __inline 366 void 367 vm_page_clear_dirty_end_nonincl(vm_page_t m, int base, int size) 368 { 369 size = (base + size) & ~DEV_BMASK; 370 if (base < size) 371 vm_page_clear_dirty(m, base, size - base); 372 } 373 374 /* 375 * Clear dirty bits in the VM page but truncate the 376 * beginning to a DEV_BSIZE'd boundary. 377 * 378 * Used when truncating a buffer. The partial device 379 * block at the beginning of the truncation range 380 * should not lose its dirty bit. 381 * 382 * NOTE: This function does not clear the pmap modified bit. 383 */ 384 static __inline 385 void 386 vm_page_clear_dirty_beg_nonincl(vm_page_t m, int base, int size) 387 { 388 size += base; 389 base = (base + DEV_BMASK) & ~DEV_BMASK; 390 if (base < size) 391 vm_page_clear_dirty(m, base, size - base); 392 } 393 394 static __inline 395 void 396 vm_page_spin_lock(vm_page_t m) 397 { 398 spin_lock(&m->spin); 399 } 400 401 static __inline 402 void 403 vm_page_spin_unlock(vm_page_t m) 404 { 405 spin_unlock(&m->spin); 406 } 407 408 /* 409 * Wire a vm_page that is already wired. Does not require a busied 410 * page. 411 */ 412 static __inline 413 void 414 vm_page_wire_quick(vm_page_t m) 415 { 416 if (atomic_fetchadd_int(&m->wire_count, 1) == 0) 417 panic("vm_page_wire_quick: wire_count was 0"); 418 } 419 420 /* 421 * Unwire a vm_page quickly, does not require a busied page. 422 * 423 * This routine refuses to drop the wire_count to 0 and will return 424 * TRUE if it would have had to (instead of decrementing it to 0). 425 * The caller can then busy the page and deal with it. 426 */ 427 static __inline 428 int 429 vm_page_unwire_quick(vm_page_t m) 430 { 431 KKASSERT(m->wire_count > 0); 432 for (;;) { 433 u_int wire_count = m->wire_count; 434 435 cpu_ccfence(); 436 if (wire_count == 1) 437 return TRUE; 438 if (atomic_cmpset_int(&m->wire_count, wire_count, wire_count - 1)) 439 return FALSE; 440 } 441 } 442 443 /* 444 * Functions implemented as macros 445 */ 446 447 static __inline void 448 vm_page_flag_set(vm_page_t m, unsigned int bits) 449 { 450 atomic_set_int(&(m)->flags, bits); 451 } 452 453 static __inline void 454 vm_page_flag_clear(vm_page_t m, unsigned int bits) 455 { 456 atomic_clear_int(&(m)->flags, bits); 457 } 458 459 /* 460 * Wakeup anyone waiting for the page after potentially unbusying 461 * (hard or soft) or doing other work on a page that might make a 462 * waiter ready. The setting of PBUSY_WANTED is integrated into the 463 * related flags and it can't be set once the flags are already 464 * clear, so there should be no races here. 465 */ 466 static __inline void 467 vm_page_flash(vm_page_t m) 468 { 469 if (m->busy_count & PBUSY_WANTED) { 470 atomic_clear_int(&m->busy_count, PBUSY_WANTED); 471 wakeup(m); 472 } 473 } 474 475 /* 476 * Adjust the soft-busy count on a page. The drop code will issue an 477 * integrated wakeup if busy_count becomes 0. 478 */ 479 static __inline void 480 vm_page_sbusy_hold(vm_page_t m) 481 { 482 atomic_add_int(&m->busy_count, 1); 483 } 484 485 static __inline void 486 vm_page_sbusy_drop(vm_page_t m) 487 { 488 uint32_t ocount; 489 490 ocount = atomic_fetchadd_int(&m->busy_count, -1); 491 if (ocount - 1 == PBUSY_WANTED) { 492 /* WANTED and no longer BUSY or SBUSY */ 493 atomic_clear_int(&m->busy_count, PBUSY_WANTED); 494 wakeup(m); 495 } 496 } 497 498 /* 499 * Reduce the protection of a page. This routine never raises the 500 * protection and therefore can be safely called if the page is already 501 * at VM_PROT_NONE (it will be a NOP effectively ). 502 * 503 * VM_PROT_NONE will remove all user mappings of a page. This is often 504 * necessary when a page changes state (for example, turns into a copy-on-write 505 * page or needs to be frozen for write I/O) in order to force a fault, or 506 * to force a page's dirty bits to be synchronized and avoid hardware 507 * (modified/accessed) bit update races with pmap changes. 508 * 509 * Since 'prot' is usually a constant, this inline usually winds up optimizing 510 * out the primary conditional. 511 * 512 * Must be called with (m) hard-busied. 513 * 514 * WARNING: VM_PROT_NONE can block, but will loop until all mappings have 515 * been cleared. Callers should be aware that other page related 516 * elements might have changed, however. 517 */ 518 static __inline void 519 vm_page_protect(vm_page_t m, int prot) 520 { 521 KKASSERT(m->busy_count & PBUSY_LOCKED); 522 if (prot == VM_PROT_NONE) { 523 if (pmap_mapped_sync(m) & (PG_MAPPED | PG_WRITEABLE)) { 524 pmap_page_protect(m, VM_PROT_NONE); 525 /* PG_WRITEABLE & PG_MAPPED cleared by call */ 526 } 527 } else if ((prot == VM_PROT_READ) && 528 (m->flags & PG_WRITEABLE) && 529 (pmap_mapped_sync(m) & PG_WRITEABLE)) { 530 pmap_page_protect(m, VM_PROT_READ); 531 /* PG_WRITEABLE cleared by call */ 532 } 533 } 534 535 /* 536 * Zero-fill the specified page. The entire contents of the page will be 537 * zero'd out. 538 */ 539 static __inline boolean_t 540 vm_page_zero_fill(vm_page_t m) 541 { 542 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 543 return (TRUE); 544 } 545 546 /* 547 * Copy the contents of src_m to dest_m. The pages must be stable but spl 548 * and other protections depend on context. 549 */ 550 static __inline void 551 vm_page_copy(vm_page_t src_m, vm_page_t dest_m) 552 { 553 pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m)); 554 dest_m->valid = VM_PAGE_BITS_ALL; 555 dest_m->dirty = VM_PAGE_BITS_ALL; 556 } 557 558 /* 559 * Free a page. The page must be marked BUSY. 560 */ 561 static __inline void 562 vm_page_free(vm_page_t m) 563 { 564 vm_page_free_toq(m); 565 } 566 567 /* 568 * Free a page to the zerod-pages queue. The caller must ensure that the 569 * page has been zerod. 570 */ 571 static __inline void 572 vm_page_free_zero(vm_page_t m) 573 { 574 #ifdef PMAP_DEBUG 575 #ifdef PHYS_TO_DMAP 576 char *p = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 577 int i; 578 579 for (i = 0; i < PAGE_SIZE; i++) { 580 if (p[i] != 0) { 581 panic("non-zero page in vm_page_free_zero()"); 582 } 583 } 584 #endif 585 #endif 586 vm_page_free_toq(m); 587 } 588 589 /* 590 * Set page to not be dirty. Note: does not clear pmap modify bits . 591 */ 592 static __inline void 593 vm_page_undirty(vm_page_t m) 594 { 595 m->dirty = 0; 596 } 597 598 #endif /* _KERNEL */ 599 #endif /* _VM_VM_PAGE2_H_ */ 600 601