1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * The Mach Operating System project at Carnegie-Mellon University. 8 * 9 * This code is derived from software contributed to The DragonFly Project 10 * by Matthew Dillon <dillon@backplane.com> 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 37 * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $ 38 */ 39 40 /* 41 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 42 * All rights reserved. 43 * 44 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 45 * 46 * Permission to use, copy, modify and distribute this software and 47 * its documentation is hereby granted, provided that both the copyright 48 * notice and this permission notice appear in all copies of the 49 * software, derivative works or modified versions, and any portions 50 * thereof, and that both notices appear in supporting documentation. 51 * 52 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 53 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 54 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 55 * 56 * Carnegie Mellon requests users of this software to return to 57 * 58 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 59 * School of Computer Science 60 * Carnegie Mellon University 61 * Pittsburgh PA 15213-3890 62 * 63 * any improvements or extensions that they make and grant Carnegie the 64 * rights to redistribute these changes. 65 */ 66 /* 67 * Resident memory management module. The module manipulates 'VM pages'. 68 * A VM page is the core building block for memory management. 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/malloc.h> 74 #include <sys/proc.h> 75 #include <sys/vmmeter.h> 76 #include <sys/vnode.h> 77 #include <sys/kernel.h> 78 #include <sys/alist.h> 79 #include <sys/sysctl.h> 80 #include <sys/cpu_topology.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_param.h> 84 #include <sys/lock.h> 85 #include <vm/vm_kern.h> 86 #include <vm/pmap.h> 87 #include <vm/vm_map.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vm_extern.h> 93 #include <vm/swap_pager.h> 94 95 #include <machine/inttypes.h> 96 #include <machine/md_var.h> 97 #include <machine/specialreg.h> 98 #include <machine/bus_dma.h> 99 100 #include <vm/vm_page2.h> 101 #include <sys/spinlock2.h> 102 103 /* 104 * SET - Minimum required set associative size, must be a power of 2. We 105 * want this to match or exceed the set-associativeness of the cpu. 106 * 107 * GRP - A larger set that allows bleed-over into the domains of other 108 * nearby cpus. Also must be a power of 2. Used by the page zeroing 109 * code to smooth things out a bit. 110 */ 111 #define PQ_SET_ASSOC 16 112 #define PQ_SET_ASSOC_MASK (PQ_SET_ASSOC - 1) 113 114 #define PQ_GRP_ASSOC (PQ_SET_ASSOC * 2) 115 #define PQ_GRP_ASSOC_MASK (PQ_GRP_ASSOC - 1) 116 117 static void vm_page_queue_init(void); 118 static void vm_page_free_wakeup(void); 119 static vm_page_t vm_page_select_cache(u_short pg_color); 120 static vm_page_t _vm_page_list_find2(int basequeue, int index); 121 static void _vm_page_deactivate_locked(vm_page_t m, int athead); 122 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes); 123 124 /* 125 * Array of tailq lists 126 */ 127 __cachealign struct vpgqueues vm_page_queues[PQ_COUNT]; 128 129 static volatile int vm_pages_waiting; 130 static struct alist vm_contig_alist; 131 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536]; 132 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin"); 133 134 static u_long vm_dma_reserved = 0; 135 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved); 136 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0, 137 "Memory reserved for DMA"); 138 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD, 139 &vm_contig_alist.bl_free, 0, "Memory reserved for DMA"); 140 141 static int vm_contig_verbose = 0; 142 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose); 143 144 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare, 145 vm_pindex_t, pindex); 146 147 static void 148 vm_page_queue_init(void) 149 { 150 int i; 151 152 for (i = 0; i < PQ_L2_SIZE; i++) 153 vm_page_queues[PQ_FREE+i].cnt_offset = 154 offsetof(struct vmstats, v_free_count); 155 for (i = 0; i < PQ_L2_SIZE; i++) 156 vm_page_queues[PQ_CACHE+i].cnt_offset = 157 offsetof(struct vmstats, v_cache_count); 158 for (i = 0; i < PQ_L2_SIZE; i++) 159 vm_page_queues[PQ_INACTIVE+i].cnt_offset = 160 offsetof(struct vmstats, v_inactive_count); 161 for (i = 0; i < PQ_L2_SIZE; i++) 162 vm_page_queues[PQ_ACTIVE+i].cnt_offset = 163 offsetof(struct vmstats, v_active_count); 164 for (i = 0; i < PQ_L2_SIZE; i++) 165 vm_page_queues[PQ_HOLD+i].cnt_offset = 166 offsetof(struct vmstats, v_active_count); 167 /* PQ_NONE has no queue */ 168 169 for (i = 0; i < PQ_COUNT; i++) { 170 TAILQ_INIT(&vm_page_queues[i].pl); 171 spin_init(&vm_page_queues[i].spin, "vm_page_queue_init"); 172 } 173 } 174 175 /* 176 * note: place in initialized data section? Is this necessary? 177 */ 178 vm_pindex_t first_page = 0; 179 vm_pindex_t vm_page_array_size = 0; 180 vm_page_t vm_page_array = NULL; 181 vm_paddr_t vm_low_phys_reserved; 182 183 /* 184 * (low level boot) 185 * 186 * Sets the page size, perhaps based upon the memory size. 187 * Must be called before any use of page-size dependent functions. 188 */ 189 void 190 vm_set_page_size(void) 191 { 192 if (vmstats.v_page_size == 0) 193 vmstats.v_page_size = PAGE_SIZE; 194 if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0) 195 panic("vm_set_page_size: page size not a power of two"); 196 } 197 198 /* 199 * (low level boot) 200 * 201 * Add a new page to the freelist for use by the system. New pages 202 * are added to both the head and tail of the associated free page 203 * queue in a bottom-up fashion, so both zero'd and non-zero'd page 204 * requests pull 'recent' adds (higher physical addresses) first. 205 * 206 * Beware that the page zeroing daemon will also be running soon after 207 * boot, moving pages from the head to the tail of the PQ_FREE queues. 208 * 209 * Must be called in a critical section. 210 */ 211 static void 212 vm_add_new_page(vm_paddr_t pa) 213 { 214 struct vpgqueues *vpq; 215 vm_page_t m; 216 217 m = PHYS_TO_VM_PAGE(pa); 218 m->phys_addr = pa; 219 m->flags = 0; 220 m->pat_mode = PAT_WRITE_BACK; 221 m->pc = (pa >> PAGE_SHIFT); 222 223 /* 224 * Twist for cpu localization in addition to page coloring, so 225 * different cpus selecting by m->queue get different page colors. 226 */ 227 m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE); 228 m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE)); 229 m->pc &= PQ_L2_MASK; 230 231 /* 232 * Reserve a certain number of contiguous low memory pages for 233 * contigmalloc() to use. 234 */ 235 if (pa < vm_low_phys_reserved) { 236 atomic_add_long(&vmstats.v_page_count, 1); 237 atomic_add_long(&vmstats.v_dma_pages, 1); 238 m->queue = PQ_NONE; 239 m->wire_count = 1; 240 atomic_add_long(&vmstats.v_wire_count, 1); 241 alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1); 242 return; 243 } 244 245 /* 246 * General page 247 */ 248 m->queue = m->pc + PQ_FREE; 249 KKASSERT(m->dirty == 0); 250 251 atomic_add_long(&vmstats.v_page_count, 1); 252 atomic_add_long(&vmstats.v_free_count, 1); 253 vpq = &vm_page_queues[m->queue]; 254 TAILQ_INSERT_HEAD(&vpq->pl, m, pageq); 255 ++vpq->lcnt; 256 } 257 258 /* 259 * (low level boot) 260 * 261 * Initializes the resident memory module. 262 * 263 * Preallocates memory for critical VM structures and arrays prior to 264 * kernel_map becoming available. 265 * 266 * Memory is allocated from (virtual2_start, virtual2_end) if available, 267 * otherwise memory is allocated from (virtual_start, virtual_end). 268 * 269 * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be 270 * large enough to hold vm_page_array & other structures for machines with 271 * large amounts of ram, so we want to use virtual2* when available. 272 */ 273 void 274 vm_page_startup(void) 275 { 276 vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start; 277 vm_offset_t mapped; 278 vm_pindex_t npages; 279 vm_paddr_t page_range; 280 vm_paddr_t new_end; 281 int i; 282 vm_paddr_t pa; 283 vm_paddr_t last_pa; 284 vm_paddr_t end; 285 vm_paddr_t biggestone, biggestsize; 286 vm_paddr_t total; 287 vm_page_t m; 288 289 total = 0; 290 biggestsize = 0; 291 biggestone = 0; 292 vaddr = round_page(vaddr); 293 294 /* 295 * Make sure ranges are page-aligned. 296 */ 297 for (i = 0; phys_avail[i].phys_end; ++i) { 298 phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg); 299 phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end); 300 if (phys_avail[i].phys_end < phys_avail[i].phys_beg) 301 phys_avail[i].phys_end = phys_avail[i].phys_beg; 302 } 303 304 /* 305 * Locate largest block 306 */ 307 for (i = 0; phys_avail[i].phys_end; ++i) { 308 vm_paddr_t size = phys_avail[i].phys_end - 309 phys_avail[i].phys_beg; 310 311 if (size > biggestsize) { 312 biggestone = i; 313 biggestsize = size; 314 } 315 total += size; 316 } 317 --i; /* adjust to last entry for use down below */ 318 319 end = phys_avail[biggestone].phys_end; 320 end = trunc_page(end); 321 322 /* 323 * Initialize the queue headers for the free queue, the active queue 324 * and the inactive queue. 325 */ 326 vm_page_queue_init(); 327 328 #if !defined(_KERNEL_VIRTUAL) 329 /* 330 * VKERNELs don't support minidumps and as such don't need 331 * vm_page_dump 332 * 333 * Allocate a bitmap to indicate that a random physical page 334 * needs to be included in a minidump. 335 * 336 * The amd64 port needs this to indicate which direct map pages 337 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 338 * 339 * However, x86 still needs this workspace internally within the 340 * minidump code. In theory, they are not needed on x86, but are 341 * included should the sf_buf code decide to use them. 342 */ 343 page_range = phys_avail[i].phys_end / PAGE_SIZE; 344 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); 345 end -= vm_page_dump_size; 346 vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size, 347 VM_PROT_READ | VM_PROT_WRITE); 348 bzero((void *)vm_page_dump, vm_page_dump_size); 349 #endif 350 /* 351 * Compute the number of pages of memory that will be available for 352 * use (taking into account the overhead of a page structure per 353 * page). 354 */ 355 first_page = phys_avail[0].phys_beg / PAGE_SIZE; 356 page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page; 357 npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE; 358 359 #ifndef _KERNEL_VIRTUAL 360 /* 361 * (only applies to real kernels) 362 * 363 * Reserve a large amount of low memory for potential 32-bit DMA 364 * space allocations. Once device initialization is complete we 365 * release most of it, but keep (vm_dma_reserved) memory reserved 366 * for later use. Typically for X / graphics. Through trial and 367 * error we find that GPUs usually requires ~60-100MB or so. 368 * 369 * By default, 128M is left in reserve on machines with 2G+ of ram. 370 */ 371 vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT; 372 if (vm_low_phys_reserved > total / 4) 373 vm_low_phys_reserved = total / 4; 374 if (vm_dma_reserved == 0) { 375 vm_dma_reserved = 128 * 1024 * 1024; /* 128MB */ 376 if (vm_dma_reserved > total / 16) 377 vm_dma_reserved = total / 16; 378 } 379 #endif 380 alist_init(&vm_contig_alist, 65536, vm_contig_ameta, 381 ALIST_RECORDS_65536); 382 383 /* 384 * Initialize the mem entry structures now, and put them in the free 385 * queue. 386 */ 387 if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024) 388 kprintf("initializing vm_page_array "); 389 new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 390 mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); 391 vm_page_array = (vm_page_t)mapped; 392 393 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL) 394 /* 395 * since pmap_map on amd64 returns stuff out of a direct-map region, 396 * we have to manually add these pages to the minidump tracking so 397 * that they can be dumped, including the vm_page_array. 398 */ 399 for (pa = new_end; 400 pa < phys_avail[biggestone].phys_end; 401 pa += PAGE_SIZE) { 402 dump_add_page(pa); 403 } 404 #endif 405 406 /* 407 * Clear all of the page structures, run basic initialization so 408 * PHYS_TO_VM_PAGE() operates properly even on pages not in the 409 * map. 410 */ 411 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); 412 vm_page_array_size = page_range; 413 if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024) 414 kprintf("size = 0x%zx\n", vm_page_array_size); 415 416 m = &vm_page_array[0]; 417 pa = ptoa(first_page); 418 for (i = 0; i < page_range; ++i) { 419 spin_init(&m->spin, "vm_page"); 420 m->phys_addr = pa; 421 pa += PAGE_SIZE; 422 ++m; 423 } 424 425 /* 426 * Construct the free queue(s) in ascending order (by physical 427 * address) so that the first 16MB of physical memory is allocated 428 * last rather than first. On large-memory machines, this avoids 429 * the exhaustion of low physical memory before isa_dma_init has run. 430 */ 431 vmstats.v_page_count = 0; 432 vmstats.v_free_count = 0; 433 for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) { 434 pa = phys_avail[i].phys_beg; 435 if (i == biggestone) 436 last_pa = new_end; 437 else 438 last_pa = phys_avail[i].phys_end; 439 while (pa < last_pa && npages-- > 0) { 440 vm_add_new_page(pa); 441 pa += PAGE_SIZE; 442 } 443 } 444 if (virtual2_start) 445 virtual2_start = vaddr; 446 else 447 virtual_start = vaddr; 448 mycpu->gd_vmstats = vmstats; 449 } 450 451 /* 452 * Reorganize VM pages based on numa data. May be called as many times as 453 * necessary. Will reorganize the vm_page_t page color and related queue(s) 454 * to allow vm_page_alloc() to choose pages based on socket affinity. 455 * 456 * NOTE: This function is only called while we are still in UP mode, so 457 * we only need a critical section to protect the queues (which 458 * saves a lot of time, there are likely a ton of pages). 459 */ 460 void 461 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid) 462 { 463 vm_paddr_t scan_beg; 464 vm_paddr_t scan_end; 465 vm_paddr_t ran_end; 466 struct vpgqueues *vpq; 467 vm_page_t m; 468 vm_page_t mend; 469 int i; 470 int socket_mod; 471 int socket_value; 472 473 /* 474 * Check if no physical information, or there was only one socket 475 * (so don't waste time doing nothing!). 476 */ 477 if (cpu_topology_phys_ids <= 1 || 478 cpu_topology_core_ids == 0) { 479 return; 480 } 481 482 /* 483 * Setup for our iteration. Note that ACPI may iterate CPU 484 * sockets starting at 0 or 1 or some other number. The 485 * cpu_topology code mod's it against the socket count. 486 */ 487 ran_end = ran_beg + bytes; 488 489 socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids; 490 socket_value = (physid % cpu_topology_phys_ids) * socket_mod; 491 mend = &vm_page_array[vm_page_array_size]; 492 493 crit_enter(); 494 495 /* 496 * Adjust cpu_topology's phys_mem parameter 497 */ 498 if (root_cpu_node) 499 vm_numa_add_topology_mem(root_cpu_node, physid, (long)bytes); 500 501 /* 502 * Adjust vm_page->pc and requeue all affected pages. The 503 * allocator will then be able to localize memory allocations 504 * to some degree. 505 */ 506 for (i = 0; phys_avail[i].phys_end; ++i) { 507 scan_beg = phys_avail[i].phys_beg; 508 scan_end = phys_avail[i].phys_end; 509 if (scan_end <= ran_beg) 510 continue; 511 if (scan_beg >= ran_end) 512 continue; 513 if (scan_beg < ran_beg) 514 scan_beg = ran_beg; 515 if (scan_end > ran_end) 516 scan_end = ran_end; 517 if (atop(scan_end) > first_page + vm_page_array_size) 518 scan_end = ptoa(first_page + vm_page_array_size); 519 520 m = PHYS_TO_VM_PAGE(scan_beg); 521 while (scan_beg < scan_end) { 522 KKASSERT(m < mend); 523 if (m->queue != PQ_NONE) { 524 vpq = &vm_page_queues[m->queue]; 525 TAILQ_REMOVE(&vpq->pl, m, pageq); 526 --vpq->lcnt; 527 /* queue doesn't change, no need to adj cnt */ 528 m->queue -= m->pc; 529 m->pc %= socket_mod; 530 m->pc += socket_value; 531 m->pc &= PQ_L2_MASK; 532 m->queue += m->pc; 533 vpq = &vm_page_queues[m->queue]; 534 TAILQ_INSERT_HEAD(&vpq->pl, m, pageq); 535 ++vpq->lcnt; 536 /* queue doesn't change, no need to adj cnt */ 537 } else { 538 m->pc %= socket_mod; 539 m->pc += socket_value; 540 m->pc &= PQ_L2_MASK; 541 } 542 scan_beg += PAGE_SIZE; 543 ++m; 544 } 545 } 546 crit_exit(); 547 } 548 549 static 550 void 551 vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes) 552 { 553 int cpuid; 554 int i; 555 556 switch(cpup->type) { 557 case PACKAGE_LEVEL: 558 cpup->phys_mem += bytes; 559 break; 560 case CHIP_LEVEL: 561 /* 562 * All members should have the same chipid, so we only need 563 * to pull out one member. 564 */ 565 if (CPUMASK_TESTNZERO(cpup->members)) { 566 cpuid = BSFCPUMASK(cpup->members); 567 if (physid == 568 get_chip_ID_from_APICID(CPUID_TO_APICID(cpuid))) { 569 cpup->phys_mem += bytes; 570 } 571 } 572 break; 573 case CORE_LEVEL: 574 case THREAD_LEVEL: 575 /* 576 * Just inherit from the parent node 577 */ 578 cpup->phys_mem = cpup->parent_node->phys_mem; 579 break; 580 } 581 for (i = 0; i < MAXCPU && cpup->child_node[i]; ++i) 582 vm_numa_add_topology_mem(cpup->child_node[i], physid, bytes); 583 } 584 585 /* 586 * We tended to reserve a ton of memory for contigmalloc(). Now that most 587 * drivers have initialized we want to return most the remaining free 588 * reserve back to the VM page queues so they can be used for normal 589 * allocations. 590 * 591 * We leave vm_dma_reserved bytes worth of free pages in the reserve pool. 592 */ 593 static void 594 vm_page_startup_finish(void *dummy __unused) 595 { 596 alist_blk_t blk; 597 alist_blk_t rblk; 598 alist_blk_t count; 599 alist_blk_t xcount; 600 alist_blk_t bfree; 601 vm_page_t m; 602 603 spin_lock(&vm_contig_spin); 604 for (;;) { 605 bfree = alist_free_info(&vm_contig_alist, &blk, &count); 606 if (bfree <= vm_dma_reserved / PAGE_SIZE) 607 break; 608 if (count == 0) 609 break; 610 611 /* 612 * Figure out how much of the initial reserve we have to 613 * free in order to reach our target. 614 */ 615 bfree -= vm_dma_reserved / PAGE_SIZE; 616 if (count > bfree) { 617 blk += count - bfree; 618 count = bfree; 619 } 620 621 /* 622 * Calculate the nearest power of 2 <= count. 623 */ 624 for (xcount = 1; xcount <= count; xcount <<= 1) 625 ; 626 xcount >>= 1; 627 blk += count - xcount; 628 count = xcount; 629 630 /* 631 * Allocate the pages from the alist, then free them to 632 * the normal VM page queues. 633 * 634 * Pages allocated from the alist are wired. We have to 635 * busy, unwire, and free them. We must also adjust 636 * vm_low_phys_reserved before freeing any pages to prevent 637 * confusion. 638 */ 639 rblk = alist_alloc(&vm_contig_alist, blk, count); 640 if (rblk != blk) { 641 kprintf("vm_page_startup_finish: Unable to return " 642 "dma space @0x%08x/%d -> 0x%08x\n", 643 blk, count, rblk); 644 break; 645 } 646 atomic_add_long(&vmstats.v_dma_pages, -(long)count); 647 spin_unlock(&vm_contig_spin); 648 649 m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT); 650 vm_low_phys_reserved = VM_PAGE_TO_PHYS(m); 651 while (count) { 652 vm_page_busy_wait(m, FALSE, "cpgfr"); 653 vm_page_unwire(m, 0); 654 vm_page_free(m); 655 --count; 656 ++m; 657 } 658 spin_lock(&vm_contig_spin); 659 } 660 spin_unlock(&vm_contig_spin); 661 662 /* 663 * Print out how much DMA space drivers have already allocated and 664 * how much is left over. 665 */ 666 kprintf("DMA space used: %jdk, remaining available: %jdk\n", 667 (intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) * 668 (PAGE_SIZE / 1024), 669 (intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024)); 670 } 671 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY, 672 vm_page_startup_finish, NULL); 673 674 675 /* 676 * Scan comparison function for Red-Black tree scans. An inclusive 677 * (start,end) is expected. Other fields are not used. 678 */ 679 int 680 rb_vm_page_scancmp(struct vm_page *p, void *data) 681 { 682 struct rb_vm_page_scan_info *info = data; 683 684 if (p->pindex < info->start_pindex) 685 return(-1); 686 if (p->pindex > info->end_pindex) 687 return(1); 688 return(0); 689 } 690 691 int 692 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2) 693 { 694 if (p1->pindex < p2->pindex) 695 return(-1); 696 if (p1->pindex > p2->pindex) 697 return(1); 698 return(0); 699 } 700 701 void 702 vm_page_init(vm_page_t m) 703 { 704 /* do nothing for now. Called from pmap_page_init() */ 705 } 706 707 /* 708 * Each page queue has its own spin lock, which is fairly optimal for 709 * allocating and freeing pages at least. 710 * 711 * The caller must hold the vm_page_spin_lock() before locking a vm_page's 712 * queue spinlock via this function. Also note that m->queue cannot change 713 * unless both the page and queue are locked. 714 */ 715 static __inline 716 void 717 _vm_page_queue_spin_lock(vm_page_t m) 718 { 719 u_short queue; 720 721 queue = m->queue; 722 if (queue != PQ_NONE) { 723 spin_lock(&vm_page_queues[queue].spin); 724 KKASSERT(queue == m->queue); 725 } 726 } 727 728 static __inline 729 void 730 _vm_page_queue_spin_unlock(vm_page_t m) 731 { 732 u_short queue; 733 734 queue = m->queue; 735 cpu_ccfence(); 736 if (queue != PQ_NONE) 737 spin_unlock(&vm_page_queues[queue].spin); 738 } 739 740 static __inline 741 void 742 _vm_page_queues_spin_lock(u_short queue) 743 { 744 cpu_ccfence(); 745 if (queue != PQ_NONE) 746 spin_lock(&vm_page_queues[queue].spin); 747 } 748 749 750 static __inline 751 void 752 _vm_page_queues_spin_unlock(u_short queue) 753 { 754 cpu_ccfence(); 755 if (queue != PQ_NONE) 756 spin_unlock(&vm_page_queues[queue].spin); 757 } 758 759 void 760 vm_page_queue_spin_lock(vm_page_t m) 761 { 762 _vm_page_queue_spin_lock(m); 763 } 764 765 void 766 vm_page_queues_spin_lock(u_short queue) 767 { 768 _vm_page_queues_spin_lock(queue); 769 } 770 771 void 772 vm_page_queue_spin_unlock(vm_page_t m) 773 { 774 _vm_page_queue_spin_unlock(m); 775 } 776 777 void 778 vm_page_queues_spin_unlock(u_short queue) 779 { 780 _vm_page_queues_spin_unlock(queue); 781 } 782 783 /* 784 * This locks the specified vm_page and its queue in the proper order 785 * (page first, then queue). The queue may change so the caller must 786 * recheck on return. 787 */ 788 static __inline 789 void 790 _vm_page_and_queue_spin_lock(vm_page_t m) 791 { 792 vm_page_spin_lock(m); 793 _vm_page_queue_spin_lock(m); 794 } 795 796 static __inline 797 void 798 _vm_page_and_queue_spin_unlock(vm_page_t m) 799 { 800 _vm_page_queues_spin_unlock(m->queue); 801 vm_page_spin_unlock(m); 802 } 803 804 void 805 vm_page_and_queue_spin_unlock(vm_page_t m) 806 { 807 _vm_page_and_queue_spin_unlock(m); 808 } 809 810 void 811 vm_page_and_queue_spin_lock(vm_page_t m) 812 { 813 _vm_page_and_queue_spin_lock(m); 814 } 815 816 /* 817 * Helper function removes vm_page from its current queue. 818 * Returns the base queue the page used to be on. 819 * 820 * The vm_page and the queue must be spinlocked. 821 * This function will unlock the queue but leave the page spinlocked. 822 */ 823 static __inline u_short 824 _vm_page_rem_queue_spinlocked(vm_page_t m) 825 { 826 struct vpgqueues *pq; 827 u_short queue; 828 u_short oqueue; 829 long *cnt; 830 831 queue = m->queue; 832 if (queue != PQ_NONE) { 833 pq = &vm_page_queues[queue]; 834 TAILQ_REMOVE(&pq->pl, m, pageq); 835 836 /* 837 * Adjust our pcpu stats. In order for the nominal low-memory 838 * algorithms to work properly we don't let any pcpu stat get 839 * too negative before we force it to be rolled-up into the 840 * global stats. Otherwise our pageout and vm_wait tests 841 * will fail badly. 842 * 843 * The idea here is to reduce unnecessary SMP cache 844 * mastership changes in the global vmstats, which can be 845 * particularly bad in multi-socket systems. 846 */ 847 cnt = (long *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset); 848 atomic_add_long(cnt, -1); 849 if (*cnt < -VMMETER_SLOP_COUNT) { 850 u_long copy = atomic_swap_long(cnt, 0); 851 cnt = (long *)((char *)&vmstats + pq->cnt_offset); 852 atomic_add_long(cnt, copy); 853 cnt = (long *)((char *)&mycpu->gd_vmstats + 854 pq->cnt_offset); 855 atomic_add_long(cnt, copy); 856 } 857 pq->lcnt--; 858 m->queue = PQ_NONE; 859 oqueue = queue; 860 queue -= m->pc; 861 vm_page_queues_spin_unlock(oqueue); /* intended */ 862 } 863 return queue; 864 } 865 866 /* 867 * Helper function places the vm_page on the specified queue. Generally 868 * speaking only PQ_FREE pages are placed at the head, to allow them to 869 * be allocated sooner rather than later on the assumption that they 870 * are cache-hot. 871 * 872 * The vm_page must be spinlocked. 873 * This function will return with both the page and the queue locked. 874 */ 875 static __inline void 876 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead) 877 { 878 struct vpgqueues *pq; 879 u_long *cnt; 880 881 KKASSERT(m->queue == PQ_NONE); 882 883 if (queue != PQ_NONE) { 884 vm_page_queues_spin_lock(queue); 885 pq = &vm_page_queues[queue]; 886 ++pq->lcnt; 887 888 /* 889 * Adjust our pcpu stats. If a system entity really needs 890 * to incorporate the count it will call vmstats_rollup() 891 * to roll it all up into the global vmstats strufture. 892 */ 893 cnt = (long *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset); 894 atomic_add_long(cnt, 1); 895 896 /* 897 * PQ_FREE is always handled LIFO style to try to provide 898 * cache-hot pages to programs. 899 */ 900 m->queue = queue; 901 if (queue - m->pc == PQ_FREE) { 902 TAILQ_INSERT_HEAD(&pq->pl, m, pageq); 903 } else if (athead) { 904 TAILQ_INSERT_HEAD(&pq->pl, m, pageq); 905 } else { 906 TAILQ_INSERT_TAIL(&pq->pl, m, pageq); 907 } 908 /* leave the queue spinlocked */ 909 } 910 } 911 912 /* 913 * Wait until page is no longer BUSY. If also_m_busy is TRUE we wait 914 * until the page is no longer BUSY or SBUSY (busy_count field is 0). 915 * 916 * Returns TRUE if it had to sleep, FALSE if we did not. Only one sleep 917 * call will be made before returning. 918 * 919 * This function does NOT busy the page and on return the page is not 920 * guaranteed to be available. 921 */ 922 void 923 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg) 924 { 925 u_int32_t busy_count; 926 927 for (;;) { 928 busy_count = m->busy_count; 929 cpu_ccfence(); 930 931 if ((busy_count & PBUSY_LOCKED) == 0 && 932 (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) { 933 break; 934 } 935 tsleep_interlock(m, 0); 936 if (atomic_cmpset_int(&m->busy_count, busy_count, 937 busy_count | PBUSY_WANTED)) { 938 atomic_set_int(&m->flags, PG_REFERENCED); 939 tsleep(m, PINTERLOCKED, msg, 0); 940 break; 941 } 942 } 943 } 944 945 /* 946 * This calculates and returns a page color given an optional VM object and 947 * either a pindex or an iterator. We attempt to return a cpu-localized 948 * pg_color that is still roughly 16-way set-associative. The CPU topology 949 * is used if it was probed. 950 * 951 * The caller may use the returned value to index into e.g. PQ_FREE when 952 * allocating a page in order to nominally obtain pages that are hopefully 953 * already localized to the requesting cpu. This function is not able to 954 * provide any sort of guarantee of this, but does its best to improve 955 * hardware cache management performance. 956 * 957 * WARNING! The caller must mask the returned value with PQ_L2_MASK. 958 */ 959 u_short 960 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex) 961 { 962 u_short pg_color; 963 int phys_id; 964 int core_id; 965 int object_pg_color; 966 967 phys_id = get_cpu_phys_id(cpuid); 968 core_id = get_cpu_core_id(cpuid); 969 object_pg_color = object ? object->pg_color : 0; 970 971 if (cpu_topology_phys_ids && cpu_topology_core_ids) { 972 int grpsize; 973 974 /* 975 * Break us down by socket and cpu 976 */ 977 pg_color = phys_id * PQ_L2_SIZE / cpu_topology_phys_ids; 978 pg_color += core_id * PQ_L2_SIZE / 979 (cpu_topology_core_ids * cpu_topology_phys_ids); 980 981 /* 982 * Calculate remaining component for object/queue color 983 */ 984 grpsize = PQ_L2_SIZE / (cpu_topology_core_ids * 985 cpu_topology_phys_ids); 986 if (grpsize >= 8) { 987 pg_color += (pindex + object_pg_color) % grpsize; 988 } else { 989 if (grpsize <= 2) { 990 grpsize = 8; 991 } else { 992 /* 3->9, 4->8, 5->10, 6->12, 7->14 */ 993 grpsize += grpsize; 994 if (grpsize < 8) 995 grpsize += grpsize; 996 } 997 pg_color += (pindex + object_pg_color) % grpsize; 998 } 999 } else { 1000 /* 1001 * Unknown topology, distribute things evenly. 1002 */ 1003 pg_color = cpuid * PQ_L2_SIZE / ncpus; 1004 pg_color += pindex + object_pg_color; 1005 } 1006 return (pg_color & PQ_L2_MASK); 1007 } 1008 1009 /* 1010 * Wait until BUSY can be set, then set it. If also_m_busy is TRUE we 1011 * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED. 1012 */ 1013 void 1014 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m, 1015 int also_m_busy, const char *msg 1016 VM_PAGE_DEBUG_ARGS) 1017 { 1018 u_int32_t busy_count; 1019 1020 for (;;) { 1021 busy_count = m->busy_count; 1022 cpu_ccfence(); 1023 if (busy_count & PBUSY_LOCKED) { 1024 tsleep_interlock(m, 0); 1025 if (atomic_cmpset_int(&m->busy_count, busy_count, 1026 busy_count | PBUSY_WANTED)) { 1027 atomic_set_int(&m->flags, PG_REFERENCED); 1028 tsleep(m, PINTERLOCKED, msg, 0); 1029 } 1030 } else if (also_m_busy && busy_count) { 1031 tsleep_interlock(m, 0); 1032 if (atomic_cmpset_int(&m->busy_count, busy_count, 1033 busy_count | PBUSY_WANTED)) { 1034 atomic_set_int(&m->flags, PG_REFERENCED); 1035 tsleep(m, PINTERLOCKED, msg, 0); 1036 } 1037 } else { 1038 if (atomic_cmpset_int(&m->busy_count, busy_count, 1039 busy_count | PBUSY_LOCKED)) { 1040 #ifdef VM_PAGE_DEBUG 1041 m->busy_func = func; 1042 m->busy_line = lineno; 1043 #endif 1044 break; 1045 } 1046 } 1047 } 1048 } 1049 1050 /* 1051 * Attempt to set BUSY. If also_m_busy is TRUE we only succeed if 1052 * m->busy_count is also 0. 1053 * 1054 * Returns non-zero on failure. 1055 */ 1056 int 1057 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy 1058 VM_PAGE_DEBUG_ARGS) 1059 { 1060 u_int32_t busy_count; 1061 1062 for (;;) { 1063 busy_count = m->busy_count; 1064 cpu_ccfence(); 1065 if (busy_count & PBUSY_LOCKED) 1066 return TRUE; 1067 if (also_m_busy && (busy_count & PBUSY_MASK) != 0) 1068 return TRUE; 1069 if (atomic_cmpset_int(&m->busy_count, busy_count, 1070 busy_count | PBUSY_LOCKED)) { 1071 #ifdef VM_PAGE_DEBUG 1072 m->busy_func = func; 1073 m->busy_line = lineno; 1074 #endif 1075 return FALSE; 1076 } 1077 } 1078 } 1079 1080 /* 1081 * Clear the BUSY flag and return non-zero to indicate to the caller 1082 * that a wakeup() should be performed. 1083 * 1084 * The vm_page must be spinlocked and will remain spinlocked on return. 1085 * The related queue must NOT be spinlocked (which could deadlock us). 1086 * 1087 * (inline version) 1088 */ 1089 static __inline 1090 int 1091 _vm_page_wakeup(vm_page_t m) 1092 { 1093 u_int32_t busy_count; 1094 1095 for (;;) { 1096 busy_count = m->busy_count; 1097 cpu_ccfence(); 1098 if (atomic_cmpset_int(&m->busy_count, busy_count, 1099 busy_count & 1100 ~(PBUSY_LOCKED | PBUSY_WANTED))) { 1101 break; 1102 } 1103 } 1104 return((int)(busy_count & PBUSY_WANTED)); 1105 } 1106 1107 /* 1108 * Clear the BUSY flag and wakeup anyone waiting for the page. This 1109 * is typically the last call you make on a page before moving onto 1110 * other things. 1111 */ 1112 void 1113 vm_page_wakeup(vm_page_t m) 1114 { 1115 KASSERT(m->busy_count & PBUSY_LOCKED, 1116 ("vm_page_wakeup: page not busy!!!")); 1117 vm_page_spin_lock(m); 1118 if (_vm_page_wakeup(m)) { 1119 vm_page_spin_unlock(m); 1120 wakeup(m); 1121 } else { 1122 vm_page_spin_unlock(m); 1123 } 1124 } 1125 1126 /* 1127 * Holding a page keeps it from being reused. Other parts of the system 1128 * can still disassociate the page from its current object and free it, or 1129 * perform read or write I/O on it and/or otherwise manipulate the page, 1130 * but if the page is held the VM system will leave the page and its data 1131 * intact and not reuse the page for other purposes until the last hold 1132 * reference is released. (see vm_page_wire() if you want to prevent the 1133 * page from being disassociated from its object too). 1134 * 1135 * The caller must still validate the contents of the page and, if necessary, 1136 * wait for any pending I/O (e.g. vm_page_sleep_busy() loop) to complete 1137 * before manipulating the page. 1138 * 1139 * XXX get vm_page_spin_lock() here and move FREE->HOLD if necessary 1140 */ 1141 void 1142 vm_page_hold(vm_page_t m) 1143 { 1144 vm_page_spin_lock(m); 1145 atomic_add_int(&m->hold_count, 1); 1146 if (m->queue - m->pc == PQ_FREE) { 1147 _vm_page_queue_spin_lock(m); 1148 _vm_page_rem_queue_spinlocked(m); 1149 _vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0); 1150 _vm_page_queue_spin_unlock(m); 1151 } 1152 vm_page_spin_unlock(m); 1153 } 1154 1155 /* 1156 * The opposite of vm_page_hold(). If the page is on the HOLD queue 1157 * it was freed while held and must be moved back to the FREE queue. 1158 */ 1159 void 1160 vm_page_unhold(vm_page_t m) 1161 { 1162 KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE, 1163 ("vm_page_unhold: pg %p illegal hold_count (%d) or on FREE queue (%d)", 1164 m, m->hold_count, m->queue - m->pc)); 1165 vm_page_spin_lock(m); 1166 atomic_add_int(&m->hold_count, -1); 1167 if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) { 1168 _vm_page_queue_spin_lock(m); 1169 _vm_page_rem_queue_spinlocked(m); 1170 _vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1); 1171 _vm_page_queue_spin_unlock(m); 1172 } 1173 vm_page_spin_unlock(m); 1174 } 1175 1176 /* 1177 * vm_page_getfake: 1178 * 1179 * Create a fictitious page with the specified physical address and 1180 * memory attribute. The memory attribute is the only the machine- 1181 * dependent aspect of a fictitious page that must be initialized. 1182 */ 1183 1184 void 1185 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 1186 { 1187 1188 if ((m->flags & PG_FICTITIOUS) != 0) { 1189 /* 1190 * The page's memattr might have changed since the 1191 * previous initialization. Update the pmap to the 1192 * new memattr. 1193 */ 1194 goto memattr; 1195 } 1196 m->phys_addr = paddr; 1197 m->queue = PQ_NONE; 1198 /* Fictitious pages don't use "segind". */ 1199 /* Fictitious pages don't use "order" or "pool". */ 1200 m->flags = PG_FICTITIOUS | PG_UNMANAGED; 1201 m->busy_count = PBUSY_LOCKED; 1202 m->wire_count = 1; 1203 spin_init(&m->spin, "fake_page"); 1204 pmap_page_init(m); 1205 memattr: 1206 pmap_page_set_memattr(m, memattr); 1207 } 1208 1209 /* 1210 * Inserts the given vm_page into the object and object list. 1211 * 1212 * The pagetables are not updated but will presumably fault the page 1213 * in if necessary, or if a kernel page the caller will at some point 1214 * enter the page into the kernel's pmap. We are not allowed to block 1215 * here so we *can't* do this anyway. 1216 * 1217 * This routine may not block. 1218 * This routine must be called with the vm_object held. 1219 * This routine must be called with a critical section held. 1220 * 1221 * This routine returns TRUE if the page was inserted into the object 1222 * successfully, and FALSE if the page already exists in the object. 1223 */ 1224 int 1225 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 1226 { 1227 ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object)); 1228 if (m->object != NULL) 1229 panic("vm_page_insert: already inserted"); 1230 1231 atomic_add_int(&object->generation, 1); 1232 1233 /* 1234 * Record the object/offset pair in this page and add the 1235 * pv_list_count of the page to the object. 1236 * 1237 * The vm_page spin lock is required for interactions with the pmap. 1238 */ 1239 vm_page_spin_lock(m); 1240 m->object = object; 1241 m->pindex = pindex; 1242 if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) { 1243 m->object = NULL; 1244 m->pindex = 0; 1245 vm_page_spin_unlock(m); 1246 return FALSE; 1247 } 1248 ++object->resident_page_count; 1249 ++mycpu->gd_vmtotal.t_rm; 1250 vm_page_spin_unlock(m); 1251 1252 /* 1253 * Since we are inserting a new and possibly dirty page, 1254 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags. 1255 */ 1256 if ((m->valid & m->dirty) || 1257 (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT))) 1258 vm_object_set_writeable_dirty(object); 1259 1260 /* 1261 * Checks for a swap assignment and sets PG_SWAPPED if appropriate. 1262 */ 1263 swap_pager_page_inserted(m); 1264 return TRUE; 1265 } 1266 1267 /* 1268 * Removes the given vm_page_t from the (object,index) table 1269 * 1270 * The underlying pmap entry (if any) is NOT removed here. 1271 * This routine may not block. 1272 * 1273 * The page must be BUSY and will remain BUSY on return. 1274 * No other requirements. 1275 * 1276 * NOTE: FreeBSD side effect was to unbusy the page on return. We leave 1277 * it busy. 1278 */ 1279 void 1280 vm_page_remove(vm_page_t m) 1281 { 1282 vm_object_t object; 1283 1284 if (m->object == NULL) { 1285 return; 1286 } 1287 1288 if ((m->busy_count & PBUSY_LOCKED) == 0) 1289 panic("vm_page_remove: page not busy"); 1290 1291 object = m->object; 1292 1293 vm_object_hold(object); 1294 1295 /* 1296 * Remove the page from the object and update the object. 1297 * 1298 * The vm_page spin lock is required for interactions with the pmap. 1299 */ 1300 vm_page_spin_lock(m); 1301 vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m); 1302 --object->resident_page_count; 1303 --mycpu->gd_vmtotal.t_rm; 1304 m->object = NULL; 1305 atomic_add_int(&object->generation, 1); 1306 vm_page_spin_unlock(m); 1307 1308 vm_object_drop(object); 1309 } 1310 1311 /* 1312 * Locate and return the page at (object, pindex), or NULL if the 1313 * page could not be found. 1314 * 1315 * The caller must hold the vm_object token. 1316 */ 1317 vm_page_t 1318 vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 1319 { 1320 vm_page_t m; 1321 1322 /* 1323 * Search the hash table for this object/offset pair 1324 */ 1325 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1326 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex); 1327 KKASSERT(m == NULL || (m->object == object && m->pindex == pindex)); 1328 return(m); 1329 } 1330 1331 vm_page_t 1332 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object, 1333 vm_pindex_t pindex, 1334 int also_m_busy, const char *msg 1335 VM_PAGE_DEBUG_ARGS) 1336 { 1337 u_int32_t busy_count; 1338 vm_page_t m; 1339 1340 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1341 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex); 1342 while (m) { 1343 KKASSERT(m->object == object && m->pindex == pindex); 1344 busy_count = m->busy_count; 1345 cpu_ccfence(); 1346 if (busy_count & PBUSY_LOCKED) { 1347 tsleep_interlock(m, 0); 1348 if (atomic_cmpset_int(&m->busy_count, busy_count, 1349 busy_count | PBUSY_WANTED)) { 1350 atomic_set_int(&m->flags, PG_REFERENCED); 1351 tsleep(m, PINTERLOCKED, msg, 0); 1352 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, 1353 pindex); 1354 } 1355 } else if (also_m_busy && busy_count) { 1356 tsleep_interlock(m, 0); 1357 if (atomic_cmpset_int(&m->busy_count, busy_count, 1358 busy_count | PBUSY_WANTED)) { 1359 atomic_set_int(&m->flags, PG_REFERENCED); 1360 tsleep(m, PINTERLOCKED, msg, 0); 1361 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, 1362 pindex); 1363 } 1364 } else if (atomic_cmpset_int(&m->busy_count, busy_count, 1365 busy_count | PBUSY_LOCKED)) { 1366 #ifdef VM_PAGE_DEBUG 1367 m->busy_func = func; 1368 m->busy_line = lineno; 1369 #endif 1370 break; 1371 } 1372 } 1373 return m; 1374 } 1375 1376 /* 1377 * Attempt to lookup and busy a page. 1378 * 1379 * Returns NULL if the page could not be found 1380 * 1381 * Returns a vm_page and error == TRUE if the page exists but could not 1382 * be busied. 1383 * 1384 * Returns a vm_page and error == FALSE on success. 1385 */ 1386 vm_page_t 1387 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object, 1388 vm_pindex_t pindex, 1389 int also_m_busy, int *errorp 1390 VM_PAGE_DEBUG_ARGS) 1391 { 1392 u_int32_t busy_count; 1393 vm_page_t m; 1394 1395 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1396 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex); 1397 *errorp = FALSE; 1398 while (m) { 1399 KKASSERT(m->object == object && m->pindex == pindex); 1400 busy_count = m->busy_count; 1401 cpu_ccfence(); 1402 if (busy_count & PBUSY_LOCKED) { 1403 *errorp = TRUE; 1404 break; 1405 } 1406 if (also_m_busy && busy_count) { 1407 *errorp = TRUE; 1408 break; 1409 } 1410 if (atomic_cmpset_int(&m->busy_count, busy_count, 1411 busy_count | PBUSY_LOCKED)) { 1412 #ifdef VM_PAGE_DEBUG 1413 m->busy_func = func; 1414 m->busy_line = lineno; 1415 #endif 1416 break; 1417 } 1418 } 1419 return m; 1420 } 1421 1422 /* 1423 * Returns a page that is only soft-busied for use by the caller in 1424 * a read-only fashion. Returns NULL if the page could not be found, 1425 * the soft busy could not be obtained, or the page data is invalid. 1426 */ 1427 vm_page_t 1428 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex, 1429 int pgoff, int pgbytes) 1430 { 1431 vm_page_t m; 1432 1433 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1434 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex); 1435 if (m) { 1436 if ((m->valid != VM_PAGE_BITS_ALL && 1437 !vm_page_is_valid(m, pgoff, pgbytes)) || 1438 (m->flags & PG_FICTITIOUS)) { 1439 m = NULL; 1440 } else if (vm_page_sbusy_try(m)) { 1441 m = NULL; 1442 } else if ((m->valid != VM_PAGE_BITS_ALL && 1443 !vm_page_is_valid(m, pgoff, pgbytes)) || 1444 (m->flags & PG_FICTITIOUS)) { 1445 vm_page_sbusy_drop(m); 1446 m = NULL; 1447 } 1448 } 1449 return m; 1450 } 1451 1452 /* 1453 * Caller must hold the related vm_object 1454 */ 1455 vm_page_t 1456 vm_page_next(vm_page_t m) 1457 { 1458 vm_page_t next; 1459 1460 next = vm_page_rb_tree_RB_NEXT(m); 1461 if (next && next->pindex != m->pindex + 1) 1462 next = NULL; 1463 return (next); 1464 } 1465 1466 /* 1467 * vm_page_rename() 1468 * 1469 * Move the given vm_page from its current object to the specified 1470 * target object/offset. The page must be busy and will remain so 1471 * on return. 1472 * 1473 * new_object must be held. 1474 * This routine might block. XXX ? 1475 * 1476 * NOTE: Swap associated with the page must be invalidated by the move. We 1477 * have to do this for several reasons: (1) we aren't freeing the 1478 * page, (2) we are dirtying the page, (3) the VM system is probably 1479 * moving the page from object A to B, and will then later move 1480 * the backing store from A to B and we can't have a conflict. 1481 * 1482 * NOTE: We *always* dirty the page. It is necessary both for the 1483 * fact that we moved it, and because we may be invalidating 1484 * swap. If the page is on the cache, we have to deactivate it 1485 * or vm_page_dirty() will panic. Dirty pages are not allowed 1486 * on the cache. 1487 */ 1488 void 1489 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 1490 { 1491 KKASSERT(m->busy_count & PBUSY_LOCKED); 1492 ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object)); 1493 if (m->object) { 1494 ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object)); 1495 vm_page_remove(m); 1496 } 1497 if (vm_page_insert(m, new_object, new_pindex) == FALSE) { 1498 panic("vm_page_rename: target exists (%p,%"PRIu64")", 1499 new_object, new_pindex); 1500 } 1501 if (m->queue - m->pc == PQ_CACHE) 1502 vm_page_deactivate(m); 1503 vm_page_dirty(m); 1504 } 1505 1506 /* 1507 * vm_page_unqueue() without any wakeup. This routine is used when a page 1508 * is to remain BUSYied by the caller. 1509 * 1510 * This routine may not block. 1511 */ 1512 void 1513 vm_page_unqueue_nowakeup(vm_page_t m) 1514 { 1515 vm_page_and_queue_spin_lock(m); 1516 (void)_vm_page_rem_queue_spinlocked(m); 1517 vm_page_spin_unlock(m); 1518 } 1519 1520 /* 1521 * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon 1522 * if necessary. 1523 * 1524 * This routine may not block. 1525 */ 1526 void 1527 vm_page_unqueue(vm_page_t m) 1528 { 1529 u_short queue; 1530 1531 vm_page_and_queue_spin_lock(m); 1532 queue = _vm_page_rem_queue_spinlocked(m); 1533 if (queue == PQ_FREE || queue == PQ_CACHE) { 1534 vm_page_spin_unlock(m); 1535 pagedaemon_wakeup(); 1536 } else { 1537 vm_page_spin_unlock(m); 1538 } 1539 } 1540 1541 /* 1542 * vm_page_list_find() 1543 * 1544 * Find a page on the specified queue with color optimization. 1545 * 1546 * The page coloring optimization attempts to locate a page that does 1547 * not overload other nearby pages in the object in the cpu's L1 or L2 1548 * caches. We need this optimization because cpu caches tend to be 1549 * physical caches, while object spaces tend to be virtual. 1550 * 1551 * The page coloring optimization also, very importantly, tries to localize 1552 * memory to cpus and physical sockets. 1553 * 1554 * On MP systems each PQ_FREE and PQ_CACHE color queue has its own spinlock 1555 * and the algorithm is adjusted to localize allocations on a per-core basis. 1556 * This is done by 'twisting' the colors. 1557 * 1558 * The page is returned spinlocked and removed from its queue (it will 1559 * be on PQ_NONE), or NULL. The page is not BUSY'd. The caller 1560 * is responsible for dealing with the busy-page case (usually by 1561 * deactivating the page and looping). 1562 * 1563 * NOTE: This routine is carefully inlined. A non-inlined version 1564 * is available for outside callers but the only critical path is 1565 * from within this source file. 1566 * 1567 * NOTE: This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE 1568 * represent stable storage, allowing us to order our locks vm_page 1569 * first, then queue. 1570 */ 1571 static __inline 1572 vm_page_t 1573 _vm_page_list_find(int basequeue, int index) 1574 { 1575 vm_page_t m; 1576 1577 for (;;) { 1578 m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl); 1579 if (m == NULL) { 1580 m = _vm_page_list_find2(basequeue, index); 1581 return(m); 1582 } 1583 vm_page_and_queue_spin_lock(m); 1584 if (m->queue == basequeue + index) { 1585 _vm_page_rem_queue_spinlocked(m); 1586 /* vm_page_t spin held, no queue spin */ 1587 break; 1588 } 1589 vm_page_and_queue_spin_unlock(m); 1590 } 1591 return(m); 1592 } 1593 1594 /* 1595 * If we could not find the page in the desired queue try to find it in 1596 * a nearby queue. 1597 */ 1598 static vm_page_t 1599 _vm_page_list_find2(int basequeue, int index) 1600 { 1601 struct vpgqueues *pq; 1602 vm_page_t m = NULL; 1603 int pqmask = PQ_SET_ASSOC_MASK >> 1; 1604 int pqi; 1605 int i; 1606 1607 index &= PQ_L2_MASK; 1608 pq = &vm_page_queues[basequeue]; 1609 1610 /* 1611 * Run local sets of 16, 32, 64, 128, and the whole queue if all 1612 * else fails (PQ_L2_MASK which is 255). 1613 */ 1614 do { 1615 pqmask = (pqmask << 1) | 1; 1616 for (i = 0; i <= pqmask; ++i) { 1617 pqi = (index & ~pqmask) | ((index + i) & pqmask); 1618 m = TAILQ_FIRST(&pq[pqi].pl); 1619 if (m) { 1620 _vm_page_and_queue_spin_lock(m); 1621 if (m->queue == basequeue + pqi) { 1622 _vm_page_rem_queue_spinlocked(m); 1623 return(m); 1624 } 1625 _vm_page_and_queue_spin_unlock(m); 1626 --i; 1627 continue; 1628 } 1629 } 1630 } while (pqmask != PQ_L2_MASK); 1631 1632 return(m); 1633 } 1634 1635 /* 1636 * Returns a vm_page candidate for allocation. The page is not busied so 1637 * it can move around. The caller must busy the page (and typically 1638 * deactivate it if it cannot be busied!) 1639 * 1640 * Returns a spinlocked vm_page that has been removed from its queue. 1641 */ 1642 vm_page_t 1643 vm_page_list_find(int basequeue, int index) 1644 { 1645 return(_vm_page_list_find(basequeue, index)); 1646 } 1647 1648 /* 1649 * Find a page on the cache queue with color optimization, remove it 1650 * from the queue, and busy it. The returned page will not be spinlocked. 1651 * 1652 * A candidate failure will be deactivated. Candidates can fail due to 1653 * being busied by someone else, in which case they will be deactivated. 1654 * 1655 * This routine may not block. 1656 * 1657 */ 1658 static vm_page_t 1659 vm_page_select_cache(u_short pg_color) 1660 { 1661 vm_page_t m; 1662 1663 for (;;) { 1664 m = _vm_page_list_find(PQ_CACHE, pg_color & PQ_L2_MASK); 1665 if (m == NULL) 1666 break; 1667 /* 1668 * (m) has been removed from its queue and spinlocked 1669 */ 1670 if (vm_page_busy_try(m, TRUE)) { 1671 _vm_page_deactivate_locked(m, 0); 1672 vm_page_spin_unlock(m); 1673 } else { 1674 /* 1675 * We successfully busied the page 1676 */ 1677 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0 && 1678 m->hold_count == 0 && 1679 m->wire_count == 0 && 1680 (m->dirty & m->valid) == 0) { 1681 vm_page_spin_unlock(m); 1682 pagedaemon_wakeup(); 1683 return(m); 1684 } 1685 1686 /* 1687 * The page cannot be recycled, deactivate it. 1688 */ 1689 _vm_page_deactivate_locked(m, 0); 1690 if (_vm_page_wakeup(m)) { 1691 vm_page_spin_unlock(m); 1692 wakeup(m); 1693 } else { 1694 vm_page_spin_unlock(m); 1695 } 1696 } 1697 } 1698 return (m); 1699 } 1700 1701 /* 1702 * Find a free page. We attempt to inline the nominal case and fall back 1703 * to _vm_page_select_free() otherwise. A busied page is removed from 1704 * the queue and returned. 1705 * 1706 * This routine may not block. 1707 */ 1708 static __inline vm_page_t 1709 vm_page_select_free(u_short pg_color) 1710 { 1711 vm_page_t m; 1712 1713 for (;;) { 1714 m = _vm_page_list_find(PQ_FREE, pg_color & PQ_L2_MASK); 1715 if (m == NULL) 1716 break; 1717 if (vm_page_busy_try(m, TRUE)) { 1718 /* 1719 * Various mechanisms such as a pmap_collect can 1720 * result in a busy page on the free queue. We 1721 * have to move the page out of the way so we can 1722 * retry the allocation. If the other thread is not 1723 * allocating the page then m->valid will remain 0 and 1724 * the pageout daemon will free the page later on. 1725 * 1726 * Since we could not busy the page, however, we 1727 * cannot make assumptions as to whether the page 1728 * will be allocated by the other thread or not, 1729 * so all we can do is deactivate it to move it out 1730 * of the way. In particular, if the other thread 1731 * wires the page it may wind up on the inactive 1732 * queue and the pageout daemon will have to deal 1733 * with that case too. 1734 */ 1735 _vm_page_deactivate_locked(m, 0); 1736 vm_page_spin_unlock(m); 1737 } else { 1738 /* 1739 * Theoretically if we are able to busy the page 1740 * atomic with the queue removal (using the vm_page 1741 * lock) nobody else should be able to mess with the 1742 * page before us. 1743 */ 1744 KKASSERT((m->flags & (PG_UNMANAGED | 1745 PG_NEED_COMMIT)) == 0); 1746 KASSERT(m->hold_count == 0, ("m->hold_count is not zero " 1747 "pg %p q=%d flags=%08x hold=%d wire=%d", 1748 m, m->queue, m->flags, m->hold_count, m->wire_count)); 1749 KKASSERT(m->wire_count == 0); 1750 vm_page_spin_unlock(m); 1751 pagedaemon_wakeup(); 1752 1753 /* return busied and removed page */ 1754 return(m); 1755 } 1756 } 1757 return(m); 1758 } 1759 1760 /* 1761 * vm_page_alloc() 1762 * 1763 * Allocate and return a memory cell associated with this VM object/offset 1764 * pair. If object is NULL an unassociated page will be allocated. 1765 * 1766 * The returned page will be busied and removed from its queues. This 1767 * routine can block and may return NULL if a race occurs and the page 1768 * is found to already exist at the specified (object, pindex). 1769 * 1770 * VM_ALLOC_NORMAL allow use of cache pages, nominal free drain 1771 * VM_ALLOC_QUICK like normal but cannot use cache 1772 * VM_ALLOC_SYSTEM greater free drain 1773 * VM_ALLOC_INTERRUPT allow free list to be completely drained 1774 * VM_ALLOC_ZERO advisory request for pre-zero'd page only 1775 * VM_ALLOC_FORCE_ZERO advisory request for pre-zero'd page only 1776 * VM_ALLOC_NULL_OK ok to return NULL on insertion collision 1777 * (see vm_page_grab()) 1778 * VM_ALLOC_USE_GD ok to use per-gd cache 1779 * 1780 * VM_ALLOC_CPU(n) allocate using specified cpu localization 1781 * 1782 * The object must be held if not NULL 1783 * This routine may not block 1784 * 1785 * Additional special handling is required when called from an interrupt 1786 * (VM_ALLOC_INTERRUPT). We are not allowed to mess with the page cache 1787 * in this case. 1788 */ 1789 vm_page_t 1790 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req) 1791 { 1792 globaldata_t gd; 1793 vm_object_t obj; 1794 vm_page_t m; 1795 u_short pg_color; 1796 int cpuid_local; 1797 1798 #if 0 1799 /* 1800 * Special per-cpu free VM page cache. The pages are pre-busied 1801 * and pre-zerod for us. 1802 */ 1803 if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) { 1804 crit_enter_gd(gd); 1805 if (gd->gd_vmpg_count) { 1806 m = gd->gd_vmpg_array[--gd->gd_vmpg_count]; 1807 crit_exit_gd(gd); 1808 goto done; 1809 } 1810 crit_exit_gd(gd); 1811 } 1812 #endif 1813 m = NULL; 1814 1815 /* 1816 * CPU LOCALIZATION 1817 * 1818 * CPU localization algorithm. Break the page queues up by physical 1819 * id and core id (note that two cpu threads will have the same core 1820 * id, and core_id != gd_cpuid). 1821 * 1822 * This is nowhere near perfect, for example the last pindex in a 1823 * subgroup will overflow into the next cpu or package. But this 1824 * should get us good page reuse locality in heavy mixed loads. 1825 * 1826 * (may be executed before the APs are started, so other GDs might 1827 * not exist!) 1828 */ 1829 if (page_req & VM_ALLOC_CPU_SPEC) 1830 cpuid_local = VM_ALLOC_GETCPU(page_req); 1831 else 1832 cpuid_local = mycpu->gd_cpuid; 1833 1834 pg_color = vm_get_pg_color(cpuid_local, object, pindex); 1835 1836 KKASSERT(page_req & 1837 (VM_ALLOC_NORMAL|VM_ALLOC_QUICK| 1838 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM)); 1839 1840 /* 1841 * Certain system threads (pageout daemon, buf_daemon's) are 1842 * allowed to eat deeper into the free page list. 1843 */ 1844 if (curthread->td_flags & TDF_SYSTHREAD) 1845 page_req |= VM_ALLOC_SYSTEM; 1846 1847 /* 1848 * Impose various limitations. Note that the v_free_reserved test 1849 * must match the opposite of vm_page_count_target() to avoid 1850 * livelocks, be careful. 1851 */ 1852 loop: 1853 gd = mycpu; 1854 if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved || 1855 ((page_req & VM_ALLOC_INTERRUPT) && 1856 gd->gd_vmstats.v_free_count > 0) || 1857 ((page_req & VM_ALLOC_SYSTEM) && 1858 gd->gd_vmstats.v_cache_count == 0 && 1859 gd->gd_vmstats.v_free_count > 1860 gd->gd_vmstats.v_interrupt_free_min) 1861 ) { 1862 /* 1863 * The free queue has sufficient free pages to take one out. 1864 */ 1865 m = vm_page_select_free(pg_color); 1866 } else if (page_req & VM_ALLOC_NORMAL) { 1867 /* 1868 * Allocatable from the cache (non-interrupt only). On 1869 * success, we must free the page and try again, thus 1870 * ensuring that vmstats.v_*_free_min counters are replenished. 1871 */ 1872 #ifdef INVARIANTS 1873 if (curthread->td_preempted) { 1874 kprintf("vm_page_alloc(): warning, attempt to allocate" 1875 " cache page from preempting interrupt\n"); 1876 m = NULL; 1877 } else { 1878 m = vm_page_select_cache(pg_color); 1879 } 1880 #else 1881 m = vm_page_select_cache(pg_color); 1882 #endif 1883 /* 1884 * On success move the page into the free queue and loop. 1885 * 1886 * Only do this if we can safely acquire the vm_object lock, 1887 * because this is effectively a random page and the caller 1888 * might be holding the lock shared, we don't want to 1889 * deadlock. 1890 */ 1891 if (m != NULL) { 1892 KASSERT(m->dirty == 0, 1893 ("Found dirty cache page %p", m)); 1894 if ((obj = m->object) != NULL) { 1895 if (vm_object_hold_try(obj)) { 1896 vm_page_protect(m, VM_PROT_NONE); 1897 vm_page_free(m); 1898 /* m->object NULL here */ 1899 vm_object_drop(obj); 1900 } else { 1901 vm_page_deactivate(m); 1902 vm_page_wakeup(m); 1903 } 1904 } else { 1905 vm_page_protect(m, VM_PROT_NONE); 1906 vm_page_free(m); 1907 } 1908 goto loop; 1909 } 1910 1911 /* 1912 * On failure return NULL 1913 */ 1914 atomic_add_int(&vm_pageout_deficit, 1); 1915 pagedaemon_wakeup(); 1916 return (NULL); 1917 } else { 1918 /* 1919 * No pages available, wakeup the pageout daemon and give up. 1920 */ 1921 atomic_add_int(&vm_pageout_deficit, 1); 1922 pagedaemon_wakeup(); 1923 return (NULL); 1924 } 1925 1926 /* 1927 * v_free_count can race so loop if we don't find the expected 1928 * page. 1929 */ 1930 if (m == NULL) { 1931 vmstats_rollup(); 1932 goto loop; 1933 } 1934 1935 /* 1936 * Good page found. The page has already been busied for us and 1937 * removed from its queues. 1938 */ 1939 KASSERT(m->dirty == 0, 1940 ("vm_page_alloc: free/cache page %p was dirty", m)); 1941 KKASSERT(m->queue == PQ_NONE); 1942 1943 #if 0 1944 done: 1945 #endif 1946 /* 1947 * Initialize the structure, inheriting some flags but clearing 1948 * all the rest. The page has already been busied for us. 1949 */ 1950 vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK); 1951 1952 KKASSERT(m->wire_count == 0); 1953 KKASSERT((m->busy_count & PBUSY_MASK) == 0); 1954 m->act_count = 0; 1955 m->valid = 0; 1956 1957 /* 1958 * Caller must be holding the object lock (asserted by 1959 * vm_page_insert()). 1960 * 1961 * NOTE: Inserting a page here does not insert it into any pmaps 1962 * (which could cause us to block allocating memory). 1963 * 1964 * NOTE: If no object an unassociated page is allocated, m->pindex 1965 * can be used by the caller for any purpose. 1966 */ 1967 if (object) { 1968 if (vm_page_insert(m, object, pindex) == FALSE) { 1969 vm_page_free(m); 1970 if ((page_req & VM_ALLOC_NULL_OK) == 0) 1971 panic("PAGE RACE %p[%ld]/%p", 1972 object, (long)pindex, m); 1973 m = NULL; 1974 } 1975 } else { 1976 m->pindex = pindex; 1977 } 1978 1979 /* 1980 * Don't wakeup too often - wakeup the pageout daemon when 1981 * we would be nearly out of memory. 1982 */ 1983 pagedaemon_wakeup(); 1984 1985 /* 1986 * A BUSY page is returned. 1987 */ 1988 return (m); 1989 } 1990 1991 /* 1992 * Returns number of pages available in our DMA memory reserve 1993 * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf) 1994 */ 1995 vm_size_t 1996 vm_contig_avail_pages(void) 1997 { 1998 alist_blk_t blk; 1999 alist_blk_t count; 2000 alist_blk_t bfree; 2001 spin_lock(&vm_contig_spin); 2002 bfree = alist_free_info(&vm_contig_alist, &blk, &count); 2003 spin_unlock(&vm_contig_spin); 2004 2005 return bfree; 2006 } 2007 2008 /* 2009 * Attempt to allocate contiguous physical memory with the specified 2010 * requirements. 2011 */ 2012 vm_page_t 2013 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high, 2014 unsigned long alignment, unsigned long boundary, 2015 unsigned long size, vm_memattr_t memattr) 2016 { 2017 alist_blk_t blk; 2018 vm_page_t m; 2019 vm_pindex_t i; 2020 #if 0 2021 static vm_pindex_t contig_rover; 2022 #endif 2023 2024 alignment >>= PAGE_SHIFT; 2025 if (alignment == 0) 2026 alignment = 1; 2027 boundary >>= PAGE_SHIFT; 2028 if (boundary == 0) 2029 boundary = 1; 2030 size = (size + PAGE_MASK) >> PAGE_SHIFT; 2031 2032 #if 0 2033 /* 2034 * Disabled temporarily until we find a solution for DRM (a flag 2035 * to always use the free space reserve, for performance). 2036 */ 2037 if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE && 2038 boundary <= PAGE_SIZE && size == 1 && 2039 memattr == VM_MEMATTR_DEFAULT) { 2040 /* 2041 * Any page will work, use vm_page_alloc() 2042 * (e.g. when used from kmem_alloc_attr()) 2043 */ 2044 m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF, 2045 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 2046 VM_ALLOC_INTERRUPT); 2047 m->valid = VM_PAGE_BITS_ALL; 2048 vm_page_wire(m); 2049 vm_page_wakeup(m); 2050 } else 2051 #endif 2052 { 2053 /* 2054 * Use the low-memory dma reserve 2055 */ 2056 spin_lock(&vm_contig_spin); 2057 blk = alist_alloc(&vm_contig_alist, 0, size); 2058 if (blk == ALIST_BLOCK_NONE) { 2059 spin_unlock(&vm_contig_spin); 2060 if (bootverbose) { 2061 kprintf("vm_page_alloc_contig: %ldk nospace\n", 2062 (size << PAGE_SHIFT) / 1024); 2063 print_backtrace(5); 2064 } 2065 return(NULL); 2066 } 2067 if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) { 2068 alist_free(&vm_contig_alist, blk, size); 2069 spin_unlock(&vm_contig_spin); 2070 if (bootverbose) { 2071 kprintf("vm_page_alloc_contig: %ldk high " 2072 "%016jx failed\n", 2073 (size << PAGE_SHIFT) / 1024, 2074 (intmax_t)high); 2075 } 2076 return(NULL); 2077 } 2078 spin_unlock(&vm_contig_spin); 2079 m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT); 2080 } 2081 if (vm_contig_verbose) { 2082 kprintf("vm_page_alloc_contig: %016jx/%ldk " 2083 "(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n", 2084 (intmax_t)m->phys_addr, 2085 (size << PAGE_SHIFT) / 1024, 2086 low, high, alignment, boundary, size, memattr); 2087 } 2088 if (memattr != VM_MEMATTR_DEFAULT) { 2089 for (i = 0;i < size; i++) 2090 pmap_page_set_memattr(&m[i], memattr); 2091 } 2092 return m; 2093 } 2094 2095 /* 2096 * Free contiguously allocated pages. The pages will be wired but not busy. 2097 * When freeing to the alist we leave them wired and not busy. 2098 */ 2099 void 2100 vm_page_free_contig(vm_page_t m, unsigned long size) 2101 { 2102 vm_paddr_t pa = VM_PAGE_TO_PHYS(m); 2103 vm_pindex_t start = pa >> PAGE_SHIFT; 2104 vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT; 2105 2106 if (vm_contig_verbose) { 2107 kprintf("vm_page_free_contig: %016jx/%ldk\n", 2108 (intmax_t)pa, size / 1024); 2109 } 2110 if (pa < vm_low_phys_reserved) { 2111 KKASSERT(pa + size <= vm_low_phys_reserved); 2112 spin_lock(&vm_contig_spin); 2113 alist_free(&vm_contig_alist, start, pages); 2114 spin_unlock(&vm_contig_spin); 2115 } else { 2116 while (pages) { 2117 vm_page_busy_wait(m, FALSE, "cpgfr"); 2118 vm_page_unwire(m, 0); 2119 vm_page_free(m); 2120 --pages; 2121 ++m; 2122 } 2123 2124 } 2125 } 2126 2127 2128 /* 2129 * Wait for sufficient free memory for nominal heavy memory use kernel 2130 * operations. 2131 * 2132 * WARNING! Be sure never to call this in any vm_pageout code path, which 2133 * will trivially deadlock the system. 2134 */ 2135 void 2136 vm_wait_nominal(void) 2137 { 2138 while (vm_page_count_min(0)) 2139 vm_wait(0); 2140 } 2141 2142 /* 2143 * Test if vm_wait_nominal() would block. 2144 */ 2145 int 2146 vm_test_nominal(void) 2147 { 2148 if (vm_page_count_min(0)) 2149 return(1); 2150 return(0); 2151 } 2152 2153 /* 2154 * Block until free pages are available for allocation, called in various 2155 * places before memory allocations. 2156 * 2157 * The caller may loop if vm_page_count_min() == FALSE so we cannot be 2158 * more generous then that. 2159 */ 2160 void 2161 vm_wait(int timo) 2162 { 2163 /* 2164 * never wait forever 2165 */ 2166 if (timo == 0) 2167 timo = hz; 2168 lwkt_gettoken(&vm_token); 2169 2170 if (curthread == pagethread || 2171 curthread == emergpager) { 2172 /* 2173 * The pageout daemon itself needs pages, this is bad. 2174 */ 2175 if (vm_page_count_min(0)) { 2176 vm_pageout_pages_needed = 1; 2177 tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo); 2178 } 2179 } else { 2180 /* 2181 * Wakeup the pageout daemon if necessary and wait. 2182 * 2183 * Do not wait indefinitely for the target to be reached, 2184 * as load might prevent it from being reached any time soon. 2185 * But wait a little to try to slow down page allocations 2186 * and to give more important threads (the pagedaemon) 2187 * allocation priority. 2188 */ 2189 if (vm_page_count_target()) { 2190 if (vm_pages_needed == 0) { 2191 vm_pages_needed = 1; 2192 wakeup(&vm_pages_needed); 2193 } 2194 ++vm_pages_waiting; /* SMP race ok */ 2195 tsleep(&vmstats.v_free_count, 0, "vmwait", timo); 2196 } 2197 } 2198 lwkt_reltoken(&vm_token); 2199 } 2200 2201 /* 2202 * Block until free pages are available for allocation 2203 * 2204 * Called only from vm_fault so that processes page faulting can be 2205 * easily tracked. 2206 */ 2207 void 2208 vm_wait_pfault(void) 2209 { 2210 /* 2211 * Wakeup the pageout daemon if necessary and wait. 2212 * 2213 * Do not wait indefinitely for the target to be reached, 2214 * as load might prevent it from being reached any time soon. 2215 * But wait a little to try to slow down page allocations 2216 * and to give more important threads (the pagedaemon) 2217 * allocation priority. 2218 */ 2219 if (vm_page_count_min(0)) { 2220 lwkt_gettoken(&vm_token); 2221 while (vm_page_count_severe()) { 2222 if (vm_page_count_target()) { 2223 thread_t td; 2224 2225 if (vm_pages_needed == 0) { 2226 vm_pages_needed = 1; 2227 wakeup(&vm_pages_needed); 2228 } 2229 ++vm_pages_waiting; /* SMP race ok */ 2230 tsleep(&vmstats.v_free_count, 0, "pfault", hz); 2231 2232 /* 2233 * Do not stay stuck in the loop if the system is trying 2234 * to kill the process. 2235 */ 2236 td = curthread; 2237 if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL)) 2238 break; 2239 } 2240 } 2241 lwkt_reltoken(&vm_token); 2242 } 2243 } 2244 2245 /* 2246 * Put the specified page on the active list (if appropriate). Ensure 2247 * that act_count is at least ACT_INIT but do not otherwise mess with it. 2248 * 2249 * The caller should be holding the page busied ? XXX 2250 * This routine may not block. 2251 */ 2252 void 2253 vm_page_activate(vm_page_t m) 2254 { 2255 u_short oqueue; 2256 2257 vm_page_spin_lock(m); 2258 if (m->queue - m->pc != PQ_ACTIVE) { 2259 _vm_page_queue_spin_lock(m); 2260 oqueue = _vm_page_rem_queue_spinlocked(m); 2261 /* page is left spinlocked, queue is unlocked */ 2262 2263 if (oqueue == PQ_CACHE) 2264 mycpu->gd_cnt.v_reactivated++; 2265 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { 2266 if (m->act_count < ACT_INIT) 2267 m->act_count = ACT_INIT; 2268 _vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0); 2269 } 2270 _vm_page_and_queue_spin_unlock(m); 2271 if (oqueue == PQ_CACHE || oqueue == PQ_FREE) 2272 pagedaemon_wakeup(); 2273 } else { 2274 if (m->act_count < ACT_INIT) 2275 m->act_count = ACT_INIT; 2276 vm_page_spin_unlock(m); 2277 } 2278 } 2279 2280 /* 2281 * Helper routine for vm_page_free_toq() and vm_page_cache(). This 2282 * routine is called when a page has been added to the cache or free 2283 * queues. 2284 * 2285 * This routine may not block. 2286 */ 2287 static __inline void 2288 vm_page_free_wakeup(void) 2289 { 2290 globaldata_t gd = mycpu; 2291 2292 /* 2293 * If the pageout daemon itself needs pages, then tell it that 2294 * there are some free. 2295 */ 2296 if (vm_pageout_pages_needed && 2297 gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >= 2298 gd->gd_vmstats.v_pageout_free_min 2299 ) { 2300 vm_pageout_pages_needed = 0; 2301 wakeup(&vm_pageout_pages_needed); 2302 } 2303 2304 /* 2305 * Wakeup processes that are waiting on memory. 2306 * 2307 * Generally speaking we want to wakeup stuck processes as soon as 2308 * possible. !vm_page_count_min(0) is the absolute minimum point 2309 * where we can do this. Wait a bit longer to reduce degenerate 2310 * re-blocking (vm_page_free_hysteresis). The target check is just 2311 * to make sure the min-check w/hysteresis does not exceed the 2312 * normal target. 2313 */ 2314 if (vm_pages_waiting) { 2315 if (!vm_page_count_min(vm_page_free_hysteresis) || 2316 !vm_page_count_target()) { 2317 vm_pages_waiting = 0; 2318 wakeup(&vmstats.v_free_count); 2319 ++mycpu->gd_cnt.v_ppwakeups; 2320 } 2321 #if 0 2322 if (!vm_page_count_target()) { 2323 /* 2324 * Plenty of pages are free, wakeup everyone. 2325 */ 2326 vm_pages_waiting = 0; 2327 wakeup(&vmstats.v_free_count); 2328 ++mycpu->gd_cnt.v_ppwakeups; 2329 } else if (!vm_page_count_min(0)) { 2330 /* 2331 * Some pages are free, wakeup someone. 2332 */ 2333 int wcount = vm_pages_waiting; 2334 if (wcount > 0) 2335 --wcount; 2336 vm_pages_waiting = wcount; 2337 wakeup_one(&vmstats.v_free_count); 2338 ++mycpu->gd_cnt.v_ppwakeups; 2339 } 2340 #endif 2341 } 2342 } 2343 2344 /* 2345 * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates 2346 * it from its VM object. 2347 * 2348 * The vm_page must be BUSY on entry. BUSY will be released on 2349 * return (the page will have been freed). 2350 */ 2351 void 2352 vm_page_free_toq(vm_page_t m) 2353 { 2354 mycpu->gd_cnt.v_tfree++; 2355 KKASSERT((m->flags & PG_MAPPED) == 0); 2356 KKASSERT(m->busy_count & PBUSY_LOCKED); 2357 2358 if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) { 2359 kprintf("vm_page_free: pindex(%lu), busy %08x, " 2360 "hold(%d)\n", 2361 (u_long)m->pindex, m->busy_count, m->hold_count); 2362 if ((m->queue - m->pc) == PQ_FREE) 2363 panic("vm_page_free: freeing free page"); 2364 else 2365 panic("vm_page_free: freeing busy page"); 2366 } 2367 2368 /* 2369 * Remove from object, spinlock the page and its queues and 2370 * remove from any queue. No queue spinlock will be held 2371 * after this section (because the page was removed from any 2372 * queue). 2373 */ 2374 vm_page_remove(m); 2375 vm_page_and_queue_spin_lock(m); 2376 _vm_page_rem_queue_spinlocked(m); 2377 2378 /* 2379 * No further management of fictitious pages occurs beyond object 2380 * and queue removal. 2381 */ 2382 if ((m->flags & PG_FICTITIOUS) != 0) { 2383 vm_page_spin_unlock(m); 2384 vm_page_wakeup(m); 2385 return; 2386 } 2387 2388 m->valid = 0; 2389 vm_page_undirty(m); 2390 2391 if (m->wire_count != 0) { 2392 if (m->wire_count > 1) { 2393 panic( 2394 "vm_page_free: invalid wire count (%d), pindex: 0x%lx", 2395 m->wire_count, (long)m->pindex); 2396 } 2397 panic("vm_page_free: freeing wired page"); 2398 } 2399 2400 /* 2401 * Clear the UNMANAGED flag when freeing an unmanaged page. 2402 * Clear the NEED_COMMIT flag 2403 */ 2404 if (m->flags & PG_UNMANAGED) 2405 vm_page_flag_clear(m, PG_UNMANAGED); 2406 if (m->flags & PG_NEED_COMMIT) 2407 vm_page_flag_clear(m, PG_NEED_COMMIT); 2408 2409 if (m->hold_count != 0) { 2410 _vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0); 2411 } else { 2412 _vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1); 2413 } 2414 2415 /* 2416 * This sequence allows us to clear BUSY while still holding 2417 * its spin lock, which reduces contention vs allocators. We 2418 * must not leave the queue locked or _vm_page_wakeup() may 2419 * deadlock. 2420 */ 2421 _vm_page_queue_spin_unlock(m); 2422 if (_vm_page_wakeup(m)) { 2423 vm_page_spin_unlock(m); 2424 wakeup(m); 2425 } else { 2426 vm_page_spin_unlock(m); 2427 } 2428 vm_page_free_wakeup(); 2429 } 2430 2431 /* 2432 * vm_page_unmanage() 2433 * 2434 * Prevent PV management from being done on the page. The page is 2435 * removed from the paging queues as if it were wired, and as a 2436 * consequence of no longer being managed the pageout daemon will not 2437 * touch it (since there is no way to locate the pte mappings for the 2438 * page). madvise() calls that mess with the pmap will also no longer 2439 * operate on the page. 2440 * 2441 * Beyond that the page is still reasonably 'normal'. Freeing the page 2442 * will clear the flag. 2443 * 2444 * This routine is used by OBJT_PHYS objects - objects using unswappable 2445 * physical memory as backing store rather then swap-backed memory and 2446 * will eventually be extended to support 4MB unmanaged physical 2447 * mappings. 2448 * 2449 * Caller must be holding the page busy. 2450 */ 2451 void 2452 vm_page_unmanage(vm_page_t m) 2453 { 2454 KKASSERT(m->busy_count & PBUSY_LOCKED); 2455 if ((m->flags & PG_UNMANAGED) == 0) { 2456 if (m->wire_count == 0) 2457 vm_page_unqueue(m); 2458 } 2459 vm_page_flag_set(m, PG_UNMANAGED); 2460 } 2461 2462 /* 2463 * Mark this page as wired down by yet another map, removing it from 2464 * paging queues as necessary. 2465 * 2466 * Caller must be holding the page busy. 2467 */ 2468 void 2469 vm_page_wire(vm_page_t m) 2470 { 2471 /* 2472 * Only bump the wire statistics if the page is not already wired, 2473 * and only unqueue the page if it is on some queue (if it is unmanaged 2474 * it is already off the queues). Don't do anything with fictitious 2475 * pages because they are always wired. 2476 */ 2477 KKASSERT(m->busy_count & PBUSY_LOCKED); 2478 if ((m->flags & PG_FICTITIOUS) == 0) { 2479 if (atomic_fetchadd_int(&m->wire_count, 1) == 0) { 2480 if ((m->flags & PG_UNMANAGED) == 0) 2481 vm_page_unqueue(m); 2482 atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1); 2483 } 2484 KASSERT(m->wire_count != 0, 2485 ("vm_page_wire: wire_count overflow m=%p", m)); 2486 } 2487 } 2488 2489 /* 2490 * Release one wiring of this page, potentially enabling it to be paged again. 2491 * 2492 * Many pages placed on the inactive queue should actually go 2493 * into the cache, but it is difficult to figure out which. What 2494 * we do instead, if the inactive target is well met, is to put 2495 * clean pages at the head of the inactive queue instead of the tail. 2496 * This will cause them to be moved to the cache more quickly and 2497 * if not actively re-referenced, freed more quickly. If we just 2498 * stick these pages at the end of the inactive queue, heavy filesystem 2499 * meta-data accesses can cause an unnecessary paging load on memory bound 2500 * processes. This optimization causes one-time-use metadata to be 2501 * reused more quickly. 2502 * 2503 * Pages marked PG_NEED_COMMIT are always activated and never placed on 2504 * the inactive queue. This helps the pageout daemon determine memory 2505 * pressure and act on out-of-memory situations more quickly. 2506 * 2507 * BUT, if we are in a low-memory situation we have no choice but to 2508 * put clean pages on the cache queue. 2509 * 2510 * A number of routines use vm_page_unwire() to guarantee that the page 2511 * will go into either the inactive or active queues, and will NEVER 2512 * be placed in the cache - for example, just after dirtying a page. 2513 * dirty pages in the cache are not allowed. 2514 * 2515 * This routine may not block. 2516 */ 2517 void 2518 vm_page_unwire(vm_page_t m, int activate) 2519 { 2520 KKASSERT(m->busy_count & PBUSY_LOCKED); 2521 if (m->flags & PG_FICTITIOUS) { 2522 /* do nothing */ 2523 } else if (m->wire_count <= 0) { 2524 panic("vm_page_unwire: invalid wire count: %d", m->wire_count); 2525 } else { 2526 if (atomic_fetchadd_int(&m->wire_count, -1) == 1) { 2527 atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1); 2528 if (m->flags & PG_UNMANAGED) { 2529 ; 2530 } else if (activate || (m->flags & PG_NEED_COMMIT)) { 2531 vm_page_spin_lock(m); 2532 _vm_page_add_queue_spinlocked(m, 2533 PQ_ACTIVE + m->pc, 0); 2534 _vm_page_and_queue_spin_unlock(m); 2535 } else { 2536 vm_page_spin_lock(m); 2537 vm_page_flag_clear(m, PG_WINATCFLS); 2538 _vm_page_add_queue_spinlocked(m, 2539 PQ_INACTIVE + m->pc, 0); 2540 ++vm_swapcache_inactive_heuristic; 2541 _vm_page_and_queue_spin_unlock(m); 2542 } 2543 } 2544 } 2545 } 2546 2547 /* 2548 * Move the specified page to the inactive queue. If the page has 2549 * any associated swap, the swap is deallocated. 2550 * 2551 * Normally athead is 0 resulting in LRU operation. athead is set 2552 * to 1 if we want this page to be 'as if it were placed in the cache', 2553 * except without unmapping it from the process address space. 2554 * 2555 * vm_page's spinlock must be held on entry and will remain held on return. 2556 * This routine may not block. 2557 */ 2558 static void 2559 _vm_page_deactivate_locked(vm_page_t m, int athead) 2560 { 2561 u_short oqueue; 2562 2563 /* 2564 * Ignore if already inactive. 2565 */ 2566 if (m->queue - m->pc == PQ_INACTIVE) 2567 return; 2568 _vm_page_queue_spin_lock(m); 2569 oqueue = _vm_page_rem_queue_spinlocked(m); 2570 2571 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { 2572 if (oqueue == PQ_CACHE) 2573 mycpu->gd_cnt.v_reactivated++; 2574 vm_page_flag_clear(m, PG_WINATCFLS); 2575 _vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead); 2576 if (athead == 0) 2577 ++vm_swapcache_inactive_heuristic; 2578 } 2579 /* NOTE: PQ_NONE if condition not taken */ 2580 _vm_page_queue_spin_unlock(m); 2581 /* leaves vm_page spinlocked */ 2582 } 2583 2584 /* 2585 * Attempt to deactivate a page. 2586 * 2587 * No requirements. 2588 */ 2589 void 2590 vm_page_deactivate(vm_page_t m) 2591 { 2592 vm_page_spin_lock(m); 2593 _vm_page_deactivate_locked(m, 0); 2594 vm_page_spin_unlock(m); 2595 } 2596 2597 void 2598 vm_page_deactivate_locked(vm_page_t m) 2599 { 2600 _vm_page_deactivate_locked(m, 0); 2601 } 2602 2603 /* 2604 * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it. 2605 * 2606 * This function returns non-zero if it successfully moved the page to 2607 * PQ_CACHE. 2608 * 2609 * This function unconditionally unbusies the page on return. 2610 */ 2611 int 2612 vm_page_try_to_cache(vm_page_t m) 2613 { 2614 vm_page_spin_lock(m); 2615 if (m->dirty || m->hold_count || m->wire_count || 2616 (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT))) { 2617 if (_vm_page_wakeup(m)) { 2618 vm_page_spin_unlock(m); 2619 wakeup(m); 2620 } else { 2621 vm_page_spin_unlock(m); 2622 } 2623 return(0); 2624 } 2625 vm_page_spin_unlock(m); 2626 2627 /* 2628 * Page busied by us and no longer spinlocked. Dirty pages cannot 2629 * be moved to the cache. 2630 */ 2631 vm_page_test_dirty(m); 2632 if (m->dirty || (m->flags & PG_NEED_COMMIT)) { 2633 vm_page_wakeup(m); 2634 return(0); 2635 } 2636 vm_page_cache(m); 2637 return(1); 2638 } 2639 2640 /* 2641 * Attempt to free the page. If we cannot free it, we do nothing. 2642 * 1 is returned on success, 0 on failure. 2643 * 2644 * No requirements. 2645 */ 2646 int 2647 vm_page_try_to_free(vm_page_t m) 2648 { 2649 vm_page_spin_lock(m); 2650 if (vm_page_busy_try(m, TRUE)) { 2651 vm_page_spin_unlock(m); 2652 return(0); 2653 } 2654 2655 /* 2656 * The page can be in any state, including already being on the free 2657 * queue. Check to see if it really can be freed. 2658 */ 2659 if (m->dirty || /* can't free if it is dirty */ 2660 m->hold_count || /* or held (XXX may be wrong) */ 2661 m->wire_count || /* or wired */ 2662 (m->flags & (PG_UNMANAGED | /* or unmanaged */ 2663 PG_NEED_COMMIT)) || /* or needs a commit */ 2664 m->queue - m->pc == PQ_FREE || /* already on PQ_FREE */ 2665 m->queue - m->pc == PQ_HOLD) { /* already on PQ_HOLD */ 2666 if (_vm_page_wakeup(m)) { 2667 vm_page_spin_unlock(m); 2668 wakeup(m); 2669 } else { 2670 vm_page_spin_unlock(m); 2671 } 2672 return(0); 2673 } 2674 vm_page_spin_unlock(m); 2675 2676 /* 2677 * We can probably free the page. 2678 * 2679 * Page busied by us and no longer spinlocked. Dirty pages will 2680 * not be freed by this function. We have to re-test the 2681 * dirty bit after cleaning out the pmaps. 2682 */ 2683 vm_page_test_dirty(m); 2684 if (m->dirty || (m->flags & PG_NEED_COMMIT)) { 2685 vm_page_wakeup(m); 2686 return(0); 2687 } 2688 vm_page_protect(m, VM_PROT_NONE); 2689 if (m->dirty || (m->flags & PG_NEED_COMMIT)) { 2690 vm_page_wakeup(m); 2691 return(0); 2692 } 2693 vm_page_free(m); 2694 return(1); 2695 } 2696 2697 /* 2698 * vm_page_cache 2699 * 2700 * Put the specified page onto the page cache queue (if appropriate). 2701 * 2702 * The page must be busy, and this routine will release the busy and 2703 * possibly even free the page. 2704 */ 2705 void 2706 vm_page_cache(vm_page_t m) 2707 { 2708 /* 2709 * Not suitable for the cache 2710 */ 2711 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 2712 (m->busy_count & PBUSY_MASK) || 2713 m->wire_count || m->hold_count) { 2714 vm_page_wakeup(m); 2715 return; 2716 } 2717 2718 /* 2719 * Already in the cache (and thus not mapped) 2720 */ 2721 if ((m->queue - m->pc) == PQ_CACHE) { 2722 KKASSERT((m->flags & PG_MAPPED) == 0); 2723 vm_page_wakeup(m); 2724 return; 2725 } 2726 2727 /* 2728 * Caller is required to test m->dirty, but note that the act of 2729 * removing the page from its maps can cause it to become dirty 2730 * on an SMP system due to another cpu running in usermode. 2731 */ 2732 if (m->dirty) { 2733 panic("vm_page_cache: caching a dirty page, pindex: %ld", 2734 (long)m->pindex); 2735 } 2736 2737 /* 2738 * Remove all pmaps and indicate that the page is not 2739 * writeable or mapped. Our vm_page_protect() call may 2740 * have blocked (especially w/ VM_PROT_NONE), so recheck 2741 * everything. 2742 */ 2743 vm_page_protect(m, VM_PROT_NONE); 2744 if ((m->flags & (PG_UNMANAGED | PG_MAPPED)) || 2745 (m->busy_count & PBUSY_MASK) || 2746 m->wire_count || m->hold_count) { 2747 vm_page_wakeup(m); 2748 } else if (m->dirty || (m->flags & PG_NEED_COMMIT)) { 2749 vm_page_deactivate(m); 2750 vm_page_wakeup(m); 2751 } else { 2752 _vm_page_and_queue_spin_lock(m); 2753 _vm_page_rem_queue_spinlocked(m); 2754 _vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0); 2755 _vm_page_queue_spin_unlock(m); 2756 if (_vm_page_wakeup(m)) { 2757 vm_page_spin_unlock(m); 2758 wakeup(m); 2759 } else { 2760 vm_page_spin_unlock(m); 2761 } 2762 vm_page_free_wakeup(); 2763 } 2764 } 2765 2766 /* 2767 * vm_page_dontneed() 2768 * 2769 * Cache, deactivate, or do nothing as appropriate. This routine 2770 * is typically used by madvise() MADV_DONTNEED. 2771 * 2772 * Generally speaking we want to move the page into the cache so 2773 * it gets reused quickly. However, this can result in a silly syndrome 2774 * due to the page recycling too quickly. Small objects will not be 2775 * fully cached. On the otherhand, if we move the page to the inactive 2776 * queue we wind up with a problem whereby very large objects 2777 * unnecessarily blow away our inactive and cache queues. 2778 * 2779 * The solution is to move the pages based on a fixed weighting. We 2780 * either leave them alone, deactivate them, or move them to the cache, 2781 * where moving them to the cache has the highest weighting. 2782 * By forcing some pages into other queues we eventually force the 2783 * system to balance the queues, potentially recovering other unrelated 2784 * space from active. The idea is to not force this to happen too 2785 * often. 2786 * 2787 * The page must be busied. 2788 */ 2789 void 2790 vm_page_dontneed(vm_page_t m) 2791 { 2792 static int dnweight; 2793 int dnw; 2794 int head; 2795 2796 dnw = ++dnweight; 2797 2798 /* 2799 * occassionally leave the page alone 2800 */ 2801 if ((dnw & 0x01F0) == 0 || 2802 m->queue - m->pc == PQ_INACTIVE || 2803 m->queue - m->pc == PQ_CACHE 2804 ) { 2805 if (m->act_count >= ACT_INIT) 2806 --m->act_count; 2807 return; 2808 } 2809 2810 /* 2811 * If vm_page_dontneed() is inactivating a page, it must clear 2812 * the referenced flag; otherwise the pagedaemon will see references 2813 * on the page in the inactive queue and reactivate it. Until the 2814 * page can move to the cache queue, madvise's job is not done. 2815 */ 2816 vm_page_flag_clear(m, PG_REFERENCED); 2817 pmap_clear_reference(m); 2818 2819 if (m->dirty == 0) 2820 vm_page_test_dirty(m); 2821 2822 if (m->dirty || (dnw & 0x0070) == 0) { 2823 /* 2824 * Deactivate the page 3 times out of 32. 2825 */ 2826 head = 0; 2827 } else { 2828 /* 2829 * Cache the page 28 times out of every 32. Note that 2830 * the page is deactivated instead of cached, but placed 2831 * at the head of the queue instead of the tail. 2832 */ 2833 head = 1; 2834 } 2835 vm_page_spin_lock(m); 2836 _vm_page_deactivate_locked(m, head); 2837 vm_page_spin_unlock(m); 2838 } 2839 2840 /* 2841 * These routines manipulate the 'soft busy' count for a page. A soft busy 2842 * is almost like a hard BUSY except that it allows certain compatible 2843 * operations to occur on the page while it is busy. For example, a page 2844 * undergoing a write can still be mapped read-only. 2845 * 2846 * We also use soft-busy to quickly pmap_enter shared read-only pages 2847 * without having to hold the page locked. 2848 * 2849 * The soft-busy count can be > 1 in situations where multiple threads 2850 * are pmap_enter()ing the same page simultaneously, or when two buffer 2851 * cache buffers overlap the same page. 2852 * 2853 * The caller must hold the page BUSY when making these two calls. 2854 */ 2855 void 2856 vm_page_io_start(vm_page_t m) 2857 { 2858 uint32_t ocount; 2859 2860 ocount = atomic_fetchadd_int(&m->busy_count, 1); 2861 KKASSERT(ocount & PBUSY_LOCKED); 2862 } 2863 2864 void 2865 vm_page_io_finish(vm_page_t m) 2866 { 2867 uint32_t ocount; 2868 2869 ocount = atomic_fetchadd_int(&m->busy_count, -1); 2870 KKASSERT(ocount & PBUSY_MASK); 2871 #if 0 2872 if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0) 2873 wakeup(m); 2874 #endif 2875 } 2876 2877 /* 2878 * Attempt to soft-busy a page. The page must not be PBUSY_LOCKED. 2879 * 2880 * We can't use fetchadd here because we might race a hard-busy and the 2881 * page freeing code asserts on a non-zero soft-busy count (even if only 2882 * temporary). 2883 * 2884 * Returns 0 on success, non-zero on failure. 2885 */ 2886 int 2887 vm_page_sbusy_try(vm_page_t m) 2888 { 2889 uint32_t ocount; 2890 2891 for (;;) { 2892 ocount = m->busy_count; 2893 cpu_ccfence(); 2894 if (ocount & PBUSY_LOCKED) 2895 return 1; 2896 if (atomic_cmpset_int(&m->busy_count, ocount, ocount + 1)) 2897 break; 2898 } 2899 return 0; 2900 #if 0 2901 if (m->busy_count & PBUSY_LOCKED) 2902 return 1; 2903 ocount = atomic_fetchadd_int(&m->busy_count, 1); 2904 if (ocount & PBUSY_LOCKED) { 2905 vm_page_sbusy_drop(m); 2906 return 1; 2907 } 2908 return 0; 2909 #endif 2910 } 2911 2912 /* 2913 * Indicate that a clean VM page requires a filesystem commit and cannot 2914 * be reused. Used by tmpfs. 2915 */ 2916 void 2917 vm_page_need_commit(vm_page_t m) 2918 { 2919 vm_page_flag_set(m, PG_NEED_COMMIT); 2920 vm_object_set_writeable_dirty(m->object); 2921 } 2922 2923 void 2924 vm_page_clear_commit(vm_page_t m) 2925 { 2926 vm_page_flag_clear(m, PG_NEED_COMMIT); 2927 } 2928 2929 /* 2930 * Grab a page, blocking if it is busy and allocating a page if necessary. 2931 * A busy page is returned or NULL. The page may or may not be valid and 2932 * might not be on a queue (the caller is responsible for the disposition of 2933 * the page). 2934 * 2935 * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the 2936 * page will be zero'd and marked valid. 2937 * 2938 * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked 2939 * valid even if it already exists. 2940 * 2941 * If VM_ALLOC_RETRY is specified this routine will never return NULL. Also 2942 * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified. 2943 * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified. 2944 * 2945 * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is 2946 * always returned if we had blocked. 2947 * 2948 * This routine may not be called from an interrupt. 2949 * 2950 * No other requirements. 2951 */ 2952 vm_page_t 2953 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 2954 { 2955 vm_page_t m; 2956 int error; 2957 int shared = 1; 2958 2959 KKASSERT(allocflags & 2960 (VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM)); 2961 vm_object_hold_shared(object); 2962 for (;;) { 2963 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 2964 if (error) { 2965 vm_page_sleep_busy(m, TRUE, "pgrbwt"); 2966 if ((allocflags & VM_ALLOC_RETRY) == 0) { 2967 m = NULL; 2968 break; 2969 } 2970 /* retry */ 2971 } else if (m == NULL) { 2972 if (shared) { 2973 vm_object_upgrade(object); 2974 shared = 0; 2975 } 2976 if (allocflags & VM_ALLOC_RETRY) 2977 allocflags |= VM_ALLOC_NULL_OK; 2978 m = vm_page_alloc(object, pindex, 2979 allocflags & ~VM_ALLOC_RETRY); 2980 if (m) 2981 break; 2982 vm_wait(0); 2983 if ((allocflags & VM_ALLOC_RETRY) == 0) 2984 goto failed; 2985 } else { 2986 /* m found */ 2987 break; 2988 } 2989 } 2990 2991 /* 2992 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid. 2993 * 2994 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set 2995 * valid even if already valid. 2996 * 2997 * NOTE! We have removed all of the PG_ZERO optimizations and also 2998 * removed the idle zeroing code. These optimizations actually 2999 * slow things down on modern cpus because the zerod area is 3000 * likely uncached, placing a memory-access burden on the 3001 * accesors taking the fault. 3002 * 3003 * By always zeroing the page in-line with the fault, no 3004 * dynamic ram reads are needed and the caches are hot, ready 3005 * for userland to access the memory. 3006 */ 3007 if (m->valid == 0) { 3008 if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) { 3009 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 3010 m->valid = VM_PAGE_BITS_ALL; 3011 } 3012 } else if (allocflags & VM_ALLOC_FORCE_ZERO) { 3013 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 3014 m->valid = VM_PAGE_BITS_ALL; 3015 } 3016 failed: 3017 vm_object_drop(object); 3018 return(m); 3019 } 3020 3021 /* 3022 * Mapping function for valid bits or for dirty bits in 3023 * a page. May not block. 3024 * 3025 * Inputs are required to range within a page. 3026 * 3027 * No requirements. 3028 * Non blocking. 3029 */ 3030 int 3031 vm_page_bits(int base, int size) 3032 { 3033 int first_bit; 3034 int last_bit; 3035 3036 KASSERT( 3037 base + size <= PAGE_SIZE, 3038 ("vm_page_bits: illegal base/size %d/%d", base, size) 3039 ); 3040 3041 if (size == 0) /* handle degenerate case */ 3042 return(0); 3043 3044 first_bit = base >> DEV_BSHIFT; 3045 last_bit = (base + size - 1) >> DEV_BSHIFT; 3046 3047 return ((2 << last_bit) - (1 << first_bit)); 3048 } 3049 3050 /* 3051 * Sets portions of a page valid and clean. The arguments are expected 3052 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 3053 * of any partial chunks touched by the range. The invalid portion of 3054 * such chunks will be zero'd. 3055 * 3056 * NOTE: When truncating a buffer vnode_pager_setsize() will automatically 3057 * align base to DEV_BSIZE so as not to mark clean a partially 3058 * truncated device block. Otherwise the dirty page status might be 3059 * lost. 3060 * 3061 * This routine may not block. 3062 * 3063 * (base + size) must be less then or equal to PAGE_SIZE. 3064 */ 3065 static void 3066 _vm_page_zero_valid(vm_page_t m, int base, int size) 3067 { 3068 int frag; 3069 int endoff; 3070 3071 if (size == 0) /* handle degenerate case */ 3072 return; 3073 3074 /* 3075 * If the base is not DEV_BSIZE aligned and the valid 3076 * bit is clear, we have to zero out a portion of the 3077 * first block. 3078 */ 3079 3080 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 3081 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0 3082 ) { 3083 pmap_zero_page_area( 3084 VM_PAGE_TO_PHYS(m), 3085 frag, 3086 base - frag 3087 ); 3088 } 3089 3090 /* 3091 * If the ending offset is not DEV_BSIZE aligned and the 3092 * valid bit is clear, we have to zero out a portion of 3093 * the last block. 3094 */ 3095 3096 endoff = base + size; 3097 3098 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 3099 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0 3100 ) { 3101 pmap_zero_page_area( 3102 VM_PAGE_TO_PHYS(m), 3103 endoff, 3104 DEV_BSIZE - (endoff & (DEV_BSIZE - 1)) 3105 ); 3106 } 3107 } 3108 3109 /* 3110 * Set valid, clear dirty bits. If validating the entire 3111 * page we can safely clear the pmap modify bit. We also 3112 * use this opportunity to clear the PG_NOSYNC flag. If a process 3113 * takes a write fault on a MAP_NOSYNC memory area the flag will 3114 * be set again. 3115 * 3116 * We set valid bits inclusive of any overlap, but we can only 3117 * clear dirty bits for DEV_BSIZE chunks that are fully within 3118 * the range. 3119 * 3120 * Page must be busied? 3121 * No other requirements. 3122 */ 3123 void 3124 vm_page_set_valid(vm_page_t m, int base, int size) 3125 { 3126 _vm_page_zero_valid(m, base, size); 3127 m->valid |= vm_page_bits(base, size); 3128 } 3129 3130 3131 /* 3132 * Set valid bits and clear dirty bits. 3133 * 3134 * Page must be busied by caller. 3135 * 3136 * NOTE: This function does not clear the pmap modified bit. 3137 * Also note that e.g. NFS may use a byte-granular base 3138 * and size. 3139 * 3140 * No other requirements. 3141 */ 3142 void 3143 vm_page_set_validclean(vm_page_t m, int base, int size) 3144 { 3145 int pagebits; 3146 3147 _vm_page_zero_valid(m, base, size); 3148 pagebits = vm_page_bits(base, size); 3149 m->valid |= pagebits; 3150 m->dirty &= ~pagebits; 3151 if (base == 0 && size == PAGE_SIZE) { 3152 /*pmap_clear_modify(m);*/ 3153 vm_page_flag_clear(m, PG_NOSYNC); 3154 } 3155 } 3156 3157 /* 3158 * Set valid & dirty. Used by buwrite() 3159 * 3160 * Page must be busied by caller. 3161 */ 3162 void 3163 vm_page_set_validdirty(vm_page_t m, int base, int size) 3164 { 3165 int pagebits; 3166 3167 pagebits = vm_page_bits(base, size); 3168 m->valid |= pagebits; 3169 m->dirty |= pagebits; 3170 if (m->object) 3171 vm_object_set_writeable_dirty(m->object); 3172 } 3173 3174 /* 3175 * Clear dirty bits. 3176 * 3177 * NOTE: This function does not clear the pmap modified bit. 3178 * Also note that e.g. NFS may use a byte-granular base 3179 * and size. 3180 * 3181 * Page must be busied? 3182 * No other requirements. 3183 */ 3184 void 3185 vm_page_clear_dirty(vm_page_t m, int base, int size) 3186 { 3187 m->dirty &= ~vm_page_bits(base, size); 3188 if (base == 0 && size == PAGE_SIZE) { 3189 /*pmap_clear_modify(m);*/ 3190 vm_page_flag_clear(m, PG_NOSYNC); 3191 } 3192 } 3193 3194 /* 3195 * Make the page all-dirty. 3196 * 3197 * Also make sure the related object and vnode reflect the fact that the 3198 * object may now contain a dirty page. 3199 * 3200 * Page must be busied? 3201 * No other requirements. 3202 */ 3203 void 3204 vm_page_dirty(vm_page_t m) 3205 { 3206 #ifdef INVARIANTS 3207 int pqtype = m->queue - m->pc; 3208 #endif 3209 KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE, 3210 ("vm_page_dirty: page in free/cache queue!")); 3211 if (m->dirty != VM_PAGE_BITS_ALL) { 3212 m->dirty = VM_PAGE_BITS_ALL; 3213 if (m->object) 3214 vm_object_set_writeable_dirty(m->object); 3215 } 3216 } 3217 3218 /* 3219 * Invalidates DEV_BSIZE'd chunks within a page. Both the 3220 * valid and dirty bits for the effected areas are cleared. 3221 * 3222 * Page must be busied? 3223 * Does not block. 3224 * No other requirements. 3225 */ 3226 void 3227 vm_page_set_invalid(vm_page_t m, int base, int size) 3228 { 3229 int bits; 3230 3231 bits = vm_page_bits(base, size); 3232 m->valid &= ~bits; 3233 m->dirty &= ~bits; 3234 atomic_add_int(&m->object->generation, 1); 3235 } 3236 3237 /* 3238 * The kernel assumes that the invalid portions of a page contain 3239 * garbage, but such pages can be mapped into memory by user code. 3240 * When this occurs, we must zero out the non-valid portions of the 3241 * page so user code sees what it expects. 3242 * 3243 * Pages are most often semi-valid when the end of a file is mapped 3244 * into memory and the file's size is not page aligned. 3245 * 3246 * Page must be busied? 3247 * No other requirements. 3248 */ 3249 void 3250 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 3251 { 3252 int b; 3253 int i; 3254 3255 /* 3256 * Scan the valid bits looking for invalid sections that 3257 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the 3258 * valid bit may be set ) have already been zerod by 3259 * vm_page_set_validclean(). 3260 */ 3261 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 3262 if (i == (PAGE_SIZE / DEV_BSIZE) || 3263 (m->valid & (1 << i)) 3264 ) { 3265 if (i > b) { 3266 pmap_zero_page_area( 3267 VM_PAGE_TO_PHYS(m), 3268 b << DEV_BSHIFT, 3269 (i - b) << DEV_BSHIFT 3270 ); 3271 } 3272 b = i + 1; 3273 } 3274 } 3275 3276 /* 3277 * setvalid is TRUE when we can safely set the zero'd areas 3278 * as being valid. We can do this if there are no cache consistency 3279 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 3280 */ 3281 if (setvalid) 3282 m->valid = VM_PAGE_BITS_ALL; 3283 } 3284 3285 /* 3286 * Is a (partial) page valid? Note that the case where size == 0 3287 * will return FALSE in the degenerate case where the page is entirely 3288 * invalid, and TRUE otherwise. 3289 * 3290 * Does not block. 3291 * No other requirements. 3292 */ 3293 int 3294 vm_page_is_valid(vm_page_t m, int base, int size) 3295 { 3296 int bits = vm_page_bits(base, size); 3297 3298 if (m->valid && ((m->valid & bits) == bits)) 3299 return 1; 3300 else 3301 return 0; 3302 } 3303 3304 /* 3305 * update dirty bits from pmap/mmu. May not block. 3306 * 3307 * Caller must hold the page busy 3308 */ 3309 void 3310 vm_page_test_dirty(vm_page_t m) 3311 { 3312 if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) { 3313 vm_page_dirty(m); 3314 } 3315 } 3316 3317 #include "opt_ddb.h" 3318 #ifdef DDB 3319 #include <ddb/ddb.h> 3320 3321 DB_SHOW_COMMAND(page, vm_page_print_page_info) 3322 { 3323 db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count); 3324 db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count); 3325 db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count); 3326 db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count); 3327 db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count); 3328 db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved); 3329 db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min); 3330 db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target); 3331 db_printf("vmstats.v_cache_min: %ld\n", vmstats.v_cache_min); 3332 db_printf("vmstats.v_inactive_target: %ld\n", 3333 vmstats.v_inactive_target); 3334 } 3335 3336 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 3337 { 3338 int i; 3339 db_printf("PQ_FREE:"); 3340 for (i = 0; i < PQ_L2_SIZE; i++) { 3341 db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt); 3342 } 3343 db_printf("\n"); 3344 3345 db_printf("PQ_CACHE:"); 3346 for(i = 0; i < PQ_L2_SIZE; i++) { 3347 db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt); 3348 } 3349 db_printf("\n"); 3350 3351 db_printf("PQ_ACTIVE:"); 3352 for(i = 0; i < PQ_L2_SIZE; i++) { 3353 db_printf(" %d", vm_page_queues[PQ_ACTIVE + i].lcnt); 3354 } 3355 db_printf("\n"); 3356 3357 db_printf("PQ_INACTIVE:"); 3358 for(i = 0; i < PQ_L2_SIZE; i++) { 3359 db_printf(" %d", vm_page_queues[PQ_INACTIVE + i].lcnt); 3360 } 3361 db_printf("\n"); 3362 } 3363 #endif /* DDB */ 3364