1 /*
2 * Copyright (c) 2003-2019 The DragonFly Project. All rights reserved.
3 * Copyright (c) 1991 Regents of the University of California.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * The Mach Operating System project at Carnegie-Mellon University.
8 *
9 * This code is derived from software contributed to The DragonFly Project
10 * by Matthew Dillon <dillon@backplane.com>
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91
37 * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
38 */
39
40 /*
41 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42 * All rights reserved.
43 *
44 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45 *
46 * Permission to use, copy, modify and distribute this software and
47 * its documentation is hereby granted, provided that both the copyright
48 * notice and this permission notice appear in all copies of the
49 * software, derivative works or modified versions, and any portions
50 * thereof, and that both notices appear in supporting documentation.
51 *
52 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55 *
56 * Carnegie Mellon requests users of this software to return to
57 *
58 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
59 * School of Computer Science
60 * Carnegie Mellon University
61 * Pittsburgh PA 15213-3890
62 *
63 * any improvements or extensions that they make and grant Carnegie the
64 * rights to redistribute these changes.
65 */
66 /*
67 * Resident memory management module. The module manipulates 'VM pages'.
68 * A VM page is the core building block for memory management.
69 */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/malloc.h>
74 #include <sys/proc.h>
75 #include <sys/vmmeter.h>
76 #include <sys/vnode.h>
77 #include <sys/kernel.h>
78 #include <sys/alist.h>
79 #include <sys/sysctl.h>
80 #include <sys/cpu_topology.h>
81
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <sys/lock.h>
85 #include <vm/vm_kern.h>
86 #include <vm/pmap.h>
87 #include <vm/vm_map.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vm_extern.h>
93 #include <vm/swap_pager.h>
94
95 #include <machine/inttypes.h>
96 #include <machine/md_var.h>
97 #include <machine/specialreg.h>
98 #include <machine/bus_dma.h>
99
100 #include <vm/vm_page2.h>
101 #include <sys/spinlock2.h>
102
103 /*
104 * Cache necessary elements in the hash table itself to avoid indirecting
105 * through random vm_page's when doing a lookup. The hash table is
106 * heuristical and it is ok for races to mess up any or all fields.
107 */
108 struct vm_page_hash_elm {
109 vm_page_t m;
110 vm_object_t object; /* heuristical */
111 vm_pindex_t pindex; /* heuristical */
112 int ticks;
113 int unused;
114 };
115
116 #define VM_PAGE_HASH_SET 4 /* power of 2, set-assoc */
117 #define VM_PAGE_HASH_MAX (8 * 1024 * 1024) /* power of 2, max size */
118
119 /*
120 * SET - Minimum required set associative size, must be a power of 2. We
121 * want this to match or exceed the set-associativeness of the cpu,
122 * up to a reasonable limit (we will use 16).
123 */
124 __read_mostly static int set_assoc_mask = 16 - 1;
125
126 static void vm_page_queue_init(void);
127 static void vm_page_free_wakeup(void);
128 static vm_page_t vm_page_select_cache(u_short pg_color);
129 static vm_page_t _vm_page_list_find_wide(int basequeue, int index, int *lastp);
130 static vm_page_t _vm_page_list_find2_wide(int bq1, int bq2, int index,
131 int *lastp1, int *lastp);
132 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
133 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes);
134
135 /*
136 * Array of tailq lists
137 */
138 struct vpgqueues vm_page_queues[PQ_COUNT];
139
140 static volatile int vm_pages_waiting;
141 static struct alist vm_contig_alist;
142 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
143 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
144
145 __read_mostly static int vm_page_hash_vnode_only;
146 __read_mostly static int vm_page_hash_size;
147 __read_mostly static struct vm_page_hash_elm *vm_page_hash;
148
149 static u_long vm_dma_reserved = 0;
150 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
151 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
152 "Memory reserved for DMA");
153 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
154 &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
155
156 SYSCTL_INT(_vm, OID_AUTO, page_hash_vnode_only, CTLFLAG_RW,
157 &vm_page_hash_vnode_only, 0, "Only hash vnode pages");
158 #if 0
159 static int vm_page_hash_debug;
160 SYSCTL_INT(_vm, OID_AUTO, page_hash_debug, CTLFLAG_RW,
161 &vm_page_hash_debug, 0, "Only hash vnode pages");
162 #endif
163
164 static int vm_contig_verbose = 0;
165 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
166
167 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
168 vm_pindex_t, pindex);
169
170 static void
vm_page_queue_init(void)171 vm_page_queue_init(void)
172 {
173 int i;
174
175 for (i = 0; i < PQ_L2_SIZE; i++)
176 vm_page_queues[PQ_FREE+i].cnt_offset =
177 offsetof(struct vmstats, v_free_count);
178 for (i = 0; i < PQ_L2_SIZE; i++)
179 vm_page_queues[PQ_CACHE+i].cnt_offset =
180 offsetof(struct vmstats, v_cache_count);
181 for (i = 0; i < PQ_L2_SIZE; i++)
182 vm_page_queues[PQ_INACTIVE+i].cnt_offset =
183 offsetof(struct vmstats, v_inactive_count);
184 for (i = 0; i < PQ_L2_SIZE; i++)
185 vm_page_queues[PQ_ACTIVE+i].cnt_offset =
186 offsetof(struct vmstats, v_active_count);
187 for (i = 0; i < PQ_L2_SIZE; i++)
188 vm_page_queues[PQ_HOLD+i].cnt_offset =
189 offsetof(struct vmstats, v_active_count);
190 /* PQ_NONE has no queue */
191
192 for (i = 0; i < PQ_COUNT; i++) {
193 struct vpgqueues *vpq;
194
195 vpq = &vm_page_queues[i];
196 vpq->lastq = -1;
197 TAILQ_INIT(&vpq->pl);
198 spin_init(&vpq->spin, "vm_page_queue_init");
199 }
200 }
201
202 /*
203 * note: place in initialized data section? Is this necessary?
204 */
205 vm_pindex_t first_page = 0;
206 vm_pindex_t vm_page_array_size = 0;
207 vm_page_t vm_page_array = NULL;
208 vm_paddr_t vm_low_phys_reserved;
209
210 /*
211 * (low level boot)
212 *
213 * Sets the page size, perhaps based upon the memory size.
214 * Must be called before any use of page-size dependent functions.
215 */
216 void
vm_set_page_size(void)217 vm_set_page_size(void)
218 {
219 if (vmstats.v_page_size == 0)
220 vmstats.v_page_size = PAGE_SIZE;
221 if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
222 panic("vm_set_page_size: page size not a power of two");
223 }
224
225 /*
226 * (low level boot)
227 *
228 * Add a new page to the freelist for use by the system. New pages
229 * are added to both the head and tail of the associated free page
230 * queue in a bottom-up fashion, so both zero'd and non-zero'd page
231 * requests pull 'recent' adds (higher physical addresses) first.
232 *
233 * Beware that the page zeroing daemon will also be running soon after
234 * boot, moving pages from the head to the tail of the PQ_FREE queues.
235 *
236 * Must be called in a critical section.
237 */
238 static void
vm_add_new_page(vm_paddr_t pa,int * badcountp)239 vm_add_new_page(vm_paddr_t pa, int *badcountp)
240 {
241 struct vpgqueues *vpq;
242 vm_page_t m;
243
244 m = PHYS_TO_VM_PAGE(pa);
245
246 /*
247 * Make sure it isn't a duplicate (due to BIOS page range overlaps,
248 * which we consider bugs... but don't crash). Note that m->phys_addr
249 * is pre-initialized, so use m->queue as a check.
250 */
251 if (m->queue) {
252 if (*badcountp < 10) {
253 kprintf("vm_add_new_page: duplicate pa %016jx\n",
254 (intmax_t)pa);
255 ++*badcountp;
256 } else if (*badcountp == 10) {
257 kprintf("vm_add_new_page: duplicate pa (many more)\n");
258 ++*badcountp;
259 }
260 return;
261 }
262
263 m->phys_addr = pa;
264 m->flags = 0;
265 m->pat_mode = PAT_WRITE_BACK;
266 m->pc = (pa >> PAGE_SHIFT);
267
268 /*
269 * Twist for cpu localization in addition to page coloring, so
270 * different cpus selecting by m->queue get different page colors.
271 */
272 m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
273 m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
274 m->pc &= PQ_L2_MASK;
275
276 /*
277 * Reserve a certain number of contiguous low memory pages for
278 * contigmalloc() to use.
279 *
280 * Even though these pages represent real ram and can be
281 * reverse-mapped, we set PG_FICTITIOUS and PG_UNQUEUED
282 * because their use is special-cased.
283 *
284 * WARNING! Once PG_FICTITIOUS is set, vm_page_wire*()
285 * and vm_page_unwire*() calls have no effect.
286 */
287 if (pa < vm_low_phys_reserved) {
288 atomic_add_long(&vmstats.v_page_count, 1);
289 atomic_add_long(&vmstats.v_dma_pages, 1);
290 m->flags |= PG_FICTITIOUS | PG_UNQUEUED;
291 m->queue = PQ_NONE;
292 m->wire_count = 1;
293 atomic_add_long(&vmstats.v_wire_count, 1);
294 alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
295 return;
296 }
297
298 /*
299 * General page
300 */
301 m->queue = m->pc + PQ_FREE;
302 KKASSERT(m->dirty == 0);
303
304 atomic_add_long(&vmstats.v_page_count, 1);
305 atomic_add_long(&vmstats.v_free_count, 1);
306 vpq = &vm_page_queues[m->queue];
307 TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
308 ++vpq->lcnt;
309 }
310
311 /*
312 * (low level boot)
313 *
314 * Initializes the resident memory module.
315 *
316 * Preallocates memory for critical VM structures and arrays prior to
317 * kernel_map becoming available.
318 *
319 * Memory is allocated from (virtual2_start, virtual2_end) if available,
320 * otherwise memory is allocated from (virtual_start, virtual_end).
321 *
322 * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
323 * large enough to hold vm_page_array & other structures for machines with
324 * large amounts of ram, so we want to use virtual2* when available.
325 */
326 void
vm_page_startup(void)327 vm_page_startup(void)
328 {
329 vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
330 vm_offset_t mapped;
331 vm_pindex_t npages;
332 vm_paddr_t page_range;
333 vm_paddr_t new_end;
334 int i;
335 vm_paddr_t pa;
336 vm_paddr_t last_pa;
337 vm_paddr_t end;
338 vm_paddr_t biggestone, biggestsize;
339 vm_paddr_t total;
340 vm_page_t m;
341 int badcount;
342
343 total = 0;
344 badcount = 0;
345 biggestsize = 0;
346 biggestone = 0;
347 vaddr = round_page(vaddr);
348
349 /*
350 * Make sure ranges are page-aligned.
351 */
352 for (i = 0; phys_avail[i].phys_end; ++i) {
353 phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
354 phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
355 if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
356 phys_avail[i].phys_end = phys_avail[i].phys_beg;
357 }
358
359 /*
360 * Locate largest block
361 */
362 for (i = 0; phys_avail[i].phys_end; ++i) {
363 vm_paddr_t size = phys_avail[i].phys_end -
364 phys_avail[i].phys_beg;
365
366 if (size > biggestsize) {
367 biggestone = i;
368 biggestsize = size;
369 }
370 total += size;
371 }
372 --i; /* adjust to last entry for use down below */
373
374 end = phys_avail[biggestone].phys_end;
375 end = trunc_page(end);
376
377 /*
378 * Initialize the queue headers for the free queue, the active queue
379 * and the inactive queue.
380 */
381 vm_page_queue_init();
382
383 #if !defined(_KERNEL_VIRTUAL)
384 /*
385 * VKERNELs don't support minidumps and as such don't need
386 * vm_page_dump
387 *
388 * Allocate a bitmap to indicate that a random physical page
389 * needs to be included in a minidump.
390 *
391 * The amd64 port needs this to indicate which direct map pages
392 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
393 *
394 * However, x86 still needs this workspace internally within the
395 * minidump code. In theory, they are not needed on x86, but are
396 * included should the sf_buf code decide to use them.
397 */
398 page_range = phys_avail[i].phys_end / PAGE_SIZE;
399 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
400 end -= vm_page_dump_size;
401 vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
402 VM_PROT_READ | VM_PROT_WRITE);
403 bzero((void *)vm_page_dump, vm_page_dump_size);
404 #endif
405 /*
406 * Compute the number of pages of memory that will be available for
407 * use (taking into account the overhead of a page structure per
408 * page).
409 */
410 first_page = phys_avail[0].phys_beg / PAGE_SIZE;
411 page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
412 npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
413
414 #ifndef _KERNEL_VIRTUAL
415 /*
416 * (only applies to real kernels)
417 *
418 * Reserve a large amount of low memory for potential 32-bit DMA
419 * space allocations. Once device initialization is complete we
420 * release most of it, but keep (vm_dma_reserved) memory reserved
421 * for later use. Typically for X / graphics. Through trial and
422 * error we find that GPUs usually requires ~60-100MB or so.
423 *
424 * By default, 128M is left in reserve on machines with 2G+ of ram.
425 */
426 vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
427 if (vm_low_phys_reserved > total / 4)
428 vm_low_phys_reserved = total / 4;
429 if (vm_dma_reserved == 0) {
430 vm_dma_reserved = 128 * 1024 * 1024; /* 128MB */
431 if (vm_dma_reserved > total / 16)
432 vm_dma_reserved = total / 16;
433 }
434 #endif
435 alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
436 ALIST_RECORDS_65536);
437
438 /*
439 * Initialize the mem entry structures now, and put them in the free
440 * queue.
441 */
442 if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
443 kprintf("initializing vm_page_array ");
444 new_end = trunc_page(end - page_range * sizeof(struct vm_page));
445 mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
446 vm_page_array = (vm_page_t)mapped;
447
448 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
449 /*
450 * since pmap_map on amd64 returns stuff out of a direct-map region,
451 * we have to manually add these pages to the minidump tracking so
452 * that they can be dumped, including the vm_page_array.
453 */
454 for (pa = new_end;
455 pa < phys_avail[biggestone].phys_end;
456 pa += PAGE_SIZE) {
457 dump_add_page(pa);
458 }
459 #endif
460
461 /*
462 * Clear all of the page structures, run basic initialization so
463 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
464 * map.
465 */
466 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
467 vm_page_array_size = page_range;
468 if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
469 kprintf("size = 0x%zx\n", vm_page_array_size);
470
471 m = &vm_page_array[0];
472 pa = ptoa(first_page);
473 for (i = 0; i < page_range; ++i) {
474 spin_init(&m->spin, "vm_page");
475 m->phys_addr = pa;
476 pa += PAGE_SIZE;
477 ++m;
478 }
479
480 /*
481 * Construct the free queue(s) in ascending order (by physical
482 * address) so that the first 16MB of physical memory is allocated
483 * last rather than first. On large-memory machines, this avoids
484 * the exhaustion of low physical memory before isa_dma_init has run.
485 */
486 vmstats.v_page_count = 0;
487 vmstats.v_free_count = 0;
488 for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
489 pa = phys_avail[i].phys_beg;
490 if (i == biggestone)
491 last_pa = new_end;
492 else
493 last_pa = phys_avail[i].phys_end;
494 while (pa < last_pa && npages-- > 0) {
495 vm_add_new_page(pa, &badcount);
496 pa += PAGE_SIZE;
497 }
498 }
499 if (virtual2_start)
500 virtual2_start = vaddr;
501 else
502 virtual_start = vaddr;
503 mycpu->gd_vmstats = vmstats;
504 }
505
506 /*
507 * (called from early boot only)
508 *
509 * Reorganize VM pages based on numa data. May be called as many times as
510 * necessary. Will reorganize the vm_page_t page color and related queue(s)
511 * to allow vm_page_alloc() to choose pages based on socket affinity.
512 *
513 * NOTE: This function is only called while we are still in UP mode, so
514 * we only need a critical section to protect the queues (which
515 * saves a lot of time, there are likely a ton of pages).
516 */
517 void
vm_numa_organize(vm_paddr_t ran_beg,vm_paddr_t bytes,int physid)518 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
519 {
520 vm_paddr_t scan_beg;
521 vm_paddr_t scan_end;
522 vm_paddr_t ran_end;
523 struct vpgqueues *vpq;
524 vm_page_t m;
525 vm_page_t mend;
526 int socket_mod;
527 int socket_value;
528 int i;
529
530 /*
531 * Check if no physical information, or there was only one socket
532 * (so don't waste time doing nothing!).
533 */
534 if (cpu_topology_phys_ids <= 1 ||
535 cpu_topology_core_ids == 0) {
536 return;
537 }
538
539 /*
540 * Setup for our iteration. Note that ACPI may iterate CPU
541 * sockets starting at 0 or 1 or some other number. The
542 * cpu_topology code mod's it against the socket count.
543 */
544 ran_end = ran_beg + bytes;
545
546 socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
547 socket_value = (physid % cpu_topology_phys_ids) * socket_mod;
548 mend = &vm_page_array[vm_page_array_size];
549
550 crit_enter();
551
552 /*
553 * Adjust cpu_topology's phys_mem parameter
554 */
555 if (root_cpu_node)
556 vm_numa_add_topology_mem(root_cpu_node, physid, (long)bytes);
557
558 /*
559 * Adjust vm_page->pc and requeue all affected pages. The
560 * allocator will then be able to localize memory allocations
561 * to some degree.
562 */
563 for (i = 0; phys_avail[i].phys_end; ++i) {
564 scan_beg = phys_avail[i].phys_beg;
565 scan_end = phys_avail[i].phys_end;
566 if (scan_end <= ran_beg)
567 continue;
568 if (scan_beg >= ran_end)
569 continue;
570 if (scan_beg < ran_beg)
571 scan_beg = ran_beg;
572 if (scan_end > ran_end)
573 scan_end = ran_end;
574 if (atop(scan_end) > first_page + vm_page_array_size)
575 scan_end = ptoa(first_page + vm_page_array_size);
576
577 m = PHYS_TO_VM_PAGE(scan_beg);
578 while (scan_beg < scan_end) {
579 KKASSERT(m < mend);
580 if (m->queue != PQ_NONE) {
581 vpq = &vm_page_queues[m->queue];
582 TAILQ_REMOVE(&vpq->pl, m, pageq);
583 --vpq->lcnt;
584 /* queue doesn't change, no need to adj cnt */
585 m->queue -= m->pc;
586 m->pc %= socket_mod;
587 m->pc += socket_value;
588 m->pc &= PQ_L2_MASK;
589 m->queue += m->pc;
590 vpq = &vm_page_queues[m->queue];
591 TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
592 ++vpq->lcnt;
593 /* queue doesn't change, no need to adj cnt */
594 } else {
595 m->pc %= socket_mod;
596 m->pc += socket_value;
597 m->pc &= PQ_L2_MASK;
598 }
599 scan_beg += PAGE_SIZE;
600 ++m;
601 }
602 }
603
604 crit_exit();
605 }
606
607 /*
608 * (called from early boot only)
609 *
610 * Don't allow the NUMA organization to leave vm_page_queues[] nodes
611 * completely empty for a logical cpu. Doing so would force allocations
612 * on that cpu to always borrow from a nearby cpu, create unnecessary
613 * contention, and cause vm_page_alloc() to iterate more queues and run more
614 * slowly.
615 *
616 * This situation can occur when memory sticks are not entirely populated,
617 * populated at different densities, or in naturally assymetric systems
618 * such as the 2990WX. There could very well be many vm_page_queues[]
619 * entries with *NO* pages assigned to them.
620 *
621 * Fixing this up ensures that each logical CPU has roughly the same
622 * sized memory pool, and more importantly ensures that logical CPUs
623 * do not wind up with an empty memory pool.
624 *
625 * At them moment we just iterate the other queues and borrow pages,
626 * moving them into the queues for cpus with severe deficits even though
627 * the memory might not be local to those cpus. I am not doing this in
628 * a 'smart' way, its effectively UMA style (sorta, since its page-by-page
629 * whereas real UMA typically exchanges address bits 8-10 with high address
630 * bits). But it works extremely well and gives us fairly good deterministic
631 * results on the cpu cores associated with these secondary nodes.
632 */
633 void
vm_numa_organize_finalize(void)634 vm_numa_organize_finalize(void)
635 {
636 struct vpgqueues *vpq;
637 vm_page_t m;
638 long lcnt_lo;
639 long lcnt_hi;
640 int iter;
641 int i;
642 int scale_lim;
643
644 crit_enter();
645
646 /*
647 * Machines might not use an exact power of 2 for phys_ids,
648 * core_ids, ht_ids, etc. This can slightly reduce the actual
649 * range of indices in vm_page_queues[] that are nominally used.
650 */
651 if (cpu_topology_ht_ids) {
652 scale_lim = PQ_L2_SIZE / cpu_topology_phys_ids;
653 scale_lim = scale_lim / cpu_topology_core_ids;
654 scale_lim = scale_lim / cpu_topology_ht_ids;
655 scale_lim = scale_lim * cpu_topology_ht_ids;
656 scale_lim = scale_lim * cpu_topology_core_ids;
657 scale_lim = scale_lim * cpu_topology_phys_ids;
658 } else {
659 scale_lim = PQ_L2_SIZE;
660 }
661
662 /*
663 * Calculate an average, set hysteresis for balancing from
664 * 10% below the average to the average.
665 */
666 lcnt_hi = 0;
667 for (i = 0; i < scale_lim; ++i) {
668 lcnt_hi += vm_page_queues[i].lcnt;
669 }
670 lcnt_hi /= scale_lim;
671 lcnt_lo = lcnt_hi - lcnt_hi / 10;
672
673 kprintf("vm_page: avg %ld pages per queue, %d queues\n",
674 lcnt_hi, scale_lim);
675
676 iter = 0;
677 for (i = 0; i < scale_lim; ++i) {
678 vpq = &vm_page_queues[PQ_FREE + i];
679 while (vpq->lcnt < lcnt_lo) {
680 struct vpgqueues *vptmp;
681
682 iter = (iter + 1) & PQ_L2_MASK;
683 vptmp = &vm_page_queues[PQ_FREE + iter];
684 if (vptmp->lcnt < lcnt_hi)
685 continue;
686 m = TAILQ_FIRST(&vptmp->pl);
687 KKASSERT(m->queue == PQ_FREE + iter);
688 TAILQ_REMOVE(&vptmp->pl, m, pageq);
689 --vptmp->lcnt;
690 /* queue doesn't change, no need to adj cnt */
691 m->queue -= m->pc;
692 m->pc = i;
693 m->queue += m->pc;
694 TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
695 ++vpq->lcnt;
696 }
697 }
698 crit_exit();
699 }
700
701 static
702 void
vm_numa_add_topology_mem(cpu_node_t * cpup,int physid,long bytes)703 vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes)
704 {
705 int cpuid;
706 int i;
707
708 switch(cpup->type) {
709 case PACKAGE_LEVEL:
710 cpup->phys_mem += bytes;
711 break;
712 case CHIP_LEVEL:
713 /*
714 * All members should have the same chipid, so we only need
715 * to pull out one member.
716 */
717 if (CPUMASK_TESTNZERO(cpup->members)) {
718 cpuid = BSFCPUMASK(cpup->members);
719 if (physid ==
720 get_chip_ID_from_APICID(CPUID_TO_APICID(cpuid))) {
721 cpup->phys_mem += bytes;
722 }
723 }
724 break;
725 case CORE_LEVEL:
726 case THREAD_LEVEL:
727 /*
728 * Just inherit from the parent node
729 */
730 cpup->phys_mem = cpup->parent_node->phys_mem;
731 break;
732 }
733 for (i = 0; i < MAXCPU && cpup->child_node[i]; ++i)
734 vm_numa_add_topology_mem(cpup->child_node[i], physid, bytes);
735 }
736
737 /*
738 * We tended to reserve a ton of memory for contigmalloc(). Now that most
739 * drivers have initialized we want to return most the remaining free
740 * reserve back to the VM page queues so they can be used for normal
741 * allocations.
742 *
743 * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
744 */
745 static void
vm_page_startup_finish(void * dummy __unused)746 vm_page_startup_finish(void *dummy __unused)
747 {
748 alist_blk_t blk;
749 alist_blk_t rblk;
750 alist_blk_t count;
751 alist_blk_t xcount;
752 alist_blk_t bfree;
753 vm_page_t m;
754 struct vm_page_hash_elm *mp;
755 int mask;
756
757 /*
758 * Set the set_assoc_mask based on the fitted number of CPUs.
759 * This is a mask, so we subject 1.
760 *
761 * w/PQ_L2_SIZE = 1024, Don't let the associativity drop below 8.
762 * So if we have 256 CPUs, two hyper-threads will wind up sharing.
763 *
764 * The maximum is PQ_L2_SIZE. However, we limit the starting
765 * maximum to 16 (mask = 15) in order to improve the cache locality
766 * of related kernel data structures.
767 */
768 mask = PQ_L2_SIZE / ncpus_fit - 1;
769 if (mask < 7) /* minimum is 8-way w/256 CPU threads */
770 mask = 7;
771 if (mask < 15)
772 mask = 15;
773 cpu_ccfence();
774 set_assoc_mask = mask;
775
776 /*
777 * Return part of the initial reserve back to the system
778 */
779 spin_lock(&vm_contig_spin);
780 for (;;) {
781 bfree = alist_free_info(&vm_contig_alist, &blk, &count);
782 if (bfree <= vm_dma_reserved / PAGE_SIZE)
783 break;
784 if (count == 0)
785 break;
786
787 /*
788 * Figure out how much of the initial reserve we have to
789 * free in order to reach our target.
790 */
791 bfree -= vm_dma_reserved / PAGE_SIZE;
792 if (count > bfree) {
793 blk += count - bfree;
794 count = bfree;
795 }
796
797 /*
798 * Calculate the nearest power of 2 <= count.
799 */
800 for (xcount = 1; xcount <= count; xcount <<= 1)
801 ;
802 xcount >>= 1;
803 blk += count - xcount;
804 count = xcount;
805
806 /*
807 * Allocate the pages from the alist, then free them to
808 * the normal VM page queues.
809 *
810 * Pages allocated from the alist are wired. We have to
811 * busy, unwire, and free them. We must also adjust
812 * vm_low_phys_reserved before freeing any pages to prevent
813 * confusion.
814 */
815 rblk = alist_alloc(&vm_contig_alist, blk, count);
816 if (rblk != blk) {
817 kprintf("vm_page_startup_finish: Unable to return "
818 "dma space @0x%08x/%d -> 0x%08x\n",
819 blk, count, rblk);
820 break;
821 }
822 atomic_add_long(&vmstats.v_dma_pages, -(long)count);
823 spin_unlock(&vm_contig_spin);
824
825 m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
826 vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
827 while (count) {
828 vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);
829 vm_page_busy_wait(m, FALSE, "cpgfr");
830 vm_page_unwire(m, 0);
831 vm_page_free(m);
832 --count;
833 ++m;
834 }
835 spin_lock(&vm_contig_spin);
836 }
837 spin_unlock(&vm_contig_spin);
838
839 /*
840 * Print out how much DMA space drivers have already allocated and
841 * how much is left over.
842 */
843 kprintf("DMA space used: %jdk, remaining available: %jdk\n",
844 (intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
845 (PAGE_SIZE / 1024),
846 (intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
847
848 /*
849 * Power of 2
850 */
851 vm_page_hash_size = 4096;
852 while (vm_page_hash_size < (vm_page_array_size / 16))
853 vm_page_hash_size <<= 1;
854 if (vm_page_hash_size > VM_PAGE_HASH_MAX)
855 vm_page_hash_size = VM_PAGE_HASH_MAX;
856
857 /*
858 * hash table for vm_page_lookup_quick()
859 */
860 mp = (void *)kmem_alloc3(kernel_map,
861 (vm_page_hash_size + VM_PAGE_HASH_SET) *
862 sizeof(*vm_page_hash),
863 VM_SUBSYS_VMPGHASH, KM_CPU(0));
864 bzero(mp, (vm_page_hash_size + VM_PAGE_HASH_SET) * sizeof(*mp));
865 cpu_sfence();
866 vm_page_hash = mp;
867 }
868 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
869 vm_page_startup_finish, NULL);
870
871
872 /*
873 * Scan comparison function for Red-Black tree scans. An inclusive
874 * (start,end) is expected. Other fields are not used.
875 */
876 int
rb_vm_page_scancmp(struct vm_page * p,void * data)877 rb_vm_page_scancmp(struct vm_page *p, void *data)
878 {
879 struct rb_vm_page_scan_info *info = data;
880
881 if (p->pindex < info->start_pindex)
882 return(-1);
883 if (p->pindex > info->end_pindex)
884 return(1);
885 return(0);
886 }
887
888 int
rb_vm_page_compare(struct vm_page * p1,struct vm_page * p2)889 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
890 {
891 if (p1->pindex < p2->pindex)
892 return(-1);
893 if (p1->pindex > p2->pindex)
894 return(1);
895 return(0);
896 }
897
898 void
vm_page_init(vm_page_t m)899 vm_page_init(vm_page_t m)
900 {
901 /* do nothing for now. Called from pmap_page_init() */
902 }
903
904 /*
905 * Each page queue has its own spin lock, which is fairly optimal for
906 * allocating and freeing pages at least.
907 *
908 * The caller must hold the vm_page_spin_lock() before locking a vm_page's
909 * queue spinlock via this function. Also note that m->queue cannot change
910 * unless both the page and queue are locked.
911 */
912 static __inline
913 void
_vm_page_queue_spin_lock(vm_page_t m)914 _vm_page_queue_spin_lock(vm_page_t m)
915 {
916 u_short queue;
917
918 queue = m->queue;
919 if (queue != PQ_NONE) {
920 spin_lock(&vm_page_queues[queue].spin);
921 KKASSERT(queue == m->queue);
922 }
923 }
924
925 static __inline
926 void
_vm_page_queue_spin_unlock(vm_page_t m)927 _vm_page_queue_spin_unlock(vm_page_t m)
928 {
929 u_short queue;
930
931 queue = m->queue;
932 cpu_ccfence();
933 if (queue != PQ_NONE)
934 spin_unlock(&vm_page_queues[queue].spin);
935 }
936
937 static __inline
938 void
_vm_page_queues_spin_lock(u_short queue)939 _vm_page_queues_spin_lock(u_short queue)
940 {
941 cpu_ccfence();
942 if (queue != PQ_NONE)
943 spin_lock(&vm_page_queues[queue].spin);
944 }
945
946
947 static __inline
948 void
_vm_page_queues_spin_unlock(u_short queue)949 _vm_page_queues_spin_unlock(u_short queue)
950 {
951 cpu_ccfence();
952 if (queue != PQ_NONE)
953 spin_unlock(&vm_page_queues[queue].spin);
954 }
955
956 void
vm_page_queue_spin_lock(vm_page_t m)957 vm_page_queue_spin_lock(vm_page_t m)
958 {
959 _vm_page_queue_spin_lock(m);
960 }
961
962 void
vm_page_queues_spin_lock(u_short queue)963 vm_page_queues_spin_lock(u_short queue)
964 {
965 _vm_page_queues_spin_lock(queue);
966 }
967
968 void
vm_page_queue_spin_unlock(vm_page_t m)969 vm_page_queue_spin_unlock(vm_page_t m)
970 {
971 _vm_page_queue_spin_unlock(m);
972 }
973
974 void
vm_page_queues_spin_unlock(u_short queue)975 vm_page_queues_spin_unlock(u_short queue)
976 {
977 _vm_page_queues_spin_unlock(queue);
978 }
979
980 /*
981 * This locks the specified vm_page and its queue in the proper order
982 * (page first, then queue). The queue may change so the caller must
983 * recheck on return.
984 */
985 static __inline
986 void
_vm_page_and_queue_spin_lock(vm_page_t m)987 _vm_page_and_queue_spin_lock(vm_page_t m)
988 {
989 vm_page_spin_lock(m);
990 _vm_page_queue_spin_lock(m);
991 }
992
993 static __inline
994 void
_vm_page_and_queue_spin_unlock(vm_page_t m)995 _vm_page_and_queue_spin_unlock(vm_page_t m)
996 {
997 _vm_page_queues_spin_unlock(m->queue);
998 vm_page_spin_unlock(m);
999 }
1000
1001 void
vm_page_and_queue_spin_unlock(vm_page_t m)1002 vm_page_and_queue_spin_unlock(vm_page_t m)
1003 {
1004 _vm_page_and_queue_spin_unlock(m);
1005 }
1006
1007 void
vm_page_and_queue_spin_lock(vm_page_t m)1008 vm_page_and_queue_spin_lock(vm_page_t m)
1009 {
1010 _vm_page_and_queue_spin_lock(m);
1011 }
1012
1013 /*
1014 * Helper function removes vm_page from its current queue.
1015 * Returns the base queue the page used to be on.
1016 *
1017 * The vm_page and the queue must be spinlocked.
1018 * This function will unlock the queue but leave the page spinlocked.
1019 */
1020 static __inline u_short
_vm_page_rem_queue_spinlocked(vm_page_t m)1021 _vm_page_rem_queue_spinlocked(vm_page_t m)
1022 {
1023 struct vpgqueues *pq;
1024 u_short queue;
1025 u_short oqueue;
1026 long *cnt_adj;
1027 long *cnt_gd;
1028
1029 queue = m->queue;
1030 if (queue != PQ_NONE) {
1031 pq = &vm_page_queues[queue];
1032 TAILQ_REMOVE(&pq->pl, m, pageq);
1033
1034 /*
1035 * Primarily adjust our pcpu stats for rollup, which is
1036 * (mycpu->gd_vmstats_adj + offset). This is normally
1037 * synchronized on every hardclock().
1038 *
1039 * However, in order for the nominal low-memory algorithms
1040 * to work properly if the unsynchronized adjustment gets
1041 * too negative and might trigger the pageout daemon, we
1042 * immediately synchronize with the global structure.
1043 *
1044 * The idea here is to reduce unnecessary SMP cache mastership
1045 * changes in the global vmstats, which can be particularly
1046 * bad in multi-socket systems.
1047 *
1048 * WARNING! In systems with low amounts of memory the
1049 * vm_paging_needed(-1024 * ncpus) test could
1050 * wind up testing a value above the paging target,
1051 * meaning it would almost always return TRUE. In
1052 * that situation we synchronize every time the
1053 * cumulative adjustment falls below -1024.
1054 */
1055 cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1056 pq->cnt_offset);
1057 cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1058 pq->cnt_offset);
1059 atomic_add_long(cnt_adj, -1);
1060 atomic_add_long(cnt_gd, -1);
1061
1062 if (*cnt_adj < -1024 && vm_paging_start(-1024 * ncpus)) {
1063 u_long copy = atomic_swap_long(cnt_adj, 0);
1064 cnt_adj = (long *)((char *)&vmstats + pq->cnt_offset);
1065 atomic_add_long(cnt_adj, copy);
1066 }
1067 pq->lcnt--;
1068 m->queue = PQ_NONE;
1069 oqueue = queue;
1070 queue -= m->pc;
1071 vm_page_queues_spin_unlock(oqueue); /* intended */
1072 }
1073 return queue;
1074 }
1075
1076 /*
1077 * Helper function places the vm_page on the specified queue. Generally
1078 * speaking only PQ_FREE pages are placed at the head, to allow them to
1079 * be allocated sooner rather than later on the assumption that they
1080 * are cache-hot.
1081 *
1082 * The vm_page must be spinlocked.
1083 * The vm_page must NOT be FICTITIOUS (that would be a disaster)
1084 * This function will return with both the page and the queue locked.
1085 */
1086 static __inline void
_vm_page_add_queue_spinlocked(vm_page_t m,u_short queue,int athead)1087 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
1088 {
1089 struct vpgqueues *pq;
1090 u_long *cnt_adj;
1091 u_long *cnt_gd;
1092
1093 KKASSERT(m->queue == PQ_NONE &&
1094 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0);
1095
1096 if (queue != PQ_NONE) {
1097 vm_page_queues_spin_lock(queue);
1098 pq = &vm_page_queues[queue];
1099 ++pq->lcnt;
1100
1101 /*
1102 * Adjust our pcpu stats. If a system entity really needs
1103 * to incorporate the count it will call vmstats_rollup()
1104 * to roll it all up into the global vmstats strufture.
1105 */
1106 cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1107 pq->cnt_offset);
1108 cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1109 pq->cnt_offset);
1110 atomic_add_long(cnt_adj, 1);
1111 atomic_add_long(cnt_gd, 1);
1112
1113 /*
1114 * PQ_FREE is always handled LIFO style to try to provide
1115 * cache-hot pages to programs.
1116 */
1117 m->queue = queue;
1118 if (queue - m->pc == PQ_FREE) {
1119 TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1120 } else if (athead) {
1121 TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1122 } else {
1123 TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1124 }
1125 /* leave the queue spinlocked */
1126 }
1127 }
1128
1129 /*
1130 * Wait until page is no longer BUSY. If also_m_busy is TRUE we wait
1131 * until the page is no longer BUSY or SBUSY (busy_count field is 0).
1132 *
1133 * Returns TRUE if it had to sleep, FALSE if we did not. Only one sleep
1134 * call will be made before returning.
1135 *
1136 * This function does NOT busy the page and on return the page is not
1137 * guaranteed to be available.
1138 */
1139 void
vm_page_sleep_busy(vm_page_t m,int also_m_busy,const char * msg)1140 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
1141 {
1142 u_int32_t busy_count;
1143
1144 for (;;) {
1145 busy_count = m->busy_count;
1146 cpu_ccfence();
1147
1148 if ((busy_count & PBUSY_LOCKED) == 0 &&
1149 (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
1150 break;
1151 }
1152 tsleep_interlock(m, 0);
1153 if (atomic_cmpset_int(&m->busy_count, busy_count,
1154 busy_count | PBUSY_WANTED)) {
1155 atomic_set_int(&m->flags, PG_REFERENCED);
1156 tsleep(m, PINTERLOCKED, msg, 0);
1157 break;
1158 }
1159 }
1160 }
1161
1162 /*
1163 * This calculates and returns a page color given an optional VM object and
1164 * either a pindex or an iterator. We attempt to return a cpu-localized
1165 * pg_color that is still roughly 16-way set-associative. The CPU topology
1166 * is used if it was probed.
1167 *
1168 * The caller may use the returned value to index into e.g. PQ_FREE when
1169 * allocating a page in order to nominally obtain pages that are hopefully
1170 * already localized to the requesting cpu. This function is not able to
1171 * provide any sort of guarantee of this, but does its best to improve
1172 * hardware cache management performance.
1173 *
1174 * WARNING! The caller must mask the returned value with PQ_L2_MASK.
1175 */
1176 u_short
vm_get_pg_color(int cpuid,vm_object_t object,vm_pindex_t pindex)1177 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
1178 {
1179 u_short pg_color;
1180 int object_pg_color;
1181
1182 /*
1183 * WARNING! cpu_topology_core_ids might not be a power of two.
1184 * We also shouldn't make assumptions about
1185 * cpu_topology_phys_ids either.
1186 *
1187 * WARNING! ncpus might not be known at this time (during early
1188 * boot), and might be set to 1.
1189 *
1190 * General format: [phys_id][core_id][cpuid][set-associativity]
1191 * (but uses modulo, so not necessarily precise bit masks)
1192 */
1193 object_pg_color = object ? object->pg_color : 0;
1194
1195 if (cpu_topology_ht_ids) {
1196 int phys_id;
1197 int core_id;
1198 int ht_id;
1199 int physcale;
1200 int grpscale;
1201 int cpuscale;
1202
1203 /*
1204 * Translate cpuid to socket, core, and hyperthread id.
1205 */
1206 phys_id = get_cpu_phys_id(cpuid);
1207 core_id = get_cpu_core_id(cpuid);
1208 ht_id = get_cpu_ht_id(cpuid);
1209
1210 /*
1211 * Calculate pg_color for our array index.
1212 *
1213 * physcale - socket multiplier.
1214 * grpscale - core multiplier (cores per socket)
1215 * cpu* - cpus per core
1216 *
1217 * WARNING! In early boot, ncpus has not yet been
1218 * initialized and may be set to (1).
1219 *
1220 * WARNING! physcale must match the organization that
1221 * vm_numa_organize() creates to ensure that
1222 * we properly localize allocations to the
1223 * requested cpuid.
1224 */
1225 physcale = PQ_L2_SIZE / cpu_topology_phys_ids;
1226 grpscale = physcale / cpu_topology_core_ids;
1227 cpuscale = grpscale / cpu_topology_ht_ids;
1228
1229 pg_color = phys_id * physcale;
1230 pg_color += core_id * grpscale;
1231 pg_color += ht_id * cpuscale;
1232 pg_color += (pindex + object_pg_color) % cpuscale;
1233
1234 #if 0
1235 if (grpsize >= 8) {
1236 pg_color += (pindex + object_pg_color) % grpsize;
1237 } else {
1238 if (grpsize <= 2) {
1239 grpsize = 8;
1240 } else {
1241 /* 3->9, 4->8, 5->10, 6->12, 7->14 */
1242 grpsize += grpsize;
1243 if (grpsize < 8)
1244 grpsize += grpsize;
1245 }
1246 pg_color += (pindex + object_pg_color) % grpsize;
1247 }
1248 #endif
1249 } else {
1250 /*
1251 * Unknown topology, distribute things evenly.
1252 *
1253 * WARNING! In early boot, ncpus has not yet been
1254 * initialized and may be set to (1).
1255 */
1256 int cpuscale;
1257
1258 cpuscale = PQ_L2_SIZE / ncpus;
1259
1260 pg_color = cpuid * cpuscale;
1261 pg_color += (pindex + object_pg_color) % cpuscale;
1262 }
1263 return (pg_color & PQ_L2_MASK);
1264 }
1265
1266 /*
1267 * Wait until BUSY can be set, then set it. If also_m_busy is TRUE we
1268 * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
1269 */
1270 void
VM_PAGE_DEBUG_EXT(vm_page_busy_wait)1271 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
1272 int also_m_busy, const char *msg
1273 VM_PAGE_DEBUG_ARGS)
1274 {
1275 u_int32_t busy_count;
1276
1277 for (;;) {
1278 busy_count = m->busy_count;
1279 cpu_ccfence();
1280 if (busy_count & PBUSY_LOCKED) {
1281 tsleep_interlock(m, 0);
1282 if (atomic_cmpset_int(&m->busy_count, busy_count,
1283 busy_count | PBUSY_WANTED)) {
1284 atomic_set_int(&m->flags, PG_REFERENCED);
1285 tsleep(m, PINTERLOCKED, msg, 0);
1286 }
1287 } else if (also_m_busy && busy_count) {
1288 tsleep_interlock(m, 0);
1289 if (atomic_cmpset_int(&m->busy_count, busy_count,
1290 busy_count | PBUSY_WANTED)) {
1291 atomic_set_int(&m->flags, PG_REFERENCED);
1292 tsleep(m, PINTERLOCKED, msg, 0);
1293 }
1294 } else {
1295 if (atomic_cmpset_int(&m->busy_count, busy_count,
1296 busy_count | PBUSY_LOCKED)) {
1297 #ifdef VM_PAGE_DEBUG
1298 m->busy_func = func;
1299 m->busy_line = lineno;
1300 #endif
1301 break;
1302 }
1303 }
1304 }
1305 }
1306
1307 /*
1308 * Attempt to set BUSY. If also_m_busy is TRUE we only succeed if
1309 * m->busy_count is also 0.
1310 *
1311 * Returns non-zero on failure.
1312 */
1313 int
VM_PAGE_DEBUG_EXT(vm_page_busy_try)1314 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1315 VM_PAGE_DEBUG_ARGS)
1316 {
1317 u_int32_t busy_count;
1318
1319 for (;;) {
1320 busy_count = m->busy_count;
1321 cpu_ccfence();
1322 if (busy_count & PBUSY_LOCKED)
1323 return TRUE;
1324 if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
1325 return TRUE;
1326 if (atomic_cmpset_int(&m->busy_count, busy_count,
1327 busy_count | PBUSY_LOCKED)) {
1328 #ifdef VM_PAGE_DEBUG
1329 m->busy_func = func;
1330 m->busy_line = lineno;
1331 #endif
1332 return FALSE;
1333 }
1334 }
1335 }
1336
1337 /*
1338 * Clear the BUSY flag and return non-zero to indicate to the caller
1339 * that a wakeup() should be performed.
1340 *
1341 * (inline version)
1342 */
1343 static __inline
1344 int
_vm_page_wakeup(vm_page_t m)1345 _vm_page_wakeup(vm_page_t m)
1346 {
1347 u_int32_t busy_count;
1348
1349 busy_count = m->busy_count;
1350 cpu_ccfence();
1351 for (;;) {
1352 if (atomic_fcmpset_int(&m->busy_count, &busy_count,
1353 busy_count &
1354 ~(PBUSY_LOCKED | PBUSY_WANTED))) {
1355 return((int)(busy_count & PBUSY_WANTED));
1356 }
1357 }
1358 /* not reached */
1359 }
1360
1361 /*
1362 * Clear the BUSY flag and wakeup anyone waiting for the page. This
1363 * is typically the last call you make on a page before moving onto
1364 * other things.
1365 */
1366 void
vm_page_wakeup(vm_page_t m)1367 vm_page_wakeup(vm_page_t m)
1368 {
1369 KASSERT(m->busy_count & PBUSY_LOCKED,
1370 ("vm_page_wakeup: page not busy!!!"));
1371 if (_vm_page_wakeup(m))
1372 wakeup(m);
1373 }
1374
1375 /*
1376 * Hold a page, preventing reuse. This is typically only called on pages
1377 * in a known state (either held busy, special, or interlocked in some
1378 * manner). Holding a page does not ensure that it remains valid, it only
1379 * prevents reuse. The page must not already be on the FREE queue or in
1380 * any danger of being moved to the FREE queue concurrent with this call.
1381 *
1382 * Other parts of the system can still disassociate the page from its object
1383 * and attempt to free it, or perform read or write I/O on it and/or otherwise
1384 * manipulate the page, but if the page is held the VM system will leave the
1385 * page and its data intact and not cycle it through the FREE queue until
1386 * the last hold has been released.
1387 *
1388 * (see vm_page_wire() if you want to prevent the page from being
1389 * disassociated from its object too).
1390 */
1391 void
vm_page_hold(vm_page_t m)1392 vm_page_hold(vm_page_t m)
1393 {
1394 atomic_add_int(&m->hold_count, 1);
1395 KKASSERT(m->queue - m->pc != PQ_FREE);
1396 }
1397
1398 /*
1399 * The opposite of vm_page_hold(). If the page is on the HOLD queue
1400 * it was freed while held and must be moved back to the FREE queue.
1401 *
1402 * To avoid racing against vm_page_free*() we must re-test conditions
1403 * after obtaining the spin-lock. The initial test can also race a
1404 * vm_page_free*() that is in the middle of moving a page to PQ_HOLD,
1405 * leaving the page on PQ_HOLD with hold_count == 0. Rather than
1406 * throw a spin-lock in the critical path, we rely on the pageout
1407 * daemon to clean-up these loose ends.
1408 *
1409 * More critically, the 'easy movement' between queues without busying
1410 * a vm_page is only allowed for PQ_FREE<->PQ_HOLD.
1411 */
1412 void
vm_page_unhold(vm_page_t m)1413 vm_page_unhold(vm_page_t m)
1414 {
1415 KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1416 ("vm_page_unhold: pg %p illegal hold_count (%d) or "
1417 "on FREE queue (%d)",
1418 m, m->hold_count, m->queue - m->pc));
1419
1420 if (atomic_fetchadd_int(&m->hold_count, -1) == 1 &&
1421 m->queue - m->pc == PQ_HOLD) {
1422 vm_page_spin_lock(m);
1423 if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1424 _vm_page_queue_spin_lock(m);
1425 _vm_page_rem_queue_spinlocked(m);
1426 _vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1427 _vm_page_queue_spin_unlock(m);
1428 }
1429 vm_page_spin_unlock(m);
1430 }
1431 }
1432
1433 /*
1434 * Create a fictitious page with the specified physical address and
1435 * memory attribute. The memory attribute is the only the machine-
1436 * dependent aspect of a fictitious page that must be initialized.
1437 */
1438 void
vm_page_initfake(vm_page_t m,vm_paddr_t paddr,vm_memattr_t memattr)1439 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1440 {
1441 /*
1442 * The page's memattr might have changed since the
1443 * previous initialization. Update the pmap to the
1444 * new memattr.
1445 */
1446 if ((m->flags & PG_FICTITIOUS) != 0)
1447 goto memattr;
1448 m->phys_addr = paddr;
1449 m->queue = PQ_NONE;
1450 /* Fictitious pages don't use "segind". */
1451 /* Fictitious pages don't use "order" or "pool". */
1452 m->flags = PG_FICTITIOUS | PG_UNQUEUED;
1453 m->busy_count = PBUSY_LOCKED;
1454 m->wire_count = 1;
1455 spin_init(&m->spin, "fake_page");
1456 pmap_page_init(m);
1457 memattr:
1458 pmap_page_set_memattr(m, memattr);
1459 }
1460
1461 /*
1462 * Inserts the given vm_page into the object and object list.
1463 *
1464 * The pagetables are not updated but will presumably fault the page
1465 * in if necessary, or if a kernel page the caller will at some point
1466 * enter the page into the kernel's pmap. We are not allowed to block
1467 * here so we *can't* do this anyway.
1468 *
1469 * This routine may not block.
1470 * This routine must be called with the vm_object held.
1471 * This routine must be called with a critical section held.
1472 *
1473 * This routine returns TRUE if the page was inserted into the object
1474 * successfully, and FALSE if the page already exists in the object.
1475 */
1476 int
vm_page_insert(vm_page_t m,vm_object_t object,vm_pindex_t pindex)1477 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1478 {
1479 ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1480 if (m->object != NULL)
1481 panic("vm_page_insert: already inserted");
1482
1483 atomic_add_int(&object->generation, 1);
1484
1485 /*
1486 * Associate the VM page with an (object, offset).
1487 *
1488 * The vm_page spin lock is required for interactions with the pmap.
1489 * XXX vm_page_spin_lock() might not be needed for this any more.
1490 */
1491 vm_page_spin_lock(m);
1492 m->object = object;
1493 m->pindex = pindex;
1494 if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1495 m->object = NULL;
1496 m->pindex = 0;
1497 vm_page_spin_unlock(m);
1498 return FALSE;
1499 }
1500 ++object->resident_page_count;
1501 ++mycpu->gd_vmtotal.t_rm;
1502 vm_page_spin_unlock(m);
1503
1504 /*
1505 * Since we are inserting a new and possibly dirty page,
1506 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1507 */
1508 if ((m->valid & m->dirty) ||
1509 (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1510 vm_object_set_writeable_dirty(object);
1511
1512 /*
1513 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1514 */
1515 swap_pager_page_inserted(m);
1516 return TRUE;
1517 }
1518
1519 /*
1520 * Removes the given vm_page_t from the (object,index) table
1521 *
1522 * The page must be BUSY and will remain BUSY on return.
1523 * No other requirements.
1524 *
1525 * NOTE: FreeBSD side effect was to unbusy the page on return. We leave
1526 * it busy.
1527 *
1528 * NOTE: Caller is responsible for any pmap disposition prior to the
1529 * rename (as the pmap code will not be able to find the entries
1530 * once the object has been disassociated). The caller may choose
1531 * to leave the pmap association intact if this routine is being
1532 * called as part of a rename between shadowed objects.
1533 *
1534 * This routine may not block.
1535 */
1536 void
vm_page_remove(vm_page_t m)1537 vm_page_remove(vm_page_t m)
1538 {
1539 vm_object_t object;
1540
1541 if (m->object == NULL) {
1542 return;
1543 }
1544
1545 if ((m->busy_count & PBUSY_LOCKED) == 0)
1546 panic("vm_page_remove: page not busy");
1547
1548 object = m->object;
1549
1550 vm_object_hold(object);
1551
1552 /*
1553 * Remove the page from the object and update the object.
1554 *
1555 * The vm_page spin lock is required for interactions with the pmap.
1556 * XXX vm_page_spin_lock() might not be needed for this any more.
1557 */
1558 vm_page_spin_lock(m);
1559 vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1560 --object->resident_page_count;
1561 --mycpu->gd_vmtotal.t_rm;
1562 m->object = NULL;
1563 atomic_add_int(&object->generation, 1);
1564 vm_page_spin_unlock(m);
1565
1566 vm_object_drop(object);
1567 }
1568
1569 /*
1570 * Calculate the hash position for the vm_page hash heuristic. Generally
1571 * speaking we want to localize sequential lookups to reduce memory stalls.
1572 *
1573 * Mask by ~3 to offer 4-way set-assoc
1574 */
1575 static __inline
1576 struct vm_page_hash_elm *
vm_page_hash_hash(vm_object_t object,vm_pindex_t pindex)1577 vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
1578 {
1579 size_t hi;
1580
1581 hi = iscsi_crc32(&object, sizeof(object)) << 2;
1582 hi ^= hi >> (23 - 2);
1583 hi += pindex * VM_PAGE_HASH_SET;
1584 #if 0
1585 /* mix it up */
1586 hi = (intptr_t)object ^ object->pg_color ^ pindex;
1587 hi += object->pg_color * pindex;
1588 hi = hi ^ (hi >> 20);
1589 #endif
1590 hi &= vm_page_hash_size - 1; /* bounds */
1591
1592 return (&vm_page_hash[hi]);
1593 }
1594
1595 /*
1596 * Heuristical page lookup that does not require any locks. Returns
1597 * a soft-busied page on success, NULL on failure.
1598 *
1599 * Caller must lookup the page the slow way if NULL is returned.
1600 */
1601 vm_page_t
vm_page_hash_get(vm_object_t object,vm_pindex_t pindex)1602 vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
1603 {
1604 struct vm_page_hash_elm *mp;
1605 vm_page_t m;
1606 int i;
1607
1608 if (__predict_false(vm_page_hash == NULL))
1609 return NULL;
1610 mp = vm_page_hash_hash(object, pindex);
1611 for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1612 if (mp->object != object ||
1613 mp->pindex != pindex) {
1614 continue;
1615 }
1616 m = mp->m;
1617 cpu_ccfence();
1618 if (m == NULL)
1619 continue;
1620 if (m->object != object || m->pindex != pindex)
1621 continue;
1622 if (vm_page_sbusy_try(m))
1623 continue;
1624 if (m->object == object && m->pindex == pindex) {
1625 /*
1626 * On-match optimization - do not update ticks
1627 * unless we have to (reduce cache coherency traffic)
1628 */
1629 if (mp->ticks != ticks)
1630 mp->ticks = ticks;
1631 return m;
1632 }
1633 vm_page_sbusy_drop(m);
1634 }
1635 return NULL;
1636 }
1637
1638 /*
1639 * Enter page onto vm_page_hash[]. This is a heuristic, SMP collisions
1640 * are allowed.
1641 */
1642 static __inline
1643 void
vm_page_hash_enter(vm_page_t m)1644 vm_page_hash_enter(vm_page_t m)
1645 {
1646 struct vm_page_hash_elm *mp;
1647 struct vm_page_hash_elm *best;
1648 vm_object_t object;
1649 vm_pindex_t pindex;
1650 int best_delta;
1651 int delta;
1652 int i;
1653
1654 /*
1655 * Only enter type-stable vm_pages with well-shared objects.
1656 */
1657 if ((m->flags & PG_MAPPEDMULTI) == 0)
1658 return;
1659 if (__predict_false(vm_page_hash == NULL ||
1660 m < &vm_page_array[0] ||
1661 m >= &vm_page_array[vm_page_array_size])) {
1662 return;
1663 }
1664 if (__predict_false(m->object == NULL))
1665 return;
1666 #if 0
1667 /*
1668 * Disabled at the moment, there are some degenerate conditions
1669 * with often-exec'd programs that get ignored. In particular,
1670 * the kernel's elf loader does a vn_rdwr() on the first page of
1671 * a binary.
1672 */
1673 if (m->object->ref_count <= 2 || (m->object->flags & OBJ_ONEMAPPING))
1674 return;
1675 #endif
1676 if (vm_page_hash_vnode_only && m->object->type != OBJT_VNODE)
1677 return;
1678
1679 /*
1680 * Find best entry
1681 */
1682 object = m->object;
1683 pindex = m->pindex;
1684
1685 mp = vm_page_hash_hash(object, pindex);
1686 best = mp;
1687 best_delta = ticks - best->ticks;
1688
1689 for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1690 if (mp->m == m &&
1691 mp->object == object &&
1692 mp->pindex == pindex) {
1693 /*
1694 * On-match optimization - do not update ticks
1695 * unless we have to (reduce cache coherency traffic)
1696 */
1697 if (mp->ticks != ticks)
1698 mp->ticks = ticks;
1699 return;
1700 }
1701
1702 /*
1703 * The best choice is the oldest entry.
1704 *
1705 * Also check for a field overflow, using -1 instead of 0
1706 * to deal with SMP races on accessing the 'ticks' global.
1707 */
1708 delta = ticks - mp->ticks;
1709 if (delta < -1)
1710 best = mp;
1711 if (best_delta < delta)
1712 best = mp;
1713 }
1714
1715 /*
1716 * Load the entry. Copy a few elements to the hash entry itself
1717 * to reduce memory stalls due to memory indirects on lookups.
1718 */
1719 best->m = m;
1720 best->object = object;
1721 best->pindex = pindex;
1722 best->ticks = ticks;
1723 }
1724
1725 /*
1726 * Locate and return the page at (object, pindex), or NULL if the
1727 * page could not be found.
1728 *
1729 * The caller must hold the vm_object token.
1730 */
1731 vm_page_t
vm_page_lookup(vm_object_t object,vm_pindex_t pindex)1732 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1733 {
1734 vm_page_t m;
1735
1736 /*
1737 * Search the hash table for this object/offset pair
1738 */
1739 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1740 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1741 if (m) {
1742 KKASSERT(m->object == object && m->pindex == pindex);
1743 vm_page_hash_enter(m);
1744 }
1745 return(m);
1746 }
1747
1748 vm_page_t
VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)1749 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1750 vm_pindex_t pindex,
1751 int also_m_busy, const char *msg
1752 VM_PAGE_DEBUG_ARGS)
1753 {
1754 u_int32_t busy_count;
1755 vm_page_t m;
1756
1757 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1758 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1759 while (m) {
1760 KKASSERT(m->object == object && m->pindex == pindex);
1761 busy_count = m->busy_count;
1762 cpu_ccfence();
1763 if (busy_count & PBUSY_LOCKED) {
1764 tsleep_interlock(m, 0);
1765 if (atomic_cmpset_int(&m->busy_count, busy_count,
1766 busy_count | PBUSY_WANTED)) {
1767 atomic_set_int(&m->flags, PG_REFERENCED);
1768 tsleep(m, PINTERLOCKED, msg, 0);
1769 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1770 pindex);
1771 }
1772 } else if (also_m_busy && busy_count) {
1773 tsleep_interlock(m, 0);
1774 if (atomic_cmpset_int(&m->busy_count, busy_count,
1775 busy_count | PBUSY_WANTED)) {
1776 atomic_set_int(&m->flags, PG_REFERENCED);
1777 tsleep(m, PINTERLOCKED, msg, 0);
1778 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1779 pindex);
1780 }
1781 } else if (atomic_cmpset_int(&m->busy_count, busy_count,
1782 busy_count | PBUSY_LOCKED)) {
1783 #ifdef VM_PAGE_DEBUG
1784 m->busy_func = func;
1785 m->busy_line = lineno;
1786 #endif
1787 vm_page_hash_enter(m);
1788 break;
1789 }
1790 }
1791 return m;
1792 }
1793
1794 /*
1795 * Attempt to lookup and busy a page.
1796 *
1797 * Returns NULL if the page could not be found
1798 *
1799 * Returns a vm_page and error == TRUE if the page exists but could not
1800 * be busied.
1801 *
1802 * Returns a vm_page and error == FALSE on success.
1803 */
1804 vm_page_t
VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)1805 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1806 vm_pindex_t pindex,
1807 int also_m_busy, int *errorp
1808 VM_PAGE_DEBUG_ARGS)
1809 {
1810 u_int32_t busy_count;
1811 vm_page_t m;
1812
1813 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1814 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1815 *errorp = FALSE;
1816 while (m) {
1817 KKASSERT(m->object == object && m->pindex == pindex);
1818 busy_count = m->busy_count;
1819 cpu_ccfence();
1820 if (busy_count & PBUSY_LOCKED) {
1821 *errorp = TRUE;
1822 break;
1823 }
1824 if (also_m_busy && busy_count) {
1825 *errorp = TRUE;
1826 break;
1827 }
1828 if (atomic_cmpset_int(&m->busy_count, busy_count,
1829 busy_count | PBUSY_LOCKED)) {
1830 #ifdef VM_PAGE_DEBUG
1831 m->busy_func = func;
1832 m->busy_line = lineno;
1833 #endif
1834 vm_page_hash_enter(m);
1835 break;
1836 }
1837 }
1838 return m;
1839 }
1840
1841 /*
1842 * Returns a page that is only soft-busied for use by the caller in
1843 * a read-only fashion. Returns NULL if the page could not be found,
1844 * the soft busy could not be obtained, or the page data is invalid.
1845 *
1846 * XXX Doesn't handle PG_FICTITIOUS pages at the moment, but there is
1847 * no reason why we couldn't.
1848 */
1849 vm_page_t
vm_page_lookup_sbusy_try(struct vm_object * object,vm_pindex_t pindex,int pgoff,int pgbytes)1850 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
1851 int pgoff, int pgbytes)
1852 {
1853 vm_page_t m;
1854
1855 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1856 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1857 if (m) {
1858 if ((m->valid != VM_PAGE_BITS_ALL &&
1859 !vm_page_is_valid(m, pgoff, pgbytes)) ||
1860 (m->flags & PG_FICTITIOUS)) {
1861 m = NULL;
1862 } else if (vm_page_sbusy_try(m)) {
1863 m = NULL;
1864 } else if ((m->valid != VM_PAGE_BITS_ALL &&
1865 !vm_page_is_valid(m, pgoff, pgbytes)) ||
1866 (m->flags & PG_FICTITIOUS)) {
1867 vm_page_sbusy_drop(m);
1868 m = NULL;
1869 } else {
1870 vm_page_hash_enter(m);
1871 }
1872 }
1873 return m;
1874 }
1875
1876 /*
1877 * Caller must hold the related vm_object
1878 */
1879 vm_page_t
vm_page_next(vm_page_t m)1880 vm_page_next(vm_page_t m)
1881 {
1882 vm_page_t next;
1883
1884 next = vm_page_rb_tree_RB_NEXT(m);
1885 if (next && next->pindex != m->pindex + 1)
1886 next = NULL;
1887 return (next);
1888 }
1889
1890 /*
1891 * vm_page_rename()
1892 *
1893 * Move the given vm_page from its current object to the specified
1894 * target object/offset. The page must be busy and will remain so
1895 * on return.
1896 *
1897 * new_object must be held.
1898 * This routine might block. XXX ?
1899 *
1900 * NOTE: Swap associated with the page must be invalidated by the move. We
1901 * have to do this for several reasons: (1) we aren't freeing the
1902 * page, (2) we are dirtying the page, (3) the VM system is probably
1903 * moving the page from object A to B, and will then later move
1904 * the backing store from A to B and we can't have a conflict.
1905 *
1906 * NOTE: We *always* dirty the page. It is necessary both for the
1907 * fact that we moved it, and because we may be invalidating
1908 * swap. If the page is on the cache, we have to deactivate it
1909 * or vm_page_dirty() will panic. Dirty pages are not allowed
1910 * on the cache.
1911 *
1912 * NOTE: Caller is responsible for any pmap disposition prior to the
1913 * rename (as the pmap code will not be able to find the entries
1914 * once the object has been disassociated or changed). Nominally
1915 * the caller is moving a page between shadowed objects and so the
1916 * pmap association is retained without having to remove the page
1917 * from it.
1918 */
1919 void
vm_page_rename(vm_page_t m,vm_object_t new_object,vm_pindex_t new_pindex)1920 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1921 {
1922 KKASSERT(m->busy_count & PBUSY_LOCKED);
1923 ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1924 if (m->object) {
1925 ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1926 vm_page_remove(m);
1927 }
1928 if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1929 panic("vm_page_rename: target exists (%p,%"PRIu64")",
1930 new_object, new_pindex);
1931 }
1932 if (m->queue - m->pc == PQ_CACHE)
1933 vm_page_deactivate(m);
1934 vm_page_dirty(m);
1935 }
1936
1937 /*
1938 * vm_page_unqueue() without any wakeup. This routine is used when a page
1939 * is to remain BUSYied by the caller.
1940 *
1941 * This routine may not block.
1942 */
1943 void
vm_page_unqueue_nowakeup(vm_page_t m)1944 vm_page_unqueue_nowakeup(vm_page_t m)
1945 {
1946 vm_page_and_queue_spin_lock(m);
1947 (void)_vm_page_rem_queue_spinlocked(m);
1948 vm_page_spin_unlock(m);
1949 }
1950
1951 /*
1952 * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1953 * if necessary.
1954 *
1955 * This routine may not block.
1956 */
1957 void
vm_page_unqueue(vm_page_t m)1958 vm_page_unqueue(vm_page_t m)
1959 {
1960 u_short queue;
1961
1962 vm_page_and_queue_spin_lock(m);
1963 queue = _vm_page_rem_queue_spinlocked(m);
1964 if (queue == PQ_FREE || queue == PQ_CACHE) {
1965 vm_page_spin_unlock(m);
1966 pagedaemon_wakeup();
1967 } else {
1968 vm_page_spin_unlock(m);
1969 }
1970 }
1971
1972 /*
1973 * vm_page_list_find()
1974 *
1975 * Find a page on the specified queue with color optimization.
1976 *
1977 * The page coloring optimization attempts to locate a page that does
1978 * not overload other nearby pages in the object in the cpu's L1 or L2
1979 * caches. We need this optimization because cpu caches tend to be
1980 * physical caches, while object spaces tend to be virtual.
1981 *
1982 * The page coloring optimization also, very importantly, tries to localize
1983 * memory to cpus and physical sockets.
1984 *
1985 * Each PQ_FREE and PQ_CACHE color queue has its own spinlock and the
1986 * algorithm is adjusted to localize allocations on a per-core basis.
1987 * This is done by 'twisting' the colors.
1988 *
1989 * The page is returned spinlocked and removed from its queue (it will
1990 * be on PQ_NONE), or NULL. The page is not BUSY'd. The caller
1991 * is responsible for dealing with the busy-page case (usually by
1992 * deactivating the page and looping).
1993 *
1994 * NOTE: This routine is carefully inlined. A non-inlined version
1995 * is available for outside callers but the only critical path is
1996 * from within this source file.
1997 *
1998 * NOTE: This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1999 * represent stable storage, allowing us to order our locks vm_page
2000 * first, then queue.
2001 *
2002 * WARNING! The returned page is not busied and may race other busying
2003 * operations, callers must check that the page is in the state they
2004 * want after busying.
2005 */
2006 static __inline
2007 vm_page_t
_vm_page_list_find(int basequeue,int index)2008 _vm_page_list_find(int basequeue, int index)
2009 {
2010 struct vpgqueues *pq;
2011 vm_page_t m;
2012
2013 index &= PQ_L2_MASK;
2014 pq = &vm_page_queues[basequeue + index];
2015
2016 /*
2017 * Try this cpu's colored queue first. Test for a page unlocked,
2018 * then lock the queue and locate a page. Note that the lock order
2019 * is reversed, but we do not want to dwadle on the page spinlock
2020 * anyway as it is held significantly longer than the queue spinlock.
2021 */
2022 if (TAILQ_FIRST(&pq->pl)) {
2023 spin_lock(&pq->spin);
2024 TAILQ_FOREACH(m, &pq->pl, pageq) {
2025 if (spin_trylock(&m->spin) == 0)
2026 continue;
2027 KKASSERT(m->queue == basequeue + index);
2028 pq->lastq = -1;
2029 return(m);
2030 }
2031 spin_unlock(&pq->spin);
2032 }
2033
2034 m = _vm_page_list_find_wide(basequeue, index, &pq->lastq);
2035
2036 return(m);
2037 }
2038
2039 /*
2040 * If we could not find the page in the desired queue try to find it in
2041 * a nearby (NUMA-aware) queue, spreading out as we go.
2042 */
2043 static vm_page_t
_vm_page_list_find_wide(int basequeue,int index,int * lastp)2044 _vm_page_list_find_wide(int basequeue, int index, int *lastp)
2045 {
2046 struct vpgqueues *pq;
2047 vm_page_t m = NULL;
2048 int pqmask = set_assoc_mask >> 1;
2049 int pqi;
2050 int range;
2051 int skip_start;
2052 int skip_next;
2053 int count;
2054
2055 /*
2056 * Avoid re-searching empty queues over and over again skip to
2057 * pq->last if appropriate.
2058 */
2059 if (*lastp >= 0)
2060 index = *lastp;
2061
2062 index &= PQ_L2_MASK;
2063 pq = &vm_page_queues[basequeue];
2064 count = 0;
2065 skip_start = -1;
2066 skip_next = -1;
2067
2068 /*
2069 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2070 * else fails (PQ_L2_MASK).
2071 *
2072 * pqmask is a mask, 15, 31, 63, etc.
2073 *
2074 * Test each queue unlocked first, then lock the queue and locate
2075 * a page. Note that the lock order is reversed, but we do not want
2076 * to dwadle on the page spinlock anyway as it is held significantly
2077 * longer than the queue spinlock.
2078 */
2079 do {
2080 pqmask = (pqmask << 1) | 1;
2081
2082 pqi = index;
2083 range = pqmask + 1;
2084
2085 while (range > 0) {
2086 if (pqi >= skip_start && pqi < skip_next) {
2087 range -= skip_next - pqi;
2088 pqi = (pqi & ~pqmask) | (skip_next & pqmask);
2089 }
2090 if (range > 0 && TAILQ_FIRST(&pq[pqi].pl)) {
2091 spin_lock(&pq[pqi].spin);
2092 TAILQ_FOREACH(m, &pq[pqi].pl, pageq) {
2093 if (spin_trylock(&m->spin) == 0)
2094 continue;
2095 KKASSERT(m->queue == basequeue + pqi);
2096
2097 /*
2098 * If we had to wander too far, set
2099 * *lastp to skip past empty queues.
2100 */
2101 if (count >= 8)
2102 *lastp = pqi & PQ_L2_MASK;
2103 return(m);
2104 }
2105 spin_unlock(&pq[pqi].spin);
2106 }
2107 --range;
2108 ++count;
2109 pqi = (pqi & ~pqmask) | ((pqi + 1) & pqmask);
2110 }
2111 skip_start = pqi & ~pqmask;
2112 skip_next = (pqi | pqmask) + 1;
2113 } while (pqmask != PQ_L2_MASK);
2114
2115 return(m);
2116 }
2117
2118 static __inline
2119 vm_page_t
_vm_page_list_find2(int bq1,int bq2,int index)2120 _vm_page_list_find2(int bq1, int bq2, int index)
2121 {
2122 struct vpgqueues *pq1;
2123 struct vpgqueues *pq2;
2124 vm_page_t m;
2125
2126 index &= PQ_L2_MASK;
2127 pq1 = &vm_page_queues[bq1 + index];
2128 pq2 = &vm_page_queues[bq2 + index];
2129
2130 /*
2131 * Try this cpu's colored queue first. Test for a page unlocked,
2132 * then lock the queue and locate a page. Note that the lock order
2133 * is reversed, but we do not want to dwadle on the page spinlock
2134 * anyway as it is held significantly longer than the queue spinlock.
2135 */
2136 if (TAILQ_FIRST(&pq1->pl)) {
2137 spin_lock(&pq1->spin);
2138 TAILQ_FOREACH(m, &pq1->pl, pageq) {
2139 if (spin_trylock(&m->spin) == 0)
2140 continue;
2141 KKASSERT(m->queue == bq1 + index);
2142 pq1->lastq = -1;
2143 pq2->lastq = -1;
2144 return(m);
2145 }
2146 spin_unlock(&pq1->spin);
2147 }
2148
2149 m = _vm_page_list_find2_wide(bq1, bq2, index, &pq1->lastq, &pq2->lastq);
2150
2151 return(m);
2152 }
2153
2154
2155 /*
2156 * This version checks two queues at the same time, widening its search
2157 * as we progress. prefering basequeue1
2158 * and starting on basequeue2 after exhausting the first set. The idea
2159 * is to try to stay localized to the cpu.
2160 */
2161 static vm_page_t
_vm_page_list_find2_wide(int basequeue1,int basequeue2,int index,int * lastp1,int * lastp2)2162 _vm_page_list_find2_wide(int basequeue1, int basequeue2, int index,
2163 int *lastp1, int *lastp2)
2164 {
2165 struct vpgqueues *pq1;
2166 struct vpgqueues *pq2;
2167 vm_page_t m = NULL;
2168 int pqmask1, pqmask2;
2169 int pqi;
2170 int range;
2171 int skip_start1, skip_start2;
2172 int skip_next1, skip_next2;
2173 int count1, count2;
2174
2175 /*
2176 * Avoid re-searching empty queues over and over again skip to
2177 * pq->last if appropriate.
2178 */
2179 if (*lastp1 >= 0)
2180 index = *lastp1;
2181
2182 index &= PQ_L2_MASK;
2183
2184 pqmask1 = set_assoc_mask >> 1;
2185 pq1 = &vm_page_queues[basequeue1];
2186 count1 = 0;
2187 skip_start1 = -1;
2188 skip_next1 = -1;
2189
2190 pqmask2 = set_assoc_mask >> 1;
2191 pq2 = &vm_page_queues[basequeue2];
2192 count2 = 0;
2193 skip_start2 = -1;
2194 skip_next2 = -1;
2195
2196 /*
2197 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2198 * else fails (PQ_L2_MASK).
2199 *
2200 * pqmask is a mask, 15, 31, 63, etc.
2201 *
2202 * Test each queue unlocked first, then lock the queue and locate
2203 * a page. Note that the lock order is reversed, but we do not want
2204 * to dwadle on the page spinlock anyway as it is held significantly
2205 * longer than the queue spinlock.
2206 */
2207 do {
2208 if (pqmask1 == PQ_L2_MASK)
2209 goto skip2;
2210
2211 pqmask1 = (pqmask1 << 1) | 1;
2212 pqi = index;
2213 range = pqmask1 + 1;
2214
2215 while (range > 0) {
2216 if (pqi >= skip_start1 && pqi < skip_next1) {
2217 range -= skip_next1 - pqi;
2218 pqi = (pqi & ~pqmask1) | (skip_next1 & pqmask1);
2219 }
2220 if (range > 0 && TAILQ_FIRST(&pq1[pqi].pl)) {
2221 spin_lock(&pq1[pqi].spin);
2222 TAILQ_FOREACH(m, &pq1[pqi].pl, pageq) {
2223 if (spin_trylock(&m->spin) == 0)
2224 continue;
2225 KKASSERT(m->queue == basequeue1 + pqi);
2226
2227 /*
2228 * If we had to wander too far, set
2229 * *lastp to skip past empty queues.
2230 */
2231 if (count1 >= 8)
2232 *lastp1 = pqi & PQ_L2_MASK;
2233 return(m);
2234 }
2235 spin_unlock(&pq1[pqi].spin);
2236 }
2237 --range;
2238 ++count1;
2239 pqi = (pqi & ~pqmask1) | ((pqi + 1) & pqmask1);
2240 }
2241 skip_start1 = pqi & ~pqmask1;
2242 skip_next1 = (pqi | pqmask1) + 1;
2243 skip2:
2244 if (pqmask1 < ((set_assoc_mask << 1) | 1))
2245 continue;
2246
2247 pqmask2 = (pqmask2 << 1) | 1;
2248 pqi = index;
2249 range = pqmask2 + 1;
2250
2251 while (range > 0) {
2252 if (pqi >= skip_start2 && pqi < skip_next2) {
2253 range -= skip_next2 - pqi;
2254 pqi = (pqi & ~pqmask2) | (skip_next2 & pqmask2);
2255 }
2256 if (range > 0 && TAILQ_FIRST(&pq2[pqi].pl)) {
2257 spin_lock(&pq2[pqi].spin);
2258 TAILQ_FOREACH(m, &pq2[pqi].pl, pageq) {
2259 if (spin_trylock(&m->spin) == 0)
2260 continue;
2261 KKASSERT(m->queue == basequeue2 + pqi);
2262
2263 /*
2264 * If we had to wander too far, set
2265 * *lastp to skip past empty queues.
2266 */
2267 if (count2 >= 8)
2268 *lastp2 = pqi & PQ_L2_MASK;
2269 return(m);
2270 }
2271 spin_unlock(&pq2[pqi].spin);
2272 }
2273 --range;
2274 ++count2;
2275 pqi = (pqi & ~pqmask2) | ((pqi + 1) & pqmask2);
2276 }
2277 skip_start2 = pqi & ~pqmask2;
2278 skip_next2 = (pqi | pqmask2) + 1;
2279 } while (pqmask1 != PQ_L2_MASK && pqmask2 != PQ_L2_MASK);
2280
2281 return(m);
2282 }
2283
2284 /*
2285 * Returns a vm_page candidate for allocation. The page is not busied so
2286 * it can move around. The caller must busy the page (and typically
2287 * deactivate it if it cannot be busied!)
2288 *
2289 * Returns a spinlocked vm_page that has been removed from its queue.
2290 * (note that _vm_page_list_find() does not remove the page from its
2291 * queue).
2292 */
2293 vm_page_t
vm_page_list_find(int basequeue,int index)2294 vm_page_list_find(int basequeue, int index)
2295 {
2296 vm_page_t m;
2297
2298 m = _vm_page_list_find(basequeue, index);
2299 if (m)
2300 _vm_page_rem_queue_spinlocked(m);
2301 return m;
2302 }
2303
2304 /*
2305 * Find a page on the cache queue with color optimization, remove it
2306 * from the queue, and busy it. The returned page will not be spinlocked.
2307 *
2308 * A candidate failure will be deactivated. Candidates can fail due to
2309 * being busied by someone else, in which case they will be deactivated.
2310 *
2311 * This routine may not block.
2312 *
2313 */
2314 static vm_page_t
vm_page_select_cache(u_short pg_color)2315 vm_page_select_cache(u_short pg_color)
2316 {
2317 vm_page_t m;
2318
2319 for (;;) {
2320 m = _vm_page_list_find(PQ_CACHE, pg_color);
2321 if (m == NULL)
2322 break;
2323 /*
2324 * (m) has been spinlocked
2325 */
2326 _vm_page_rem_queue_spinlocked(m);
2327 if (vm_page_busy_try(m, TRUE)) {
2328 _vm_page_deactivate_locked(m, 0);
2329 vm_page_spin_unlock(m);
2330 } else {
2331 /*
2332 * We successfully busied the page. This can race
2333 * vm_page_lookup() + busy ops so make sure the
2334 * page is in the state we want.
2335 */
2336 if ((m->flags & (PG_NEED_COMMIT | PG_MAPPED)) == 0 &&
2337 m->hold_count == 0 &&
2338 m->wire_count == 0 &&
2339 (m->dirty & m->valid) == 0) {
2340 vm_page_spin_unlock(m);
2341 KKASSERT((m->flags & PG_UNQUEUED) == 0);
2342 pagedaemon_wakeup();
2343 return(m);
2344 }
2345
2346 /*
2347 * The page cannot be recycled, deactivate it.
2348 */
2349 _vm_page_deactivate_locked(m, 0);
2350 if (_vm_page_wakeup(m)) {
2351 vm_page_spin_unlock(m);
2352 wakeup(m);
2353 } else {
2354 vm_page_spin_unlock(m);
2355 }
2356 }
2357 }
2358 return (m);
2359 }
2360
2361 /*
2362 * Find a free page. We attempt to inline the nominal case and fall back
2363 * to _vm_page_select_free() otherwise. A busied page is removed from
2364 * the queue and returned.
2365 *
2366 * This routine may not block.
2367 */
2368 static __inline vm_page_t
vm_page_select_free(u_short pg_color)2369 vm_page_select_free(u_short pg_color)
2370 {
2371 vm_page_t m;
2372
2373 for (;;) {
2374 m = _vm_page_list_find(PQ_FREE, pg_color);
2375 if (m == NULL)
2376 break;
2377 _vm_page_rem_queue_spinlocked(m);
2378 if (vm_page_busy_try(m, TRUE)) {
2379 /*
2380 * Various mechanisms such as a pmap_collect can
2381 * result in a busy page on the free queue. We
2382 * have to move the page out of the way so we can
2383 * retry the allocation. If the other thread is not
2384 * allocating the page then m->valid will remain 0 and
2385 * the pageout daemon will free the page later on.
2386 *
2387 * Since we could not busy the page, however, we
2388 * cannot make assumptions as to whether the page
2389 * will be allocated by the other thread or not,
2390 * so all we can do is deactivate it to move it out
2391 * of the way. In particular, if the other thread
2392 * wires the page it may wind up on the inactive
2393 * queue and the pageout daemon will have to deal
2394 * with that case too.
2395 */
2396 _vm_page_deactivate_locked(m, 0);
2397 vm_page_spin_unlock(m);
2398 } else {
2399 /*
2400 * Theoretically if we are able to busy the page
2401 * atomic with the queue removal (using the vm_page
2402 * lock) nobody else should have been able to mess
2403 * with the page before us.
2404 *
2405 * Assert the page state. Note that even though
2406 * wiring doesn't adjust queues, a page on the free
2407 * queue should never be wired at this point.
2408 */
2409 KKASSERT((m->flags & (PG_UNQUEUED |
2410 PG_NEED_COMMIT)) == 0);
2411 KASSERT(m->hold_count == 0,
2412 ("m->hold_count is not zero "
2413 "pg %p q=%d flags=%08x hold=%d wire=%d",
2414 m, m->queue, m->flags,
2415 m->hold_count, m->wire_count));
2416 KKASSERT(m->wire_count == 0);
2417 vm_page_spin_unlock(m);
2418 pagedaemon_wakeup();
2419
2420 /* return busied and removed page */
2421 return(m);
2422 }
2423 }
2424 return(m);
2425 }
2426
2427 static __inline vm_page_t
vm_page_select_free_or_cache(u_short pg_color,int * fromcachep)2428 vm_page_select_free_or_cache(u_short pg_color, int *fromcachep)
2429 {
2430 vm_page_t m;
2431
2432 *fromcachep = 0;
2433 for (;;) {
2434 m = _vm_page_list_find2(PQ_FREE, PQ_CACHE, pg_color);
2435 if (m == NULL)
2436 break;
2437 if (vm_page_busy_try(m, TRUE)) {
2438 _vm_page_rem_queue_spinlocked(m);
2439 _vm_page_deactivate_locked(m, 0);
2440 vm_page_spin_unlock(m);
2441 } else if (m->queue - m->pc == PQ_FREE) {
2442 /*
2443 * We successfully busied the page, PQ_FREE case
2444 */
2445 _vm_page_rem_queue_spinlocked(m);
2446 KKASSERT((m->flags & (PG_UNQUEUED |
2447 PG_NEED_COMMIT)) == 0);
2448 KASSERT(m->hold_count == 0,
2449 ("m->hold_count is not zero "
2450 "pg %p q=%d flags=%08x hold=%d wire=%d",
2451 m, m->queue, m->flags,
2452 m->hold_count, m->wire_count));
2453 KKASSERT(m->wire_count == 0);
2454 vm_page_spin_unlock(m);
2455 pagedaemon_wakeup();
2456
2457 /* return busied and removed page */
2458 return(m);
2459 } else {
2460 /*
2461 * We successfully busied the page, PQ_CACHE case
2462 *
2463 * This can race vm_page_lookup() + busy ops, so make
2464 * sure the page is in the state we want.
2465 */
2466 _vm_page_rem_queue_spinlocked(m);
2467 if ((m->flags & (PG_NEED_COMMIT | PG_MAPPED)) == 0 &&
2468 m->hold_count == 0 &&
2469 m->wire_count == 0 &&
2470 (m->dirty & m->valid) == 0) {
2471 vm_page_spin_unlock(m);
2472 KKASSERT((m->flags & PG_UNQUEUED) == 0);
2473 pagedaemon_wakeup();
2474 *fromcachep = 1;
2475 return(m);
2476 }
2477
2478 /*
2479 * The page cannot be recycled, deactivate it.
2480 */
2481 _vm_page_deactivate_locked(m, 0);
2482 if (_vm_page_wakeup(m)) {
2483 vm_page_spin_unlock(m);
2484 wakeup(m);
2485 } else {
2486 vm_page_spin_unlock(m);
2487 }
2488 }
2489 }
2490 return(m);
2491 }
2492
2493 /*
2494 * vm_page_alloc()
2495 *
2496 * Allocate and return a memory cell associated with this VM object/offset
2497 * pair. If object is NULL an unassociated page will be allocated.
2498 *
2499 * The returned page will be busied and removed from its queues. This
2500 * routine can block and may return NULL if a race occurs and the page
2501 * is found to already exist at the specified (object, pindex).
2502 *
2503 * VM_ALLOC_NORMAL - Allow use of cache pages, nominal free drain
2504 * VM_ALLOC_QUICK - Like normal but cannot use cache
2505 * VM_ALLOC_SYSTEM - Greater free drain
2506 * VM_ALLOC_INTERRUPT - Allow free list to be completely drained
2507 *
2508 * VM_ALLOC_CPU(n) - Allocate using specified cpu localization
2509 *
2510 * VM_ALLOC_ZERO - Zero the page if we have to allocate it.
2511 * (vm_page_grab() and vm_page_alloczwq() ONLY!)
2512 *
2513 * VM_ALLOC_FORCE_ZERO - Zero the page unconditionally.
2514 * (vm_page_grab() and vm_page_alloczwq() ONLY!)
2515 *
2516 * VM_ALLOC_NULL_OK - Return NULL on insertion collision, else
2517 * panic on insertion collisions.
2518 * (vm_page_grab() and vm_page_alloczwq() ONLY!)
2519 *
2520 * The object must be held if not NULL
2521 *
2522 * This routine may not block
2523 *
2524 * Additional special handling is required when called from an interrupt
2525 * (VM_ALLOC_INTERRUPT). We are not allowed to mess with the page cache
2526 * in this case.
2527 */
2528 vm_page_t
vm_page_alloc(vm_object_t object,vm_pindex_t pindex,int page_req)2529 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
2530 {
2531 globaldata_t gd;
2532 vm_object_t obj;
2533 vm_page_t m;
2534 u_short pg_color;
2535 int cpuid_local;
2536 int fromcache;
2537
2538 #if 0
2539 /*
2540 * Special per-cpu free VM page cache. The pages are pre-busied
2541 * and pre-zerod for us.
2542 */
2543 if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
2544 crit_enter_gd(gd);
2545 if (gd->gd_vmpg_count) {
2546 m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
2547 crit_exit_gd(gd);
2548 goto done;
2549 }
2550 crit_exit_gd(gd);
2551 }
2552 #endif
2553 m = NULL;
2554
2555 /*
2556 * CPU LOCALIZATION
2557 *
2558 * CPU localization algorithm. Break the page queues up by physical
2559 * id and core id (note that two cpu threads will have the same core
2560 * id, and core_id != gd_cpuid).
2561 *
2562 * This is nowhere near perfect, for example the last pindex in a
2563 * subgroup will overflow into the next cpu or package. But this
2564 * should get us good page reuse locality in heavy mixed loads.
2565 *
2566 * (may be executed before the APs are started, so other GDs might
2567 * not exist!)
2568 */
2569 if (page_req & VM_ALLOC_CPU_SPEC)
2570 cpuid_local = VM_ALLOC_GETCPU(page_req);
2571 else
2572 cpuid_local = mycpu->gd_cpuid;
2573
2574 pg_color = vm_get_pg_color(cpuid_local, object, pindex);
2575
2576 KKASSERT(page_req & (VM_ALLOC_NORMAL | VM_ALLOC_QUICK |
2577 VM_ALLOC_INTERRUPT | VM_ALLOC_SYSTEM));
2578
2579 /*
2580 * Certain system threads (pageout daemon, buf_daemon's) are
2581 * allowed to eat deeper into the free page list.
2582 */
2583 if (curthread->td_flags & TDF_SYSTHREAD)
2584 page_req |= VM_ALLOC_SYSTEM;
2585
2586 /*
2587 * To avoid live-locks only compare against v_free_reserved. The
2588 * pageout daemon has extra tests for this.
2589 */
2590 loop:
2591 gd = mycpu;
2592 if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
2593 ((page_req & VM_ALLOC_INTERRUPT) &&
2594 gd->gd_vmstats.v_free_count > 0) ||
2595 ((page_req & VM_ALLOC_SYSTEM) &&
2596 gd->gd_vmstats.v_cache_count == 0 &&
2597 gd->gd_vmstats.v_free_count >
2598 gd->gd_vmstats.v_interrupt_free_min)
2599 ) {
2600 /*
2601 * The free queue has sufficient free pages to take one out.
2602 *
2603 * However, if the free queue is strained the scan may widen
2604 * to the entire queue and cause a great deal of SMP
2605 * contention, so we use a double-queue-scan if we can
2606 * to avoid this.
2607 */
2608 if (page_req & VM_ALLOC_NORMAL) {
2609 m = vm_page_select_free_or_cache(pg_color, &fromcache);
2610 if (m && fromcache)
2611 goto found_cache;
2612 } else {
2613 m = vm_page_select_free(pg_color);
2614 }
2615 } else if (page_req & VM_ALLOC_NORMAL) {
2616 /*
2617 * Allocatable from the cache (non-interrupt only). On
2618 * success, we must free the page and try again, thus
2619 * ensuring that vmstats.v_*_free_min counters are replenished.
2620 */
2621 #ifdef INVARIANTS
2622 if (curthread->td_preempted) {
2623 kprintf("vm_page_alloc(): warning, attempt to allocate"
2624 " cache page from preempting interrupt\n");
2625 m = NULL;
2626 } else {
2627 m = vm_page_select_cache(pg_color);
2628 }
2629 #else
2630 m = vm_page_select_cache(pg_color);
2631 #endif
2632 /*
2633 * On success move the page into the free queue and loop.
2634 *
2635 * Only do this if we can safely acquire the vm_object lock,
2636 * because this is effectively a random page and the caller
2637 * might be holding the lock shared, we don't want to
2638 * deadlock.
2639 */
2640 if (m != NULL) {
2641 found_cache:
2642 KASSERT(m->dirty == 0,
2643 ("Found dirty cache page %p", m));
2644 if ((obj = m->object) != NULL) {
2645 if (vm_object_hold_try(obj)) {
2646 if (__predict_false((m->flags & (PG_MAPPED|PG_WRITEABLE)) != 0))
2647 vm_page_protect(m, VM_PROT_NONE);
2648 vm_page_free(m);
2649 /* m->object NULL here */
2650 vm_object_drop(obj);
2651 } else {
2652 vm_page_deactivate(m);
2653 vm_page_wakeup(m);
2654 }
2655 } else {
2656 if (__predict_false((m->flags & (PG_MAPPED|PG_WRITEABLE)) != 0))
2657 vm_page_protect(m, VM_PROT_NONE);
2658 vm_page_free(m);
2659 }
2660 goto loop;
2661 }
2662
2663 /*
2664 * On failure return NULL
2665 */
2666 atomic_add_int(&vm_pageout_deficit, 1);
2667 pagedaemon_wakeup();
2668 return (NULL);
2669 } else {
2670 /*
2671 * No pages available, wakeup the pageout daemon and give up.
2672 */
2673 atomic_add_int(&vm_pageout_deficit, 1);
2674 pagedaemon_wakeup();
2675 return (NULL);
2676 }
2677
2678 /*
2679 * v_free_count can race so loop if we don't find the expected
2680 * page.
2681 */
2682 if (m == NULL) {
2683 vmstats_rollup();
2684 goto loop;
2685 }
2686
2687 /*
2688 * Good page found. The page has already been busied for us and
2689 * removed from its queues.
2690 */
2691 KASSERT(m->dirty == 0,
2692 ("vm_page_alloc: free/cache page %p was dirty", m));
2693 KKASSERT(m->queue == PQ_NONE);
2694
2695 #if 0
2696 done:
2697 #endif
2698 /*
2699 * Initialize the structure, inheriting some flags but clearing
2700 * all the rest. The page has already been busied for us.
2701 */
2702 vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
2703
2704 KKASSERT(m->wire_count == 0);
2705 KKASSERT((m->busy_count & PBUSY_MASK) == 0);
2706 m->act_count = 0;
2707 m->valid = 0;
2708
2709 /*
2710 * Caller must be holding the object lock (asserted by
2711 * vm_page_insert()).
2712 *
2713 * NOTE: Inserting a page here does not insert it into any pmaps
2714 * (which could cause us to block allocating memory).
2715 *
2716 * NOTE: If no object an unassociated page is allocated, m->pindex
2717 * can be used by the caller for any purpose.
2718 */
2719 if (object) {
2720 if (vm_page_insert(m, object, pindex) == FALSE) {
2721 vm_page_free(m);
2722 if ((page_req & VM_ALLOC_NULL_OK) == 0)
2723 panic("PAGE RACE %p[%ld]/%p",
2724 object, (long)pindex, m);
2725 m = NULL;
2726 }
2727 } else {
2728 m->pindex = pindex;
2729 }
2730
2731 /*
2732 * Don't wakeup too often - wakeup the pageout daemon when
2733 * we would be nearly out of memory.
2734 */
2735 pagedaemon_wakeup();
2736
2737 /*
2738 * A BUSY page is returned.
2739 */
2740 return (m);
2741 }
2742
2743 /*
2744 * Returns number of pages available in our DMA memory reserve
2745 * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
2746 */
2747 vm_size_t
vm_contig_avail_pages(void)2748 vm_contig_avail_pages(void)
2749 {
2750 alist_blk_t blk;
2751 alist_blk_t count;
2752 alist_blk_t bfree;
2753 spin_lock(&vm_contig_spin);
2754 bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2755 spin_unlock(&vm_contig_spin);
2756
2757 return bfree;
2758 }
2759
2760 /*
2761 * Attempt to allocate contiguous physical memory with the specified
2762 * requirements.
2763 */
2764 vm_page_t
vm_page_alloc_contig(vm_paddr_t low,vm_paddr_t high,unsigned long alignment,unsigned long boundary,unsigned long size,vm_memattr_t memattr)2765 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2766 unsigned long alignment, unsigned long boundary,
2767 unsigned long size, vm_memattr_t memattr)
2768 {
2769 alist_blk_t blk;
2770 vm_page_t m;
2771 vm_pindex_t i;
2772 #if 0
2773 static vm_pindex_t contig_rover;
2774 #endif
2775
2776 alignment >>= PAGE_SHIFT;
2777 if (alignment == 0)
2778 alignment = 1;
2779 boundary >>= PAGE_SHIFT;
2780 if (boundary == 0)
2781 boundary = 1;
2782 size = (size + PAGE_MASK) >> PAGE_SHIFT;
2783
2784 #if 0
2785 /*
2786 * Disabled temporarily until we find a solution for DRM (a flag
2787 * to always use the free space reserve, for performance).
2788 */
2789 if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE &&
2790 boundary <= PAGE_SIZE && size == 1 &&
2791 memattr == VM_MEMATTR_DEFAULT) {
2792 /*
2793 * Any page will work, use vm_page_alloc()
2794 * (e.g. when used from kmem_alloc_attr())
2795 */
2796 m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF,
2797 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
2798 VM_ALLOC_INTERRUPT);
2799 m->valid = VM_PAGE_BITS_ALL;
2800 vm_page_wire(m);
2801 vm_page_wakeup(m);
2802 } else
2803 #endif
2804 {
2805 /*
2806 * Use the low-memory dma reserve
2807 */
2808 spin_lock(&vm_contig_spin);
2809 blk = alist_alloc(&vm_contig_alist, 0, size);
2810 if (blk == ALIST_BLOCK_NONE) {
2811 spin_unlock(&vm_contig_spin);
2812 if (bootverbose) {
2813 kprintf("vm_page_alloc_contig: %ldk nospace\n",
2814 (size << PAGE_SHIFT) / 1024);
2815 print_backtrace(5);
2816 }
2817 return(NULL);
2818 }
2819 if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2820 alist_free(&vm_contig_alist, blk, size);
2821 spin_unlock(&vm_contig_spin);
2822 if (bootverbose) {
2823 kprintf("vm_page_alloc_contig: %ldk high "
2824 "%016jx failed\n",
2825 (size << PAGE_SHIFT) / 1024,
2826 (intmax_t)high);
2827 }
2828 return(NULL);
2829 }
2830 spin_unlock(&vm_contig_spin);
2831
2832 /*
2833 * Base vm_page_t of range
2834 */
2835 m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2836 }
2837 if (vm_contig_verbose) {
2838 kprintf("vm_page_alloc_contig: %016jx/%ldk "
2839 "(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n",
2840 (intmax_t)m->phys_addr,
2841 (size << PAGE_SHIFT) / 1024,
2842 low, high, alignment, boundary, size, memattr);
2843 }
2844 if (memattr != VM_MEMATTR_DEFAULT) {
2845 for (i = 0; i < size; ++i) {
2846 KKASSERT(m[i].flags & PG_FICTITIOUS);
2847 pmap_page_set_memattr(&m[i], memattr);
2848 }
2849 }
2850 return m;
2851 }
2852
2853 /*
2854 * Free contiguously allocated pages. The pages will be wired but not busy.
2855 * When freeing to the alist we leave them wired and not busy.
2856 */
2857 void
vm_page_free_contig(vm_page_t m,unsigned long size)2858 vm_page_free_contig(vm_page_t m, unsigned long size)
2859 {
2860 vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2861 vm_pindex_t start = pa >> PAGE_SHIFT;
2862 vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2863
2864 if (vm_contig_verbose) {
2865 kprintf("vm_page_free_contig: %016jx/%ldk\n",
2866 (intmax_t)pa, size / 1024);
2867 }
2868 if (pa < vm_low_phys_reserved) {
2869 /*
2870 * Just assert check the first page for convenience.
2871 */
2872 KKASSERT(m->wire_count == 1);
2873 KKASSERT(m->flags & PG_FICTITIOUS);
2874 KKASSERT(pa + size <= vm_low_phys_reserved);
2875 spin_lock(&vm_contig_spin);
2876 alist_free(&vm_contig_alist, start, pages);
2877 spin_unlock(&vm_contig_spin);
2878 } else {
2879 while (pages) {
2880 /* XXX FUTURE, maybe (pair with vm_pg_contig_alloc()) */
2881 /*vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);*/
2882 vm_page_busy_wait(m, FALSE, "cpgfr");
2883 vm_page_unwire(m, 0);
2884 vm_page_free(m);
2885 --pages;
2886 ++m;
2887 }
2888
2889 }
2890 }
2891
2892
2893 /*
2894 * Wait for sufficient free memory for nominal heavy memory use kernel
2895 * operations.
2896 *
2897 * WARNING! Be sure never to call this in any vm_pageout code path, which
2898 * will trivially deadlock the system.
2899 */
2900 void
vm_wait_nominal(void)2901 vm_wait_nominal(void)
2902 {
2903 while (vm_paging_min())
2904 vm_wait(0);
2905 }
2906
2907 /*
2908 * Test if vm_wait_nominal() would block.
2909 */
2910 int
vm_test_nominal(void)2911 vm_test_nominal(void)
2912 {
2913 if (vm_paging_min())
2914 return(1);
2915 return(0);
2916 }
2917
2918 /*
2919 * Block until free pages are available for allocation, called in various
2920 * places before memory allocations, and occurs before the minimum is reached.
2921 * Typically in the I/O path.
2922 *
2923 * The caller may loop if vm_paging_min() is TRUE (free pages below minimum),
2924 * so we cannot be more generous then that.
2925 */
2926 void
vm_wait(int timo)2927 vm_wait(int timo)
2928 {
2929 /*
2930 * never wait forever
2931 */
2932 if (timo == 0)
2933 timo = hz;
2934 lwkt_gettoken(&vm_token);
2935
2936 if (curthread == pagethread ||
2937 curthread == emergpager) {
2938 /*
2939 * The pageout daemon itself needs pages, this is bad.
2940 */
2941 if (vm_paging_min()) {
2942 vm_pageout_pages_needed = 1;
2943 tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2944 }
2945 } else {
2946 /*
2947 * Wakeup the pageout daemon if necessary and wait.
2948 *
2949 * Do not wait indefinitely for the target to be reached,
2950 * as load might prevent it from being reached any time soon.
2951 * But wait a little to try to slow down page allocations
2952 * and to give more important threads (the pagedaemon)
2953 * allocation priority.
2954 *
2955 * The vm_paging_min() test is a safety.
2956 *
2957 * I/O waits are given a slightly lower priority (higher nice)
2958 * than VM waits.
2959 */
2960 int nice;
2961
2962 nice = curthread->td_proc ? curthread->td_proc->p_nice : 0;
2963 /*if (vm_paging_wait() || vm_paging_min())*/
2964 if (vm_paging_min_nice(nice + 1))
2965 {
2966 if (vm_pages_needed <= 1) {
2967 ++vm_pages_needed;
2968 wakeup(&vm_pages_needed);
2969 }
2970 ++vm_pages_waiting; /* SMP race ok */
2971 tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2972 }
2973 }
2974 lwkt_reltoken(&vm_token);
2975 }
2976
2977 /*
2978 * Block until free pages are available for allocation, called in the
2979 * page-fault code. We must stall indefinitely (except for certain
2980 * conditions) when the free page count becomes severe.
2981 *
2982 * Called only from vm_fault so that processes page faulting can be
2983 * easily tracked.
2984 *
2985 * The process nice value determines the trip point. This way niced
2986 * processes which are heavy memory users do not completely mess the
2987 * machine up for normal processes.
2988 */
2989 void
vm_wait_pfault(void)2990 vm_wait_pfault(void)
2991 {
2992 int nice;
2993
2994 /*
2995 * Wakeup the pageout daemon if necessary and wait.
2996 *
2997 * Allow VM faults down to the minimum free page count, but only
2998 * stall once paging becomes severe.
2999 *
3000 * Do not wait indefinitely for the target to be reached,
3001 * as load might prevent it from being reached any time soon.
3002 * But wait a little to try to slow down page allocations
3003 * and to give more important threads (the pagedaemon)
3004 * allocation priority.
3005 */
3006 nice = curthread->td_proc ? curthread->td_proc->p_nice : 0;
3007
3008 if (vm_paging_min_nice(nice)) {
3009 lwkt_gettoken(&vm_token);
3010 do {
3011 thread_t td;
3012
3013 if (vm_pages_needed <= 1) {
3014 ++vm_pages_needed;
3015 wakeup(&vm_pages_needed);
3016 }
3017 ++vm_pages_waiting; /* SMP race ok */
3018 tsleep(&vmstats.v_free_count, 0, "pfault",
3019 hz / 10 + 1);
3020
3021 /*
3022 * Do not stay stuck in the loop if the system
3023 * is trying to kill the process.
3024 */
3025 td = curthread;
3026 if (td->td_proc &&
3027 (td->td_proc->p_flags & P_LOWMEMKILL))
3028 {
3029 break;
3030 }
3031 } while (vm_paging_severe());
3032 lwkt_reltoken(&vm_token);
3033 }
3034 }
3035
3036 /*
3037 * Put the specified page on the active list (if appropriate). Ensure
3038 * that act_count is at least ACT_INIT but do not otherwise mess with it.
3039 *
3040 * The caller should be holding the page busied ? XXX
3041 * This routine may not block.
3042 *
3043 * It is ok if the page is wired (so buffer cache operations don't have
3044 * to mess with the page queues).
3045 */
3046 void
vm_page_activate(vm_page_t m)3047 vm_page_activate(vm_page_t m)
3048 {
3049 u_short oqueue;
3050
3051 /*
3052 * If already active or inappropriate, just set act_count and
3053 * return. We don't have to spin-lock the page.
3054 */
3055 if (m->queue - m->pc == PQ_ACTIVE ||
3056 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3057 if (m->act_count < ACT_INIT)
3058 m->act_count = ACT_INIT;
3059 return;
3060 }
3061
3062 vm_page_spin_lock(m);
3063 if (m->queue - m->pc != PQ_ACTIVE &&
3064 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3065 _vm_page_queue_spin_lock(m);
3066 oqueue = _vm_page_rem_queue_spinlocked(m);
3067 /* page is left spinlocked, queue is unlocked */
3068
3069 if (oqueue == PQ_CACHE)
3070 mycpu->gd_cnt.v_reactivated++;
3071 if (m->act_count < ACT_INIT)
3072 m->act_count = ACT_INIT;
3073 _vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
3074 _vm_page_and_queue_spin_unlock(m);
3075 if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
3076 pagedaemon_wakeup();
3077 } else {
3078 if (m->act_count < ACT_INIT)
3079 m->act_count = ACT_INIT;
3080 vm_page_spin_unlock(m);
3081 }
3082 }
3083
3084 void
vm_page_soft_activate(vm_page_t m)3085 vm_page_soft_activate(vm_page_t m)
3086 {
3087 if (m->queue - m->pc == PQ_ACTIVE ||
3088 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3089 if (m->act_count < ACT_INIT)
3090 m->act_count = ACT_INIT;
3091 } else {
3092 vm_page_activate(m);
3093 }
3094 }
3095
3096 /*
3097 * Helper routine for vm_page_free_toq() and vm_page_cache(). This
3098 * routine is called when a page has been added to the cache or free
3099 * queues.
3100 *
3101 * This routine may not block.
3102 */
3103 static __inline void
vm_page_free_wakeup(void)3104 vm_page_free_wakeup(void)
3105 {
3106 globaldata_t gd = mycpu;
3107
3108 /*
3109 * If the pageout daemon itself needs pages, then tell it that
3110 * there are some free.
3111 */
3112 if (vm_pageout_pages_needed &&
3113 gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
3114 gd->gd_vmstats.v_pageout_free_min
3115 ) {
3116 vm_pageout_pages_needed = 0;
3117 wakeup(&vm_pageout_pages_needed);
3118 }
3119
3120 /*
3121 * Wakeup processes that are waiting on memory.
3122 *
3123 * Generally speaking we want to wakeup stuck processes as soon as
3124 * possible. !vm_page_count_min(0) is the absolute minimum point
3125 * where we can do this. Wait a bit longer to reduce degenerate
3126 * re-blocking (vm_page_free_hysteresis).
3127 *
3128 * The target check is a safety to make sure the min-check
3129 * w/hysteresis does not exceed the normal target1.
3130 */
3131 if (vm_pages_waiting) {
3132 if (!vm_paging_min_dnc(vm_page_free_hysteresis) ||
3133 !vm_paging_target1())
3134 {
3135 vm_pages_waiting = 0;
3136 wakeup(&vmstats.v_free_count);
3137 ++mycpu->gd_cnt.v_ppwakeups;
3138 }
3139 }
3140 }
3141
3142 /*
3143 * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
3144 * it from its VM object.
3145 *
3146 * The vm_page must be BUSY on entry. BUSY will be released on
3147 * return (the page will have been freed).
3148 */
3149 void
vm_page_free_toq(vm_page_t m)3150 vm_page_free_toq(vm_page_t m)
3151 {
3152 /*
3153 * The page must not be mapped when freed, but we may have to call
3154 * pmap_mapped_sync() to validate this.
3155 */
3156 mycpu->gd_cnt.v_tfree++;
3157 if (m->flags & (PG_MAPPED | PG_WRITEABLE))
3158 pmap_mapped_sync(m);
3159 KKASSERT((m->flags & PG_MAPPED) == 0);
3160 KKASSERT(m->busy_count & PBUSY_LOCKED);
3161
3162 if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
3163 kprintf("vm_page_free: pindex(%lu), busy %08x, "
3164 "hold(%d)\n",
3165 (u_long)m->pindex, m->busy_count, m->hold_count);
3166 if ((m->queue - m->pc) == PQ_FREE)
3167 panic("vm_page_free: freeing free page");
3168 else
3169 panic("vm_page_free: freeing busy page");
3170 }
3171
3172 /*
3173 * Remove from object, spinlock the page and its queues and
3174 * remove from any queue. No queue spinlock will be held
3175 * after this section (because the page was removed from any
3176 * queue).
3177 */
3178 vm_page_remove(m);
3179
3180 /*
3181 * No further management of fictitious pages occurs beyond object
3182 * and queue removal.
3183 */
3184 if ((m->flags & PG_FICTITIOUS) != 0) {
3185 KKASSERT(m->queue == PQ_NONE);
3186 vm_page_wakeup(m);
3187 return;
3188 }
3189 vm_page_and_queue_spin_lock(m);
3190 _vm_page_rem_queue_spinlocked(m);
3191
3192 m->valid = 0;
3193 vm_page_undirty(m);
3194
3195 if (m->wire_count != 0) {
3196 if (m->wire_count > 1) {
3197 panic(
3198 "vm_page_free: invalid wire count (%d), pindex: 0x%lx",
3199 m->wire_count, (long)m->pindex);
3200 }
3201 panic("vm_page_free: freeing wired page");
3202 }
3203
3204 if (!MD_PAGE_FREEABLE(m))
3205 panic("vm_page_free: page %p is still mapped!", m);
3206
3207 /*
3208 * Clear the PG_NEED_COMMIT and the PG_UNQUEUED flags. The
3209 * page returns to normal operation and will be placed in
3210 * the PQ_HOLD or PQ_FREE queue.
3211 */
3212 vm_page_flag_clear(m, PG_NEED_COMMIT | PG_UNQUEUED);
3213
3214 if (m->hold_count != 0) {
3215 _vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
3216 } else {
3217 _vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
3218 }
3219
3220 /*
3221 * This sequence allows us to clear BUSY while still holding
3222 * its spin lock, which reduces contention vs allocators. We
3223 * must not leave the queue locked or _vm_page_wakeup() may
3224 * deadlock.
3225 */
3226 _vm_page_queue_spin_unlock(m);
3227 if (_vm_page_wakeup(m)) {
3228 vm_page_spin_unlock(m);
3229 wakeup(m);
3230 } else {
3231 vm_page_spin_unlock(m);
3232 }
3233 vm_page_free_wakeup();
3234 }
3235
3236 /*
3237 * Mark this page as wired down by yet another map. We do not adjust the
3238 * queue the page is on, it will be checked for wiring as-needed.
3239 *
3240 * This function has no effect on fictitious pages.
3241 *
3242 * Caller must be holding the page busy.
3243 */
3244 void
vm_page_wire(vm_page_t m)3245 vm_page_wire(vm_page_t m)
3246 {
3247 KKASSERT(m->busy_count & PBUSY_LOCKED);
3248 if ((m->flags & PG_FICTITIOUS) == 0) {
3249 if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
3250 atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
3251 }
3252 KASSERT(m->wire_count != 0,
3253 ("vm_page_wire: wire_count overflow m=%p", m));
3254 }
3255 }
3256
3257 /*
3258 * Release one wiring of this page, potentially enabling it to be paged again.
3259 *
3260 * Note that wired pages are no longer unconditionally removed from the
3261 * paging queues, so the page may already be on a queue. Move the page
3262 * to the desired queue if necessary.
3263 *
3264 * Many pages placed on the inactive queue should actually go
3265 * into the cache, but it is difficult to figure out which. What
3266 * we do instead, if the inactive target is well met, is to put
3267 * clean pages at the head of the inactive queue instead of the tail.
3268 * This will cause them to be moved to the cache more quickly and
3269 * if not actively re-referenced, freed more quickly. If we just
3270 * stick these pages at the end of the inactive queue, heavy filesystem
3271 * meta-data accesses can cause an unnecessary paging load on memory bound
3272 * processes. This optimization causes one-time-use metadata to be
3273 * reused more quickly.
3274 *
3275 * Pages marked PG_NEED_COMMIT are always activated and never placed on
3276 * the inactive queue. This helps the pageout daemon determine memory
3277 * pressure and act on out-of-memory situations more quickly.
3278 *
3279 * BUT, if we are in a low-memory situation we have no choice but to
3280 * put clean pages on the cache queue.
3281 *
3282 * A number of routines use vm_page_unwire() to guarantee that the page
3283 * will go into either the inactive or active queues, and will NEVER
3284 * be placed in the cache - for example, just after dirtying a page.
3285 * dirty pages in the cache are not allowed.
3286 *
3287 * PG_FICTITIOUS or PG_UNQUEUED pages are never moved to any queue, and
3288 * the wire_count will not be adjusted in any way for a PG_FICTITIOUS
3289 * page.
3290 *
3291 * This routine may not block.
3292 */
3293 void
vm_page_unwire(vm_page_t m,int activate)3294 vm_page_unwire(vm_page_t m, int activate)
3295 {
3296 KKASSERT(m->busy_count & PBUSY_LOCKED);
3297 if (m->flags & PG_FICTITIOUS) {
3298 /* do nothing */
3299 } else if ((int)m->wire_count <= 0) {
3300 panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
3301 } else {
3302 if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
3303 atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1);
3304 if (m->flags & PG_UNQUEUED) {
3305 ;
3306 } else if (activate || (m->flags & PG_NEED_COMMIT)) {
3307 vm_page_activate(m);
3308 } else {
3309 vm_page_deactivate(m);
3310 }
3311 }
3312 }
3313 }
3314
3315 /*
3316 * Move the specified page to the inactive queue.
3317 *
3318 * Normally athead is 0 resulting in LRU operation. athead is set
3319 * to 1 if we want this page to be 'as if it were placed in the cache',
3320 * except without unmapping it from the process address space.
3321 *
3322 * vm_page's spinlock must be held on entry and will remain held on return.
3323 * This routine may not block. The caller does not have to hold the page
3324 * busied but should have some sort of interlock on its validity.
3325 *
3326 * It is ok if the page is wired (so buffer cache operations don't have
3327 * to mess with the page queues).
3328 */
3329 static void
_vm_page_deactivate_locked(vm_page_t m,int athead)3330 _vm_page_deactivate_locked(vm_page_t m, int athead)
3331 {
3332 u_short oqueue;
3333
3334 /*
3335 * Ignore if already inactive.
3336 */
3337 if (m->queue - m->pc == PQ_INACTIVE ||
3338 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3339 return;
3340 }
3341
3342 _vm_page_queue_spin_lock(m);
3343 oqueue = _vm_page_rem_queue_spinlocked(m);
3344
3345 if ((m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3346 if (oqueue == PQ_CACHE)
3347 mycpu->gd_cnt.v_reactivated++;
3348 vm_page_flag_clear(m, PG_WINATCFLS);
3349 _vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
3350 if (athead == 0) {
3351 atomic_add_long(
3352 &vm_page_queues[PQ_INACTIVE + m->pc].adds, 1);
3353 }
3354 }
3355 /* NOTE: PQ_NONE if condition not taken */
3356 _vm_page_queue_spin_unlock(m);
3357 /* leaves vm_page spinlocked */
3358 }
3359
3360 /*
3361 * Attempt to deactivate a page.
3362 *
3363 * No requirements. We can pre-filter before getting the spinlock.
3364 *
3365 * It is ok if the page is wired (so buffer cache operations don't have
3366 * to mess with the page queues).
3367 */
3368 void
vm_page_deactivate(vm_page_t m)3369 vm_page_deactivate(vm_page_t m)
3370 {
3371 if (m->queue - m->pc != PQ_INACTIVE &&
3372 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3373 vm_page_spin_lock(m);
3374 _vm_page_deactivate_locked(m, 0);
3375 vm_page_spin_unlock(m);
3376 }
3377 }
3378
3379 void
vm_page_deactivate_locked(vm_page_t m)3380 vm_page_deactivate_locked(vm_page_t m)
3381 {
3382 _vm_page_deactivate_locked(m, 0);
3383 }
3384
3385 /*
3386 * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
3387 *
3388 * This function returns non-zero if it successfully moved the page to
3389 * PQ_CACHE.
3390 *
3391 * This function unconditionally unbusies the page on return.
3392 */
3393 int
vm_page_try_to_cache(vm_page_t m)3394 vm_page_try_to_cache(vm_page_t m)
3395 {
3396 /*
3397 * Shortcut if we obviously cannot move the page, or if the
3398 * page is already on the cache queue, or it is ficitious.
3399 *
3400 * Never allow a wired page into the cache.
3401 */
3402 if (m->dirty || m->hold_count || m->wire_count ||
3403 m->queue - m->pc == PQ_CACHE ||
3404 (m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS))) {
3405 vm_page_wakeup(m);
3406 return(0);
3407 }
3408
3409 /*
3410 * Page busied by us and no longer spinlocked. Dirty pages cannot
3411 * be moved to the cache, but can be deactivated. However, users
3412 * of this function want to move pages closer to the cache so we
3413 * only deactivate it if it is in PQ_ACTIVE. We do not re-deactivate.
3414 */
3415 vm_page_test_dirty(m);
3416 if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3417 if (m->queue - m->pc == PQ_ACTIVE)
3418 vm_page_deactivate(m);
3419 vm_page_wakeup(m);
3420 return(0);
3421 }
3422 vm_page_cache(m);
3423 return(1);
3424 }
3425
3426 /*
3427 * Attempt to free the page. If we cannot free it, we do nothing.
3428 * 1 is returned on success, 0 on failure.
3429 *
3430 * The page can be in any state, including already being on the free
3431 * queue. Check to see if it really can be freed. Note that we disallow
3432 * this ad-hoc operation if the page is flagged PG_UNQUEUED.
3433 *
3434 * Caller provides an unlocked/non-busied page.
3435 * No requirements.
3436 */
3437 int
vm_page_try_to_free(vm_page_t m)3438 vm_page_try_to_free(vm_page_t m)
3439 {
3440 if (vm_page_busy_try(m, TRUE))
3441 return(0);
3442
3443 if (m->dirty || /* can't free if it is dirty */
3444 m->hold_count || /* or held (XXX may be wrong) */
3445 m->wire_count || /* or wired */
3446 (m->flags & (PG_UNQUEUED | /* or unqueued */
3447 PG_NEED_COMMIT | /* or needs a commit */
3448 PG_FICTITIOUS)) || /* or is fictitious */
3449 m->queue - m->pc == PQ_FREE || /* already on PQ_FREE */
3450 m->queue - m->pc == PQ_HOLD) { /* already on PQ_HOLD */
3451 vm_page_wakeup(m);
3452 return(0);
3453 }
3454
3455 /*
3456 * We can probably free the page.
3457 *
3458 * Page busied by us and no longer spinlocked. Dirty pages will
3459 * not be freed by this function. We have to re-test the
3460 * dirty bit after cleaning out the pmaps.
3461 */
3462 vm_page_test_dirty(m);
3463 if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3464 vm_page_wakeup(m);
3465 return(0);
3466 }
3467 vm_page_protect(m, VM_PROT_NONE);
3468 if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3469 vm_page_wakeup(m);
3470 return(0);
3471 }
3472 vm_page_free(m);
3473 return(1);
3474 }
3475
3476 /*
3477 * vm_page_cache
3478 *
3479 * Put the specified page onto the page cache queue (if appropriate).
3480 *
3481 * The page must be busy, and this routine will release the busy and
3482 * possibly even free the page.
3483 */
3484 void
vm_page_cache(vm_page_t m)3485 vm_page_cache(vm_page_t m)
3486 {
3487 /*
3488 * Not suitable for the cache
3489 */
3490 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS)) ||
3491 (m->busy_count & PBUSY_MASK) ||
3492 m->wire_count || m->hold_count) {
3493 vm_page_wakeup(m);
3494 return;
3495 }
3496
3497 /*
3498 * Already in the cache (and thus not mapped)
3499 */
3500 if ((m->queue - m->pc) == PQ_CACHE) {
3501 KKASSERT((m->flags & PG_MAPPED) == 0);
3502 vm_page_wakeup(m);
3503 return;
3504 }
3505
3506 #if 0
3507 /*
3508 * REMOVED - it is possible for dirty to get set at any time as
3509 * long as the page is still mapped and writeable.
3510 *
3511 * Caller is required to test m->dirty, but note that the act of
3512 * removing the page from its maps can cause it to become dirty
3513 * on an SMP system due to another cpu running in usermode.
3514 */
3515 if (m->dirty) {
3516 panic("vm_page_cache: caching a dirty page, pindex: %ld",
3517 (long)m->pindex);
3518 }
3519 #endif
3520
3521 /*
3522 * Remove all pmaps and indicate that the page is not
3523 * writeable or mapped. Our vm_page_protect() call may
3524 * have blocked (especially w/ VM_PROT_NONE), so recheck
3525 * everything.
3526 */
3527 if (m->flags & (PG_MAPPED | PG_WRITEABLE)) {
3528 vm_page_protect(m, VM_PROT_NONE);
3529 pmap_mapped_sync(m);
3530 }
3531 if ((m->flags & (PG_UNQUEUED | PG_MAPPED)) ||
3532 (m->busy_count & PBUSY_MASK) ||
3533 m->wire_count || m->hold_count) {
3534 vm_page_wakeup(m);
3535 } else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3536 vm_page_deactivate(m);
3537 vm_page_wakeup(m);
3538 } else {
3539 _vm_page_and_queue_spin_lock(m);
3540 _vm_page_rem_queue_spinlocked(m);
3541 _vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
3542 _vm_page_and_queue_spin_unlock(m);
3543 vm_page_wakeup(m);
3544 vm_page_free_wakeup();
3545 }
3546 }
3547
3548 /*
3549 * vm_page_dontneed()
3550 *
3551 * Cache, deactivate, or do nothing as appropriate. This routine
3552 * is typically used by madvise() MADV_DONTNEED.
3553 *
3554 * Generally speaking we want to move the page into the cache so
3555 * it gets reused quickly. However, this can result in a silly syndrome
3556 * due to the page recycling too quickly. Small objects will not be
3557 * fully cached. On the otherhand, if we move the page to the inactive
3558 * queue we wind up with a problem whereby very large objects
3559 * unnecessarily blow away our inactive and cache queues.
3560 *
3561 * The solution is to move the pages based on a fixed weighting. We
3562 * either leave them alone, deactivate them, or move them to the cache,
3563 * where moving them to the cache has the highest weighting.
3564 * By forcing some pages into other queues we eventually force the
3565 * system to balance the queues, potentially recovering other unrelated
3566 * space from active. The idea is to not force this to happen too
3567 * often.
3568 *
3569 * The page must be busied.
3570 */
3571 void
vm_page_dontneed(vm_page_t m)3572 vm_page_dontneed(vm_page_t m)
3573 {
3574 static int dnweight;
3575 int dnw;
3576 int head;
3577
3578 dnw = ++dnweight;
3579
3580 /*
3581 * occassionally leave the page alone
3582 */
3583 if ((dnw & 0x01F0) == 0 ||
3584 m->queue - m->pc == PQ_INACTIVE ||
3585 m->queue - m->pc == PQ_CACHE
3586 ) {
3587 if (m->act_count >= ACT_INIT)
3588 --m->act_count;
3589 return;
3590 }
3591
3592 /*
3593 * If vm_page_dontneed() is inactivating a page, it must clear
3594 * the referenced flag; otherwise the pagedaemon will see references
3595 * on the page in the inactive queue and reactivate it. Until the
3596 * page can move to the cache queue, madvise's job is not done.
3597 */
3598 vm_page_flag_clear(m, PG_REFERENCED);
3599 pmap_clear_reference(m);
3600
3601 if (m->dirty == 0)
3602 vm_page_test_dirty(m);
3603
3604 if (m->dirty || (dnw & 0x0070) == 0) {
3605 /*
3606 * Deactivate the page 3 times out of 32.
3607 */
3608 head = 0;
3609 } else {
3610 /*
3611 * Cache the page 28 times out of every 32. Note that
3612 * the page is deactivated instead of cached, but placed
3613 * at the head of the queue instead of the tail.
3614 */
3615 head = 1;
3616 }
3617 vm_page_spin_lock(m);
3618 _vm_page_deactivate_locked(m, head);
3619 vm_page_spin_unlock(m);
3620 }
3621
3622 /*
3623 * These routines manipulate the 'soft busy' count for a page. A soft busy
3624 * is almost like a hard BUSY except that it allows certain compatible
3625 * operations to occur on the page while it is busy. For example, a page
3626 * undergoing a write can still be mapped read-only.
3627 *
3628 * We also use soft-busy to quickly pmap_enter shared read-only pages
3629 * without having to hold the page locked.
3630 *
3631 * The soft-busy count can be > 1 in situations where multiple threads
3632 * are pmap_enter()ing the same page simultaneously, or when two buffer
3633 * cache buffers overlap the same page.
3634 *
3635 * The caller must hold the page BUSY when making these two calls.
3636 */
3637 void
vm_page_io_start(vm_page_t m)3638 vm_page_io_start(vm_page_t m)
3639 {
3640 uint32_t ocount;
3641
3642 ocount = atomic_fetchadd_int(&m->busy_count, 1);
3643 KKASSERT(ocount & PBUSY_LOCKED);
3644 }
3645
3646 void
vm_page_io_finish(vm_page_t m)3647 vm_page_io_finish(vm_page_t m)
3648 {
3649 uint32_t ocount;
3650
3651 ocount = atomic_fetchadd_int(&m->busy_count, -1);
3652 KKASSERT(ocount & PBUSY_MASK);
3653 #if 0
3654 if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
3655 wakeup(m);
3656 #endif
3657 }
3658
3659 /*
3660 * Attempt to soft-busy a page. The page must not be PBUSY_LOCKED.
3661 *
3662 * We can't use fetchadd here because we might race a hard-busy and the
3663 * page freeing code asserts on a non-zero soft-busy count (even if only
3664 * temporary).
3665 *
3666 * Returns 0 on success, non-zero on failure.
3667 */
3668 int
vm_page_sbusy_try(vm_page_t m)3669 vm_page_sbusy_try(vm_page_t m)
3670 {
3671 uint32_t ocount;
3672
3673 for (;;) {
3674 ocount = m->busy_count;
3675 cpu_ccfence();
3676 if (ocount & PBUSY_LOCKED)
3677 return 1;
3678 if (atomic_cmpset_int(&m->busy_count, ocount, ocount + 1))
3679 break;
3680 }
3681 return 0;
3682 #if 0
3683 if (m->busy_count & PBUSY_LOCKED)
3684 return 1;
3685 ocount = atomic_fetchadd_int(&m->busy_count, 1);
3686 if (ocount & PBUSY_LOCKED) {
3687 vm_page_sbusy_drop(m);
3688 return 1;
3689 }
3690 return 0;
3691 #endif
3692 }
3693
3694 /*
3695 * Indicate that a clean VM page requires a filesystem commit and cannot
3696 * be reused. Used by tmpfs.
3697 */
3698 void
vm_page_need_commit(vm_page_t m)3699 vm_page_need_commit(vm_page_t m)
3700 {
3701 vm_page_flag_set(m, PG_NEED_COMMIT);
3702 vm_object_set_writeable_dirty(m->object);
3703 }
3704
3705 void
vm_page_clear_commit(vm_page_t m)3706 vm_page_clear_commit(vm_page_t m)
3707 {
3708 vm_page_flag_clear(m, PG_NEED_COMMIT);
3709 }
3710
3711 /*
3712 * Allocate a page without an object. The returned page will be wired and
3713 * NOT busy. The function will block if no page is available, but only loop
3714 * if VM_ALLOC_RETRY is specified (else returns NULL after blocking).
3715 *
3716 * The pindex can be passed as zero, and is typically passed to help the
3717 * allocator 'color' the page returned. That is, select pages that are
3718 * cache-friendly if the caller is allocating multiple pages.
3719 *
3720 * VM_ALLOC_QUICK - Allocate from free queue only
3721 * VM_ALLOC_NORMAL - Allocate from free + cache
3722 * VM_ALLOC_SYSTEM - Allocation can use system page reserve
3723 * VM_ALLOC_INTERRUPT - Allocation can use emergency page reserve
3724 *
3725 * VM_ALLOC_CPU(n) - Allocate using specified cpu localization
3726 *
3727 * VM_ALLOC_ZERO - Zero and set page valid. If not specified,
3728 * m->valid will be 0 and the page will contain
3729 * prior garbage.
3730 *
3731 * VM_ALLOC_FORCE_ZERO - (same as VM_ALLOC_ZERO in this case)
3732 *
3733 * VM_ALLOC_RETRY - Retry until a page is available. If not
3734 * specified, NULL can be returned.
3735 *
3736 * VM_ALLOC_NULL_OK - Not applicable since there is no object.
3737 */
3738 vm_page_t
vm_page_alloczwq(vm_pindex_t pindex,int flags)3739 vm_page_alloczwq(vm_pindex_t pindex, int flags)
3740 {
3741 vm_page_t m;
3742
3743 KKASSERT(flags & (VM_ALLOC_NORMAL | VM_ALLOC_QUICK |
3744 VM_ALLOC_INTERRUPT | VM_ALLOC_SYSTEM));
3745 for (;;) {
3746 m = vm_page_alloc(NULL, pindex, flags & ~VM_ALLOC_RETRY);
3747 if (m)
3748 break;
3749 vm_wait(0);
3750 if ((flags & VM_ALLOC_RETRY) == 0)
3751 return NULL;
3752 }
3753
3754 if (flags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3755 pmap_zero_page(VM_PAGE_TO_PHYS(m));
3756 m->valid = VM_PAGE_BITS_ALL;
3757 }
3758
3759 vm_page_wire(m);
3760 vm_page_wakeup(m);
3761
3762 return(m);
3763 }
3764
3765 /*
3766 * Free a page previously allocated via vm_page_alloczwq().
3767 *
3768 * Caller should not busy the page. This function will busy, unwire,
3769 * and free the page.
3770 */
3771 void
vm_page_freezwq(vm_page_t m)3772 vm_page_freezwq(vm_page_t m)
3773 {
3774 vm_page_busy_wait(m, FALSE, "pgzwq");
3775 vm_page_unwire(m, 0);
3776 vm_page_free(m);
3777 }
3778
3779 /*
3780 * Grab a page, blocking if it is busy and allocating a page if necessary.
3781 * A busy page is returned or NULL. The page may or may not be valid and
3782 * might not be on a queue (the caller is responsible for the disposition of
3783 * the page).
3784 *
3785 * VM_ALLOC_QUICK - Allocate from free queue only
3786 * VM_ALLOC_NORMAL - Allocate from free + cache
3787 * VM_ALLOC_SYSTEM - Allocation can use system page reserve
3788 * VM_ALLOC_INTERRUPT - Allocation can use emergency page reserve
3789 *
3790 * VM_ALLOC_CPU(n) - Allocate using specified cpu localization
3791 *
3792 * VM_ALLOC_ZERO - If the page does not exist and must be
3793 * allocated, it will be zerod and set valid.
3794 *
3795 * VM_ALLOC_FORCE_ZERO - The page will be zerod and set valid whether
3796 * it previously existed or had to be allocated.
3797 *
3798 * VM_ALLOC_RETRY - Routine waits and loops until it can obtain
3799 * the page, never returning NULL. Also note
3800 * that VM_ALLOC_NORMAL must also be specified
3801 * if you use VM_ALLOC_RETRY.
3802 *
3803 * Also, VM_ALLOC_NULL_OK is implied when
3804 * VM_ALLOC_RETRY is specified, but will simply
3805 * cause a retry loop and never return NULL.
3806 *
3807 * VM_ALLOC_NULL_OK - Prevent panic on insertion collision. This
3808 * flag is implied and need not be set if
3809 * VM_ALLOC_RETRY is specified.
3810 *
3811 * If VM_ALLOC_RETRY is not specified, the page
3812 * can still be pre-existing and will be
3813 * returned if so, but concurrent creation of
3814 * the same 'new' page can cause one or more
3815 * grabs to return NULL.
3816 *
3817 * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
3818 * always returned if we had blocked.
3819 *
3820 * This routine may not be called from an interrupt.
3821 *
3822 * No other requirements.
3823 */
3824 vm_page_t
vm_page_grab(vm_object_t object,vm_pindex_t pindex,int flags)3825 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int flags)
3826 {
3827 vm_page_t m;
3828 int error;
3829 int shared = 1;
3830
3831 KKASSERT(flags & (VM_ALLOC_NORMAL | VM_ALLOC_QUICK |
3832 VM_ALLOC_INTERRUPT | VM_ALLOC_SYSTEM));
3833 vm_object_hold_shared(object);
3834 for (;;) {
3835 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
3836 if (error) {
3837 vm_page_sleep_busy(m, TRUE, "pgrbwt");
3838 if ((flags & VM_ALLOC_RETRY) == 0) {
3839 m = NULL;
3840 break;
3841 }
3842 /* retry */
3843 } else if (m == NULL) {
3844 if (shared) {
3845 vm_object_upgrade(object);
3846 shared = 0;
3847 }
3848 if (flags & VM_ALLOC_RETRY)
3849 flags |= VM_ALLOC_NULL_OK;
3850 m = vm_page_alloc(object, pindex,
3851 flags & ~VM_ALLOC_RETRY);
3852 if (m)
3853 break;
3854 vm_wait(0);
3855 if ((flags & VM_ALLOC_RETRY) == 0)
3856 goto failed;
3857 } else {
3858 /* m found */
3859 break;
3860 }
3861 }
3862
3863 /*
3864 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
3865 *
3866 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
3867 * valid even if already valid.
3868 *
3869 * NOTE! We have removed all of the PG_ZERO optimizations and also
3870 * removed the idle zeroing code. These optimizations actually
3871 * slow things down on modern cpus because the zerod area is
3872 * likely uncached, placing a memory-access burden on the
3873 * accesors taking the fault.
3874 *
3875 * By always zeroing the page in-line with the fault, no
3876 * dynamic ram reads are needed and the caches are hot, ready
3877 * for userland to access the memory.
3878 */
3879 if (m->valid == 0) {
3880 if (flags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3881 pmap_zero_page(VM_PAGE_TO_PHYS(m));
3882 m->valid = VM_PAGE_BITS_ALL;
3883 }
3884 } else if (flags & VM_ALLOC_FORCE_ZERO) {
3885 pmap_zero_page(VM_PAGE_TO_PHYS(m));
3886 m->valid = VM_PAGE_BITS_ALL;
3887 }
3888 failed:
3889 vm_object_drop(object);
3890 return(m);
3891 }
3892
3893 /*
3894 * Mapping function for valid bits or for dirty bits in
3895 * a page. May not block.
3896 *
3897 * Inputs are required to range within a page.
3898 *
3899 * No requirements.
3900 * Non blocking.
3901 */
3902 int
vm_page_bits(int base,int size)3903 vm_page_bits(int base, int size)
3904 {
3905 int first_bit;
3906 int last_bit;
3907
3908 KASSERT(
3909 base + size <= PAGE_SIZE,
3910 ("vm_page_bits: illegal base/size %d/%d", base, size)
3911 );
3912
3913 if (size == 0) /* handle degenerate case */
3914 return(0);
3915
3916 first_bit = base >> DEV_BSHIFT;
3917 last_bit = (base + size - 1) >> DEV_BSHIFT;
3918
3919 return ((2 << last_bit) - (1 << first_bit));
3920 }
3921
3922 /*
3923 * Sets portions of a page valid and clean. The arguments are expected
3924 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3925 * of any partial chunks touched by the range. The invalid portion of
3926 * such chunks will be zero'd.
3927 *
3928 * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
3929 * align base to DEV_BSIZE so as not to mark clean a partially
3930 * truncated device block. Otherwise the dirty page status might be
3931 * lost.
3932 *
3933 * This routine may not block.
3934 *
3935 * (base + size) must be less then or equal to PAGE_SIZE.
3936 */
3937 static void
_vm_page_zero_valid(vm_page_t m,int base,int size)3938 _vm_page_zero_valid(vm_page_t m, int base, int size)
3939 {
3940 int frag;
3941 int endoff;
3942
3943 if (size == 0) /* handle degenerate case */
3944 return;
3945
3946 /*
3947 * If the base is not DEV_BSIZE aligned and the valid
3948 * bit is clear, we have to zero out a portion of the
3949 * first block.
3950 */
3951
3952 if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3953 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3954 ) {
3955 pmap_zero_page_area(
3956 VM_PAGE_TO_PHYS(m),
3957 frag,
3958 base - frag
3959 );
3960 }
3961
3962 /*
3963 * If the ending offset is not DEV_BSIZE aligned and the
3964 * valid bit is clear, we have to zero out a portion of
3965 * the last block.
3966 */
3967
3968 endoff = base + size;
3969
3970 if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3971 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3972 ) {
3973 pmap_zero_page_area(
3974 VM_PAGE_TO_PHYS(m),
3975 endoff,
3976 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3977 );
3978 }
3979 }
3980
3981 /*
3982 * Set valid, clear dirty bits. If validating the entire
3983 * page we can safely clear the pmap modify bit. We also
3984 * use this opportunity to clear the PG_NOSYNC flag. If a process
3985 * takes a write fault on a MAP_NOSYNC memory area the flag will
3986 * be set again.
3987 *
3988 * We set valid bits inclusive of any overlap, but we can only
3989 * clear dirty bits for DEV_BSIZE chunks that are fully within
3990 * the range.
3991 *
3992 * Page must be busied?
3993 * No other requirements.
3994 */
3995 void
vm_page_set_valid(vm_page_t m,int base,int size)3996 vm_page_set_valid(vm_page_t m, int base, int size)
3997 {
3998 _vm_page_zero_valid(m, base, size);
3999 m->valid |= vm_page_bits(base, size);
4000 }
4001
4002
4003 /*
4004 * Set valid bits and clear dirty bits.
4005 *
4006 * Page must be busied by caller.
4007 *
4008 * NOTE: This function does not clear the pmap modified bit.
4009 * Also note that e.g. NFS may use a byte-granular base
4010 * and size.
4011 *
4012 * No other requirements.
4013 */
4014 void
vm_page_set_validclean(vm_page_t m,int base,int size)4015 vm_page_set_validclean(vm_page_t m, int base, int size)
4016 {
4017 int pagebits;
4018
4019 _vm_page_zero_valid(m, base, size);
4020 pagebits = vm_page_bits(base, size);
4021 m->valid |= pagebits;
4022 m->dirty &= ~pagebits;
4023 if (base == 0 && size == PAGE_SIZE) {
4024 /*pmap_clear_modify(m);*/
4025 vm_page_flag_clear(m, PG_NOSYNC);
4026 }
4027 }
4028
4029 /*
4030 * Set valid & dirty. Used by buwrite()
4031 *
4032 * Page must be busied by caller.
4033 */
4034 void
vm_page_set_validdirty(vm_page_t m,int base,int size)4035 vm_page_set_validdirty(vm_page_t m, int base, int size)
4036 {
4037 int pagebits;
4038
4039 pagebits = vm_page_bits(base, size);
4040 m->valid |= pagebits;
4041 m->dirty |= pagebits;
4042 if (m->object)
4043 vm_object_set_writeable_dirty(m->object);
4044 }
4045
4046 /*
4047 * Clear dirty bits.
4048 *
4049 * NOTE: This function does not clear the pmap modified bit.
4050 * Also note that e.g. NFS may use a byte-granular base
4051 * and size.
4052 *
4053 * Page must be busied?
4054 * No other requirements.
4055 */
4056 void
vm_page_clear_dirty(vm_page_t m,int base,int size)4057 vm_page_clear_dirty(vm_page_t m, int base, int size)
4058 {
4059 m->dirty &= ~vm_page_bits(base, size);
4060 if (base == 0 && size == PAGE_SIZE) {
4061 /*pmap_clear_modify(m);*/
4062 vm_page_flag_clear(m, PG_NOSYNC);
4063 }
4064 }
4065
4066 /*
4067 * Make the page all-dirty.
4068 *
4069 * Also make sure the related object and vnode reflect the fact that the
4070 * object may now contain a dirty page.
4071 *
4072 * Page must be busied?
4073 * No other requirements.
4074 */
4075 void
vm_page_dirty(vm_page_t m)4076 vm_page_dirty(vm_page_t m)
4077 {
4078 #ifdef INVARIANTS
4079 int pqtype = m->queue - m->pc;
4080 #endif
4081 KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
4082 ("vm_page_dirty: page in free/cache queue!"));
4083 if (m->dirty != VM_PAGE_BITS_ALL) {
4084 m->dirty = VM_PAGE_BITS_ALL;
4085 if (m->object)
4086 vm_object_set_writeable_dirty(m->object);
4087 }
4088 }
4089
4090 /*
4091 * Invalidates DEV_BSIZE'd chunks within a page. Both the
4092 * valid and dirty bits for the effected areas are cleared.
4093 *
4094 * Page must be busied?
4095 * Does not block.
4096 * No other requirements.
4097 */
4098 void
vm_page_set_invalid(vm_page_t m,int base,int size)4099 vm_page_set_invalid(vm_page_t m, int base, int size)
4100 {
4101 int bits;
4102
4103 bits = vm_page_bits(base, size);
4104 m->valid &= ~bits;
4105 m->dirty &= ~bits;
4106 atomic_add_int(&m->object->generation, 1);
4107 }
4108
4109 /*
4110 * The kernel assumes that the invalid portions of a page contain
4111 * garbage, but such pages can be mapped into memory by user code.
4112 * When this occurs, we must zero out the non-valid portions of the
4113 * page so user code sees what it expects.
4114 *
4115 * Pages are most often semi-valid when the end of a file is mapped
4116 * into memory and the file's size is not page aligned.
4117 *
4118 * Page must be busied?
4119 * No other requirements.
4120 */
4121 void
vm_page_zero_invalid(vm_page_t m,boolean_t setvalid)4122 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
4123 {
4124 int b;
4125 int i;
4126
4127 /*
4128 * Scan the valid bits looking for invalid sections that
4129 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the
4130 * valid bit may be set ) have already been zerod by
4131 * vm_page_set_validclean().
4132 */
4133 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
4134 if (i == (PAGE_SIZE / DEV_BSIZE) ||
4135 (m->valid & (1 << i))
4136 ) {
4137 if (i > b) {
4138 pmap_zero_page_area(
4139 VM_PAGE_TO_PHYS(m),
4140 b << DEV_BSHIFT,
4141 (i - b) << DEV_BSHIFT
4142 );
4143 }
4144 b = i + 1;
4145 }
4146 }
4147
4148 /*
4149 * setvalid is TRUE when we can safely set the zero'd areas
4150 * as being valid. We can do this if there are no cache consistency
4151 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS.
4152 */
4153 if (setvalid)
4154 m->valid = VM_PAGE_BITS_ALL;
4155 }
4156
4157 /*
4158 * Is a (partial) page valid? Note that the case where size == 0
4159 * will return FALSE in the degenerate case where the page is entirely
4160 * invalid, and TRUE otherwise.
4161 *
4162 * Does not block.
4163 * No other requirements.
4164 */
4165 int
vm_page_is_valid(vm_page_t m,int base,int size)4166 vm_page_is_valid(vm_page_t m, int base, int size)
4167 {
4168 int bits = vm_page_bits(base, size);
4169
4170 if (m->valid && ((m->valid & bits) == bits))
4171 return 1;
4172 else
4173 return 0;
4174 }
4175
4176 /*
4177 * Update dirty bits from pmap/mmu. May not block.
4178 *
4179 * Caller must hold the page busy
4180 *
4181 * WARNING! Unless the page has been unmapped, this function only
4182 * provides a likely dirty status.
4183 */
4184 void
vm_page_test_dirty(vm_page_t m)4185 vm_page_test_dirty(vm_page_t m)
4186 {
4187 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) {
4188 vm_page_dirty(m);
4189 }
4190 }
4191
4192 #include "opt_ddb.h"
4193 #ifdef DDB
4194 #include <ddb/ddb.h>
4195
DB_SHOW_COMMAND(page,vm_page_print_page_info)4196 DB_SHOW_COMMAND(page, vm_page_print_page_info)
4197 {
4198 db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count);
4199 db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count);
4200 db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count);
4201 db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count);
4202 db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count);
4203 db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved);
4204 db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min);
4205 db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target);
4206 db_printf("vmstats.v_inactive_target: %ld\n",
4207 vmstats.v_inactive_target);
4208 db_printf("vmstats.v_paging_wait: %ld\n", vmstats.v_paging_wait);
4209 db_printf("vmstats.v_paging_start: %ld\n", vmstats.v_paging_start);
4210 db_printf("vmstats.v_paging_target1: %ld\n", vmstats.v_paging_target1);
4211 db_printf("vmstats.v_paging_target2: %ld\n", vmstats.v_paging_target2);
4212 }
4213
DB_SHOW_COMMAND(pageq,vm_page_print_pageq_info)4214 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
4215 {
4216 int i;
4217 db_printf("PQ_FREE:");
4218 for (i = 0; i < PQ_L2_SIZE; i++) {
4219 db_printf(" %ld", vm_page_queues[PQ_FREE + i].lcnt);
4220 }
4221 db_printf("\n");
4222
4223 db_printf("PQ_CACHE:");
4224 for(i = 0; i < PQ_L2_SIZE; i++) {
4225 db_printf(" %ld", vm_page_queues[PQ_CACHE + i].lcnt);
4226 }
4227 db_printf("\n");
4228
4229 db_printf("PQ_ACTIVE:");
4230 for(i = 0; i < PQ_L2_SIZE; i++) {
4231 db_printf(" %ld", vm_page_queues[PQ_ACTIVE + i].lcnt);
4232 }
4233 db_printf("\n");
4234
4235 db_printf("PQ_INACTIVE:");
4236 for(i = 0; i < PQ_L2_SIZE; i++) {
4237 db_printf(" %ld", vm_page_queues[PQ_INACTIVE + i].lcnt);
4238 }
4239 db_printf("\n");
4240 }
4241 #endif /* DDB */
4242