xref: /freebsd/sys/i386/i386/pmap.c (revision aa0a1e58)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * the Systems Programming Group of the University of Utah Computer
13  * Science Department and William Jolitz of UUNET Technologies Inc.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. All advertising materials mentioning features or use of this software
24  *    must display the following acknowledgement:
25  *	This product includes software developed by the University of
26  *	California, Berkeley and its contributors.
27  * 4. Neither the name of the University nor the names of its contributors
28  *    may be used to endorse or promote products derived from this software
29  *    without specific prior written permission.
30  *
31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41  * SUCH DAMAGE.
42  *
43  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44  */
45 /*-
46  * Copyright (c) 2003 Networks Associates Technology, Inc.
47  * All rights reserved.
48  *
49  * This software was developed for the FreeBSD Project by Jake Burkholder,
50  * Safeport Network Services, and Network Associates Laboratories, the
51  * Security Research Division of Network Associates, Inc. under
52  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53  * CHATS research program.
54  *
55  * Redistribution and use in source and binary forms, with or without
56  * modification, are permitted provided that the following conditions
57  * are met:
58  * 1. Redistributions of source code must retain the above copyright
59  *    notice, this list of conditions and the following disclaimer.
60  * 2. Redistributions in binary form must reproduce the above copyright
61  *    notice, this list of conditions and the following disclaimer in the
62  *    documentation and/or other materials provided with the distribution.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74  * SUCH DAMAGE.
75  */
76 
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD$");
79 
80 /*
81  *	Manages physical address maps.
82  *
83  *	In addition to hardware address maps, this
84  *	module is called upon to provide software-use-only
85  *	maps which may or may not be stored in the same
86  *	form as hardware maps.  These pseudo-maps are
87  *	used to store intermediate results from copy
88  *	operations to and from address spaces.
89  *
90  *	Since the information managed by this module is
91  *	also stored by the logical address mapping module,
92  *	this module may throw away valid virtual-to-physical
93  *	mappings at almost any time.  However, invalidations
94  *	of virtual-to-physical mappings must be done as
95  *	requested.
96  *
97  *	In order to cope with hardware architectures which
98  *	make virtual-to-physical map invalidates expensive,
99  *	this module may delay invalidate or reduced protection
100  *	operations until such time as they are actually
101  *	necessary.  This module is given full information as
102  *	to which processors are currently using which maps,
103  *	and to when physical maps must be made correct.
104  */
105 
106 #include "opt_cpu.h"
107 #include "opt_pmap.h"
108 #include "opt_smp.h"
109 #include "opt_xbox.h"
110 
111 #include <sys/param.h>
112 #include <sys/systm.h>
113 #include <sys/kernel.h>
114 #include <sys/ktr.h>
115 #include <sys/lock.h>
116 #include <sys/malloc.h>
117 #include <sys/mman.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/proc.h>
121 #include <sys/sf_buf.h>
122 #include <sys/sx.h>
123 #include <sys/vmmeter.h>
124 #include <sys/sched.h>
125 #include <sys/sysctl.h>
126 #ifdef SMP
127 #include <sys/smp.h>
128 #endif
129 
130 #include <vm/vm.h>
131 #include <vm/vm_param.h>
132 #include <vm/vm_kern.h>
133 #include <vm/vm_page.h>
134 #include <vm/vm_map.h>
135 #include <vm/vm_object.h>
136 #include <vm/vm_extern.h>
137 #include <vm/vm_pageout.h>
138 #include <vm/vm_pager.h>
139 #include <vm/vm_reserv.h>
140 #include <vm/uma.h>
141 
142 #include <machine/cpu.h>
143 #include <machine/cputypes.h>
144 #include <machine/md_var.h>
145 #include <machine/pcb.h>
146 #include <machine/specialreg.h>
147 #ifdef SMP
148 #include <machine/smp.h>
149 #endif
150 
151 #ifdef XBOX
152 #include <machine/xbox.h>
153 #endif
154 
155 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
156 #define CPU_ENABLE_SSE
157 #endif
158 
159 #ifndef PMAP_SHPGPERPROC
160 #define PMAP_SHPGPERPROC 200
161 #endif
162 
163 #if !defined(DIAGNOSTIC)
164 #ifdef __GNUC_GNU_INLINE__
165 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
166 #else
167 #define PMAP_INLINE	extern inline
168 #endif
169 #else
170 #define PMAP_INLINE
171 #endif
172 
173 #define PV_STATS
174 #ifdef PV_STATS
175 #define PV_STAT(x)	do { x ; } while (0)
176 #else
177 #define PV_STAT(x)	do { } while (0)
178 #endif
179 
180 #define	pa_index(pa)	((pa) >> PDRSHIFT)
181 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
182 
183 /*
184  * Get PDEs and PTEs for user/kernel address space
185  */
186 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
187 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
188 
189 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
190 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
191 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
192 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
193 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
194 
195 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
196     atomic_clear_int((u_int *)(pte), PG_W))
197 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
198 
199 struct pmap kernel_pmap_store;
200 LIST_HEAD(pmaplist, pmap);
201 static struct pmaplist allpmaps;
202 static struct mtx allpmaps_lock;
203 
204 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
205 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
206 int pgeflag = 0;		/* PG_G or-in */
207 int pseflag = 0;		/* PG_PS or-in */
208 
209 static int nkpt = NKPT;
210 vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
211 extern u_int32_t KERNend;
212 extern u_int32_t KPTphys;
213 
214 #ifdef PAE
215 pt_entry_t pg_nx;
216 static uma_zone_t pdptzone;
217 #endif
218 
219 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
220 
221 static int pat_works = 1;
222 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
223     "Is page attribute table fully functional?");
224 
225 static int pg_ps_enabled = 1;
226 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
227     "Are large page mappings enabled?");
228 
229 #define	PAT_INDEX_SIZE	8
230 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
231 
232 /*
233  * Data for the pv entry allocation mechanism
234  */
235 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
236 static struct md_page *pv_table;
237 static int shpgperproc = PMAP_SHPGPERPROC;
238 
239 struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
240 int pv_maxchunks;			/* How many chunks we have KVA for */
241 vm_offset_t pv_vafree;			/* freelist stored in the PTE */
242 
243 /*
244  * All those kernel PT submaps that BSD is so fond of
245  */
246 struct sysmaps {
247 	struct	mtx lock;
248 	pt_entry_t *CMAP1;
249 	pt_entry_t *CMAP2;
250 	caddr_t	CADDR1;
251 	caddr_t	CADDR2;
252 };
253 static struct sysmaps sysmaps_pcpu[MAXCPU];
254 pt_entry_t *CMAP1 = 0;
255 static pt_entry_t *CMAP3;
256 static pd_entry_t *KPTD;
257 caddr_t CADDR1 = 0, ptvmmap = 0;
258 static caddr_t CADDR3;
259 struct msgbuf *msgbufp = 0;
260 
261 /*
262  * Crashdump maps.
263  */
264 static caddr_t crashdumpmap;
265 
266 static pt_entry_t *PMAP1 = 0, *PMAP2;
267 static pt_entry_t *PADDR1 = 0, *PADDR2;
268 #ifdef SMP
269 static int PMAP1cpu;
270 static int PMAP1changedcpu;
271 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
272 	   &PMAP1changedcpu, 0,
273 	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
274 #endif
275 static int PMAP1changed;
276 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
277 	   &PMAP1changed, 0,
278 	   "Number of times pmap_pte_quick changed PMAP1");
279 static int PMAP1unchanged;
280 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
281 	   &PMAP1unchanged, 0,
282 	   "Number of times pmap_pte_quick didn't change PMAP1");
283 static struct mtx PMAP2mutex;
284 
285 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
286 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
287 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
288 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
289 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
290 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
291 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
292 		    vm_offset_t va);
293 static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
294 
295 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
296 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
297     vm_prot_t prot);
298 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
299     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
300 static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
301 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
302 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
303 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
304 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
305 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
306 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
307 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
308 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
309 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
310     vm_prot_t prot);
311 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
312 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
313     vm_page_t *free);
314 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
315     vm_page_t *free);
316 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
317 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
318     vm_page_t *free);
319 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
320 					vm_offset_t va);
321 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
322 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
323     vm_page_t m);
324 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
325     pd_entry_t newpde);
326 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
327 
328 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
329 
330 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
331 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
332 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
333 static void pmap_pte_release(pt_entry_t *pte);
334 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
335 #ifdef PAE
336 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
337 #endif
338 static void pmap_set_pg(void);
339 
340 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
341 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
342 
343 /*
344  * If you get an error here, then you set KVA_PAGES wrong! See the
345  * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
346  * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
347  */
348 CTASSERT(KERNBASE % (1 << 24) == 0);
349 
350 /*
351  *	Bootstrap the system enough to run with virtual memory.
352  *
353  *	On the i386 this is called after mapping has already been enabled
354  *	and just syncs the pmap module with what has already been done.
355  *	[We can't call it easily with mapping off since the kernel is not
356  *	mapped with PA == VA, hence we would have to relocate every address
357  *	from the linked base (virtual) address "KERNBASE" to the actual
358  *	(physical) address starting relative to 0]
359  */
360 void
361 pmap_bootstrap(vm_paddr_t firstaddr)
362 {
363 	vm_offset_t va;
364 	pt_entry_t *pte, *unused;
365 	struct sysmaps *sysmaps;
366 	int i;
367 
368 	/*
369 	 * Initialize the first available kernel virtual address.  However,
370 	 * using "firstaddr" may waste a few pages of the kernel virtual
371 	 * address space, because locore may not have mapped every physical
372 	 * page that it allocated.  Preferably, locore would provide a first
373 	 * unused virtual address in addition to "firstaddr".
374 	 */
375 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
376 
377 	virtual_end = VM_MAX_KERNEL_ADDRESS;
378 
379 	/*
380 	 * Initialize the kernel pmap (which is statically allocated).
381 	 */
382 	PMAP_LOCK_INIT(kernel_pmap);
383 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
384 #ifdef PAE
385 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
386 #endif
387 	kernel_pmap->pm_root = NULL;
388 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
389 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
390 	LIST_INIT(&allpmaps);
391 
392 	/*
393 	 * Request a spin mutex so that changes to allpmaps cannot be
394 	 * preempted by smp_rendezvous_cpus().  Otherwise,
395 	 * pmap_update_pde_kernel() could access allpmaps while it is
396 	 * being changed.
397 	 */
398 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
399 	mtx_lock_spin(&allpmaps_lock);
400 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
401 	mtx_unlock_spin(&allpmaps_lock);
402 
403 	/*
404 	 * Reserve some special page table entries/VA space for temporary
405 	 * mapping of pages.
406 	 */
407 #define	SYSMAP(c, p, v, n)	\
408 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
409 
410 	va = virtual_avail;
411 	pte = vtopte(va);
412 
413 	/*
414 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
415 	 * CMAP3 is used for the idle process page zeroing.
416 	 */
417 	for (i = 0; i < MAXCPU; i++) {
418 		sysmaps = &sysmaps_pcpu[i];
419 		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
420 		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
421 		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
422 	}
423 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
424 	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
425 
426 	/*
427 	 * Crashdump maps.
428 	 */
429 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
430 
431 	/*
432 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
433 	 */
434 	SYSMAP(caddr_t, unused, ptvmmap, 1)
435 
436 	/*
437 	 * msgbufp is used to map the system message buffer.
438 	 */
439 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
440 
441 	/*
442 	 * KPTmap is used by pmap_kextract().
443 	 *
444 	 * KPTmap is first initialized by locore.  However, that initial
445 	 * KPTmap can only support NKPT page table pages.  Here, a larger
446 	 * KPTmap is created that can support KVA_PAGES page table pages.
447 	 */
448 	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
449 
450 	for (i = 0; i < NKPT; i++)
451 		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
452 
453 	/*
454 	 * Adjust the start of the KPTD and KPTmap so that the implementation
455 	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
456 	 */
457 	KPTD -= KPTDI;
458 	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
459 
460 	/*
461 	 * ptemap is used for pmap_pte_quick
462 	 */
463 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
464 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
465 
466 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
467 
468 	virtual_avail = va;
469 
470 	/*
471 	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
472 	 * physical memory region that is used by the ACPI wakeup code.  This
473 	 * mapping must not have PG_G set.
474 	 */
475 #ifdef XBOX
476 	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
477 	 * an early stadium, we cannot yet neatly map video memory ... :-(
478 	 * Better fixes are very welcome! */
479 	if (!arch_i386_is_xbox)
480 #endif
481 	for (i = 1; i < NKPT; i++)
482 		PTD[i] = 0;
483 
484 	/* Initialize the PAT MSR if present. */
485 	pmap_init_pat();
486 
487 	/* Turn on PG_G on kernel page(s) */
488 	pmap_set_pg();
489 }
490 
491 /*
492  * Setup the PAT MSR.
493  */
494 void
495 pmap_init_pat(void)
496 {
497 	int pat_table[PAT_INDEX_SIZE];
498 	uint64_t pat_msr;
499 	u_long cr0, cr4;
500 	int i;
501 
502 	/* Set default PAT index table. */
503 	for (i = 0; i < PAT_INDEX_SIZE; i++)
504 		pat_table[i] = -1;
505 	pat_table[PAT_WRITE_BACK] = 0;
506 	pat_table[PAT_WRITE_THROUGH] = 1;
507 	pat_table[PAT_UNCACHEABLE] = 3;
508 	pat_table[PAT_WRITE_COMBINING] = 3;
509 	pat_table[PAT_WRITE_PROTECTED] = 3;
510 	pat_table[PAT_UNCACHED] = 3;
511 
512 	/* Bail if this CPU doesn't implement PAT. */
513 	if ((cpu_feature & CPUID_PAT) == 0) {
514 		for (i = 0; i < PAT_INDEX_SIZE; i++)
515 			pat_index[i] = pat_table[i];
516 		pat_works = 0;
517 		return;
518 	}
519 
520 	/*
521 	 * Due to some Intel errata, we can only safely use the lower 4
522 	 * PAT entries.
523 	 *
524 	 *   Intel Pentium III Processor Specification Update
525 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
526 	 * or Mode C Paging)
527 	 *
528 	 *   Intel Pentium IV  Processor Specification Update
529 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
530 	 */
531 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
532 	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
533 		pat_works = 0;
534 
535 	/* Initialize default PAT entries. */
536 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
537 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
538 	    PAT_VALUE(2, PAT_UNCACHED) |
539 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
540 	    PAT_VALUE(4, PAT_WRITE_BACK) |
541 	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
542 	    PAT_VALUE(6, PAT_UNCACHED) |
543 	    PAT_VALUE(7, PAT_UNCACHEABLE);
544 
545 	if (pat_works) {
546 		/*
547 		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
548 		 * Program 5 and 6 as WP and WC.
549 		 * Leave 4 and 7 as WB and UC.
550 		 */
551 		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
552 		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
553 		    PAT_VALUE(6, PAT_WRITE_COMBINING);
554 		pat_table[PAT_UNCACHED] = 2;
555 		pat_table[PAT_WRITE_PROTECTED] = 5;
556 		pat_table[PAT_WRITE_COMBINING] = 6;
557 	} else {
558 		/*
559 		 * Just replace PAT Index 2 with WC instead of UC-.
560 		 */
561 		pat_msr &= ~PAT_MASK(2);
562 		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
563 		pat_table[PAT_WRITE_COMBINING] = 2;
564 	}
565 
566 	/* Disable PGE. */
567 	cr4 = rcr4();
568 	load_cr4(cr4 & ~CR4_PGE);
569 
570 	/* Disable caches (CD = 1, NW = 0). */
571 	cr0 = rcr0();
572 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
573 
574 	/* Flushes caches and TLBs. */
575 	wbinvd();
576 	invltlb();
577 
578 	/* Update PAT and index table. */
579 	wrmsr(MSR_PAT, pat_msr);
580 	for (i = 0; i < PAT_INDEX_SIZE; i++)
581 		pat_index[i] = pat_table[i];
582 
583 	/* Flush caches and TLBs again. */
584 	wbinvd();
585 	invltlb();
586 
587 	/* Restore caches and PGE. */
588 	load_cr0(cr0);
589 	load_cr4(cr4);
590 }
591 
592 /*
593  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
594  */
595 static void
596 pmap_set_pg(void)
597 {
598 	pt_entry_t *pte;
599 	vm_offset_t va, endva;
600 
601 	if (pgeflag == 0)
602 		return;
603 
604 	endva = KERNBASE + KERNend;
605 
606 	if (pseflag) {
607 		va = KERNBASE + KERNLOAD;
608 		while (va  < endva) {
609 			pdir_pde(PTD, va) |= pgeflag;
610 			invltlb();	/* Play it safe, invltlb() every time */
611 			va += NBPDR;
612 		}
613 	} else {
614 		va = (vm_offset_t)btext;
615 		while (va < endva) {
616 			pte = vtopte(va);
617 			if (*pte)
618 				*pte |= pgeflag;
619 			invltlb();	/* Play it safe, invltlb() every time */
620 			va += PAGE_SIZE;
621 		}
622 	}
623 }
624 
625 /*
626  * Initialize a vm_page's machine-dependent fields.
627  */
628 void
629 pmap_page_init(vm_page_t m)
630 {
631 
632 	TAILQ_INIT(&m->md.pv_list);
633 	m->md.pat_mode = PAT_WRITE_BACK;
634 }
635 
636 #ifdef PAE
637 static void *
638 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
639 {
640 
641 	/* Inform UMA that this allocator uses kernel_map/object. */
642 	*flags = UMA_SLAB_KERNEL;
643 	return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL,
644 	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
645 }
646 #endif
647 
648 /*
649  * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
650  * Requirements:
651  *  - Must deal with pages in order to ensure that none of the PG_* bits
652  *    are ever set, PG_V in particular.
653  *  - Assumes we can write to ptes without pte_store() atomic ops, even
654  *    on PAE systems.  This should be ok.
655  *  - Assumes nothing will ever test these addresses for 0 to indicate
656  *    no mapping instead of correctly checking PG_V.
657  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
658  * Because PG_V is never set, there can be no mappings to invalidate.
659  */
660 static vm_offset_t
661 pmap_ptelist_alloc(vm_offset_t *head)
662 {
663 	pt_entry_t *pte;
664 	vm_offset_t va;
665 
666 	va = *head;
667 	if (va == 0)
668 		return (va);	/* Out of memory */
669 	pte = vtopte(va);
670 	*head = *pte;
671 	if (*head & PG_V)
672 		panic("pmap_ptelist_alloc: va with PG_V set!");
673 	*pte = 0;
674 	return (va);
675 }
676 
677 static void
678 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
679 {
680 	pt_entry_t *pte;
681 
682 	if (va & PG_V)
683 		panic("pmap_ptelist_free: freeing va with PG_V set!");
684 	pte = vtopte(va);
685 	*pte = *head;		/* virtual! PG_V is 0 though */
686 	*head = va;
687 }
688 
689 static void
690 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
691 {
692 	int i;
693 	vm_offset_t va;
694 
695 	*head = 0;
696 	for (i = npages - 1; i >= 0; i--) {
697 		va = (vm_offset_t)base + i * PAGE_SIZE;
698 		pmap_ptelist_free(head, va);
699 	}
700 }
701 
702 
703 /*
704  *	Initialize the pmap module.
705  *	Called by vm_init, to initialize any structures that the pmap
706  *	system needs to map virtual memory.
707  */
708 void
709 pmap_init(void)
710 {
711 	vm_page_t mpte;
712 	vm_size_t s;
713 	int i, pv_npg;
714 
715 	/*
716 	 * Initialize the vm page array entries for the kernel pmap's
717 	 * page table pages.
718 	 */
719 	for (i = 0; i < NKPT; i++) {
720 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
721 		KASSERT(mpte >= vm_page_array &&
722 		    mpte < &vm_page_array[vm_page_array_size],
723 		    ("pmap_init: page table page is out of range"));
724 		mpte->pindex = i + KPTDI;
725 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
726 	}
727 
728 	/*
729 	 * Initialize the address space (zone) for the pv entries.  Set a
730 	 * high water mark so that the system can recover from excessive
731 	 * numbers of pv entries.
732 	 */
733 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
734 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
735 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
736 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
737 	pv_entry_high_water = 9 * (pv_entry_max / 10);
738 
739 	/*
740 	 * If the kernel is running in a virtual machine on an AMD Family 10h
741 	 * processor, then it must assume that MCA is enabled by the virtual
742 	 * machine monitor.
743 	 */
744 	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
745 	    CPUID_TO_FAMILY(cpu_id) == 0x10)
746 		workaround_erratum383 = 1;
747 
748 	/*
749 	 * Are large page mappings supported and enabled?
750 	 */
751 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
752 	if (pseflag == 0)
753 		pg_ps_enabled = 0;
754 	else if (pg_ps_enabled) {
755 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
756 		    ("pmap_init: can't assign to pagesizes[1]"));
757 		pagesizes[1] = NBPDR;
758 	}
759 
760 	/*
761 	 * Calculate the size of the pv head table for superpages.
762 	 */
763 	for (i = 0; phys_avail[i + 1]; i += 2);
764 	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
765 
766 	/*
767 	 * Allocate memory for the pv head table for superpages.
768 	 */
769 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
770 	s = round_page(s);
771 	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
772 	for (i = 0; i < pv_npg; i++)
773 		TAILQ_INIT(&pv_table[i].pv_list);
774 
775 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
776 	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
777 	    PAGE_SIZE * pv_maxchunks);
778 	if (pv_chunkbase == NULL)
779 		panic("pmap_init: not enough kvm for pv chunks");
780 	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
781 #ifdef PAE
782 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
783 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
784 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
785 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
786 #endif
787 }
788 
789 
790 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
791 	"Max number of PV entries");
792 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
793 	"Page share factor per proc");
794 
795 SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
796     "2/4MB page mapping counters");
797 
798 static u_long pmap_pde_demotions;
799 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
800     &pmap_pde_demotions, 0, "2/4MB page demotions");
801 
802 static u_long pmap_pde_mappings;
803 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
804     &pmap_pde_mappings, 0, "2/4MB page mappings");
805 
806 static u_long pmap_pde_p_failures;
807 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
808     &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
809 
810 static u_long pmap_pde_promotions;
811 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
812     &pmap_pde_promotions, 0, "2/4MB page promotions");
813 
814 /***************************************************
815  * Low level helper routines.....
816  ***************************************************/
817 
818 /*
819  * Determine the appropriate bits to set in a PTE or PDE for a specified
820  * caching mode.
821  */
822 int
823 pmap_cache_bits(int mode, boolean_t is_pde)
824 {
825 	int cache_bits, pat_flag, pat_idx;
826 
827 	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
828 		panic("Unknown caching mode %d\n", mode);
829 
830 	/* The PAT bit is different for PTE's and PDE's. */
831 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
832 
833 	/* Map the caching mode to a PAT index. */
834 	pat_idx = pat_index[mode];
835 
836 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
837 	cache_bits = 0;
838 	if (pat_idx & 0x4)
839 		cache_bits |= pat_flag;
840 	if (pat_idx & 0x2)
841 		cache_bits |= PG_NC_PCD;
842 	if (pat_idx & 0x1)
843 		cache_bits |= PG_NC_PWT;
844 	return (cache_bits);
845 }
846 
847 /*
848  * The caller is responsible for maintaining TLB consistency.
849  */
850 static void
851 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
852 {
853 	pd_entry_t *pde;
854 	pmap_t pmap;
855 	boolean_t PTD_updated;
856 
857 	PTD_updated = FALSE;
858 	mtx_lock_spin(&allpmaps_lock);
859 	LIST_FOREACH(pmap, &allpmaps, pm_list) {
860 		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
861 		    PG_FRAME))
862 			PTD_updated = TRUE;
863 		pde = pmap_pde(pmap, va);
864 		pde_store(pde, newpde);
865 	}
866 	mtx_unlock_spin(&allpmaps_lock);
867 	KASSERT(PTD_updated,
868 	    ("pmap_kenter_pde: current page table is not in allpmaps"));
869 }
870 
871 /*
872  * After changing the page size for the specified virtual address in the page
873  * table, flush the corresponding entries from the processor's TLB.  Only the
874  * calling processor's TLB is affected.
875  *
876  * The calling thread must be pinned to a processor.
877  */
878 static void
879 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
880 {
881 	u_long cr4;
882 
883 	if ((newpde & PG_PS) == 0)
884 		/* Demotion: flush a specific 2MB page mapping. */
885 		invlpg(va);
886 	else if ((newpde & PG_G) == 0)
887 		/*
888 		 * Promotion: flush every 4KB page mapping from the TLB
889 		 * because there are too many to flush individually.
890 		 */
891 		invltlb();
892 	else {
893 		/*
894 		 * Promotion: flush every 4KB page mapping from the TLB,
895 		 * including any global (PG_G) mappings.
896 		 */
897 		cr4 = rcr4();
898 		load_cr4(cr4 & ~CR4_PGE);
899 		/*
900 		 * Although preemption at this point could be detrimental to
901 		 * performance, it would not lead to an error.  PG_G is simply
902 		 * ignored if CR4.PGE is clear.  Moreover, in case this block
903 		 * is re-entered, the load_cr4() either above or below will
904 		 * modify CR4.PGE flushing the TLB.
905 		 */
906 		load_cr4(cr4 | CR4_PGE);
907 	}
908 }
909 #ifdef SMP
910 /*
911  * For SMP, these functions have to use the IPI mechanism for coherence.
912  *
913  * N.B.: Before calling any of the following TLB invalidation functions,
914  * the calling processor must ensure that all stores updating a non-
915  * kernel page table are globally performed.  Otherwise, another
916  * processor could cache an old, pre-update entry without being
917  * invalidated.  This can happen one of two ways: (1) The pmap becomes
918  * active on another processor after its pm_active field is checked by
919  * one of the following functions but before a store updating the page
920  * table is globally performed. (2) The pmap becomes active on another
921  * processor before its pm_active field is checked but due to
922  * speculative loads one of the following functions stills reads the
923  * pmap as inactive on the other processor.
924  *
925  * The kernel page table is exempt because its pm_active field is
926  * immutable.  The kernel page table is always active on every
927  * processor.
928  */
929 void
930 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
931 {
932 	cpumask_t cpumask, other_cpus;
933 
934 	sched_pin();
935 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
936 		invlpg(va);
937 		smp_invlpg(va);
938 	} else {
939 		cpumask = PCPU_GET(cpumask);
940 		other_cpus = PCPU_GET(other_cpus);
941 		if (pmap->pm_active & cpumask)
942 			invlpg(va);
943 		if (pmap->pm_active & other_cpus)
944 			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
945 	}
946 	sched_unpin();
947 }
948 
949 void
950 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
951 {
952 	cpumask_t cpumask, other_cpus;
953 	vm_offset_t addr;
954 
955 	sched_pin();
956 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
957 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
958 			invlpg(addr);
959 		smp_invlpg_range(sva, eva);
960 	} else {
961 		cpumask = PCPU_GET(cpumask);
962 		other_cpus = PCPU_GET(other_cpus);
963 		if (pmap->pm_active & cpumask)
964 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
965 				invlpg(addr);
966 		if (pmap->pm_active & other_cpus)
967 			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
968 			    sva, eva);
969 	}
970 	sched_unpin();
971 }
972 
973 void
974 pmap_invalidate_all(pmap_t pmap)
975 {
976 	cpumask_t cpumask, other_cpus;
977 
978 	sched_pin();
979 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
980 		invltlb();
981 		smp_invltlb();
982 	} else {
983 		cpumask = PCPU_GET(cpumask);
984 		other_cpus = PCPU_GET(other_cpus);
985 		if (pmap->pm_active & cpumask)
986 			invltlb();
987 		if (pmap->pm_active & other_cpus)
988 			smp_masked_invltlb(pmap->pm_active & other_cpus);
989 	}
990 	sched_unpin();
991 }
992 
993 void
994 pmap_invalidate_cache(void)
995 {
996 
997 	sched_pin();
998 	wbinvd();
999 	smp_cache_flush();
1000 	sched_unpin();
1001 }
1002 
1003 struct pde_action {
1004 	cpumask_t store;	/* processor that updates the PDE */
1005 	cpumask_t invalidate;	/* processors that invalidate their TLB */
1006 	vm_offset_t va;
1007 	pd_entry_t *pde;
1008 	pd_entry_t newpde;
1009 };
1010 
1011 static void
1012 pmap_update_pde_kernel(void *arg)
1013 {
1014 	struct pde_action *act = arg;
1015 	pd_entry_t *pde;
1016 	pmap_t pmap;
1017 
1018 	if (act->store == PCPU_GET(cpumask))
1019 		/*
1020 		 * Elsewhere, this operation requires allpmaps_lock for
1021 		 * synchronization.  Here, it does not because it is being
1022 		 * performed in the context of an all_cpus rendezvous.
1023 		 */
1024 		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1025 			pde = pmap_pde(pmap, act->va);
1026 			pde_store(pde, act->newpde);
1027 		}
1028 }
1029 
1030 static void
1031 pmap_update_pde_user(void *arg)
1032 {
1033 	struct pde_action *act = arg;
1034 
1035 	if (act->store == PCPU_GET(cpumask))
1036 		pde_store(act->pde, act->newpde);
1037 }
1038 
1039 static void
1040 pmap_update_pde_teardown(void *arg)
1041 {
1042 	struct pde_action *act = arg;
1043 
1044 	if ((act->invalidate & PCPU_GET(cpumask)) != 0)
1045 		pmap_update_pde_invalidate(act->va, act->newpde);
1046 }
1047 
1048 /*
1049  * Change the page size for the specified virtual address in a way that
1050  * prevents any possibility of the TLB ever having two entries that map the
1051  * same virtual address using different page sizes.  This is the recommended
1052  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1053  * machine check exception for a TLB state that is improperly diagnosed as a
1054  * hardware error.
1055  */
1056 static void
1057 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1058 {
1059 	struct pde_action act;
1060 	cpumask_t active, cpumask;
1061 
1062 	sched_pin();
1063 	cpumask = PCPU_GET(cpumask);
1064 	if (pmap == kernel_pmap)
1065 		active = all_cpus;
1066 	else
1067 		active = pmap->pm_active;
1068 	if ((active & PCPU_GET(other_cpus)) != 0) {
1069 		act.store = cpumask;
1070 		act.invalidate = active;
1071 		act.va = va;
1072 		act.pde = pde;
1073 		act.newpde = newpde;
1074 		smp_rendezvous_cpus(cpumask | active,
1075 		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1076 		    pmap_update_pde_kernel : pmap_update_pde_user,
1077 		    pmap_update_pde_teardown, &act);
1078 	} else {
1079 		if (pmap == kernel_pmap)
1080 			pmap_kenter_pde(va, newpde);
1081 		else
1082 			pde_store(pde, newpde);
1083 		if ((active & cpumask) != 0)
1084 			pmap_update_pde_invalidate(va, newpde);
1085 	}
1086 	sched_unpin();
1087 }
1088 #else /* !SMP */
1089 /*
1090  * Normal, non-SMP, 486+ invalidation functions.
1091  * We inline these within pmap.c for speed.
1092  */
1093 PMAP_INLINE void
1094 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1095 {
1096 
1097 	if (pmap == kernel_pmap || pmap->pm_active)
1098 		invlpg(va);
1099 }
1100 
1101 PMAP_INLINE void
1102 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1103 {
1104 	vm_offset_t addr;
1105 
1106 	if (pmap == kernel_pmap || pmap->pm_active)
1107 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1108 			invlpg(addr);
1109 }
1110 
1111 PMAP_INLINE void
1112 pmap_invalidate_all(pmap_t pmap)
1113 {
1114 
1115 	if (pmap == kernel_pmap || pmap->pm_active)
1116 		invltlb();
1117 }
1118 
1119 PMAP_INLINE void
1120 pmap_invalidate_cache(void)
1121 {
1122 
1123 	wbinvd();
1124 }
1125 
1126 static void
1127 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1128 {
1129 
1130 	if (pmap == kernel_pmap)
1131 		pmap_kenter_pde(va, newpde);
1132 	else
1133 		pde_store(pde, newpde);
1134 	if (pmap == kernel_pmap || pmap->pm_active)
1135 		pmap_update_pde_invalidate(va, newpde);
1136 }
1137 #endif /* !SMP */
1138 
1139 void
1140 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1141 {
1142 
1143 	KASSERT((sva & PAGE_MASK) == 0,
1144 	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1145 	KASSERT((eva & PAGE_MASK) == 0,
1146 	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1147 
1148 	if (cpu_feature & CPUID_SS)
1149 		; /* If "Self Snoop" is supported, do nothing. */
1150 	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1151 		 eva - sva < 2 * 1024 * 1024) {
1152 
1153 		/*
1154 		 * Otherwise, do per-cache line flush.  Use the mfence
1155 		 * instruction to insure that previous stores are
1156 		 * included in the write-back.  The processor
1157 		 * propagates flush to other processors in the cache
1158 		 * coherence domain.
1159 		 */
1160 		mfence();
1161 		for (; sva < eva; sva += cpu_clflush_line_size)
1162 			clflush(sva);
1163 		mfence();
1164 	} else {
1165 
1166 		/*
1167 		 * No targeted cache flush methods are supported by CPU,
1168 		 * or the supplied range is bigger than 2MB.
1169 		 * Globally invalidate cache.
1170 		 */
1171 		pmap_invalidate_cache();
1172 	}
1173 }
1174 
1175 /*
1176  * Are we current address space or kernel?  N.B. We return FALSE when
1177  * a pmap's page table is in use because a kernel thread is borrowing
1178  * it.  The borrowed page table can change spontaneously, making any
1179  * dependence on its continued use subject to a race condition.
1180  */
1181 static __inline int
1182 pmap_is_current(pmap_t pmap)
1183 {
1184 
1185 	return (pmap == kernel_pmap ||
1186 		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
1187 	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
1188 }
1189 
1190 /*
1191  * If the given pmap is not the current or kernel pmap, the returned pte must
1192  * be released by passing it to pmap_pte_release().
1193  */
1194 pt_entry_t *
1195 pmap_pte(pmap_t pmap, vm_offset_t va)
1196 {
1197 	pd_entry_t newpf;
1198 	pd_entry_t *pde;
1199 
1200 	pde = pmap_pde(pmap, va);
1201 	if (*pde & PG_PS)
1202 		return (pde);
1203 	if (*pde != 0) {
1204 		/* are we current address space or kernel? */
1205 		if (pmap_is_current(pmap))
1206 			return (vtopte(va));
1207 		mtx_lock(&PMAP2mutex);
1208 		newpf = *pde & PG_FRAME;
1209 		if ((*PMAP2 & PG_FRAME) != newpf) {
1210 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1211 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1212 		}
1213 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1214 	}
1215 	return (NULL);
1216 }
1217 
1218 /*
1219  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1220  * being NULL.
1221  */
1222 static __inline void
1223 pmap_pte_release(pt_entry_t *pte)
1224 {
1225 
1226 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1227 		mtx_unlock(&PMAP2mutex);
1228 }
1229 
1230 static __inline void
1231 invlcaddr(void *caddr)
1232 {
1233 
1234 	invlpg((u_int)caddr);
1235 }
1236 
1237 /*
1238  * Super fast pmap_pte routine best used when scanning
1239  * the pv lists.  This eliminates many coarse-grained
1240  * invltlb calls.  Note that many of the pv list
1241  * scans are across different pmaps.  It is very wasteful
1242  * to do an entire invltlb for checking a single mapping.
1243  *
1244  * If the given pmap is not the current pmap, vm_page_queue_mtx
1245  * must be held and curthread pinned to a CPU.
1246  */
1247 static pt_entry_t *
1248 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1249 {
1250 	pd_entry_t newpf;
1251 	pd_entry_t *pde;
1252 
1253 	pde = pmap_pde(pmap, va);
1254 	if (*pde & PG_PS)
1255 		return (pde);
1256 	if (*pde != 0) {
1257 		/* are we current address space or kernel? */
1258 		if (pmap_is_current(pmap))
1259 			return (vtopte(va));
1260 		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1261 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1262 		newpf = *pde & PG_FRAME;
1263 		if ((*PMAP1 & PG_FRAME) != newpf) {
1264 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1265 #ifdef SMP
1266 			PMAP1cpu = PCPU_GET(cpuid);
1267 #endif
1268 			invlcaddr(PADDR1);
1269 			PMAP1changed++;
1270 		} else
1271 #ifdef SMP
1272 		if (PMAP1cpu != PCPU_GET(cpuid)) {
1273 			PMAP1cpu = PCPU_GET(cpuid);
1274 			invlcaddr(PADDR1);
1275 			PMAP1changedcpu++;
1276 		} else
1277 #endif
1278 			PMAP1unchanged++;
1279 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1280 	}
1281 	return (0);
1282 }
1283 
1284 /*
1285  *	Routine:	pmap_extract
1286  *	Function:
1287  *		Extract the physical page address associated
1288  *		with the given map/virtual_address pair.
1289  */
1290 vm_paddr_t
1291 pmap_extract(pmap_t pmap, vm_offset_t va)
1292 {
1293 	vm_paddr_t rtval;
1294 	pt_entry_t *pte;
1295 	pd_entry_t pde;
1296 
1297 	rtval = 0;
1298 	PMAP_LOCK(pmap);
1299 	pde = pmap->pm_pdir[va >> PDRSHIFT];
1300 	if (pde != 0) {
1301 		if ((pde & PG_PS) != 0)
1302 			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1303 		else {
1304 			pte = pmap_pte(pmap, va);
1305 			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1306 			pmap_pte_release(pte);
1307 		}
1308 	}
1309 	PMAP_UNLOCK(pmap);
1310 	return (rtval);
1311 }
1312 
1313 /*
1314  *	Routine:	pmap_extract_and_hold
1315  *	Function:
1316  *		Atomically extract and hold the physical page
1317  *		with the given pmap and virtual address pair
1318  *		if that mapping permits the given protection.
1319  */
1320 vm_page_t
1321 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1322 {
1323 	pd_entry_t pde;
1324 	pt_entry_t pte, *ptep;
1325 	vm_page_t m;
1326 	vm_paddr_t pa;
1327 
1328 	pa = 0;
1329 	m = NULL;
1330 	PMAP_LOCK(pmap);
1331 retry:
1332 	pde = *pmap_pde(pmap, va);
1333 	if (pde != 0) {
1334 		if (pde & PG_PS) {
1335 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1336 				if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) |
1337 				       (va & PDRMASK), &pa))
1338 					goto retry;
1339 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1340 				    (va & PDRMASK));
1341 				vm_page_hold(m);
1342 			}
1343 		} else {
1344 			ptep = pmap_pte(pmap, va);
1345 			pte = *ptep;
1346 			pmap_pte_release(ptep);
1347 			if (pte != 0 &&
1348 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1349 				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa))
1350 					goto retry;
1351 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1352 				vm_page_hold(m);
1353 			}
1354 		}
1355 	}
1356 	PA_UNLOCK_COND(pa);
1357 	PMAP_UNLOCK(pmap);
1358 	return (m);
1359 }
1360 
1361 /***************************************************
1362  * Low level mapping routines.....
1363  ***************************************************/
1364 
1365 /*
1366  * Add a wired page to the kva.
1367  * Note: not SMP coherent.
1368  *
1369  * This function may be used before pmap_bootstrap() is called.
1370  */
1371 PMAP_INLINE void
1372 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1373 {
1374 	pt_entry_t *pte;
1375 
1376 	pte = vtopte(va);
1377 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1378 }
1379 
1380 static __inline void
1381 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1382 {
1383 	pt_entry_t *pte;
1384 
1385 	pte = vtopte(va);
1386 	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1387 }
1388 
1389 /*
1390  * Remove a page from the kernel pagetables.
1391  * Note: not SMP coherent.
1392  *
1393  * This function may be used before pmap_bootstrap() is called.
1394  */
1395 PMAP_INLINE void
1396 pmap_kremove(vm_offset_t va)
1397 {
1398 	pt_entry_t *pte;
1399 
1400 	pte = vtopte(va);
1401 	pte_clear(pte);
1402 }
1403 
1404 /*
1405  *	Used to map a range of physical addresses into kernel
1406  *	virtual address space.
1407  *
1408  *	The value passed in '*virt' is a suggested virtual address for
1409  *	the mapping. Architectures which can support a direct-mapped
1410  *	physical to virtual region can return the appropriate address
1411  *	within that region, leaving '*virt' unchanged. Other
1412  *	architectures should map the pages starting at '*virt' and
1413  *	update '*virt' with the first usable address after the mapped
1414  *	region.
1415  */
1416 vm_offset_t
1417 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1418 {
1419 	vm_offset_t va, sva;
1420 
1421 	va = sva = *virt;
1422 	while (start < end) {
1423 		pmap_kenter(va, start);
1424 		va += PAGE_SIZE;
1425 		start += PAGE_SIZE;
1426 	}
1427 	pmap_invalidate_range(kernel_pmap, sva, va);
1428 	*virt = va;
1429 	return (sva);
1430 }
1431 
1432 
1433 /*
1434  * Add a list of wired pages to the kva
1435  * this routine is only used for temporary
1436  * kernel mappings that do not need to have
1437  * page modification or references recorded.
1438  * Note that old mappings are simply written
1439  * over.  The page *must* be wired.
1440  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1441  */
1442 void
1443 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1444 {
1445 	pt_entry_t *endpte, oldpte, pa, *pte;
1446 	vm_page_t m;
1447 
1448 	oldpte = 0;
1449 	pte = vtopte(sva);
1450 	endpte = pte + count;
1451 	while (pte < endpte) {
1452 		m = *ma++;
1453 		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1454 		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1455 			oldpte |= *pte;
1456 			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1457 		}
1458 		pte++;
1459 	}
1460 	if (__predict_false((oldpte & PG_V) != 0))
1461 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1462 		    PAGE_SIZE);
1463 }
1464 
1465 /*
1466  * This routine tears out page mappings from the
1467  * kernel -- it is meant only for temporary mappings.
1468  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1469  */
1470 void
1471 pmap_qremove(vm_offset_t sva, int count)
1472 {
1473 	vm_offset_t va;
1474 
1475 	va = sva;
1476 	while (count-- > 0) {
1477 		pmap_kremove(va);
1478 		va += PAGE_SIZE;
1479 	}
1480 	pmap_invalidate_range(kernel_pmap, sva, va);
1481 }
1482 
1483 /***************************************************
1484  * Page table page management routines.....
1485  ***************************************************/
1486 static __inline void
1487 pmap_free_zero_pages(vm_page_t free)
1488 {
1489 	vm_page_t m;
1490 
1491 	while (free != NULL) {
1492 		m = free;
1493 		free = m->right;
1494 		/* Preserve the page's PG_ZERO setting. */
1495 		vm_page_free_toq(m);
1496 	}
1497 }
1498 
1499 /*
1500  * Schedule the specified unused page table page to be freed.  Specifically,
1501  * add the page to the specified list of pages that will be released to the
1502  * physical memory manager after the TLB has been updated.
1503  */
1504 static __inline void
1505 pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1506 {
1507 
1508 	if (set_PG_ZERO)
1509 		m->flags |= PG_ZERO;
1510 	else
1511 		m->flags &= ~PG_ZERO;
1512 	m->right = *free;
1513 	*free = m;
1514 }
1515 
1516 /*
1517  * Inserts the specified page table page into the specified pmap's collection
1518  * of idle page table pages.  Each of a pmap's page table pages is responsible
1519  * for mapping a distinct range of virtual addresses.  The pmap's collection is
1520  * ordered by this virtual address range.
1521  */
1522 static void
1523 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1524 {
1525 	vm_page_t root;
1526 
1527 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1528 	root = pmap->pm_root;
1529 	if (root == NULL) {
1530 		mpte->left = NULL;
1531 		mpte->right = NULL;
1532 	} else {
1533 		root = vm_page_splay(mpte->pindex, root);
1534 		if (mpte->pindex < root->pindex) {
1535 			mpte->left = root->left;
1536 			mpte->right = root;
1537 			root->left = NULL;
1538 		} else if (mpte->pindex == root->pindex)
1539 			panic("pmap_insert_pt_page: pindex already inserted");
1540 		else {
1541 			mpte->right = root->right;
1542 			mpte->left = root;
1543 			root->right = NULL;
1544 		}
1545 	}
1546 	pmap->pm_root = mpte;
1547 }
1548 
1549 /*
1550  * Looks for a page table page mapping the specified virtual address in the
1551  * specified pmap's collection of idle page table pages.  Returns NULL if there
1552  * is no page table page corresponding to the specified virtual address.
1553  */
1554 static vm_page_t
1555 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1556 {
1557 	vm_page_t mpte;
1558 	vm_pindex_t pindex = va >> PDRSHIFT;
1559 
1560 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1561 	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1562 		mpte = vm_page_splay(pindex, mpte);
1563 		if ((pmap->pm_root = mpte)->pindex != pindex)
1564 			mpte = NULL;
1565 	}
1566 	return (mpte);
1567 }
1568 
1569 /*
1570  * Removes the specified page table page from the specified pmap's collection
1571  * of idle page table pages.  The specified page table page must be a member of
1572  * the pmap's collection.
1573  */
1574 static void
1575 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1576 {
1577 	vm_page_t root;
1578 
1579 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1580 	if (mpte != pmap->pm_root)
1581 		vm_page_splay(mpte->pindex, pmap->pm_root);
1582 	if (mpte->left == NULL)
1583 		root = mpte->right;
1584 	else {
1585 		root = vm_page_splay(mpte->pindex, mpte->left);
1586 		root->right = mpte->right;
1587 	}
1588 	pmap->pm_root = root;
1589 }
1590 
1591 /*
1592  * This routine unholds page table pages, and if the hold count
1593  * drops to zero, then it decrements the wire count.
1594  */
1595 static __inline int
1596 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1597 {
1598 
1599 	--m->wire_count;
1600 	if (m->wire_count == 0)
1601 		return (_pmap_unwire_pte_hold(pmap, m, free));
1602 	else
1603 		return (0);
1604 }
1605 
1606 static int
1607 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1608 {
1609 	vm_offset_t pteva;
1610 
1611 	/*
1612 	 * unmap the page table page
1613 	 */
1614 	pmap->pm_pdir[m->pindex] = 0;
1615 	--pmap->pm_stats.resident_count;
1616 
1617 	/*
1618 	 * This is a release store so that the ordinary store unmapping
1619 	 * the page table page is globally performed before TLB shoot-
1620 	 * down is begun.
1621 	 */
1622 	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1623 
1624 	/*
1625 	 * Do an invltlb to make the invalidated mapping
1626 	 * take effect immediately.
1627 	 */
1628 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1629 	pmap_invalidate_page(pmap, pteva);
1630 
1631 	/*
1632 	 * Put page on a list so that it is released after
1633 	 * *ALL* TLB shootdown is done
1634 	 */
1635 	pmap_add_delayed_free_list(m, free, TRUE);
1636 
1637 	return (1);
1638 }
1639 
1640 /*
1641  * After removing a page table entry, this routine is used to
1642  * conditionally free the page, and manage the hold/wire counts.
1643  */
1644 static int
1645 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1646 {
1647 	pd_entry_t ptepde;
1648 	vm_page_t mpte;
1649 
1650 	if (va >= VM_MAXUSER_ADDRESS)
1651 		return (0);
1652 	ptepde = *pmap_pde(pmap, va);
1653 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1654 	return (pmap_unwire_pte_hold(pmap, mpte, free));
1655 }
1656 
1657 /*
1658  * Initialize the pmap for the swapper process.
1659  */
1660 void
1661 pmap_pinit0(pmap_t pmap)
1662 {
1663 
1664 	PMAP_LOCK_INIT(pmap);
1665 	/*
1666 	 * Since the page table directory is shared with the kernel pmap,
1667 	 * which is already included in the list "allpmaps", this pmap does
1668 	 * not need to be inserted into that list.
1669 	 */
1670 	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1671 #ifdef PAE
1672 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1673 #endif
1674 	pmap->pm_root = NULL;
1675 	pmap->pm_active = 0;
1676 	PCPU_SET(curpmap, pmap);
1677 	TAILQ_INIT(&pmap->pm_pvchunk);
1678 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1679 }
1680 
1681 /*
1682  * Initialize a preallocated and zeroed pmap structure,
1683  * such as one in a vmspace structure.
1684  */
1685 int
1686 pmap_pinit(pmap_t pmap)
1687 {
1688 	vm_page_t m, ptdpg[NPGPTD];
1689 	vm_paddr_t pa;
1690 	static int color;
1691 	int i;
1692 
1693 	PMAP_LOCK_INIT(pmap);
1694 
1695 	/*
1696 	 * No need to allocate page table space yet but we do need a valid
1697 	 * page directory table.
1698 	 */
1699 	if (pmap->pm_pdir == NULL) {
1700 		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1701 		    NBPTD);
1702 
1703 		if (pmap->pm_pdir == NULL) {
1704 			PMAP_LOCK_DESTROY(pmap);
1705 			return (0);
1706 		}
1707 #ifdef PAE
1708 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1709 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1710 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1711 		    ("pmap_pinit: pdpt misaligned"));
1712 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1713 		    ("pmap_pinit: pdpt above 4g"));
1714 #endif
1715 		pmap->pm_root = NULL;
1716 	}
1717 	KASSERT(pmap->pm_root == NULL,
1718 	    ("pmap_pinit: pmap has reserved page table page(s)"));
1719 
1720 	/*
1721 	 * allocate the page directory page(s)
1722 	 */
1723 	for (i = 0; i < NPGPTD;) {
1724 		m = vm_page_alloc(NULL, color++,
1725 		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1726 		    VM_ALLOC_ZERO);
1727 		if (m == NULL)
1728 			VM_WAIT;
1729 		else {
1730 			ptdpg[i++] = m;
1731 		}
1732 	}
1733 
1734 	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1735 
1736 	for (i = 0; i < NPGPTD; i++) {
1737 		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1738 			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1739 	}
1740 
1741 	mtx_lock_spin(&allpmaps_lock);
1742 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1743 	/* Copy the kernel page table directory entries. */
1744 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1745 	mtx_unlock_spin(&allpmaps_lock);
1746 
1747 	/* install self-referential address mapping entry(s) */
1748 	for (i = 0; i < NPGPTD; i++) {
1749 		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1750 		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1751 #ifdef PAE
1752 		pmap->pm_pdpt[i] = pa | PG_V;
1753 #endif
1754 	}
1755 
1756 	pmap->pm_active = 0;
1757 	TAILQ_INIT(&pmap->pm_pvchunk);
1758 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1759 
1760 	return (1);
1761 }
1762 
1763 /*
1764  * this routine is called if the page table page is not
1765  * mapped correctly.
1766  */
1767 static vm_page_t
1768 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1769 {
1770 	vm_paddr_t ptepa;
1771 	vm_page_t m;
1772 
1773 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1774 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1775 	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1776 
1777 	/*
1778 	 * Allocate a page table page.
1779 	 */
1780 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1781 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1782 		if (flags & M_WAITOK) {
1783 			PMAP_UNLOCK(pmap);
1784 			vm_page_unlock_queues();
1785 			VM_WAIT;
1786 			vm_page_lock_queues();
1787 			PMAP_LOCK(pmap);
1788 		}
1789 
1790 		/*
1791 		 * Indicate the need to retry.  While waiting, the page table
1792 		 * page may have been allocated.
1793 		 */
1794 		return (NULL);
1795 	}
1796 	if ((m->flags & PG_ZERO) == 0)
1797 		pmap_zero_page(m);
1798 
1799 	/*
1800 	 * Map the pagetable page into the process address space, if
1801 	 * it isn't already there.
1802 	 */
1803 
1804 	pmap->pm_stats.resident_count++;
1805 
1806 	ptepa = VM_PAGE_TO_PHYS(m);
1807 	pmap->pm_pdir[ptepindex] =
1808 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1809 
1810 	return (m);
1811 }
1812 
1813 static vm_page_t
1814 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1815 {
1816 	unsigned ptepindex;
1817 	pd_entry_t ptepa;
1818 	vm_page_t m;
1819 
1820 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1821 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1822 	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1823 
1824 	/*
1825 	 * Calculate pagetable page index
1826 	 */
1827 	ptepindex = va >> PDRSHIFT;
1828 retry:
1829 	/*
1830 	 * Get the page directory entry
1831 	 */
1832 	ptepa = pmap->pm_pdir[ptepindex];
1833 
1834 	/*
1835 	 * This supports switching from a 4MB page to a
1836 	 * normal 4K page.
1837 	 */
1838 	if (ptepa & PG_PS) {
1839 		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1840 		ptepa = pmap->pm_pdir[ptepindex];
1841 	}
1842 
1843 	/*
1844 	 * If the page table page is mapped, we just increment the
1845 	 * hold count, and activate it.
1846 	 */
1847 	if (ptepa) {
1848 		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1849 		m->wire_count++;
1850 	} else {
1851 		/*
1852 		 * Here if the pte page isn't mapped, or if it has
1853 		 * been deallocated.
1854 		 */
1855 		m = _pmap_allocpte(pmap, ptepindex, flags);
1856 		if (m == NULL && (flags & M_WAITOK))
1857 			goto retry;
1858 	}
1859 	return (m);
1860 }
1861 
1862 
1863 /***************************************************
1864 * Pmap allocation/deallocation routines.
1865  ***************************************************/
1866 
1867 #ifdef SMP
1868 /*
1869  * Deal with a SMP shootdown of other users of the pmap that we are
1870  * trying to dispose of.  This can be a bit hairy.
1871  */
1872 static cpumask_t *lazymask;
1873 static u_int lazyptd;
1874 static volatile u_int lazywait;
1875 
1876 void pmap_lazyfix_action(void);
1877 
1878 void
1879 pmap_lazyfix_action(void)
1880 {
1881 	cpumask_t mymask = PCPU_GET(cpumask);
1882 
1883 #ifdef COUNT_IPIS
1884 	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1885 #endif
1886 	if (rcr3() == lazyptd)
1887 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1888 	atomic_clear_int(lazymask, mymask);
1889 	atomic_store_rel_int(&lazywait, 1);
1890 }
1891 
1892 static void
1893 pmap_lazyfix_self(cpumask_t mymask)
1894 {
1895 
1896 	if (rcr3() == lazyptd)
1897 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1898 	atomic_clear_int(lazymask, mymask);
1899 }
1900 
1901 
1902 static void
1903 pmap_lazyfix(pmap_t pmap)
1904 {
1905 	cpumask_t mymask, mask;
1906 	u_int spins;
1907 
1908 	while ((mask = pmap->pm_active) != 0) {
1909 		spins = 50000000;
1910 		mask = mask & -mask;	/* Find least significant set bit */
1911 		mtx_lock_spin(&smp_ipi_mtx);
1912 #ifdef PAE
1913 		lazyptd = vtophys(pmap->pm_pdpt);
1914 #else
1915 		lazyptd = vtophys(pmap->pm_pdir);
1916 #endif
1917 		mymask = PCPU_GET(cpumask);
1918 		if (mask == mymask) {
1919 			lazymask = &pmap->pm_active;
1920 			pmap_lazyfix_self(mymask);
1921 		} else {
1922 			atomic_store_rel_int((u_int *)&lazymask,
1923 			    (u_int)&pmap->pm_active);
1924 			atomic_store_rel_int(&lazywait, 0);
1925 			ipi_selected(mask, IPI_LAZYPMAP);
1926 			while (lazywait == 0) {
1927 				ia32_pause();
1928 				if (--spins == 0)
1929 					break;
1930 			}
1931 		}
1932 		mtx_unlock_spin(&smp_ipi_mtx);
1933 		if (spins == 0)
1934 			printf("pmap_lazyfix: spun for 50000000\n");
1935 	}
1936 }
1937 
1938 #else	/* SMP */
1939 
1940 /*
1941  * Cleaning up on uniprocessor is easy.  For various reasons, we're
1942  * unlikely to have to even execute this code, including the fact
1943  * that the cleanup is deferred until the parent does a wait(2), which
1944  * means that another userland process has run.
1945  */
1946 static void
1947 pmap_lazyfix(pmap_t pmap)
1948 {
1949 	u_int cr3;
1950 
1951 	cr3 = vtophys(pmap->pm_pdir);
1952 	if (cr3 == rcr3()) {
1953 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1954 		pmap->pm_active &= ~(PCPU_GET(cpumask));
1955 	}
1956 }
1957 #endif	/* SMP */
1958 
1959 /*
1960  * Release any resources held by the given physical map.
1961  * Called when a pmap initialized by pmap_pinit is being released.
1962  * Should only be called if the map contains no valid mappings.
1963  */
1964 void
1965 pmap_release(pmap_t pmap)
1966 {
1967 	vm_page_t m, ptdpg[NPGPTD];
1968 	int i;
1969 
1970 	KASSERT(pmap->pm_stats.resident_count == 0,
1971 	    ("pmap_release: pmap resident count %ld != 0",
1972 	    pmap->pm_stats.resident_count));
1973 	KASSERT(pmap->pm_root == NULL,
1974 	    ("pmap_release: pmap has reserved page table page(s)"));
1975 
1976 	pmap_lazyfix(pmap);
1977 	mtx_lock_spin(&allpmaps_lock);
1978 	LIST_REMOVE(pmap, pm_list);
1979 	mtx_unlock_spin(&allpmaps_lock);
1980 
1981 	for (i = 0; i < NPGPTD; i++)
1982 		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
1983 		    PG_FRAME);
1984 
1985 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1986 	    sizeof(*pmap->pm_pdir));
1987 
1988 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1989 
1990 	for (i = 0; i < NPGPTD; i++) {
1991 		m = ptdpg[i];
1992 #ifdef PAE
1993 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1994 		    ("pmap_release: got wrong ptd page"));
1995 #endif
1996 		m->wire_count--;
1997 		atomic_subtract_int(&cnt.v_wire_count, 1);
1998 		vm_page_free_zero(m);
1999 	}
2000 	PMAP_LOCK_DESTROY(pmap);
2001 }
2002 
2003 static int
2004 kvm_size(SYSCTL_HANDLER_ARGS)
2005 {
2006 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2007 
2008 	return (sysctl_handle_long(oidp, &ksize, 0, req));
2009 }
2010 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2011     0, 0, kvm_size, "IU", "Size of KVM");
2012 
2013 static int
2014 kvm_free(SYSCTL_HANDLER_ARGS)
2015 {
2016 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2017 
2018 	return (sysctl_handle_long(oidp, &kfree, 0, req));
2019 }
2020 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2021     0, 0, kvm_free, "IU", "Amount of KVM free");
2022 
2023 /*
2024  * grow the number of kernel page table entries, if needed
2025  */
2026 void
2027 pmap_growkernel(vm_offset_t addr)
2028 {
2029 	vm_paddr_t ptppaddr;
2030 	vm_page_t nkpg;
2031 	pd_entry_t newpdir;
2032 
2033 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2034 	addr = roundup2(addr, NBPDR);
2035 	if (addr - 1 >= kernel_map->max_offset)
2036 		addr = kernel_map->max_offset;
2037 	while (kernel_vm_end < addr) {
2038 		if (pdir_pde(PTD, kernel_vm_end)) {
2039 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2040 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2041 				kernel_vm_end = kernel_map->max_offset;
2042 				break;
2043 			}
2044 			continue;
2045 		}
2046 
2047 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2048 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2049 		    VM_ALLOC_ZERO);
2050 		if (nkpg == NULL)
2051 			panic("pmap_growkernel: no memory to grow kernel");
2052 
2053 		nkpt++;
2054 
2055 		if ((nkpg->flags & PG_ZERO) == 0)
2056 			pmap_zero_page(nkpg);
2057 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2058 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2059 		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2060 
2061 		pmap_kenter_pde(kernel_vm_end, newpdir);
2062 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2063 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2064 			kernel_vm_end = kernel_map->max_offset;
2065 			break;
2066 		}
2067 	}
2068 }
2069 
2070 
2071 /***************************************************
2072  * page management routines.
2073  ***************************************************/
2074 
2075 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2076 CTASSERT(_NPCM == 11);
2077 
2078 static __inline struct pv_chunk *
2079 pv_to_chunk(pv_entry_t pv)
2080 {
2081 
2082 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2083 }
2084 
2085 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2086 
2087 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2088 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2089 
2090 static uint32_t pc_freemask[11] = {
2091 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2092 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2093 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2094 	PC_FREE0_9, PC_FREE10
2095 };
2096 
2097 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2098 	"Current number of pv entries");
2099 
2100 #ifdef PV_STATS
2101 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2102 
2103 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2104 	"Current number of pv entry chunks");
2105 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2106 	"Current number of pv entry chunks allocated");
2107 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2108 	"Current number of pv entry chunks frees");
2109 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2110 	"Number of times tried to get a chunk page but failed.");
2111 
2112 static long pv_entry_frees, pv_entry_allocs;
2113 static int pv_entry_spare;
2114 
2115 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2116 	"Current number of pv entry frees");
2117 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2118 	"Current number of pv entry allocs");
2119 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2120 	"Current number of spare pv entries");
2121 
2122 static int pmap_collect_inactive, pmap_collect_active;
2123 
2124 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
2125 	"Current number times pmap_collect called on inactive queue");
2126 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
2127 	"Current number times pmap_collect called on active queue");
2128 #endif
2129 
2130 /*
2131  * We are in a serious low memory condition.  Resort to
2132  * drastic measures to free some pages so we can allocate
2133  * another pv entry chunk.  This is normally called to
2134  * unmap inactive pages, and if necessary, active pages.
2135  */
2136 static void
2137 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
2138 {
2139 	pd_entry_t *pde;
2140 	pmap_t pmap;
2141 	pt_entry_t *pte, tpte;
2142 	pv_entry_t next_pv, pv;
2143 	vm_offset_t va;
2144 	vm_page_t m, free;
2145 
2146 	sched_pin();
2147 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
2148 		if (m->hold_count || m->busy)
2149 			continue;
2150 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
2151 			va = pv->pv_va;
2152 			pmap = PV_PMAP(pv);
2153 			/* Avoid deadlock and lock recursion. */
2154 			if (pmap > locked_pmap)
2155 				PMAP_LOCK(pmap);
2156 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
2157 				continue;
2158 			pmap->pm_stats.resident_count--;
2159 			pde = pmap_pde(pmap, va);
2160 			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
2161 			    " a 4mpage in page %p's pv list", m));
2162 			pte = pmap_pte_quick(pmap, va);
2163 			tpte = pte_load_clear(pte);
2164 			KASSERT((tpte & PG_W) == 0,
2165 			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
2166 			if (tpte & PG_A)
2167 				vm_page_flag_set(m, PG_REFERENCED);
2168 			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2169 				vm_page_dirty(m);
2170 			free = NULL;
2171 			pmap_unuse_pt(pmap, va, &free);
2172 			pmap_invalidate_page(pmap, va);
2173 			pmap_free_zero_pages(free);
2174 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2175 			free_pv_entry(pmap, pv);
2176 			if (pmap != locked_pmap)
2177 				PMAP_UNLOCK(pmap);
2178 		}
2179 		if (TAILQ_EMPTY(&m->md.pv_list) &&
2180 		    TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list))
2181 			vm_page_flag_clear(m, PG_WRITEABLE);
2182 	}
2183 	sched_unpin();
2184 }
2185 
2186 
2187 /*
2188  * free the pv_entry back to the free list
2189  */
2190 static void
2191 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2192 {
2193 	vm_page_t m;
2194 	struct pv_chunk *pc;
2195 	int idx, field, bit;
2196 
2197 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2198 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2199 	PV_STAT(pv_entry_frees++);
2200 	PV_STAT(pv_entry_spare++);
2201 	pv_entry_count--;
2202 	pc = pv_to_chunk(pv);
2203 	idx = pv - &pc->pc_pventry[0];
2204 	field = idx / 32;
2205 	bit = idx % 32;
2206 	pc->pc_map[field] |= 1ul << bit;
2207 	/* move to head of list */
2208 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2209 	for (idx = 0; idx < _NPCM; idx++)
2210 		if (pc->pc_map[idx] != pc_freemask[idx]) {
2211 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2212 			return;
2213 		}
2214 	PV_STAT(pv_entry_spare -= _NPCPV);
2215 	PV_STAT(pc_chunk_count--);
2216 	PV_STAT(pc_chunk_frees++);
2217 	/* entire chunk is free, return it */
2218 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2219 	pmap_qremove((vm_offset_t)pc, 1);
2220 	vm_page_unwire(m, 0);
2221 	vm_page_free(m);
2222 	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2223 }
2224 
2225 /*
2226  * get a new pv_entry, allocating a block from the system
2227  * when needed.
2228  */
2229 static pv_entry_t
2230 get_pv_entry(pmap_t pmap, int try)
2231 {
2232 	static const struct timeval printinterval = { 60, 0 };
2233 	static struct timeval lastprint;
2234 	static vm_pindex_t colour;
2235 	struct vpgqueues *pq;
2236 	int bit, field;
2237 	pv_entry_t pv;
2238 	struct pv_chunk *pc;
2239 	vm_page_t m;
2240 
2241 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2242 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2243 	PV_STAT(pv_entry_allocs++);
2244 	pv_entry_count++;
2245 	if (pv_entry_count > pv_entry_high_water)
2246 		if (ratecheck(&lastprint, &printinterval))
2247 			printf("Approaching the limit on PV entries, consider "
2248 			    "increasing either the vm.pmap.shpgperproc or the "
2249 			    "vm.pmap.pv_entry_max tunable.\n");
2250 	pq = NULL;
2251 retry:
2252 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2253 	if (pc != NULL) {
2254 		for (field = 0; field < _NPCM; field++) {
2255 			if (pc->pc_map[field]) {
2256 				bit = bsfl(pc->pc_map[field]);
2257 				break;
2258 			}
2259 		}
2260 		if (field < _NPCM) {
2261 			pv = &pc->pc_pventry[field * 32 + bit];
2262 			pc->pc_map[field] &= ~(1ul << bit);
2263 			/* If this was the last item, move it to tail */
2264 			for (field = 0; field < _NPCM; field++)
2265 				if (pc->pc_map[field] != 0) {
2266 					PV_STAT(pv_entry_spare--);
2267 					return (pv);	/* not full, return */
2268 				}
2269 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2270 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2271 			PV_STAT(pv_entry_spare--);
2272 			return (pv);
2273 		}
2274 	}
2275 	/*
2276 	 * Access to the ptelist "pv_vafree" is synchronized by the page
2277 	 * queues lock.  If "pv_vafree" is currently non-empty, it will
2278 	 * remain non-empty until pmap_ptelist_alloc() completes.
2279 	 */
2280 	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq ==
2281 	    &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
2282 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2283 		if (try) {
2284 			pv_entry_count--;
2285 			PV_STAT(pc_chunk_tryfail++);
2286 			return (NULL);
2287 		}
2288 		/*
2289 		 * Reclaim pv entries: At first, destroy mappings to
2290 		 * inactive pages.  After that, if a pv chunk entry
2291 		 * is still needed, destroy mappings to active pages.
2292 		 */
2293 		if (pq == NULL) {
2294 			PV_STAT(pmap_collect_inactive++);
2295 			pq = &vm_page_queues[PQ_INACTIVE];
2296 		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
2297 			PV_STAT(pmap_collect_active++);
2298 			pq = &vm_page_queues[PQ_ACTIVE];
2299 		} else
2300 			panic("get_pv_entry: increase vm.pmap.shpgperproc");
2301 		pmap_collect(pmap, pq);
2302 		goto retry;
2303 	}
2304 	PV_STAT(pc_chunk_count++);
2305 	PV_STAT(pc_chunk_allocs++);
2306 	colour++;
2307 	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2308 	pmap_qenter((vm_offset_t)pc, &m, 1);
2309 	pc->pc_pmap = pmap;
2310 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2311 	for (field = 1; field < _NPCM; field++)
2312 		pc->pc_map[field] = pc_freemask[field];
2313 	pv = &pc->pc_pventry[0];
2314 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2315 	PV_STAT(pv_entry_spare += _NPCPV - 1);
2316 	return (pv);
2317 }
2318 
2319 static __inline pv_entry_t
2320 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2321 {
2322 	pv_entry_t pv;
2323 
2324 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2325 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2326 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2327 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2328 			break;
2329 		}
2330 	}
2331 	return (pv);
2332 }
2333 
2334 static void
2335 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2336 {
2337 	struct md_page *pvh;
2338 	pv_entry_t pv;
2339 	vm_offset_t va_last;
2340 	vm_page_t m;
2341 
2342 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2343 	KASSERT((pa & PDRMASK) == 0,
2344 	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2345 
2346 	/*
2347 	 * Transfer the 4mpage's pv entry for this mapping to the first
2348 	 * page's pv list.
2349 	 */
2350 	pvh = pa_to_pvh(pa);
2351 	va = trunc_4mpage(va);
2352 	pv = pmap_pvh_remove(pvh, pmap, va);
2353 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2354 	m = PHYS_TO_VM_PAGE(pa);
2355 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2356 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2357 	va_last = va + NBPDR - PAGE_SIZE;
2358 	do {
2359 		m++;
2360 		KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
2361 		    ("pmap_pv_demote_pde: page %p is not managed", m));
2362 		va += PAGE_SIZE;
2363 		pmap_insert_entry(pmap, va, m);
2364 	} while (va < va_last);
2365 }
2366 
2367 static void
2368 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2369 {
2370 	struct md_page *pvh;
2371 	pv_entry_t pv;
2372 	vm_offset_t va_last;
2373 	vm_page_t m;
2374 
2375 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2376 	KASSERT((pa & PDRMASK) == 0,
2377 	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2378 
2379 	/*
2380 	 * Transfer the first page's pv entry for this mapping to the
2381 	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2382 	 * to get_pv_entry(), a transfer avoids the possibility that
2383 	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2384 	 * removes one of the mappings that is being promoted.
2385 	 */
2386 	m = PHYS_TO_VM_PAGE(pa);
2387 	va = trunc_4mpage(va);
2388 	pv = pmap_pvh_remove(&m->md, pmap, va);
2389 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2390 	pvh = pa_to_pvh(pa);
2391 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2392 	/* Free the remaining NPTEPG - 1 pv entries. */
2393 	va_last = va + NBPDR - PAGE_SIZE;
2394 	do {
2395 		m++;
2396 		va += PAGE_SIZE;
2397 		pmap_pvh_free(&m->md, pmap, va);
2398 	} while (va < va_last);
2399 }
2400 
2401 static void
2402 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2403 {
2404 	pv_entry_t pv;
2405 
2406 	pv = pmap_pvh_remove(pvh, pmap, va);
2407 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2408 	free_pv_entry(pmap, pv);
2409 }
2410 
2411 static void
2412 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2413 {
2414 	struct md_page *pvh;
2415 
2416 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2417 	pmap_pvh_free(&m->md, pmap, va);
2418 	if (TAILQ_EMPTY(&m->md.pv_list)) {
2419 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2420 		if (TAILQ_EMPTY(&pvh->pv_list))
2421 			vm_page_flag_clear(m, PG_WRITEABLE);
2422 	}
2423 }
2424 
2425 /*
2426  * Create a pv entry for page at pa for
2427  * (pmap, va).
2428  */
2429 static void
2430 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2431 {
2432 	pv_entry_t pv;
2433 
2434 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2435 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2436 	pv = get_pv_entry(pmap, FALSE);
2437 	pv->pv_va = va;
2438 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2439 }
2440 
2441 /*
2442  * Conditionally create a pv entry.
2443  */
2444 static boolean_t
2445 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2446 {
2447 	pv_entry_t pv;
2448 
2449 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2450 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2451 	if (pv_entry_count < pv_entry_high_water &&
2452 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2453 		pv->pv_va = va;
2454 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2455 		return (TRUE);
2456 	} else
2457 		return (FALSE);
2458 }
2459 
2460 /*
2461  * Create the pv entries for each of the pages within a superpage.
2462  */
2463 static boolean_t
2464 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2465 {
2466 	struct md_page *pvh;
2467 	pv_entry_t pv;
2468 
2469 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2470 	if (pv_entry_count < pv_entry_high_water &&
2471 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2472 		pv->pv_va = va;
2473 		pvh = pa_to_pvh(pa);
2474 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2475 		return (TRUE);
2476 	} else
2477 		return (FALSE);
2478 }
2479 
2480 /*
2481  * Fills a page table page with mappings to consecutive physical pages.
2482  */
2483 static void
2484 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2485 {
2486 	pt_entry_t *pte;
2487 
2488 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2489 		*pte = newpte;
2490 		newpte += PAGE_SIZE;
2491 	}
2492 }
2493 
2494 /*
2495  * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2496  * 2- or 4MB page mapping is invalidated.
2497  */
2498 static boolean_t
2499 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2500 {
2501 	pd_entry_t newpde, oldpde;
2502 	pt_entry_t *firstpte, newpte;
2503 	vm_paddr_t mptepa;
2504 	vm_page_t free, mpte;
2505 
2506 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2507 	oldpde = *pde;
2508 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2509 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2510 	mpte = pmap_lookup_pt_page(pmap, va);
2511 	if (mpte != NULL)
2512 		pmap_remove_pt_page(pmap, mpte);
2513 	else {
2514 		KASSERT((oldpde & PG_W) == 0,
2515 		    ("pmap_demote_pde: page table page for a wired mapping"
2516 		    " is missing"));
2517 
2518 		/*
2519 		 * Invalidate the 2- or 4MB page mapping and return
2520 		 * "failure" if the mapping was never accessed or the
2521 		 * allocation of the new page table page fails.
2522 		 */
2523 		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2524 		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2525 		    VM_ALLOC_WIRED)) == NULL) {
2526 			free = NULL;
2527 			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2528 			pmap_invalidate_page(pmap, trunc_4mpage(va));
2529 			pmap_free_zero_pages(free);
2530 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2531 			    " in pmap %p", va, pmap);
2532 			return (FALSE);
2533 		}
2534 		if (va < VM_MAXUSER_ADDRESS)
2535 			pmap->pm_stats.resident_count++;
2536 	}
2537 	mptepa = VM_PAGE_TO_PHYS(mpte);
2538 
2539 	/*
2540 	 * If the page mapping is in the kernel's address space, then the
2541 	 * KPTmap can provide access to the page table page.  Otherwise,
2542 	 * temporarily map the page table page (mpte) into the kernel's
2543 	 * address space at either PADDR1 or PADDR2.
2544 	 */
2545 	if (va >= KERNBASE)
2546 		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2547 	else if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) {
2548 		if ((*PMAP1 & PG_FRAME) != mptepa) {
2549 			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2550 #ifdef SMP
2551 			PMAP1cpu = PCPU_GET(cpuid);
2552 #endif
2553 			invlcaddr(PADDR1);
2554 			PMAP1changed++;
2555 		} else
2556 #ifdef SMP
2557 		if (PMAP1cpu != PCPU_GET(cpuid)) {
2558 			PMAP1cpu = PCPU_GET(cpuid);
2559 			invlcaddr(PADDR1);
2560 			PMAP1changedcpu++;
2561 		} else
2562 #endif
2563 			PMAP1unchanged++;
2564 		firstpte = PADDR1;
2565 	} else {
2566 		mtx_lock(&PMAP2mutex);
2567 		if ((*PMAP2 & PG_FRAME) != mptepa) {
2568 			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2569 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2570 		}
2571 		firstpte = PADDR2;
2572 	}
2573 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2574 	KASSERT((oldpde & PG_A) != 0,
2575 	    ("pmap_demote_pde: oldpde is missing PG_A"));
2576 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2577 	    ("pmap_demote_pde: oldpde is missing PG_M"));
2578 	newpte = oldpde & ~PG_PS;
2579 	if ((newpte & PG_PDE_PAT) != 0)
2580 		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2581 
2582 	/*
2583 	 * If the page table page is new, initialize it.
2584 	 */
2585 	if (mpte->wire_count == 1) {
2586 		mpte->wire_count = NPTEPG;
2587 		pmap_fill_ptp(firstpte, newpte);
2588 	}
2589 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2590 	    ("pmap_demote_pde: firstpte and newpte map different physical"
2591 	    " addresses"));
2592 
2593 	/*
2594 	 * If the mapping has changed attributes, update the page table
2595 	 * entries.
2596 	 */
2597 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2598 		pmap_fill_ptp(firstpte, newpte);
2599 
2600 	/*
2601 	 * Demote the mapping.  This pmap is locked.  The old PDE has
2602 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2603 	 * set.  Thus, there is no danger of a race with another
2604 	 * processor changing the setting of PG_A and/or PG_M between
2605 	 * the read above and the store below.
2606 	 */
2607 	if (workaround_erratum383)
2608 		pmap_update_pde(pmap, va, pde, newpde);
2609 	else if (pmap == kernel_pmap)
2610 		pmap_kenter_pde(va, newpde);
2611 	else
2612 		pde_store(pde, newpde);
2613 	if (firstpte == PADDR2)
2614 		mtx_unlock(&PMAP2mutex);
2615 
2616 	/*
2617 	 * Invalidate the recursive mapping of the page table page.
2618 	 */
2619 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2620 
2621 	/*
2622 	 * Demote the pv entry.  This depends on the earlier demotion
2623 	 * of the mapping.  Specifically, the (re)creation of a per-
2624 	 * page pv entry might trigger the execution of pmap_collect(),
2625 	 * which might reclaim a newly (re)created per-page pv entry
2626 	 * and destroy the associated mapping.  In order to destroy
2627 	 * the mapping, the PDE must have already changed from mapping
2628 	 * the 2mpage to referencing the page table page.
2629 	 */
2630 	if ((oldpde & PG_MANAGED) != 0)
2631 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2632 
2633 	pmap_pde_demotions++;
2634 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2635 	    " in pmap %p", va, pmap);
2636 	return (TRUE);
2637 }
2638 
2639 /*
2640  * pmap_remove_pde: do the things to unmap a superpage in a process
2641  */
2642 static void
2643 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2644     vm_page_t *free)
2645 {
2646 	struct md_page *pvh;
2647 	pd_entry_t oldpde;
2648 	vm_offset_t eva, va;
2649 	vm_page_t m, mpte;
2650 
2651 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2652 	KASSERT((sva & PDRMASK) == 0,
2653 	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2654 	oldpde = pte_load_clear(pdq);
2655 	if (oldpde & PG_W)
2656 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2657 
2658 	/*
2659 	 * Machines that don't support invlpg, also don't support
2660 	 * PG_G.
2661 	 */
2662 	if (oldpde & PG_G)
2663 		pmap_invalidate_page(kernel_pmap, sva);
2664 	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2665 	if (oldpde & PG_MANAGED) {
2666 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2667 		pmap_pvh_free(pvh, pmap, sva);
2668 		eva = sva + NBPDR;
2669 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2670 		    va < eva; va += PAGE_SIZE, m++) {
2671 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2672 				vm_page_dirty(m);
2673 			if (oldpde & PG_A)
2674 				vm_page_flag_set(m, PG_REFERENCED);
2675 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2676 			    TAILQ_EMPTY(&pvh->pv_list))
2677 				vm_page_flag_clear(m, PG_WRITEABLE);
2678 		}
2679 	}
2680 	if (pmap == kernel_pmap) {
2681 		if (!pmap_demote_pde(pmap, pdq, sva))
2682 			panic("pmap_remove_pde: failed demotion");
2683 	} else {
2684 		mpte = pmap_lookup_pt_page(pmap, sva);
2685 		if (mpte != NULL) {
2686 			pmap_remove_pt_page(pmap, mpte);
2687 			pmap->pm_stats.resident_count--;
2688 			KASSERT(mpte->wire_count == NPTEPG,
2689 			    ("pmap_remove_pde: pte page wire count error"));
2690 			mpte->wire_count = 0;
2691 			pmap_add_delayed_free_list(mpte, free, FALSE);
2692 			atomic_subtract_int(&cnt.v_wire_count, 1);
2693 		}
2694 	}
2695 }
2696 
2697 /*
2698  * pmap_remove_pte: do the things to unmap a page in a process
2699  */
2700 static int
2701 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2702 {
2703 	pt_entry_t oldpte;
2704 	vm_page_t m;
2705 
2706 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2707 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2708 	oldpte = pte_load_clear(ptq);
2709 	if (oldpte & PG_W)
2710 		pmap->pm_stats.wired_count -= 1;
2711 	/*
2712 	 * Machines that don't support invlpg, also don't support
2713 	 * PG_G.
2714 	 */
2715 	if (oldpte & PG_G)
2716 		pmap_invalidate_page(kernel_pmap, va);
2717 	pmap->pm_stats.resident_count -= 1;
2718 	if (oldpte & PG_MANAGED) {
2719 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2720 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2721 			vm_page_dirty(m);
2722 		if (oldpte & PG_A)
2723 			vm_page_flag_set(m, PG_REFERENCED);
2724 		pmap_remove_entry(pmap, m, va);
2725 	}
2726 	return (pmap_unuse_pt(pmap, va, free));
2727 }
2728 
2729 /*
2730  * Remove a single page from a process address space
2731  */
2732 static void
2733 pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2734 {
2735 	pt_entry_t *pte;
2736 
2737 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2738 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2739 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2740 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2741 		return;
2742 	pmap_remove_pte(pmap, pte, va, free);
2743 	pmap_invalidate_page(pmap, va);
2744 }
2745 
2746 /*
2747  *	Remove the given range of addresses from the specified map.
2748  *
2749  *	It is assumed that the start and end are properly
2750  *	rounded to the page size.
2751  */
2752 void
2753 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2754 {
2755 	vm_offset_t pdnxt;
2756 	pd_entry_t ptpaddr;
2757 	pt_entry_t *pte;
2758 	vm_page_t free = NULL;
2759 	int anyvalid;
2760 
2761 	/*
2762 	 * Perform an unsynchronized read.  This is, however, safe.
2763 	 */
2764 	if (pmap->pm_stats.resident_count == 0)
2765 		return;
2766 
2767 	anyvalid = 0;
2768 
2769 	vm_page_lock_queues();
2770 	sched_pin();
2771 	PMAP_LOCK(pmap);
2772 
2773 	/*
2774 	 * special handling of removing one page.  a very
2775 	 * common operation and easy to short circuit some
2776 	 * code.
2777 	 */
2778 	if ((sva + PAGE_SIZE == eva) &&
2779 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2780 		pmap_remove_page(pmap, sva, &free);
2781 		goto out;
2782 	}
2783 
2784 	for (; sva < eva; sva = pdnxt) {
2785 		unsigned pdirindex;
2786 
2787 		/*
2788 		 * Calculate index for next page table.
2789 		 */
2790 		pdnxt = (sva + NBPDR) & ~PDRMASK;
2791 		if (pdnxt < sva)
2792 			pdnxt = eva;
2793 		if (pmap->pm_stats.resident_count == 0)
2794 			break;
2795 
2796 		pdirindex = sva >> PDRSHIFT;
2797 		ptpaddr = pmap->pm_pdir[pdirindex];
2798 
2799 		/*
2800 		 * Weed out invalid mappings. Note: we assume that the page
2801 		 * directory table is always allocated, and in kernel virtual.
2802 		 */
2803 		if (ptpaddr == 0)
2804 			continue;
2805 
2806 		/*
2807 		 * Check for large page.
2808 		 */
2809 		if ((ptpaddr & PG_PS) != 0) {
2810 			/*
2811 			 * Are we removing the entire large page?  If not,
2812 			 * demote the mapping and fall through.
2813 			 */
2814 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2815 				/*
2816 				 * The TLB entry for a PG_G mapping is
2817 				 * invalidated by pmap_remove_pde().
2818 				 */
2819 				if ((ptpaddr & PG_G) == 0)
2820 					anyvalid = 1;
2821 				pmap_remove_pde(pmap,
2822 				    &pmap->pm_pdir[pdirindex], sva, &free);
2823 				continue;
2824 			} else if (!pmap_demote_pde(pmap,
2825 			    &pmap->pm_pdir[pdirindex], sva)) {
2826 				/* The large page mapping was destroyed. */
2827 				continue;
2828 			}
2829 		}
2830 
2831 		/*
2832 		 * Limit our scan to either the end of the va represented
2833 		 * by the current page table page, or to the end of the
2834 		 * range being removed.
2835 		 */
2836 		if (pdnxt > eva)
2837 			pdnxt = eva;
2838 
2839 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2840 		    sva += PAGE_SIZE) {
2841 			if (*pte == 0)
2842 				continue;
2843 
2844 			/*
2845 			 * The TLB entry for a PG_G mapping is invalidated
2846 			 * by pmap_remove_pte().
2847 			 */
2848 			if ((*pte & PG_G) == 0)
2849 				anyvalid = 1;
2850 			if (pmap_remove_pte(pmap, pte, sva, &free))
2851 				break;
2852 		}
2853 	}
2854 out:
2855 	sched_unpin();
2856 	if (anyvalid)
2857 		pmap_invalidate_all(pmap);
2858 	vm_page_unlock_queues();
2859 	PMAP_UNLOCK(pmap);
2860 	pmap_free_zero_pages(free);
2861 }
2862 
2863 /*
2864  *	Routine:	pmap_remove_all
2865  *	Function:
2866  *		Removes this physical page from
2867  *		all physical maps in which it resides.
2868  *		Reflects back modify bits to the pager.
2869  *
2870  *	Notes:
2871  *		Original versions of this routine were very
2872  *		inefficient because they iteratively called
2873  *		pmap_remove (slow...)
2874  */
2875 
2876 void
2877 pmap_remove_all(vm_page_t m)
2878 {
2879 	struct md_page *pvh;
2880 	pv_entry_t pv;
2881 	pmap_t pmap;
2882 	pt_entry_t *pte, tpte;
2883 	pd_entry_t *pde;
2884 	vm_offset_t va;
2885 	vm_page_t free;
2886 
2887 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
2888 	    ("pmap_remove_all: page %p is fictitious", m));
2889 	free = NULL;
2890 	vm_page_lock_queues();
2891 	sched_pin();
2892 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2893 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2894 		va = pv->pv_va;
2895 		pmap = PV_PMAP(pv);
2896 		PMAP_LOCK(pmap);
2897 		pde = pmap_pde(pmap, va);
2898 		(void)pmap_demote_pde(pmap, pde, va);
2899 		PMAP_UNLOCK(pmap);
2900 	}
2901 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2902 		pmap = PV_PMAP(pv);
2903 		PMAP_LOCK(pmap);
2904 		pmap->pm_stats.resident_count--;
2905 		pde = pmap_pde(pmap, pv->pv_va);
2906 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2907 		    " a 4mpage in page %p's pv list", m));
2908 		pte = pmap_pte_quick(pmap, pv->pv_va);
2909 		tpte = pte_load_clear(pte);
2910 		if (tpte & PG_W)
2911 			pmap->pm_stats.wired_count--;
2912 		if (tpte & PG_A)
2913 			vm_page_flag_set(m, PG_REFERENCED);
2914 
2915 		/*
2916 		 * Update the vm_page_t clean and reference bits.
2917 		 */
2918 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2919 			vm_page_dirty(m);
2920 		pmap_unuse_pt(pmap, pv->pv_va, &free);
2921 		pmap_invalidate_page(pmap, pv->pv_va);
2922 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2923 		free_pv_entry(pmap, pv);
2924 		PMAP_UNLOCK(pmap);
2925 	}
2926 	vm_page_flag_clear(m, PG_WRITEABLE);
2927 	sched_unpin();
2928 	vm_page_unlock_queues();
2929 	pmap_free_zero_pages(free);
2930 }
2931 
2932 /*
2933  * pmap_protect_pde: do the things to protect a 4mpage in a process
2934  */
2935 static boolean_t
2936 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2937 {
2938 	pd_entry_t newpde, oldpde;
2939 	vm_offset_t eva, va;
2940 	vm_page_t m;
2941 	boolean_t anychanged;
2942 
2943 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2944 	KASSERT((sva & PDRMASK) == 0,
2945 	    ("pmap_protect_pde: sva is not 4mpage aligned"));
2946 	anychanged = FALSE;
2947 retry:
2948 	oldpde = newpde = *pde;
2949 	if (oldpde & PG_MANAGED) {
2950 		eva = sva + NBPDR;
2951 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2952 		    va < eva; va += PAGE_SIZE, m++)
2953 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2954 				vm_page_dirty(m);
2955 	}
2956 	if ((prot & VM_PROT_WRITE) == 0)
2957 		newpde &= ~(PG_RW | PG_M);
2958 #ifdef PAE
2959 	if ((prot & VM_PROT_EXECUTE) == 0)
2960 		newpde |= pg_nx;
2961 #endif
2962 	if (newpde != oldpde) {
2963 		if (!pde_cmpset(pde, oldpde, newpde))
2964 			goto retry;
2965 		if (oldpde & PG_G)
2966 			pmap_invalidate_page(pmap, sva);
2967 		else
2968 			anychanged = TRUE;
2969 	}
2970 	return (anychanged);
2971 }
2972 
2973 /*
2974  *	Set the physical protection on the
2975  *	specified range of this map as requested.
2976  */
2977 void
2978 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2979 {
2980 	vm_offset_t pdnxt;
2981 	pd_entry_t ptpaddr;
2982 	pt_entry_t *pte;
2983 	int anychanged;
2984 
2985 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2986 		pmap_remove(pmap, sva, eva);
2987 		return;
2988 	}
2989 
2990 #ifdef PAE
2991 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2992 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
2993 		return;
2994 #else
2995 	if (prot & VM_PROT_WRITE)
2996 		return;
2997 #endif
2998 
2999 	anychanged = 0;
3000 
3001 	vm_page_lock_queues();
3002 	sched_pin();
3003 	PMAP_LOCK(pmap);
3004 	for (; sva < eva; sva = pdnxt) {
3005 		pt_entry_t obits, pbits;
3006 		unsigned pdirindex;
3007 
3008 		pdnxt = (sva + NBPDR) & ~PDRMASK;
3009 		if (pdnxt < sva)
3010 			pdnxt = eva;
3011 
3012 		pdirindex = sva >> PDRSHIFT;
3013 		ptpaddr = pmap->pm_pdir[pdirindex];
3014 
3015 		/*
3016 		 * Weed out invalid mappings. Note: we assume that the page
3017 		 * directory table is always allocated, and in kernel virtual.
3018 		 */
3019 		if (ptpaddr == 0)
3020 			continue;
3021 
3022 		/*
3023 		 * Check for large page.
3024 		 */
3025 		if ((ptpaddr & PG_PS) != 0) {
3026 			/*
3027 			 * Are we protecting the entire large page?  If not,
3028 			 * demote the mapping and fall through.
3029 			 */
3030 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3031 				/*
3032 				 * The TLB entry for a PG_G mapping is
3033 				 * invalidated by pmap_protect_pde().
3034 				 */
3035 				if (pmap_protect_pde(pmap,
3036 				    &pmap->pm_pdir[pdirindex], sva, prot))
3037 					anychanged = 1;
3038 				continue;
3039 			} else if (!pmap_demote_pde(pmap,
3040 			    &pmap->pm_pdir[pdirindex], sva)) {
3041 				/* The large page mapping was destroyed. */
3042 				continue;
3043 			}
3044 		}
3045 
3046 		if (pdnxt > eva)
3047 			pdnxt = eva;
3048 
3049 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3050 		    sva += PAGE_SIZE) {
3051 			vm_page_t m;
3052 
3053 retry:
3054 			/*
3055 			 * Regardless of whether a pte is 32 or 64 bits in
3056 			 * size, PG_RW, PG_A, and PG_M are among the least
3057 			 * significant 32 bits.
3058 			 */
3059 			obits = pbits = *pte;
3060 			if ((pbits & PG_V) == 0)
3061 				continue;
3062 
3063 			if ((prot & VM_PROT_WRITE) == 0) {
3064 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3065 				    (PG_MANAGED | PG_M | PG_RW)) {
3066 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3067 					vm_page_dirty(m);
3068 				}
3069 				pbits &= ~(PG_RW | PG_M);
3070 			}
3071 #ifdef PAE
3072 			if ((prot & VM_PROT_EXECUTE) == 0)
3073 				pbits |= pg_nx;
3074 #endif
3075 
3076 			if (pbits != obits) {
3077 #ifdef PAE
3078 				if (!atomic_cmpset_64(pte, obits, pbits))
3079 					goto retry;
3080 #else
3081 				if (!atomic_cmpset_int((u_int *)pte, obits,
3082 				    pbits))
3083 					goto retry;
3084 #endif
3085 				if (obits & PG_G)
3086 					pmap_invalidate_page(pmap, sva);
3087 				else
3088 					anychanged = 1;
3089 			}
3090 		}
3091 	}
3092 	sched_unpin();
3093 	if (anychanged)
3094 		pmap_invalidate_all(pmap);
3095 	vm_page_unlock_queues();
3096 	PMAP_UNLOCK(pmap);
3097 }
3098 
3099 /*
3100  * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3101  * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3102  * For promotion to occur, two conditions must be met: (1) the 4KB page
3103  * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3104  * mappings must have identical characteristics.
3105  *
3106  * Managed (PG_MANAGED) mappings within the kernel address space are not
3107  * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3108  * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3109  * pmap.
3110  */
3111 static void
3112 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3113 {
3114 	pd_entry_t newpde;
3115 	pt_entry_t *firstpte, oldpte, pa, *pte;
3116 	vm_offset_t oldpteva;
3117 	vm_page_t mpte;
3118 
3119 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3120 
3121 	/*
3122 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3123 	 * either invalid, unused, or does not map the first 4KB physical page
3124 	 * within a 2- or 4MB page.
3125 	 */
3126 	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3127 setpde:
3128 	newpde = *firstpte;
3129 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3130 		pmap_pde_p_failures++;
3131 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3132 		    " in pmap %p", va, pmap);
3133 		return;
3134 	}
3135 	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3136 		pmap_pde_p_failures++;
3137 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3138 		    " in pmap %p", va, pmap);
3139 		return;
3140 	}
3141 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3142 		/*
3143 		 * When PG_M is already clear, PG_RW can be cleared without
3144 		 * a TLB invalidation.
3145 		 */
3146 		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3147 		    ~PG_RW))
3148 			goto setpde;
3149 		newpde &= ~PG_RW;
3150 	}
3151 
3152 	/*
3153 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3154 	 * PTE maps an unexpected 4KB physical page or does not have identical
3155 	 * characteristics to the first PTE.
3156 	 */
3157 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3158 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3159 setpte:
3160 		oldpte = *pte;
3161 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3162 			pmap_pde_p_failures++;
3163 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3164 			    " in pmap %p", va, pmap);
3165 			return;
3166 		}
3167 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3168 			/*
3169 			 * When PG_M is already clear, PG_RW can be cleared
3170 			 * without a TLB invalidation.
3171 			 */
3172 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3173 			    oldpte & ~PG_RW))
3174 				goto setpte;
3175 			oldpte &= ~PG_RW;
3176 			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3177 			    (va & ~PDRMASK);
3178 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3179 			    " in pmap %p", oldpteva, pmap);
3180 		}
3181 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3182 			pmap_pde_p_failures++;
3183 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3184 			    " in pmap %p", va, pmap);
3185 			return;
3186 		}
3187 		pa -= PAGE_SIZE;
3188 	}
3189 
3190 	/*
3191 	 * Save the page table page in its current state until the PDE
3192 	 * mapping the superpage is demoted by pmap_demote_pde() or
3193 	 * destroyed by pmap_remove_pde().
3194 	 */
3195 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3196 	KASSERT(mpte >= vm_page_array &&
3197 	    mpte < &vm_page_array[vm_page_array_size],
3198 	    ("pmap_promote_pde: page table page is out of range"));
3199 	KASSERT(mpte->pindex == va >> PDRSHIFT,
3200 	    ("pmap_promote_pde: page table page's pindex is wrong"));
3201 	pmap_insert_pt_page(pmap, mpte);
3202 
3203 	/*
3204 	 * Promote the pv entries.
3205 	 */
3206 	if ((newpde & PG_MANAGED) != 0)
3207 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3208 
3209 	/*
3210 	 * Propagate the PAT index to its proper position.
3211 	 */
3212 	if ((newpde & PG_PTE_PAT) != 0)
3213 		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3214 
3215 	/*
3216 	 * Map the superpage.
3217 	 */
3218 	if (workaround_erratum383)
3219 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3220 	else if (pmap == kernel_pmap)
3221 		pmap_kenter_pde(va, PG_PS | newpde);
3222 	else
3223 		pde_store(pde, PG_PS | newpde);
3224 
3225 	pmap_pde_promotions++;
3226 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3227 	    " in pmap %p", va, pmap);
3228 }
3229 
3230 /*
3231  *	Insert the given physical page (p) at
3232  *	the specified virtual address (v) in the
3233  *	target physical map with the protection requested.
3234  *
3235  *	If specified, the page will be wired down, meaning
3236  *	that the related pte can not be reclaimed.
3237  *
3238  *	NB:  This is the only routine which MAY NOT lazy-evaluate
3239  *	or lose information.  That is, this routine must actually
3240  *	insert this page into the given map NOW.
3241  */
3242 void
3243 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3244     vm_prot_t prot, boolean_t wired)
3245 {
3246 	pd_entry_t *pde;
3247 	pt_entry_t *pte;
3248 	pt_entry_t newpte, origpte;
3249 	pv_entry_t pv;
3250 	vm_paddr_t opa, pa;
3251 	vm_page_t mpte, om;
3252 	boolean_t invlva;
3253 
3254 	va = trunc_page(va);
3255 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3256 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3257 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3258 	    va));
3259 	KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0 ||
3260 	    (m->oflags & VPO_BUSY) != 0,
3261 	    ("pmap_enter: page %p is not busy", m));
3262 
3263 	mpte = NULL;
3264 
3265 	vm_page_lock_queues();
3266 	PMAP_LOCK(pmap);
3267 	sched_pin();
3268 
3269 	/*
3270 	 * In the case that a page table page is not
3271 	 * resident, we are creating it here.
3272 	 */
3273 	if (va < VM_MAXUSER_ADDRESS) {
3274 		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3275 	}
3276 
3277 	pde = pmap_pde(pmap, va);
3278 	if ((*pde & PG_PS) != 0)
3279 		panic("pmap_enter: attempted pmap_enter on 4MB page");
3280 	pte = pmap_pte_quick(pmap, va);
3281 
3282 	/*
3283 	 * Page Directory table entry not valid, we need a new PT page
3284 	 */
3285 	if (pte == NULL) {
3286 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3287 			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3288 	}
3289 
3290 	pa = VM_PAGE_TO_PHYS(m);
3291 	om = NULL;
3292 	origpte = *pte;
3293 	opa = origpte & PG_FRAME;
3294 
3295 	/*
3296 	 * Mapping has not changed, must be protection or wiring change.
3297 	 */
3298 	if (origpte && (opa == pa)) {
3299 		/*
3300 		 * Wiring change, just update stats. We don't worry about
3301 		 * wiring PT pages as they remain resident as long as there
3302 		 * are valid mappings in them. Hence, if a user page is wired,
3303 		 * the PT page will be also.
3304 		 */
3305 		if (wired && ((origpte & PG_W) == 0))
3306 			pmap->pm_stats.wired_count++;
3307 		else if (!wired && (origpte & PG_W))
3308 			pmap->pm_stats.wired_count--;
3309 
3310 		/*
3311 		 * Remove extra pte reference
3312 		 */
3313 		if (mpte)
3314 			mpte->wire_count--;
3315 
3316 		if (origpte & PG_MANAGED) {
3317 			om = m;
3318 			pa |= PG_MANAGED;
3319 		}
3320 		goto validate;
3321 	}
3322 
3323 	pv = NULL;
3324 
3325 	/*
3326 	 * Mapping has changed, invalidate old range and fall through to
3327 	 * handle validating new mapping.
3328 	 */
3329 	if (opa) {
3330 		if (origpte & PG_W)
3331 			pmap->pm_stats.wired_count--;
3332 		if (origpte & PG_MANAGED) {
3333 			om = PHYS_TO_VM_PAGE(opa);
3334 			pv = pmap_pvh_remove(&om->md, pmap, va);
3335 		}
3336 		if (mpte != NULL) {
3337 			mpte->wire_count--;
3338 			KASSERT(mpte->wire_count > 0,
3339 			    ("pmap_enter: missing reference to page table page,"
3340 			     " va: 0x%x", va));
3341 		}
3342 	} else
3343 		pmap->pm_stats.resident_count++;
3344 
3345 	/*
3346 	 * Enter on the PV list if part of our managed memory.
3347 	 */
3348 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3349 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3350 		    ("pmap_enter: managed mapping within the clean submap"));
3351 		if (pv == NULL)
3352 			pv = get_pv_entry(pmap, FALSE);
3353 		pv->pv_va = va;
3354 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3355 		pa |= PG_MANAGED;
3356 	} else if (pv != NULL)
3357 		free_pv_entry(pmap, pv);
3358 
3359 	/*
3360 	 * Increment counters
3361 	 */
3362 	if (wired)
3363 		pmap->pm_stats.wired_count++;
3364 
3365 validate:
3366 	/*
3367 	 * Now validate mapping with desired protection/wiring.
3368 	 */
3369 	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3370 	if ((prot & VM_PROT_WRITE) != 0) {
3371 		newpte |= PG_RW;
3372 		if ((newpte & PG_MANAGED) != 0)
3373 			vm_page_flag_set(m, PG_WRITEABLE);
3374 	}
3375 #ifdef PAE
3376 	if ((prot & VM_PROT_EXECUTE) == 0)
3377 		newpte |= pg_nx;
3378 #endif
3379 	if (wired)
3380 		newpte |= PG_W;
3381 	if (va < VM_MAXUSER_ADDRESS)
3382 		newpte |= PG_U;
3383 	if (pmap == kernel_pmap)
3384 		newpte |= pgeflag;
3385 
3386 	/*
3387 	 * if the mapping or permission bits are different, we need
3388 	 * to update the pte.
3389 	 */
3390 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3391 		newpte |= PG_A;
3392 		if ((access & VM_PROT_WRITE) != 0)
3393 			newpte |= PG_M;
3394 		if (origpte & PG_V) {
3395 			invlva = FALSE;
3396 			origpte = pte_load_store(pte, newpte);
3397 			if (origpte & PG_A) {
3398 				if (origpte & PG_MANAGED)
3399 					vm_page_flag_set(om, PG_REFERENCED);
3400 				if (opa != VM_PAGE_TO_PHYS(m))
3401 					invlva = TRUE;
3402 #ifdef PAE
3403 				if ((origpte & PG_NX) == 0 &&
3404 				    (newpte & PG_NX) != 0)
3405 					invlva = TRUE;
3406 #endif
3407 			}
3408 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3409 				if ((origpte & PG_MANAGED) != 0)
3410 					vm_page_dirty(om);
3411 				if ((prot & VM_PROT_WRITE) == 0)
3412 					invlva = TRUE;
3413 			}
3414 			if ((origpte & PG_MANAGED) != 0 &&
3415 			    TAILQ_EMPTY(&om->md.pv_list) &&
3416 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))
3417 				vm_page_flag_clear(om, PG_WRITEABLE);
3418 			if (invlva)
3419 				pmap_invalidate_page(pmap, va);
3420 		} else
3421 			pte_store(pte, newpte);
3422 	}
3423 
3424 	/*
3425 	 * If both the page table page and the reservation are fully
3426 	 * populated, then attempt promotion.
3427 	 */
3428 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3429 	    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3430 		pmap_promote_pde(pmap, pde, va);
3431 
3432 	sched_unpin();
3433 	vm_page_unlock_queues();
3434 	PMAP_UNLOCK(pmap);
3435 }
3436 
3437 /*
3438  * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3439  * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3440  * blocking, (2) a mapping already exists at the specified virtual address, or
3441  * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3442  */
3443 static boolean_t
3444 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3445 {
3446 	pd_entry_t *pde, newpde;
3447 
3448 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3449 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3450 	pde = pmap_pde(pmap, va);
3451 	if (*pde != 0) {
3452 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3453 		    " in pmap %p", va, pmap);
3454 		return (FALSE);
3455 	}
3456 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3457 	    PG_PS | PG_V;
3458 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3459 		newpde |= PG_MANAGED;
3460 
3461 		/*
3462 		 * Abort this mapping if its PV entry could not be created.
3463 		 */
3464 		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3465 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3466 			    " in pmap %p", va, pmap);
3467 			return (FALSE);
3468 		}
3469 	}
3470 #ifdef PAE
3471 	if ((prot & VM_PROT_EXECUTE) == 0)
3472 		newpde |= pg_nx;
3473 #endif
3474 	if (va < VM_MAXUSER_ADDRESS)
3475 		newpde |= PG_U;
3476 
3477 	/*
3478 	 * Increment counters.
3479 	 */
3480 	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3481 
3482 	/*
3483 	 * Map the superpage.
3484 	 */
3485 	pde_store(pde, newpde);
3486 
3487 	pmap_pde_mappings++;
3488 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3489 	    " in pmap %p", va, pmap);
3490 	return (TRUE);
3491 }
3492 
3493 /*
3494  * Maps a sequence of resident pages belonging to the same object.
3495  * The sequence begins with the given page m_start.  This page is
3496  * mapped at the given virtual address start.  Each subsequent page is
3497  * mapped at a virtual address that is offset from start by the same
3498  * amount as the page is offset from m_start within the object.  The
3499  * last page in the sequence is the page with the largest offset from
3500  * m_start that can be mapped at a virtual address less than the given
3501  * virtual address end.  Not every virtual page between start and end
3502  * is mapped; only those for which a resident page exists with the
3503  * corresponding offset from m_start are mapped.
3504  */
3505 void
3506 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3507     vm_page_t m_start, vm_prot_t prot)
3508 {
3509 	vm_offset_t va;
3510 	vm_page_t m, mpte;
3511 	vm_pindex_t diff, psize;
3512 
3513 	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3514 	psize = atop(end - start);
3515 	mpte = NULL;
3516 	m = m_start;
3517 	vm_page_lock_queues();
3518 	PMAP_LOCK(pmap);
3519 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3520 		va = start + ptoa(diff);
3521 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3522 		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3523 		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3524 		    pmap_enter_pde(pmap, va, m, prot))
3525 			m = &m[NBPDR / PAGE_SIZE - 1];
3526 		else
3527 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3528 			    mpte);
3529 		m = TAILQ_NEXT(m, listq);
3530 	}
3531 	vm_page_unlock_queues();
3532  	PMAP_UNLOCK(pmap);
3533 }
3534 
3535 /*
3536  * this code makes some *MAJOR* assumptions:
3537  * 1. Current pmap & pmap exists.
3538  * 2. Not wired.
3539  * 3. Read access.
3540  * 4. No page table pages.
3541  * but is *MUCH* faster than pmap_enter...
3542  */
3543 
3544 void
3545 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3546 {
3547 
3548 	vm_page_lock_queues();
3549 	PMAP_LOCK(pmap);
3550 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3551 	vm_page_unlock_queues();
3552 	PMAP_UNLOCK(pmap);
3553 }
3554 
3555 static vm_page_t
3556 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3557     vm_prot_t prot, vm_page_t mpte)
3558 {
3559 	pt_entry_t *pte;
3560 	vm_paddr_t pa;
3561 	vm_page_t free;
3562 
3563 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3564 	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
3565 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3566 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3567 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3568 
3569 	/*
3570 	 * In the case that a page table page is not
3571 	 * resident, we are creating it here.
3572 	 */
3573 	if (va < VM_MAXUSER_ADDRESS) {
3574 		unsigned ptepindex;
3575 		pd_entry_t ptepa;
3576 
3577 		/*
3578 		 * Calculate pagetable page index
3579 		 */
3580 		ptepindex = va >> PDRSHIFT;
3581 		if (mpte && (mpte->pindex == ptepindex)) {
3582 			mpte->wire_count++;
3583 		} else {
3584 			/*
3585 			 * Get the page directory entry
3586 			 */
3587 			ptepa = pmap->pm_pdir[ptepindex];
3588 
3589 			/*
3590 			 * If the page table page is mapped, we just increment
3591 			 * the hold count, and activate it.
3592 			 */
3593 			if (ptepa) {
3594 				if (ptepa & PG_PS)
3595 					return (NULL);
3596 				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3597 				mpte->wire_count++;
3598 			} else {
3599 				mpte = _pmap_allocpte(pmap, ptepindex,
3600 				    M_NOWAIT);
3601 				if (mpte == NULL)
3602 					return (mpte);
3603 			}
3604 		}
3605 	} else {
3606 		mpte = NULL;
3607 	}
3608 
3609 	/*
3610 	 * This call to vtopte makes the assumption that we are
3611 	 * entering the page into the current pmap.  In order to support
3612 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3613 	 * But that isn't as quick as vtopte.
3614 	 */
3615 	pte = vtopte(va);
3616 	if (*pte) {
3617 		if (mpte != NULL) {
3618 			mpte->wire_count--;
3619 			mpte = NULL;
3620 		}
3621 		return (mpte);
3622 	}
3623 
3624 	/*
3625 	 * Enter on the PV list if part of our managed memory.
3626 	 */
3627 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
3628 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3629 		if (mpte != NULL) {
3630 			free = NULL;
3631 			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
3632 				pmap_invalidate_page(pmap, va);
3633 				pmap_free_zero_pages(free);
3634 			}
3635 
3636 			mpte = NULL;
3637 		}
3638 		return (mpte);
3639 	}
3640 
3641 	/*
3642 	 * Increment counters
3643 	 */
3644 	pmap->pm_stats.resident_count++;
3645 
3646 	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3647 #ifdef PAE
3648 	if ((prot & VM_PROT_EXECUTE) == 0)
3649 		pa |= pg_nx;
3650 #endif
3651 
3652 	/*
3653 	 * Now validate mapping with RO protection
3654 	 */
3655 	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
3656 		pte_store(pte, pa | PG_V | PG_U);
3657 	else
3658 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3659 	return (mpte);
3660 }
3661 
3662 /*
3663  * Make a temporary mapping for a physical address.  This is only intended
3664  * to be used for panic dumps.
3665  */
3666 void *
3667 pmap_kenter_temporary(vm_paddr_t pa, int i)
3668 {
3669 	vm_offset_t va;
3670 
3671 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3672 	pmap_kenter(va, pa);
3673 	invlpg(va);
3674 	return ((void *)crashdumpmap);
3675 }
3676 
3677 /*
3678  * This code maps large physical mmap regions into the
3679  * processor address space.  Note that some shortcuts
3680  * are taken, but the code works.
3681  */
3682 void
3683 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3684     vm_pindex_t pindex, vm_size_t size)
3685 {
3686 	pd_entry_t *pde;
3687 	vm_paddr_t pa, ptepa;
3688 	vm_page_t p;
3689 	int pat_mode;
3690 
3691 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3692 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3693 	    ("pmap_object_init_pt: non-device object"));
3694 	if (pseflag &&
3695 	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3696 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3697 			return;
3698 		p = vm_page_lookup(object, pindex);
3699 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3700 		    ("pmap_object_init_pt: invalid page %p", p));
3701 		pat_mode = p->md.pat_mode;
3702 
3703 		/*
3704 		 * Abort the mapping if the first page is not physically
3705 		 * aligned to a 2/4MB page boundary.
3706 		 */
3707 		ptepa = VM_PAGE_TO_PHYS(p);
3708 		if (ptepa & (NBPDR - 1))
3709 			return;
3710 
3711 		/*
3712 		 * Skip the first page.  Abort the mapping if the rest of
3713 		 * the pages are not physically contiguous or have differing
3714 		 * memory attributes.
3715 		 */
3716 		p = TAILQ_NEXT(p, listq);
3717 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3718 		    pa += PAGE_SIZE) {
3719 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3720 			    ("pmap_object_init_pt: invalid page %p", p));
3721 			if (pa != VM_PAGE_TO_PHYS(p) ||
3722 			    pat_mode != p->md.pat_mode)
3723 				return;
3724 			p = TAILQ_NEXT(p, listq);
3725 		}
3726 
3727 		/*
3728 		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3729 		 * "size" is a multiple of 2/4M, adding the PAT setting to
3730 		 * "pa" will not affect the termination of this loop.
3731 		 */
3732 		PMAP_LOCK(pmap);
3733 		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3734 		    size; pa += NBPDR) {
3735 			pde = pmap_pde(pmap, addr);
3736 			if (*pde == 0) {
3737 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3738 				    PG_U | PG_RW | PG_V);
3739 				pmap->pm_stats.resident_count += NBPDR /
3740 				    PAGE_SIZE;
3741 				pmap_pde_mappings++;
3742 			}
3743 			/* Else continue on if the PDE is already valid. */
3744 			addr += NBPDR;
3745 		}
3746 		PMAP_UNLOCK(pmap);
3747 	}
3748 }
3749 
3750 /*
3751  *	Routine:	pmap_change_wiring
3752  *	Function:	Change the wiring attribute for a map/virtual-address
3753  *			pair.
3754  *	In/out conditions:
3755  *			The mapping must already exist in the pmap.
3756  */
3757 void
3758 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3759 {
3760 	pd_entry_t *pde;
3761 	pt_entry_t *pte;
3762 	boolean_t are_queues_locked;
3763 
3764 	are_queues_locked = FALSE;
3765 retry:
3766 	PMAP_LOCK(pmap);
3767 	pde = pmap_pde(pmap, va);
3768 	if ((*pde & PG_PS) != 0) {
3769 		if (!wired != ((*pde & PG_W) == 0)) {
3770 			if (!are_queues_locked) {
3771 				are_queues_locked = TRUE;
3772 				if (!mtx_trylock(&vm_page_queue_mtx)) {
3773 					PMAP_UNLOCK(pmap);
3774 					vm_page_lock_queues();
3775 					goto retry;
3776 				}
3777 			}
3778 			if (!pmap_demote_pde(pmap, pde, va))
3779 				panic("pmap_change_wiring: demotion failed");
3780 		} else
3781 			goto out;
3782 	}
3783 	pte = pmap_pte(pmap, va);
3784 
3785 	if (wired && !pmap_pte_w(pte))
3786 		pmap->pm_stats.wired_count++;
3787 	else if (!wired && pmap_pte_w(pte))
3788 		pmap->pm_stats.wired_count--;
3789 
3790 	/*
3791 	 * Wiring is not a hardware characteristic so there is no need to
3792 	 * invalidate TLB.
3793 	 */
3794 	pmap_pte_set_w(pte, wired);
3795 	pmap_pte_release(pte);
3796 out:
3797 	if (are_queues_locked)
3798 		vm_page_unlock_queues();
3799 	PMAP_UNLOCK(pmap);
3800 }
3801 
3802 
3803 
3804 /*
3805  *	Copy the range specified by src_addr/len
3806  *	from the source map to the range dst_addr/len
3807  *	in the destination map.
3808  *
3809  *	This routine is only advisory and need not do anything.
3810  */
3811 
3812 void
3813 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3814     vm_offset_t src_addr)
3815 {
3816 	vm_page_t   free;
3817 	vm_offset_t addr;
3818 	vm_offset_t end_addr = src_addr + len;
3819 	vm_offset_t pdnxt;
3820 
3821 	if (dst_addr != src_addr)
3822 		return;
3823 
3824 	if (!pmap_is_current(src_pmap))
3825 		return;
3826 
3827 	vm_page_lock_queues();
3828 	if (dst_pmap < src_pmap) {
3829 		PMAP_LOCK(dst_pmap);
3830 		PMAP_LOCK(src_pmap);
3831 	} else {
3832 		PMAP_LOCK(src_pmap);
3833 		PMAP_LOCK(dst_pmap);
3834 	}
3835 	sched_pin();
3836 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
3837 		pt_entry_t *src_pte, *dst_pte;
3838 		vm_page_t dstmpte, srcmpte;
3839 		pd_entry_t srcptepaddr;
3840 		unsigned ptepindex;
3841 
3842 		KASSERT(addr < UPT_MIN_ADDRESS,
3843 		    ("pmap_copy: invalid to pmap_copy page tables"));
3844 
3845 		pdnxt = (addr + NBPDR) & ~PDRMASK;
3846 		if (pdnxt < addr)
3847 			pdnxt = end_addr;
3848 		ptepindex = addr >> PDRSHIFT;
3849 
3850 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
3851 		if (srcptepaddr == 0)
3852 			continue;
3853 
3854 		if (srcptepaddr & PG_PS) {
3855 			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
3856 			    ((srcptepaddr & PG_MANAGED) == 0 ||
3857 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
3858 			    PG_PS_FRAME))) {
3859 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
3860 				    ~PG_W;
3861 				dst_pmap->pm_stats.resident_count +=
3862 				    NBPDR / PAGE_SIZE;
3863 			}
3864 			continue;
3865 		}
3866 
3867 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
3868 		KASSERT(srcmpte->wire_count > 0,
3869 		    ("pmap_copy: source page table page is unused"));
3870 
3871 		if (pdnxt > end_addr)
3872 			pdnxt = end_addr;
3873 
3874 		src_pte = vtopte(addr);
3875 		while (addr < pdnxt) {
3876 			pt_entry_t ptetemp;
3877 			ptetemp = *src_pte;
3878 			/*
3879 			 * we only virtual copy managed pages
3880 			 */
3881 			if ((ptetemp & PG_MANAGED) != 0) {
3882 				dstmpte = pmap_allocpte(dst_pmap, addr,
3883 				    M_NOWAIT);
3884 				if (dstmpte == NULL)
3885 					goto out;
3886 				dst_pte = pmap_pte_quick(dst_pmap, addr);
3887 				if (*dst_pte == 0 &&
3888 				    pmap_try_insert_pv_entry(dst_pmap, addr,
3889 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3890 					/*
3891 					 * Clear the wired, modified, and
3892 					 * accessed (referenced) bits
3893 					 * during the copy.
3894 					 */
3895 					*dst_pte = ptetemp & ~(PG_W | PG_M |
3896 					    PG_A);
3897 					dst_pmap->pm_stats.resident_count++;
3898 	 			} else {
3899 					free = NULL;
3900 					if (pmap_unwire_pte_hold(dst_pmap,
3901 					    dstmpte, &free)) {
3902 						pmap_invalidate_page(dst_pmap,
3903 						    addr);
3904 						pmap_free_zero_pages(free);
3905 					}
3906 					goto out;
3907 				}
3908 				if (dstmpte->wire_count >= srcmpte->wire_count)
3909 					break;
3910 			}
3911 			addr += PAGE_SIZE;
3912 			src_pte++;
3913 		}
3914 	}
3915 out:
3916 	sched_unpin();
3917 	vm_page_unlock_queues();
3918 	PMAP_UNLOCK(src_pmap);
3919 	PMAP_UNLOCK(dst_pmap);
3920 }
3921 
3922 static __inline void
3923 pagezero(void *page)
3924 {
3925 #if defined(I686_CPU)
3926 	if (cpu_class == CPUCLASS_686) {
3927 #if defined(CPU_ENABLE_SSE)
3928 		if (cpu_feature & CPUID_SSE2)
3929 			sse2_pagezero(page);
3930 		else
3931 #endif
3932 			i686_pagezero(page);
3933 	} else
3934 #endif
3935 		bzero(page, PAGE_SIZE);
3936 }
3937 
3938 /*
3939  *	pmap_zero_page zeros the specified hardware page by mapping
3940  *	the page into KVM and using bzero to clear its contents.
3941  */
3942 void
3943 pmap_zero_page(vm_page_t m)
3944 {
3945 	struct sysmaps *sysmaps;
3946 
3947 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3948 	mtx_lock(&sysmaps->lock);
3949 	if (*sysmaps->CMAP2)
3950 		panic("pmap_zero_page: CMAP2 busy");
3951 	sched_pin();
3952 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3953 	    pmap_cache_bits(m->md.pat_mode, 0);
3954 	invlcaddr(sysmaps->CADDR2);
3955 	pagezero(sysmaps->CADDR2);
3956 	*sysmaps->CMAP2 = 0;
3957 	sched_unpin();
3958 	mtx_unlock(&sysmaps->lock);
3959 }
3960 
3961 /*
3962  *	pmap_zero_page_area zeros the specified hardware page by mapping
3963  *	the page into KVM and using bzero to clear its contents.
3964  *
3965  *	off and size may not cover an area beyond a single hardware page.
3966  */
3967 void
3968 pmap_zero_page_area(vm_page_t m, int off, int size)
3969 {
3970 	struct sysmaps *sysmaps;
3971 
3972 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3973 	mtx_lock(&sysmaps->lock);
3974 	if (*sysmaps->CMAP2)
3975 		panic("pmap_zero_page_area: CMAP2 busy");
3976 	sched_pin();
3977 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
3978 	    pmap_cache_bits(m->md.pat_mode, 0);
3979 	invlcaddr(sysmaps->CADDR2);
3980 	if (off == 0 && size == PAGE_SIZE)
3981 		pagezero(sysmaps->CADDR2);
3982 	else
3983 		bzero((char *)sysmaps->CADDR2 + off, size);
3984 	*sysmaps->CMAP2 = 0;
3985 	sched_unpin();
3986 	mtx_unlock(&sysmaps->lock);
3987 }
3988 
3989 /*
3990  *	pmap_zero_page_idle zeros the specified hardware page by mapping
3991  *	the page into KVM and using bzero to clear its contents.  This
3992  *	is intended to be called from the vm_pagezero process only and
3993  *	outside of Giant.
3994  */
3995 void
3996 pmap_zero_page_idle(vm_page_t m)
3997 {
3998 
3999 	if (*CMAP3)
4000 		panic("pmap_zero_page_idle: CMAP3 busy");
4001 	sched_pin();
4002 	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4003 	    pmap_cache_bits(m->md.pat_mode, 0);
4004 	invlcaddr(CADDR3);
4005 	pagezero(CADDR3);
4006 	*CMAP3 = 0;
4007 	sched_unpin();
4008 }
4009 
4010 /*
4011  *	pmap_copy_page copies the specified (machine independent)
4012  *	page by mapping the page into virtual memory and using
4013  *	bcopy to copy the page, one machine dependent page at a
4014  *	time.
4015  */
4016 void
4017 pmap_copy_page(vm_page_t src, vm_page_t dst)
4018 {
4019 	struct sysmaps *sysmaps;
4020 
4021 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4022 	mtx_lock(&sysmaps->lock);
4023 	if (*sysmaps->CMAP1)
4024 		panic("pmap_copy_page: CMAP1 busy");
4025 	if (*sysmaps->CMAP2)
4026 		panic("pmap_copy_page: CMAP2 busy");
4027 	sched_pin();
4028 	invlpg((u_int)sysmaps->CADDR1);
4029 	invlpg((u_int)sysmaps->CADDR2);
4030 	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4031 	    pmap_cache_bits(src->md.pat_mode, 0);
4032 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4033 	    pmap_cache_bits(dst->md.pat_mode, 0);
4034 	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4035 	*sysmaps->CMAP1 = 0;
4036 	*sysmaps->CMAP2 = 0;
4037 	sched_unpin();
4038 	mtx_unlock(&sysmaps->lock);
4039 }
4040 
4041 /*
4042  * Returns true if the pmap's pv is one of the first
4043  * 16 pvs linked to from this page.  This count may
4044  * be changed upwards or downwards in the future; it
4045  * is only necessary that true be returned for a small
4046  * subset of pmaps for proper page aging.
4047  */
4048 boolean_t
4049 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4050 {
4051 	struct md_page *pvh;
4052 	pv_entry_t pv;
4053 	int loops = 0;
4054 	boolean_t rv;
4055 
4056 	KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
4057 	    ("pmap_page_exists_quick: page %p is not managed", m));
4058 	rv = FALSE;
4059 	vm_page_lock_queues();
4060 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4061 		if (PV_PMAP(pv) == pmap) {
4062 			rv = TRUE;
4063 			break;
4064 		}
4065 		loops++;
4066 		if (loops >= 16)
4067 			break;
4068 	}
4069 	if (!rv && loops < 16) {
4070 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4071 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4072 			if (PV_PMAP(pv) == pmap) {
4073 				rv = TRUE;
4074 				break;
4075 			}
4076 			loops++;
4077 			if (loops >= 16)
4078 				break;
4079 		}
4080 	}
4081 	vm_page_unlock_queues();
4082 	return (rv);
4083 }
4084 
4085 /*
4086  *	pmap_page_wired_mappings:
4087  *
4088  *	Return the number of managed mappings to the given physical page
4089  *	that are wired.
4090  */
4091 int
4092 pmap_page_wired_mappings(vm_page_t m)
4093 {
4094 	int count;
4095 
4096 	count = 0;
4097 	if ((m->flags & PG_FICTITIOUS) != 0)
4098 		return (count);
4099 	vm_page_lock_queues();
4100 	count = pmap_pvh_wired_mappings(&m->md, count);
4101 	count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count);
4102 	vm_page_unlock_queues();
4103 	return (count);
4104 }
4105 
4106 /*
4107  *	pmap_pvh_wired_mappings:
4108  *
4109  *	Return the updated number "count" of managed mappings that are wired.
4110  */
4111 static int
4112 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4113 {
4114 	pmap_t pmap;
4115 	pt_entry_t *pte;
4116 	pv_entry_t pv;
4117 
4118 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4119 	sched_pin();
4120 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4121 		pmap = PV_PMAP(pv);
4122 		PMAP_LOCK(pmap);
4123 		pte = pmap_pte_quick(pmap, pv->pv_va);
4124 		if ((*pte & PG_W) != 0)
4125 			count++;
4126 		PMAP_UNLOCK(pmap);
4127 	}
4128 	sched_unpin();
4129 	return (count);
4130 }
4131 
4132 /*
4133  * Returns TRUE if the given page is mapped individually or as part of
4134  * a 4mpage.  Otherwise, returns FALSE.
4135  */
4136 boolean_t
4137 pmap_page_is_mapped(vm_page_t m)
4138 {
4139 	boolean_t rv;
4140 
4141 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
4142 		return (FALSE);
4143 	vm_page_lock_queues();
4144 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4145 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list);
4146 	vm_page_unlock_queues();
4147 	return (rv);
4148 }
4149 
4150 /*
4151  * Remove all pages from specified address space
4152  * this aids process exit speeds.  Also, this code
4153  * is special cased for current process only, but
4154  * can have the more generic (and slightly slower)
4155  * mode enabled.  This is much faster than pmap_remove
4156  * in the case of running down an entire address space.
4157  */
4158 void
4159 pmap_remove_pages(pmap_t pmap)
4160 {
4161 	pt_entry_t *pte, tpte;
4162 	vm_page_t free = NULL;
4163 	vm_page_t m, mpte, mt;
4164 	pv_entry_t pv;
4165 	struct md_page *pvh;
4166 	struct pv_chunk *pc, *npc;
4167 	int field, idx;
4168 	int32_t bit;
4169 	uint32_t inuse, bitmask;
4170 	int allfree;
4171 
4172 	if (pmap != PCPU_GET(curpmap)) {
4173 		printf("warning: pmap_remove_pages called with non-current pmap\n");
4174 		return;
4175 	}
4176 	vm_page_lock_queues();
4177 	PMAP_LOCK(pmap);
4178 	sched_pin();
4179 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4180 		allfree = 1;
4181 		for (field = 0; field < _NPCM; field++) {
4182 			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
4183 			while (inuse != 0) {
4184 				bit = bsfl(inuse);
4185 				bitmask = 1UL << bit;
4186 				idx = field * 32 + bit;
4187 				pv = &pc->pc_pventry[idx];
4188 				inuse &= ~bitmask;
4189 
4190 				pte = pmap_pde(pmap, pv->pv_va);
4191 				tpte = *pte;
4192 				if ((tpte & PG_PS) == 0) {
4193 					pte = vtopte(pv->pv_va);
4194 					tpte = *pte & ~PG_PTE_PAT;
4195 				}
4196 
4197 				if (tpte == 0) {
4198 					printf(
4199 					    "TPTE at %p  IS ZERO @ VA %08x\n",
4200 					    pte, pv->pv_va);
4201 					panic("bad pte");
4202 				}
4203 
4204 /*
4205  * We cannot remove wired pages from a process' mapping at this time
4206  */
4207 				if (tpte & PG_W) {
4208 					allfree = 0;
4209 					continue;
4210 				}
4211 
4212 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4213 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4214 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4215 				    m, (uintmax_t)m->phys_addr,
4216 				    (uintmax_t)tpte));
4217 
4218 				KASSERT(m < &vm_page_array[vm_page_array_size],
4219 					("pmap_remove_pages: bad tpte %#jx",
4220 					(uintmax_t)tpte));
4221 
4222 				pte_clear(pte);
4223 
4224 				/*
4225 				 * Update the vm_page_t clean/reference bits.
4226 				 */
4227 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4228 					if ((tpte & PG_PS) != 0) {
4229 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4230 							vm_page_dirty(mt);
4231 					} else
4232 						vm_page_dirty(m);
4233 				}
4234 
4235 				/* Mark free */
4236 				PV_STAT(pv_entry_frees++);
4237 				PV_STAT(pv_entry_spare++);
4238 				pv_entry_count--;
4239 				pc->pc_map[field] |= bitmask;
4240 				if ((tpte & PG_PS) != 0) {
4241 					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4242 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4243 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
4244 					if (TAILQ_EMPTY(&pvh->pv_list)) {
4245 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4246 							if (TAILQ_EMPTY(&mt->md.pv_list))
4247 								vm_page_flag_clear(mt, PG_WRITEABLE);
4248 					}
4249 					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4250 					if (mpte != NULL) {
4251 						pmap_remove_pt_page(pmap, mpte);
4252 						pmap->pm_stats.resident_count--;
4253 						KASSERT(mpte->wire_count == NPTEPG,
4254 						    ("pmap_remove_pages: pte page wire count error"));
4255 						mpte->wire_count = 0;
4256 						pmap_add_delayed_free_list(mpte, &free, FALSE);
4257 						atomic_subtract_int(&cnt.v_wire_count, 1);
4258 					}
4259 				} else {
4260 					pmap->pm_stats.resident_count--;
4261 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4262 					if (TAILQ_EMPTY(&m->md.pv_list)) {
4263 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4264 						if (TAILQ_EMPTY(&pvh->pv_list))
4265 							vm_page_flag_clear(m, PG_WRITEABLE);
4266 					}
4267 					pmap_unuse_pt(pmap, pv->pv_va, &free);
4268 				}
4269 			}
4270 		}
4271 		if (allfree) {
4272 			PV_STAT(pv_entry_spare -= _NPCPV);
4273 			PV_STAT(pc_chunk_count--);
4274 			PV_STAT(pc_chunk_frees++);
4275 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4276 			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
4277 			pmap_qremove((vm_offset_t)pc, 1);
4278 			vm_page_unwire(m, 0);
4279 			vm_page_free(m);
4280 			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
4281 		}
4282 	}
4283 	sched_unpin();
4284 	pmap_invalidate_all(pmap);
4285 	vm_page_unlock_queues();
4286 	PMAP_UNLOCK(pmap);
4287 	pmap_free_zero_pages(free);
4288 }
4289 
4290 /*
4291  *	pmap_is_modified:
4292  *
4293  *	Return whether or not the specified physical page was modified
4294  *	in any physical maps.
4295  */
4296 boolean_t
4297 pmap_is_modified(vm_page_t m)
4298 {
4299 	boolean_t rv;
4300 
4301 	KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
4302 	    ("pmap_is_modified: page %p is not managed", m));
4303 
4304 	/*
4305 	 * If the page is not VPO_BUSY, then PG_WRITEABLE cannot be
4306 	 * concurrently set while the object is locked.  Thus, if PG_WRITEABLE
4307 	 * is clear, no PTEs can have PG_M set.
4308 	 */
4309 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4310 	if ((m->oflags & VPO_BUSY) == 0 &&
4311 	    (m->flags & PG_WRITEABLE) == 0)
4312 		return (FALSE);
4313 	vm_page_lock_queues();
4314 	rv = pmap_is_modified_pvh(&m->md) ||
4315 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)));
4316 	vm_page_unlock_queues();
4317 	return (rv);
4318 }
4319 
4320 /*
4321  * Returns TRUE if any of the given mappings were used to modify
4322  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4323  * mappings are supported.
4324  */
4325 static boolean_t
4326 pmap_is_modified_pvh(struct md_page *pvh)
4327 {
4328 	pv_entry_t pv;
4329 	pt_entry_t *pte;
4330 	pmap_t pmap;
4331 	boolean_t rv;
4332 
4333 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4334 	rv = FALSE;
4335 	sched_pin();
4336 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4337 		pmap = PV_PMAP(pv);
4338 		PMAP_LOCK(pmap);
4339 		pte = pmap_pte_quick(pmap, pv->pv_va);
4340 		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4341 		PMAP_UNLOCK(pmap);
4342 		if (rv)
4343 			break;
4344 	}
4345 	sched_unpin();
4346 	return (rv);
4347 }
4348 
4349 /*
4350  *	pmap_is_prefaultable:
4351  *
4352  *	Return whether or not the specified virtual address is elgible
4353  *	for prefault.
4354  */
4355 boolean_t
4356 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4357 {
4358 	pd_entry_t *pde;
4359 	pt_entry_t *pte;
4360 	boolean_t rv;
4361 
4362 	rv = FALSE;
4363 	PMAP_LOCK(pmap);
4364 	pde = pmap_pde(pmap, addr);
4365 	if (*pde != 0 && (*pde & PG_PS) == 0) {
4366 		pte = vtopte(addr);
4367 		rv = *pte == 0;
4368 	}
4369 	PMAP_UNLOCK(pmap);
4370 	return (rv);
4371 }
4372 
4373 /*
4374  *	pmap_is_referenced:
4375  *
4376  *	Return whether or not the specified physical page was referenced
4377  *	in any physical maps.
4378  */
4379 boolean_t
4380 pmap_is_referenced(vm_page_t m)
4381 {
4382 	boolean_t rv;
4383 
4384 	KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
4385 	    ("pmap_is_referenced: page %p is not managed", m));
4386 	vm_page_lock_queues();
4387 	rv = pmap_is_referenced_pvh(&m->md) ||
4388 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)));
4389 	vm_page_unlock_queues();
4390 	return (rv);
4391 }
4392 
4393 /*
4394  * Returns TRUE if any of the given mappings were referenced and FALSE
4395  * otherwise.  Both page and 4mpage mappings are supported.
4396  */
4397 static boolean_t
4398 pmap_is_referenced_pvh(struct md_page *pvh)
4399 {
4400 	pv_entry_t pv;
4401 	pt_entry_t *pte;
4402 	pmap_t pmap;
4403 	boolean_t rv;
4404 
4405 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4406 	rv = FALSE;
4407 	sched_pin();
4408 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4409 		pmap = PV_PMAP(pv);
4410 		PMAP_LOCK(pmap);
4411 		pte = pmap_pte_quick(pmap, pv->pv_va);
4412 		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4413 		PMAP_UNLOCK(pmap);
4414 		if (rv)
4415 			break;
4416 	}
4417 	sched_unpin();
4418 	return (rv);
4419 }
4420 
4421 /*
4422  * Clear the write and modified bits in each of the given page's mappings.
4423  */
4424 void
4425 pmap_remove_write(vm_page_t m)
4426 {
4427 	struct md_page *pvh;
4428 	pv_entry_t next_pv, pv;
4429 	pmap_t pmap;
4430 	pd_entry_t *pde;
4431 	pt_entry_t oldpte, *pte;
4432 	vm_offset_t va;
4433 
4434 	KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
4435 	    ("pmap_remove_write: page %p is not managed", m));
4436 
4437 	/*
4438 	 * If the page is not VPO_BUSY, then PG_WRITEABLE cannot be set by
4439 	 * another thread while the object is locked.  Thus, if PG_WRITEABLE
4440 	 * is clear, no page table entries need updating.
4441 	 */
4442 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4443 	if ((m->oflags & VPO_BUSY) == 0 &&
4444 	    (m->flags & PG_WRITEABLE) == 0)
4445 		return;
4446 	vm_page_lock_queues();
4447 	sched_pin();
4448 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4449 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4450 		va = pv->pv_va;
4451 		pmap = PV_PMAP(pv);
4452 		PMAP_LOCK(pmap);
4453 		pde = pmap_pde(pmap, va);
4454 		if ((*pde & PG_RW) != 0)
4455 			(void)pmap_demote_pde(pmap, pde, va);
4456 		PMAP_UNLOCK(pmap);
4457 	}
4458 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4459 		pmap = PV_PMAP(pv);
4460 		PMAP_LOCK(pmap);
4461 		pde = pmap_pde(pmap, pv->pv_va);
4462 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4463 		    " a 4mpage in page %p's pv list", m));
4464 		pte = pmap_pte_quick(pmap, pv->pv_va);
4465 retry:
4466 		oldpte = *pte;
4467 		if ((oldpte & PG_RW) != 0) {
4468 			/*
4469 			 * Regardless of whether a pte is 32 or 64 bits
4470 			 * in size, PG_RW and PG_M are among the least
4471 			 * significant 32 bits.
4472 			 */
4473 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4474 			    oldpte & ~(PG_RW | PG_M)))
4475 				goto retry;
4476 			if ((oldpte & PG_M) != 0)
4477 				vm_page_dirty(m);
4478 			pmap_invalidate_page(pmap, pv->pv_va);
4479 		}
4480 		PMAP_UNLOCK(pmap);
4481 	}
4482 	vm_page_flag_clear(m, PG_WRITEABLE);
4483 	sched_unpin();
4484 	vm_page_unlock_queues();
4485 }
4486 
4487 /*
4488  *	pmap_ts_referenced:
4489  *
4490  *	Return a count of reference bits for a page, clearing those bits.
4491  *	It is not necessary for every reference bit to be cleared, but it
4492  *	is necessary that 0 only be returned when there are truly no
4493  *	reference bits set.
4494  *
4495  *	XXX: The exact number of bits to check and clear is a matter that
4496  *	should be tested and standardized at some point in the future for
4497  *	optimal aging of shared pages.
4498  */
4499 int
4500 pmap_ts_referenced(vm_page_t m)
4501 {
4502 	struct md_page *pvh;
4503 	pv_entry_t pv, pvf, pvn;
4504 	pmap_t pmap;
4505 	pd_entry_t oldpde, *pde;
4506 	pt_entry_t *pte;
4507 	vm_offset_t va;
4508 	int rtval = 0;
4509 
4510 	KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
4511 	    ("pmap_ts_referenced: page %p is not managed", m));
4512 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4513 	vm_page_lock_queues();
4514 	sched_pin();
4515 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4516 		va = pv->pv_va;
4517 		pmap = PV_PMAP(pv);
4518 		PMAP_LOCK(pmap);
4519 		pde = pmap_pde(pmap, va);
4520 		oldpde = *pde;
4521 		if ((oldpde & PG_A) != 0) {
4522 			if (pmap_demote_pde(pmap, pde, va)) {
4523 				if ((oldpde & PG_W) == 0) {
4524 					/*
4525 					 * Remove the mapping to a single page
4526 					 * so that a subsequent access may
4527 					 * repromote.  Since the underlying
4528 					 * page table page is fully populated,
4529 					 * this removal never frees a page
4530 					 * table page.
4531 					 */
4532 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4533 					    PG_PS_FRAME);
4534 					pmap_remove_page(pmap, va, NULL);
4535 					rtval++;
4536 					if (rtval > 4) {
4537 						PMAP_UNLOCK(pmap);
4538 						goto out;
4539 					}
4540 				}
4541 			}
4542 		}
4543 		PMAP_UNLOCK(pmap);
4544 	}
4545 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4546 		pvf = pv;
4547 		do {
4548 			pvn = TAILQ_NEXT(pv, pv_list);
4549 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4550 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4551 			pmap = PV_PMAP(pv);
4552 			PMAP_LOCK(pmap);
4553 			pde = pmap_pde(pmap, pv->pv_va);
4554 			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4555 			    " found a 4mpage in page %p's pv list", m));
4556 			pte = pmap_pte_quick(pmap, pv->pv_va);
4557 			if ((*pte & PG_A) != 0) {
4558 				atomic_clear_int((u_int *)pte, PG_A);
4559 				pmap_invalidate_page(pmap, pv->pv_va);
4560 				rtval++;
4561 				if (rtval > 4)
4562 					pvn = NULL;
4563 			}
4564 			PMAP_UNLOCK(pmap);
4565 		} while ((pv = pvn) != NULL && pv != pvf);
4566 	}
4567 out:
4568 	sched_unpin();
4569 	vm_page_unlock_queues();
4570 	return (rtval);
4571 }
4572 
4573 /*
4574  *	Clear the modify bits on the specified physical page.
4575  */
4576 void
4577 pmap_clear_modify(vm_page_t m)
4578 {
4579 	struct md_page *pvh;
4580 	pv_entry_t next_pv, pv;
4581 	pmap_t pmap;
4582 	pd_entry_t oldpde, *pde;
4583 	pt_entry_t oldpte, *pte;
4584 	vm_offset_t va;
4585 
4586 	KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
4587 	    ("pmap_clear_modify: page %p is not managed", m));
4588 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
4589 	KASSERT((m->oflags & VPO_BUSY) == 0,
4590 	    ("pmap_clear_modify: page %p is busy", m));
4591 
4592 	/*
4593 	 * If the page is not PG_WRITEABLE, then no PTEs can have PG_M set.
4594 	 * If the object containing the page is locked and the page is not
4595 	 * VPO_BUSY, then PG_WRITEABLE cannot be concurrently set.
4596 	 */
4597 	if ((m->flags & PG_WRITEABLE) == 0)
4598 		return;
4599 	vm_page_lock_queues();
4600 	sched_pin();
4601 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4602 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4603 		va = pv->pv_va;
4604 		pmap = PV_PMAP(pv);
4605 		PMAP_LOCK(pmap);
4606 		pde = pmap_pde(pmap, va);
4607 		oldpde = *pde;
4608 		if ((oldpde & PG_RW) != 0) {
4609 			if (pmap_demote_pde(pmap, pde, va)) {
4610 				if ((oldpde & PG_W) == 0) {
4611 					/*
4612 					 * Write protect the mapping to a
4613 					 * single page so that a subsequent
4614 					 * write access may repromote.
4615 					 */
4616 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4617 					    PG_PS_FRAME);
4618 					pte = pmap_pte_quick(pmap, va);
4619 					oldpte = *pte;
4620 					if ((oldpte & PG_V) != 0) {
4621 						/*
4622 						 * Regardless of whether a pte is 32 or 64 bits
4623 						 * in size, PG_RW and PG_M are among the least
4624 						 * significant 32 bits.
4625 						 */
4626 						while (!atomic_cmpset_int((u_int *)pte,
4627 						    oldpte,
4628 						    oldpte & ~(PG_M | PG_RW)))
4629 							oldpte = *pte;
4630 						vm_page_dirty(m);
4631 						pmap_invalidate_page(pmap, va);
4632 					}
4633 				}
4634 			}
4635 		}
4636 		PMAP_UNLOCK(pmap);
4637 	}
4638 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4639 		pmap = PV_PMAP(pv);
4640 		PMAP_LOCK(pmap);
4641 		pde = pmap_pde(pmap, pv->pv_va);
4642 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4643 		    " a 4mpage in page %p's pv list", m));
4644 		pte = pmap_pte_quick(pmap, pv->pv_va);
4645 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4646 			/*
4647 			 * Regardless of whether a pte is 32 or 64 bits
4648 			 * in size, PG_M is among the least significant
4649 			 * 32 bits.
4650 			 */
4651 			atomic_clear_int((u_int *)pte, PG_M);
4652 			pmap_invalidate_page(pmap, pv->pv_va);
4653 		}
4654 		PMAP_UNLOCK(pmap);
4655 	}
4656 	sched_unpin();
4657 	vm_page_unlock_queues();
4658 }
4659 
4660 /*
4661  *	pmap_clear_reference:
4662  *
4663  *	Clear the reference bit on the specified physical page.
4664  */
4665 void
4666 pmap_clear_reference(vm_page_t m)
4667 {
4668 	struct md_page *pvh;
4669 	pv_entry_t next_pv, pv;
4670 	pmap_t pmap;
4671 	pd_entry_t oldpde, *pde;
4672 	pt_entry_t *pte;
4673 	vm_offset_t va;
4674 
4675 	KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
4676 	    ("pmap_clear_reference: page %p is not managed", m));
4677 	vm_page_lock_queues();
4678 	sched_pin();
4679 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4680 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4681 		va = pv->pv_va;
4682 		pmap = PV_PMAP(pv);
4683 		PMAP_LOCK(pmap);
4684 		pde = pmap_pde(pmap, va);
4685 		oldpde = *pde;
4686 		if ((oldpde & PG_A) != 0) {
4687 			if (pmap_demote_pde(pmap, pde, va)) {
4688 				/*
4689 				 * Remove the mapping to a single page so
4690 				 * that a subsequent access may repromote.
4691 				 * Since the underlying page table page is
4692 				 * fully populated, this removal never frees
4693 				 * a page table page.
4694 				 */
4695 				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4696 				    PG_PS_FRAME);
4697 				pmap_remove_page(pmap, va, NULL);
4698 			}
4699 		}
4700 		PMAP_UNLOCK(pmap);
4701 	}
4702 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4703 		pmap = PV_PMAP(pv);
4704 		PMAP_LOCK(pmap);
4705 		pde = pmap_pde(pmap, pv->pv_va);
4706 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4707 		    " a 4mpage in page %p's pv list", m));
4708 		pte = pmap_pte_quick(pmap, pv->pv_va);
4709 		if ((*pte & PG_A) != 0) {
4710 			/*
4711 			 * Regardless of whether a pte is 32 or 64 bits
4712 			 * in size, PG_A is among the least significant
4713 			 * 32 bits.
4714 			 */
4715 			atomic_clear_int((u_int *)pte, PG_A);
4716 			pmap_invalidate_page(pmap, pv->pv_va);
4717 		}
4718 		PMAP_UNLOCK(pmap);
4719 	}
4720 	sched_unpin();
4721 	vm_page_unlock_queues();
4722 }
4723 
4724 /*
4725  * Miscellaneous support routines follow
4726  */
4727 
4728 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
4729 static __inline void
4730 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
4731 {
4732 	u_int opte, npte;
4733 
4734 	/*
4735 	 * The cache mode bits are all in the low 32-bits of the
4736 	 * PTE, so we can just spin on updating the low 32-bits.
4737 	 */
4738 	do {
4739 		opte = *(u_int *)pte;
4740 		npte = opte & ~PG_PTE_CACHE;
4741 		npte |= cache_bits;
4742 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
4743 }
4744 
4745 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
4746 static __inline void
4747 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
4748 {
4749 	u_int opde, npde;
4750 
4751 	/*
4752 	 * The cache mode bits are all in the low 32-bits of the
4753 	 * PDE, so we can just spin on updating the low 32-bits.
4754 	 */
4755 	do {
4756 		opde = *(u_int *)pde;
4757 		npde = opde & ~PG_PDE_CACHE;
4758 		npde |= cache_bits;
4759 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
4760 }
4761 
4762 /*
4763  * Map a set of physical memory pages into the kernel virtual
4764  * address space. Return a pointer to where it is mapped. This
4765  * routine is intended to be used for mapping device memory,
4766  * NOT real memory.
4767  */
4768 void *
4769 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4770 {
4771 	vm_offset_t va, offset;
4772 	vm_size_t tmpsize;
4773 
4774 	offset = pa & PAGE_MASK;
4775 	size = roundup(offset + size, PAGE_SIZE);
4776 	pa = pa & PG_FRAME;
4777 
4778 	if (pa < KERNLOAD && pa + size <= KERNLOAD)
4779 		va = KERNBASE + pa;
4780 	else
4781 		va = kmem_alloc_nofault(kernel_map, size);
4782 	if (!va)
4783 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4784 
4785 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4786 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
4787 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
4788 	pmap_invalidate_cache_range(va, va + size);
4789 	return ((void *)(va + offset));
4790 }
4791 
4792 void *
4793 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4794 {
4795 
4796 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4797 }
4798 
4799 void *
4800 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4801 {
4802 
4803 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4804 }
4805 
4806 void
4807 pmap_unmapdev(vm_offset_t va, vm_size_t size)
4808 {
4809 	vm_offset_t base, offset, tmpva;
4810 
4811 	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
4812 		return;
4813 	base = trunc_page(va);
4814 	offset = va & PAGE_MASK;
4815 	size = roundup(offset + size, PAGE_SIZE);
4816 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4817 		pmap_kremove(tmpva);
4818 	pmap_invalidate_range(kernel_pmap, va, tmpva);
4819 	kmem_free(kernel_map, base, size);
4820 }
4821 
4822 /*
4823  * Sets the memory attribute for the specified page.
4824  */
4825 void
4826 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4827 {
4828 	struct sysmaps *sysmaps;
4829 	vm_offset_t sva, eva;
4830 
4831 	m->md.pat_mode = ma;
4832 	if ((m->flags & PG_FICTITIOUS) != 0)
4833 		return;
4834 
4835 	/*
4836 	 * If "m" is a normal page, flush it from the cache.
4837 	 * See pmap_invalidate_cache_range().
4838 	 *
4839 	 * First, try to find an existing mapping of the page by sf
4840 	 * buffer. sf_buf_invalidate_cache() modifies mapping and
4841 	 * flushes the cache.
4842 	 */
4843 	if (sf_buf_invalidate_cache(m))
4844 		return;
4845 
4846 	/*
4847 	 * If page is not mapped by sf buffer, but CPU does not
4848 	 * support self snoop, map the page transient and do
4849 	 * invalidation. In the worst case, whole cache is flushed by
4850 	 * pmap_invalidate_cache_range().
4851 	 */
4852 	if ((cpu_feature & (CPUID_SS|CPUID_CLFSH)) == CPUID_CLFSH) {
4853 		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4854 		mtx_lock(&sysmaps->lock);
4855 		if (*sysmaps->CMAP2)
4856 			panic("pmap_page_set_memattr: CMAP2 busy");
4857 		sched_pin();
4858 		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
4859 		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
4860 		invlcaddr(sysmaps->CADDR2);
4861 		sva = (vm_offset_t)sysmaps->CADDR2;
4862 		eva = sva + PAGE_SIZE;
4863 	} else
4864 		sva = eva = 0; /* gcc */
4865 	pmap_invalidate_cache_range(sva, eva);
4866 	if (sva != 0) {
4867 		*sysmaps->CMAP2 = 0;
4868 		sched_unpin();
4869 		mtx_unlock(&sysmaps->lock);
4870 	}
4871 }
4872 
4873 /*
4874  * Changes the specified virtual address range's memory type to that given by
4875  * the parameter "mode".  The specified virtual address range must be
4876  * completely contained within either the kernel map.
4877  *
4878  * Returns zero if the change completed successfully, and either EINVAL or
4879  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4880  * of the virtual address range was not mapped, and ENOMEM is returned if
4881  * there was insufficient memory available to complete the change.
4882  */
4883 int
4884 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4885 {
4886 	vm_offset_t base, offset, tmpva;
4887 	pd_entry_t *pde;
4888 	pt_entry_t *pte;
4889 	int cache_bits_pte, cache_bits_pde;
4890 	boolean_t changed;
4891 
4892 	base = trunc_page(va);
4893 	offset = va & PAGE_MASK;
4894 	size = roundup(offset + size, PAGE_SIZE);
4895 
4896 	/*
4897 	 * Only supported on kernel virtual addresses above the recursive map.
4898 	 */
4899 	if (base < VM_MIN_KERNEL_ADDRESS)
4900 		return (EINVAL);
4901 
4902 	cache_bits_pde = pmap_cache_bits(mode, 1);
4903 	cache_bits_pte = pmap_cache_bits(mode, 0);
4904 	changed = FALSE;
4905 
4906 	/*
4907 	 * Pages that aren't mapped aren't supported.  Also break down
4908 	 * 2/4MB pages into 4KB pages if required.
4909 	 */
4910 	PMAP_LOCK(kernel_pmap);
4911 	for (tmpva = base; tmpva < base + size; ) {
4912 		pde = pmap_pde(kernel_pmap, tmpva);
4913 		if (*pde == 0) {
4914 			PMAP_UNLOCK(kernel_pmap);
4915 			return (EINVAL);
4916 		}
4917 		if (*pde & PG_PS) {
4918 			/*
4919 			 * If the current 2/4MB page already has
4920 			 * the required memory type, then we need not
4921 			 * demote this page.  Just increment tmpva to
4922 			 * the next 2/4MB page frame.
4923 			 */
4924 			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
4925 				tmpva = trunc_4mpage(tmpva) + NBPDR;
4926 				continue;
4927 			}
4928 
4929 			/*
4930 			 * If the current offset aligns with a 2/4MB
4931 			 * page frame and there is at least 2/4MB left
4932 			 * within the range, then we need not break
4933 			 * down this page into 4KB pages.
4934 			 */
4935 			if ((tmpva & PDRMASK) == 0 &&
4936 			    tmpva + PDRMASK < base + size) {
4937 				tmpva += NBPDR;
4938 				continue;
4939 			}
4940 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
4941 				PMAP_UNLOCK(kernel_pmap);
4942 				return (ENOMEM);
4943 			}
4944 		}
4945 		pte = vtopte(tmpva);
4946 		if (*pte == 0) {
4947 			PMAP_UNLOCK(kernel_pmap);
4948 			return (EINVAL);
4949 		}
4950 		tmpva += PAGE_SIZE;
4951 	}
4952 	PMAP_UNLOCK(kernel_pmap);
4953 
4954 	/*
4955 	 * Ok, all the pages exist, so run through them updating their
4956 	 * cache mode if required.
4957 	 */
4958 	for (tmpva = base; tmpva < base + size; ) {
4959 		pde = pmap_pde(kernel_pmap, tmpva);
4960 		if (*pde & PG_PS) {
4961 			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
4962 				pmap_pde_attr(pde, cache_bits_pde);
4963 				changed = TRUE;
4964 			}
4965 			tmpva = trunc_4mpage(tmpva) + NBPDR;
4966 		} else {
4967 			pte = vtopte(tmpva);
4968 			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
4969 				pmap_pte_attr(pte, cache_bits_pte);
4970 				changed = TRUE;
4971 			}
4972 			tmpva += PAGE_SIZE;
4973 		}
4974 	}
4975 
4976 	/*
4977 	 * Flush CPU caches to make sure any data isn't cached that
4978 	 * shouldn't be, etc.
4979 	 */
4980 	if (changed) {
4981 		pmap_invalidate_range(kernel_pmap, base, tmpva);
4982 		pmap_invalidate_cache_range(base, tmpva);
4983 	}
4984 	return (0);
4985 }
4986 
4987 /*
4988  * perform the pmap work for mincore
4989  */
4990 int
4991 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
4992 {
4993 	pd_entry_t *pdep;
4994 	pt_entry_t *ptep, pte;
4995 	vm_paddr_t pa;
4996 	int val;
4997 
4998 	PMAP_LOCK(pmap);
4999 retry:
5000 	pdep = pmap_pde(pmap, addr);
5001 	if (*pdep != 0) {
5002 		if (*pdep & PG_PS) {
5003 			pte = *pdep;
5004 			/* Compute the physical address of the 4KB page. */
5005 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5006 			    PG_FRAME;
5007 			val = MINCORE_SUPER;
5008 		} else {
5009 			ptep = pmap_pte(pmap, addr);
5010 			pte = *ptep;
5011 			pmap_pte_release(ptep);
5012 			pa = pte & PG_FRAME;
5013 			val = 0;
5014 		}
5015 	} else {
5016 		pte = 0;
5017 		pa = 0;
5018 		val = 0;
5019 	}
5020 	if ((pte & PG_V) != 0) {
5021 		val |= MINCORE_INCORE;
5022 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5023 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5024 		if ((pte & PG_A) != 0)
5025 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5026 	}
5027 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5028 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5029 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5030 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5031 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5032 			goto retry;
5033 	} else
5034 		PA_UNLOCK_COND(*locked_pa);
5035 	PMAP_UNLOCK(pmap);
5036 	return (val);
5037 }
5038 
5039 void
5040 pmap_activate(struct thread *td)
5041 {
5042 	pmap_t	pmap, oldpmap;
5043 	u_int32_t  cr3;
5044 
5045 	critical_enter();
5046 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5047 	oldpmap = PCPU_GET(curpmap);
5048 #if defined(SMP)
5049 	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
5050 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
5051 #else
5052 	oldpmap->pm_active &= ~1;
5053 	pmap->pm_active |= 1;
5054 #endif
5055 #ifdef PAE
5056 	cr3 = vtophys(pmap->pm_pdpt);
5057 #else
5058 	cr3 = vtophys(pmap->pm_pdir);
5059 #endif
5060 	/*
5061 	 * pmap_activate is for the current thread on the current cpu
5062 	 */
5063 	td->td_pcb->pcb_cr3 = cr3;
5064 	load_cr3(cr3);
5065 	PCPU_SET(curpmap, pmap);
5066 	critical_exit();
5067 }
5068 
5069 void
5070 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5071 {
5072 }
5073 
5074 /*
5075  *	Increase the starting virtual address of the given mapping if a
5076  *	different alignment might result in more superpage mappings.
5077  */
5078 void
5079 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5080     vm_offset_t *addr, vm_size_t size)
5081 {
5082 	vm_offset_t superpage_offset;
5083 
5084 	if (size < NBPDR)
5085 		return;
5086 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5087 		offset += ptoa(object->pg_color);
5088 	superpage_offset = offset & PDRMASK;
5089 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5090 	    (*addr & PDRMASK) == superpage_offset)
5091 		return;
5092 	if ((*addr & PDRMASK) < superpage_offset)
5093 		*addr = (*addr & ~PDRMASK) + superpage_offset;
5094 	else
5095 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5096 }
5097 
5098 
5099 #if defined(PMAP_DEBUG)
5100 pmap_pid_dump(int pid)
5101 {
5102 	pmap_t pmap;
5103 	struct proc *p;
5104 	int npte = 0;
5105 	int index;
5106 
5107 	sx_slock(&allproc_lock);
5108 	FOREACH_PROC_IN_SYSTEM(p) {
5109 		if (p->p_pid != pid)
5110 			continue;
5111 
5112 		if (p->p_vmspace) {
5113 			int i,j;
5114 			index = 0;
5115 			pmap = vmspace_pmap(p->p_vmspace);
5116 			for (i = 0; i < NPDEPTD; i++) {
5117 				pd_entry_t *pde;
5118 				pt_entry_t *pte;
5119 				vm_offset_t base = i << PDRSHIFT;
5120 
5121 				pde = &pmap->pm_pdir[i];
5122 				if (pde && pmap_pde_v(pde)) {
5123 					for (j = 0; j < NPTEPG; j++) {
5124 						vm_offset_t va = base + (j << PAGE_SHIFT);
5125 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5126 							if (index) {
5127 								index = 0;
5128 								printf("\n");
5129 							}
5130 							sx_sunlock(&allproc_lock);
5131 							return (npte);
5132 						}
5133 						pte = pmap_pte(pmap, va);
5134 						if (pte && pmap_pte_v(pte)) {
5135 							pt_entry_t pa;
5136 							vm_page_t m;
5137 							pa = *pte;
5138 							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5139 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5140 								va, pa, m->hold_count, m->wire_count, m->flags);
5141 							npte++;
5142 							index++;
5143 							if (index >= 2) {
5144 								index = 0;
5145 								printf("\n");
5146 							} else {
5147 								printf(" ");
5148 							}
5149 						}
5150 					}
5151 				}
5152 			}
5153 		}
5154 	}
5155 	sx_sunlock(&allproc_lock);
5156 	return (npte);
5157 }
5158 #endif
5159 
5160 #if defined(DEBUG)
5161 
5162 static void	pads(pmap_t pm);
5163 void		pmap_pvdump(vm_offset_t pa);
5164 
5165 /* print address space of pmap*/
5166 static void
5167 pads(pmap_t pm)
5168 {
5169 	int i, j;
5170 	vm_paddr_t va;
5171 	pt_entry_t *ptep;
5172 
5173 	if (pm == kernel_pmap)
5174 		return;
5175 	for (i = 0; i < NPDEPTD; i++)
5176 		if (pm->pm_pdir[i])
5177 			for (j = 0; j < NPTEPG; j++) {
5178 				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
5179 				if (pm == kernel_pmap && va < KERNBASE)
5180 					continue;
5181 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
5182 					continue;
5183 				ptep = pmap_pte(pm, va);
5184 				if (pmap_pte_v(ptep))
5185 					printf("%x:%x ", va, *ptep);
5186 			};
5187 
5188 }
5189 
5190 void
5191 pmap_pvdump(vm_paddr_t pa)
5192 {
5193 	pv_entry_t pv;
5194 	pmap_t pmap;
5195 	vm_page_t m;
5196 
5197 	printf("pa %x", pa);
5198 	m = PHYS_TO_VM_PAGE(pa);
5199 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
5200 		pmap = PV_PMAP(pv);
5201 		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
5202 		pads(pmap);
5203 	}
5204 	printf(" ");
5205 }
5206 #endif
5207