xref: /freebsd/sys/i386/i386/pmap.c (revision 325151a3)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * the Systems Programming Group of the University of Utah Computer
13  * Science Department and William Jolitz of UUNET Technologies Inc.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. All advertising materials mentioning features or use of this software
24  *    must display the following acknowledgement:
25  *	This product includes software developed by the University of
26  *	California, Berkeley and its contributors.
27  * 4. Neither the name of the University nor the names of its contributors
28  *    may be used to endorse or promote products derived from this software
29  *    without specific prior written permission.
30  *
31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41  * SUCH DAMAGE.
42  *
43  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44  */
45 /*-
46  * Copyright (c) 2003 Networks Associates Technology, Inc.
47  * All rights reserved.
48  *
49  * This software was developed for the FreeBSD Project by Jake Burkholder,
50  * Safeport Network Services, and Network Associates Laboratories, the
51  * Security Research Division of Network Associates, Inc. under
52  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53  * CHATS research program.
54  *
55  * Redistribution and use in source and binary forms, with or without
56  * modification, are permitted provided that the following conditions
57  * are met:
58  * 1. Redistributions of source code must retain the above copyright
59  *    notice, this list of conditions and the following disclaimer.
60  * 2. Redistributions in binary form must reproduce the above copyright
61  *    notice, this list of conditions and the following disclaimer in the
62  *    documentation and/or other materials provided with the distribution.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74  * SUCH DAMAGE.
75  */
76 
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD$");
79 
80 /*
81  *	Manages physical address maps.
82  *
83  *	Since the information managed by this module is
84  *	also stored by the logical address mapping module,
85  *	this module may throw away valid virtual-to-physical
86  *	mappings at almost any time.  However, invalidations
87  *	of virtual-to-physical mappings must be done as
88  *	requested.
89  *
90  *	In order to cope with hardware architectures which
91  *	make virtual-to-physical map invalidates expensive,
92  *	this module may delay invalidate or reduced protection
93  *	operations until such time as they are actually
94  *	necessary.  This module is given full information as
95  *	to which processors are currently using which maps,
96  *	and to when physical maps must be made correct.
97  */
98 
99 #include "opt_apic.h"
100 #include "opt_cpu.h"
101 #include "opt_pmap.h"
102 #include "opt_smp.h"
103 #include "opt_xbox.h"
104 
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/kernel.h>
108 #include <sys/ktr.h>
109 #include <sys/lock.h>
110 #include <sys/malloc.h>
111 #include <sys/mman.h>
112 #include <sys/msgbuf.h>
113 #include <sys/mutex.h>
114 #include <sys/proc.h>
115 #include <sys/rwlock.h>
116 #include <sys/sf_buf.h>
117 #include <sys/sx.h>
118 #include <sys/vmmeter.h>
119 #include <sys/sched.h>
120 #include <sys/sysctl.h>
121 #include <sys/smp.h>
122 
123 #include <vm/vm.h>
124 #include <vm/vm_param.h>
125 #include <vm/vm_kern.h>
126 #include <vm/vm_page.h>
127 #include <vm/vm_map.h>
128 #include <vm/vm_object.h>
129 #include <vm/vm_extern.h>
130 #include <vm/vm_pageout.h>
131 #include <vm/vm_pager.h>
132 #include <vm/vm_phys.h>
133 #include <vm/vm_radix.h>
134 #include <vm/vm_reserv.h>
135 #include <vm/uma.h>
136 
137 #ifdef DEV_APIC
138 #include <sys/bus.h>
139 #include <machine/intr_machdep.h>
140 #include <x86/apicvar.h>
141 #endif
142 #include <machine/cpu.h>
143 #include <machine/cputypes.h>
144 #include <machine/md_var.h>
145 #include <machine/pcb.h>
146 #include <machine/specialreg.h>
147 #ifdef SMP
148 #include <machine/smp.h>
149 #endif
150 
151 #ifdef XBOX
152 #include <machine/xbox.h>
153 #endif
154 
155 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
156 #define CPU_ENABLE_SSE
157 #endif
158 
159 #ifndef PMAP_SHPGPERPROC
160 #define PMAP_SHPGPERPROC 200
161 #endif
162 
163 #if !defined(DIAGNOSTIC)
164 #ifdef __GNUC_GNU_INLINE__
165 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
166 #else
167 #define PMAP_INLINE	extern inline
168 #endif
169 #else
170 #define PMAP_INLINE
171 #endif
172 
173 #ifdef PV_STATS
174 #define PV_STAT(x)	do { x ; } while (0)
175 #else
176 #define PV_STAT(x)	do { } while (0)
177 #endif
178 
179 #define	pa_index(pa)	((pa) >> PDRSHIFT)
180 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
181 
182 /*
183  * Get PDEs and PTEs for user/kernel address space
184  */
185 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
186 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
187 
188 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
189 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
190 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
191 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
192 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
193 
194 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
195     atomic_clear_int((u_int *)(pte), PG_W))
196 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
197 
198 struct pmap kernel_pmap_store;
199 LIST_HEAD(pmaplist, pmap);
200 static struct pmaplist allpmaps;
201 static struct mtx allpmaps_lock;
202 
203 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
204 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
205 int pgeflag = 0;		/* PG_G or-in */
206 int pseflag = 0;		/* PG_PS or-in */
207 
208 static int nkpt = NKPT;
209 vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
210 extern u_int32_t KERNend;
211 extern u_int32_t KPTphys;
212 
213 #if defined(PAE) || defined(PAE_TABLES)
214 pt_entry_t pg_nx;
215 static uma_zone_t pdptzone;
216 #endif
217 
218 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
219 
220 static int pat_works = 1;
221 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
222     "Is page attribute table fully functional?");
223 
224 static int pg_ps_enabled = 1;
225 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
226     &pg_ps_enabled, 0, "Are large page mappings enabled?");
227 
228 #define	PAT_INDEX_SIZE	8
229 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
230 
231 /*
232  * pmap_mapdev support pre initialization (i.e. console)
233  */
234 #define	PMAP_PREINIT_MAPPING_COUNT	8
235 static struct pmap_preinit_mapping {
236 	vm_paddr_t	pa;
237 	vm_offset_t	va;
238 	vm_size_t	sz;
239 	int		mode;
240 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
241 static int pmap_initialized;
242 
243 static struct rwlock_padalign pvh_global_lock;
244 
245 /*
246  * Data for the pv entry allocation mechanism
247  */
248 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
249 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
250 static struct md_page *pv_table;
251 static int shpgperproc = PMAP_SHPGPERPROC;
252 
253 struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
254 int pv_maxchunks;			/* How many chunks we have KVA for */
255 vm_offset_t pv_vafree;			/* freelist stored in the PTE */
256 
257 /*
258  * All those kernel PT submaps that BSD is so fond of
259  */
260 struct sysmaps {
261 	struct	mtx lock;
262 	pt_entry_t *CMAP1;
263 	pt_entry_t *CMAP2;
264 	caddr_t	CADDR1;
265 	caddr_t	CADDR2;
266 };
267 static struct sysmaps sysmaps_pcpu[MAXCPU];
268 pt_entry_t *CMAP3;
269 static pd_entry_t *KPTD;
270 caddr_t ptvmmap = 0;
271 caddr_t CADDR3;
272 struct msgbuf *msgbufp = 0;
273 
274 /*
275  * Crashdump maps.
276  */
277 static caddr_t crashdumpmap;
278 
279 static pt_entry_t *PMAP1 = 0, *PMAP2;
280 static pt_entry_t *PADDR1 = 0, *PADDR2;
281 #ifdef SMP
282 static int PMAP1cpu;
283 static int PMAP1changedcpu;
284 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
285 	   &PMAP1changedcpu, 0,
286 	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
287 #endif
288 static int PMAP1changed;
289 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
290 	   &PMAP1changed, 0,
291 	   "Number of times pmap_pte_quick changed PMAP1");
292 static int PMAP1unchanged;
293 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
294 	   &PMAP1unchanged, 0,
295 	   "Number of times pmap_pte_quick didn't change PMAP1");
296 static struct mtx PMAP2mutex;
297 
298 static void	free_pv_chunk(struct pv_chunk *pc);
299 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
300 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
301 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
302 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
303 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
304 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
305 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
306 		    vm_offset_t va);
307 static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
308 
309 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
310 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
311     vm_prot_t prot);
312 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
313     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
314 static void pmap_flush_page(vm_page_t m);
315 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
316 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
317 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
318 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
319 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
320 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
321 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
322 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
323 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
324 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
325     vm_prot_t prot);
326 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
327 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
328     struct spglist *free);
329 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
330     struct spglist *free);
331 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
332 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
333     struct spglist *free);
334 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
335 					vm_offset_t va);
336 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
337 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
338     vm_page_t m);
339 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
340     pd_entry_t newpde);
341 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
342 
343 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
344 
345 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
346 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
347 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
348 static void pmap_pte_release(pt_entry_t *pte);
349 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
350 #if defined(PAE) || defined(PAE_TABLES)
351 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
352     int wait);
353 #endif
354 static void pmap_set_pg(void);
355 
356 static __inline void pagezero(void *page);
357 
358 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
359 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
360 
361 /*
362  * If you get an error here, then you set KVA_PAGES wrong! See the
363  * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
364  * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
365  */
366 CTASSERT(KERNBASE % (1 << 24) == 0);
367 
368 /*
369  *	Bootstrap the system enough to run with virtual memory.
370  *
371  *	On the i386 this is called after mapping has already been enabled
372  *	and just syncs the pmap module with what has already been done.
373  *	[We can't call it easily with mapping off since the kernel is not
374  *	mapped with PA == VA, hence we would have to relocate every address
375  *	from the linked base (virtual) address "KERNBASE" to the actual
376  *	(physical) address starting relative to 0]
377  */
378 void
379 pmap_bootstrap(vm_paddr_t firstaddr)
380 {
381 	vm_offset_t va;
382 	pt_entry_t *pte, *unused;
383 	struct sysmaps *sysmaps;
384 	int i;
385 
386 	/*
387 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
388 	 * preallocated kernel page table pages so that vm_page structures
389 	 * representing these pages will be created.  The vm_page structures
390 	 * are required for promotion of the corresponding kernel virtual
391 	 * addresses to superpage mappings.
392 	 */
393 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
394 
395 	/*
396 	 * Initialize the first available kernel virtual address.  However,
397 	 * using "firstaddr" may waste a few pages of the kernel virtual
398 	 * address space, because locore may not have mapped every physical
399 	 * page that it allocated.  Preferably, locore would provide a first
400 	 * unused virtual address in addition to "firstaddr".
401 	 */
402 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
403 
404 	virtual_end = VM_MAX_KERNEL_ADDRESS;
405 
406 	/*
407 	 * Initialize the kernel pmap (which is statically allocated).
408 	 */
409 	PMAP_LOCK_INIT(kernel_pmap);
410 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
411 #if defined(PAE) || defined(PAE_TABLES)
412 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
413 #endif
414 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
415 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
416 
417  	/*
418 	 * Initialize the global pv list lock.
419 	 */
420 	rw_init(&pvh_global_lock, "pmap pv global");
421 
422 	LIST_INIT(&allpmaps);
423 
424 	/*
425 	 * Request a spin mutex so that changes to allpmaps cannot be
426 	 * preempted by smp_rendezvous_cpus().  Otherwise,
427 	 * pmap_update_pde_kernel() could access allpmaps while it is
428 	 * being changed.
429 	 */
430 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
431 	mtx_lock_spin(&allpmaps_lock);
432 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
433 	mtx_unlock_spin(&allpmaps_lock);
434 
435 	/*
436 	 * Reserve some special page table entries/VA space for temporary
437 	 * mapping of pages.
438 	 */
439 #define	SYSMAP(c, p, v, n)	\
440 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
441 
442 	va = virtual_avail;
443 	pte = vtopte(va);
444 
445 	/*
446 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
447 	 * CMAP3 is used for the idle process page zeroing.
448 	 */
449 	for (i = 0; i < MAXCPU; i++) {
450 		sysmaps = &sysmaps_pcpu[i];
451 		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
452 		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
453 		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
454 	}
455 	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
456 
457 	/*
458 	 * Crashdump maps.
459 	 */
460 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
461 
462 	/*
463 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
464 	 */
465 	SYSMAP(caddr_t, unused, ptvmmap, 1)
466 
467 	/*
468 	 * msgbufp is used to map the system message buffer.
469 	 */
470 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
471 
472 	/*
473 	 * KPTmap is used by pmap_kextract().
474 	 *
475 	 * KPTmap is first initialized by locore.  However, that initial
476 	 * KPTmap can only support NKPT page table pages.  Here, a larger
477 	 * KPTmap is created that can support KVA_PAGES page table pages.
478 	 */
479 	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
480 
481 	for (i = 0; i < NKPT; i++)
482 		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
483 
484 	/*
485 	 * Adjust the start of the KPTD and KPTmap so that the implementation
486 	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
487 	 */
488 	KPTD -= KPTDI;
489 	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
490 
491 	/*
492 	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
493 	 * respectively.
494 	 */
495 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
496 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
497 
498 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
499 
500 	virtual_avail = va;
501 
502 	/*
503 	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
504 	 * physical memory region that is used by the ACPI wakeup code.  This
505 	 * mapping must not have PG_G set.
506 	 */
507 #ifdef XBOX
508 	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
509 	 * an early stadium, we cannot yet neatly map video memory ... :-(
510 	 * Better fixes are very welcome! */
511 	if (!arch_i386_is_xbox)
512 #endif
513 	for (i = 1; i < NKPT; i++)
514 		PTD[i] = 0;
515 
516 	/* Initialize the PAT MSR if present. */
517 	pmap_init_pat();
518 
519 	/* Turn on PG_G on kernel page(s) */
520 	pmap_set_pg();
521 }
522 
523 static void
524 pmap_init_qpages(void)
525 {
526 	struct pcpu *pc;
527 	int i;
528 
529 	CPU_FOREACH(i) {
530 		pc = pcpu_find(i);
531 		pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
532 		if (pc->pc_qmap_addr == 0)
533 			panic("pmap_init_qpages: unable to allocate KVA");
534 	}
535 }
536 
537 SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_qpages, NULL);
538 
539 /*
540  * Setup the PAT MSR.
541  */
542 void
543 pmap_init_pat(void)
544 {
545 	int pat_table[PAT_INDEX_SIZE];
546 	uint64_t pat_msr;
547 	u_long cr0, cr4;
548 	int i;
549 
550 	/* Set default PAT index table. */
551 	for (i = 0; i < PAT_INDEX_SIZE; i++)
552 		pat_table[i] = -1;
553 	pat_table[PAT_WRITE_BACK] = 0;
554 	pat_table[PAT_WRITE_THROUGH] = 1;
555 	pat_table[PAT_UNCACHEABLE] = 3;
556 	pat_table[PAT_WRITE_COMBINING] = 3;
557 	pat_table[PAT_WRITE_PROTECTED] = 3;
558 	pat_table[PAT_UNCACHED] = 3;
559 
560 	/* Bail if this CPU doesn't implement PAT. */
561 	if ((cpu_feature & CPUID_PAT) == 0) {
562 		for (i = 0; i < PAT_INDEX_SIZE; i++)
563 			pat_index[i] = pat_table[i];
564 		pat_works = 0;
565 		return;
566 	}
567 
568 	/*
569 	 * Due to some Intel errata, we can only safely use the lower 4
570 	 * PAT entries.
571 	 *
572 	 *   Intel Pentium III Processor Specification Update
573 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
574 	 * or Mode C Paging)
575 	 *
576 	 *   Intel Pentium IV  Processor Specification Update
577 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
578 	 */
579 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
580 	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
581 		pat_works = 0;
582 
583 	/* Initialize default PAT entries. */
584 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
585 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
586 	    PAT_VALUE(2, PAT_UNCACHED) |
587 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
588 	    PAT_VALUE(4, PAT_WRITE_BACK) |
589 	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
590 	    PAT_VALUE(6, PAT_UNCACHED) |
591 	    PAT_VALUE(7, PAT_UNCACHEABLE);
592 
593 	if (pat_works) {
594 		/*
595 		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
596 		 * Program 5 and 6 as WP and WC.
597 		 * Leave 4 and 7 as WB and UC.
598 		 */
599 		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
600 		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
601 		    PAT_VALUE(6, PAT_WRITE_COMBINING);
602 		pat_table[PAT_UNCACHED] = 2;
603 		pat_table[PAT_WRITE_PROTECTED] = 5;
604 		pat_table[PAT_WRITE_COMBINING] = 6;
605 	} else {
606 		/*
607 		 * Just replace PAT Index 2 with WC instead of UC-.
608 		 */
609 		pat_msr &= ~PAT_MASK(2);
610 		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
611 		pat_table[PAT_WRITE_COMBINING] = 2;
612 	}
613 
614 	/* Disable PGE. */
615 	cr4 = rcr4();
616 	load_cr4(cr4 & ~CR4_PGE);
617 
618 	/* Disable caches (CD = 1, NW = 0). */
619 	cr0 = rcr0();
620 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
621 
622 	/* Flushes caches and TLBs. */
623 	wbinvd();
624 	invltlb();
625 
626 	/* Update PAT and index table. */
627 	wrmsr(MSR_PAT, pat_msr);
628 	for (i = 0; i < PAT_INDEX_SIZE; i++)
629 		pat_index[i] = pat_table[i];
630 
631 	/* Flush caches and TLBs again. */
632 	wbinvd();
633 	invltlb();
634 
635 	/* Restore caches and PGE. */
636 	load_cr0(cr0);
637 	load_cr4(cr4);
638 }
639 
640 /*
641  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
642  */
643 static void
644 pmap_set_pg(void)
645 {
646 	pt_entry_t *pte;
647 	vm_offset_t va, endva;
648 
649 	if (pgeflag == 0)
650 		return;
651 
652 	endva = KERNBASE + KERNend;
653 
654 	if (pseflag) {
655 		va = KERNBASE + KERNLOAD;
656 		while (va  < endva) {
657 			pdir_pde(PTD, va) |= pgeflag;
658 			invltlb();	/* Play it safe, invltlb() every time */
659 			va += NBPDR;
660 		}
661 	} else {
662 		va = (vm_offset_t)btext;
663 		while (va < endva) {
664 			pte = vtopte(va);
665 			if (*pte)
666 				*pte |= pgeflag;
667 			invltlb();	/* Play it safe, invltlb() every time */
668 			va += PAGE_SIZE;
669 		}
670 	}
671 }
672 
673 /*
674  * Initialize a vm_page's machine-dependent fields.
675  */
676 void
677 pmap_page_init(vm_page_t m)
678 {
679 
680 	TAILQ_INIT(&m->md.pv_list);
681 	m->md.pat_mode = PAT_WRITE_BACK;
682 }
683 
684 #if defined(PAE) || defined(PAE_TABLES)
685 static void *
686 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
687 {
688 
689 	/* Inform UMA that this allocator uses kernel_map/object. */
690 	*flags = UMA_SLAB_KERNEL;
691 	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
692 	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
693 }
694 #endif
695 
696 /*
697  * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
698  * Requirements:
699  *  - Must deal with pages in order to ensure that none of the PG_* bits
700  *    are ever set, PG_V in particular.
701  *  - Assumes we can write to ptes without pte_store() atomic ops, even
702  *    on PAE systems.  This should be ok.
703  *  - Assumes nothing will ever test these addresses for 0 to indicate
704  *    no mapping instead of correctly checking PG_V.
705  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
706  * Because PG_V is never set, there can be no mappings to invalidate.
707  */
708 static vm_offset_t
709 pmap_ptelist_alloc(vm_offset_t *head)
710 {
711 	pt_entry_t *pte;
712 	vm_offset_t va;
713 
714 	va = *head;
715 	if (va == 0)
716 		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
717 	pte = vtopte(va);
718 	*head = *pte;
719 	if (*head & PG_V)
720 		panic("pmap_ptelist_alloc: va with PG_V set!");
721 	*pte = 0;
722 	return (va);
723 }
724 
725 static void
726 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
727 {
728 	pt_entry_t *pte;
729 
730 	if (va & PG_V)
731 		panic("pmap_ptelist_free: freeing va with PG_V set!");
732 	pte = vtopte(va);
733 	*pte = *head;		/* virtual! PG_V is 0 though */
734 	*head = va;
735 }
736 
737 static void
738 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
739 {
740 	int i;
741 	vm_offset_t va;
742 
743 	*head = 0;
744 	for (i = npages - 1; i >= 0; i--) {
745 		va = (vm_offset_t)base + i * PAGE_SIZE;
746 		pmap_ptelist_free(head, va);
747 	}
748 }
749 
750 
751 /*
752  *	Initialize the pmap module.
753  *	Called by vm_init, to initialize any structures that the pmap
754  *	system needs to map virtual memory.
755  */
756 void
757 pmap_init(void)
758 {
759 	struct pmap_preinit_mapping *ppim;
760 	vm_page_t mpte;
761 	vm_size_t s;
762 	int i, pv_npg;
763 
764 	/*
765 	 * Initialize the vm page array entries for the kernel pmap's
766 	 * page table pages.
767 	 */
768 	for (i = 0; i < NKPT; i++) {
769 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
770 		KASSERT(mpte >= vm_page_array &&
771 		    mpte < &vm_page_array[vm_page_array_size],
772 		    ("pmap_init: page table page is out of range"));
773 		mpte->pindex = i + KPTDI;
774 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
775 	}
776 
777 	/*
778 	 * Initialize the address space (zone) for the pv entries.  Set a
779 	 * high water mark so that the system can recover from excessive
780 	 * numbers of pv entries.
781 	 */
782 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
783 	pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
784 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
785 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
786 	pv_entry_high_water = 9 * (pv_entry_max / 10);
787 
788 	/*
789 	 * If the kernel is running on a virtual machine, then it must assume
790 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
791 	 * be prepared for the hypervisor changing the vendor and family that
792 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
793 	 * 10h Erratum 383 is enabled if the processor's feature set does not
794 	 * include at least one feature that is only supported by older Intel
795 	 * or newer AMD processors.
796 	 */
797 	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
798 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
799 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
800 	    AMDID2_FMA4)) == 0)
801 		workaround_erratum383 = 1;
802 
803 	/*
804 	 * Are large page mappings supported and enabled?
805 	 */
806 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
807 	if (pseflag == 0)
808 		pg_ps_enabled = 0;
809 	else if (pg_ps_enabled) {
810 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
811 		    ("pmap_init: can't assign to pagesizes[1]"));
812 		pagesizes[1] = NBPDR;
813 	}
814 
815 	/*
816 	 * Calculate the size of the pv head table for superpages.
817 	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
818 	 */
819 	pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
820 	    PAGE_SIZE) / NBPDR + 1;
821 
822 	/*
823 	 * Allocate memory for the pv head table for superpages.
824 	 */
825 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
826 	s = round_page(s);
827 	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
828 	    M_WAITOK | M_ZERO);
829 	for (i = 0; i < pv_npg; i++)
830 		TAILQ_INIT(&pv_table[i].pv_list);
831 
832 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
833 	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
834 	if (pv_chunkbase == NULL)
835 		panic("pmap_init: not enough kvm for pv chunks");
836 	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
837 #if defined(PAE) || defined(PAE_TABLES)
838 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
839 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
840 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
841 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
842 #endif
843 
844 	pmap_initialized = 1;
845 	if (!bootverbose)
846 		return;
847 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
848 		ppim = pmap_preinit_mapping + i;
849 		if (ppim->va == 0)
850 			continue;
851 		printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
852 		    (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
853 	}
854 }
855 
856 
857 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
858 	"Max number of PV entries");
859 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
860 	"Page share factor per proc");
861 
862 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
863     "2/4MB page mapping counters");
864 
865 static u_long pmap_pde_demotions;
866 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
867     &pmap_pde_demotions, 0, "2/4MB page demotions");
868 
869 static u_long pmap_pde_mappings;
870 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
871     &pmap_pde_mappings, 0, "2/4MB page mappings");
872 
873 static u_long pmap_pde_p_failures;
874 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
875     &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
876 
877 static u_long pmap_pde_promotions;
878 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
879     &pmap_pde_promotions, 0, "2/4MB page promotions");
880 
881 /***************************************************
882  * Low level helper routines.....
883  ***************************************************/
884 
885 /*
886  * Determine the appropriate bits to set in a PTE or PDE for a specified
887  * caching mode.
888  */
889 int
890 pmap_cache_bits(int mode, boolean_t is_pde)
891 {
892 	int cache_bits, pat_flag, pat_idx;
893 
894 	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
895 		panic("Unknown caching mode %d\n", mode);
896 
897 	/* The PAT bit is different for PTE's and PDE's. */
898 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
899 
900 	/* Map the caching mode to a PAT index. */
901 	pat_idx = pat_index[mode];
902 
903 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
904 	cache_bits = 0;
905 	if (pat_idx & 0x4)
906 		cache_bits |= pat_flag;
907 	if (pat_idx & 0x2)
908 		cache_bits |= PG_NC_PCD;
909 	if (pat_idx & 0x1)
910 		cache_bits |= PG_NC_PWT;
911 	return (cache_bits);
912 }
913 
914 /*
915  * The caller is responsible for maintaining TLB consistency.
916  */
917 static void
918 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
919 {
920 	pd_entry_t *pde;
921 	pmap_t pmap;
922 	boolean_t PTD_updated;
923 
924 	PTD_updated = FALSE;
925 	mtx_lock_spin(&allpmaps_lock);
926 	LIST_FOREACH(pmap, &allpmaps, pm_list) {
927 		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
928 		    PG_FRAME))
929 			PTD_updated = TRUE;
930 		pde = pmap_pde(pmap, va);
931 		pde_store(pde, newpde);
932 	}
933 	mtx_unlock_spin(&allpmaps_lock);
934 	KASSERT(PTD_updated,
935 	    ("pmap_kenter_pde: current page table is not in allpmaps"));
936 }
937 
938 /*
939  * After changing the page size for the specified virtual address in the page
940  * table, flush the corresponding entries from the processor's TLB.  Only the
941  * calling processor's TLB is affected.
942  *
943  * The calling thread must be pinned to a processor.
944  */
945 static void
946 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
947 {
948 	u_long cr4;
949 
950 	if ((newpde & PG_PS) == 0)
951 		/* Demotion: flush a specific 2MB page mapping. */
952 		invlpg(va);
953 	else if ((newpde & PG_G) == 0)
954 		/*
955 		 * Promotion: flush every 4KB page mapping from the TLB
956 		 * because there are too many to flush individually.
957 		 */
958 		invltlb();
959 	else {
960 		/*
961 		 * Promotion: flush every 4KB page mapping from the TLB,
962 		 * including any global (PG_G) mappings.
963 		 */
964 		cr4 = rcr4();
965 		load_cr4(cr4 & ~CR4_PGE);
966 		/*
967 		 * Although preemption at this point could be detrimental to
968 		 * performance, it would not lead to an error.  PG_G is simply
969 		 * ignored if CR4.PGE is clear.  Moreover, in case this block
970 		 * is re-entered, the load_cr4() either above or below will
971 		 * modify CR4.PGE flushing the TLB.
972 		 */
973 		load_cr4(cr4 | CR4_PGE);
974 	}
975 }
976 #ifdef SMP
977 /*
978  * For SMP, these functions have to use the IPI mechanism for coherence.
979  *
980  * N.B.: Before calling any of the following TLB invalidation functions,
981  * the calling processor must ensure that all stores updating a non-
982  * kernel page table are globally performed.  Otherwise, another
983  * processor could cache an old, pre-update entry without being
984  * invalidated.  This can happen one of two ways: (1) The pmap becomes
985  * active on another processor after its pm_active field is checked by
986  * one of the following functions but before a store updating the page
987  * table is globally performed. (2) The pmap becomes active on another
988  * processor before its pm_active field is checked but due to
989  * speculative loads one of the following functions stills reads the
990  * pmap as inactive on the other processor.
991  *
992  * The kernel page table is exempt because its pm_active field is
993  * immutable.  The kernel page table is always active on every
994  * processor.
995  */
996 void
997 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
998 {
999 	cpuset_t other_cpus;
1000 	u_int cpuid;
1001 
1002 	sched_pin();
1003 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1004 		invlpg(va);
1005 		smp_invlpg(va);
1006 	} else {
1007 		cpuid = PCPU_GET(cpuid);
1008 		other_cpus = all_cpus;
1009 		CPU_CLR(cpuid, &other_cpus);
1010 		if (CPU_ISSET(cpuid, &pmap->pm_active))
1011 			invlpg(va);
1012 		CPU_AND(&other_cpus, &pmap->pm_active);
1013 		if (!CPU_EMPTY(&other_cpus))
1014 			smp_masked_invlpg(other_cpus, va);
1015 	}
1016 	sched_unpin();
1017 }
1018 
1019 void
1020 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1021 {
1022 	cpuset_t other_cpus;
1023 	vm_offset_t addr;
1024 	u_int cpuid;
1025 
1026 	sched_pin();
1027 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1028 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1029 			invlpg(addr);
1030 		smp_invlpg_range(sva, eva);
1031 	} else {
1032 		cpuid = PCPU_GET(cpuid);
1033 		other_cpus = all_cpus;
1034 		CPU_CLR(cpuid, &other_cpus);
1035 		if (CPU_ISSET(cpuid, &pmap->pm_active))
1036 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1037 				invlpg(addr);
1038 		CPU_AND(&other_cpus, &pmap->pm_active);
1039 		if (!CPU_EMPTY(&other_cpus))
1040 			smp_masked_invlpg_range(other_cpus, sva, eva);
1041 	}
1042 	sched_unpin();
1043 }
1044 
1045 void
1046 pmap_invalidate_all(pmap_t pmap)
1047 {
1048 	cpuset_t other_cpus;
1049 	u_int cpuid;
1050 
1051 	sched_pin();
1052 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1053 		invltlb();
1054 		smp_invltlb();
1055 	} else {
1056 		cpuid = PCPU_GET(cpuid);
1057 		other_cpus = all_cpus;
1058 		CPU_CLR(cpuid, &other_cpus);
1059 		if (CPU_ISSET(cpuid, &pmap->pm_active))
1060 			invltlb();
1061 		CPU_AND(&other_cpus, &pmap->pm_active);
1062 		if (!CPU_EMPTY(&other_cpus))
1063 			smp_masked_invltlb(other_cpus);
1064 	}
1065 	sched_unpin();
1066 }
1067 
1068 void
1069 pmap_invalidate_cache(void)
1070 {
1071 
1072 	sched_pin();
1073 	wbinvd();
1074 	smp_cache_flush();
1075 	sched_unpin();
1076 }
1077 
1078 struct pde_action {
1079 	cpuset_t invalidate;	/* processors that invalidate their TLB */
1080 	vm_offset_t va;
1081 	pd_entry_t *pde;
1082 	pd_entry_t newpde;
1083 	u_int store;		/* processor that updates the PDE */
1084 };
1085 
1086 static void
1087 pmap_update_pde_kernel(void *arg)
1088 {
1089 	struct pde_action *act = arg;
1090 	pd_entry_t *pde;
1091 	pmap_t pmap;
1092 
1093 	if (act->store == PCPU_GET(cpuid)) {
1094 
1095 		/*
1096 		 * Elsewhere, this operation requires allpmaps_lock for
1097 		 * synchronization.  Here, it does not because it is being
1098 		 * performed in the context of an all_cpus rendezvous.
1099 		 */
1100 		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1101 			pde = pmap_pde(pmap, act->va);
1102 			pde_store(pde, act->newpde);
1103 		}
1104 	}
1105 }
1106 
1107 static void
1108 pmap_update_pde_user(void *arg)
1109 {
1110 	struct pde_action *act = arg;
1111 
1112 	if (act->store == PCPU_GET(cpuid))
1113 		pde_store(act->pde, act->newpde);
1114 }
1115 
1116 static void
1117 pmap_update_pde_teardown(void *arg)
1118 {
1119 	struct pde_action *act = arg;
1120 
1121 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1122 		pmap_update_pde_invalidate(act->va, act->newpde);
1123 }
1124 
1125 /*
1126  * Change the page size for the specified virtual address in a way that
1127  * prevents any possibility of the TLB ever having two entries that map the
1128  * same virtual address using different page sizes.  This is the recommended
1129  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1130  * machine check exception for a TLB state that is improperly diagnosed as a
1131  * hardware error.
1132  */
1133 static void
1134 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1135 {
1136 	struct pde_action act;
1137 	cpuset_t active, other_cpus;
1138 	u_int cpuid;
1139 
1140 	sched_pin();
1141 	cpuid = PCPU_GET(cpuid);
1142 	other_cpus = all_cpus;
1143 	CPU_CLR(cpuid, &other_cpus);
1144 	if (pmap == kernel_pmap)
1145 		active = all_cpus;
1146 	else
1147 		active = pmap->pm_active;
1148 	if (CPU_OVERLAP(&active, &other_cpus)) {
1149 		act.store = cpuid;
1150 		act.invalidate = active;
1151 		act.va = va;
1152 		act.pde = pde;
1153 		act.newpde = newpde;
1154 		CPU_SET(cpuid, &active);
1155 		smp_rendezvous_cpus(active,
1156 		    smp_no_rendevous_barrier, pmap == kernel_pmap ?
1157 		    pmap_update_pde_kernel : pmap_update_pde_user,
1158 		    pmap_update_pde_teardown, &act);
1159 	} else {
1160 		if (pmap == kernel_pmap)
1161 			pmap_kenter_pde(va, newpde);
1162 		else
1163 			pde_store(pde, newpde);
1164 		if (CPU_ISSET(cpuid, &active))
1165 			pmap_update_pde_invalidate(va, newpde);
1166 	}
1167 	sched_unpin();
1168 }
1169 #else /* !SMP */
1170 /*
1171  * Normal, non-SMP, 486+ invalidation functions.
1172  * We inline these within pmap.c for speed.
1173  */
1174 PMAP_INLINE void
1175 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1176 {
1177 
1178 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1179 		invlpg(va);
1180 }
1181 
1182 PMAP_INLINE void
1183 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1184 {
1185 	vm_offset_t addr;
1186 
1187 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1188 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1189 			invlpg(addr);
1190 }
1191 
1192 PMAP_INLINE void
1193 pmap_invalidate_all(pmap_t pmap)
1194 {
1195 
1196 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1197 		invltlb();
1198 }
1199 
1200 PMAP_INLINE void
1201 pmap_invalidate_cache(void)
1202 {
1203 
1204 	wbinvd();
1205 }
1206 
1207 static void
1208 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1209 {
1210 
1211 	if (pmap == kernel_pmap)
1212 		pmap_kenter_pde(va, newpde);
1213 	else
1214 		pde_store(pde, newpde);
1215 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1216 		pmap_update_pde_invalidate(va, newpde);
1217 }
1218 #endif /* !SMP */
1219 
1220 #define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1221 
1222 void
1223 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1224 {
1225 
1226 	if (force) {
1227 		sva &= ~(vm_offset_t)cpu_clflush_line_size;
1228 	} else {
1229 		KASSERT((sva & PAGE_MASK) == 0,
1230 		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1231 		KASSERT((eva & PAGE_MASK) == 0,
1232 		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1233 	}
1234 
1235 	if ((cpu_feature & CPUID_SS) != 0 && !force)
1236 		; /* If "Self Snoop" is supported and allowed, do nothing. */
1237 	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1238 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1239 
1240 #ifdef DEV_APIC
1241 		/*
1242 		 * XXX: Some CPUs fault, hang, or trash the local APIC
1243 		 * registers if we use CLFLUSH on the local APIC
1244 		 * range.  The local APIC is always uncached, so we
1245 		 * don't need to flush for that range anyway.
1246 		 */
1247 		if (pmap_kextract(sva) == lapic_paddr)
1248 			return;
1249 #endif
1250 		/*
1251 		 * Otherwise, do per-cache line flush.  Use the mfence
1252 		 * instruction to insure that previous stores are
1253 		 * included in the write-back.  The processor
1254 		 * propagates flush to other processors in the cache
1255 		 * coherence domain.
1256 		 */
1257 		mfence();
1258 		for (; sva < eva; sva += cpu_clflush_line_size)
1259 			clflush(sva);
1260 		mfence();
1261 	} else {
1262 
1263 		/*
1264 		 * No targeted cache flush methods are supported by CPU,
1265 		 * or the supplied range is bigger than 2MB.
1266 		 * Globally invalidate cache.
1267 		 */
1268 		pmap_invalidate_cache();
1269 	}
1270 }
1271 
1272 void
1273 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1274 {
1275 	int i;
1276 
1277 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1278 	    (cpu_feature & CPUID_CLFSH) == 0) {
1279 		pmap_invalidate_cache();
1280 	} else {
1281 		for (i = 0; i < count; i++)
1282 			pmap_flush_page(pages[i]);
1283 	}
1284 }
1285 
1286 /*
1287  * Are we current address space or kernel?
1288  */
1289 static __inline int
1290 pmap_is_current(pmap_t pmap)
1291 {
1292 
1293 	return (pmap == kernel_pmap || pmap ==
1294 	    vmspace_pmap(curthread->td_proc->p_vmspace));
1295 }
1296 
1297 /*
1298  * If the given pmap is not the current or kernel pmap, the returned pte must
1299  * be released by passing it to pmap_pte_release().
1300  */
1301 pt_entry_t *
1302 pmap_pte(pmap_t pmap, vm_offset_t va)
1303 {
1304 	pd_entry_t newpf;
1305 	pd_entry_t *pde;
1306 
1307 	pde = pmap_pde(pmap, va);
1308 	if (*pde & PG_PS)
1309 		return (pde);
1310 	if (*pde != 0) {
1311 		/* are we current address space or kernel? */
1312 		if (pmap_is_current(pmap))
1313 			return (vtopte(va));
1314 		mtx_lock(&PMAP2mutex);
1315 		newpf = *pde & PG_FRAME;
1316 		if ((*PMAP2 & PG_FRAME) != newpf) {
1317 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1318 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1319 		}
1320 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1321 	}
1322 	return (NULL);
1323 }
1324 
1325 /*
1326  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1327  * being NULL.
1328  */
1329 static __inline void
1330 pmap_pte_release(pt_entry_t *pte)
1331 {
1332 
1333 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1334 		mtx_unlock(&PMAP2mutex);
1335 }
1336 
1337 /*
1338  * NB:  The sequence of updating a page table followed by accesses to the
1339  * corresponding pages is subject to the situation described in the "AMD64
1340  * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
1341  * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
1342  * right after modifying the PTE bits is crucial.
1343  */
1344 static __inline void
1345 invlcaddr(void *caddr)
1346 {
1347 
1348 	invlpg((u_int)caddr);
1349 }
1350 
1351 /*
1352  * Super fast pmap_pte routine best used when scanning
1353  * the pv lists.  This eliminates many coarse-grained
1354  * invltlb calls.  Note that many of the pv list
1355  * scans are across different pmaps.  It is very wasteful
1356  * to do an entire invltlb for checking a single mapping.
1357  *
1358  * If the given pmap is not the current pmap, pvh_global_lock
1359  * must be held and curthread pinned to a CPU.
1360  */
1361 static pt_entry_t *
1362 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1363 {
1364 	pd_entry_t newpf;
1365 	pd_entry_t *pde;
1366 
1367 	pde = pmap_pde(pmap, va);
1368 	if (*pde & PG_PS)
1369 		return (pde);
1370 	if (*pde != 0) {
1371 		/* are we current address space or kernel? */
1372 		if (pmap_is_current(pmap))
1373 			return (vtopte(va));
1374 		rw_assert(&pvh_global_lock, RA_WLOCKED);
1375 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1376 		newpf = *pde & PG_FRAME;
1377 		if ((*PMAP1 & PG_FRAME) != newpf) {
1378 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1379 #ifdef SMP
1380 			PMAP1cpu = PCPU_GET(cpuid);
1381 #endif
1382 			invlcaddr(PADDR1);
1383 			PMAP1changed++;
1384 		} else
1385 #ifdef SMP
1386 		if (PMAP1cpu != PCPU_GET(cpuid)) {
1387 			PMAP1cpu = PCPU_GET(cpuid);
1388 			invlcaddr(PADDR1);
1389 			PMAP1changedcpu++;
1390 		} else
1391 #endif
1392 			PMAP1unchanged++;
1393 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1394 	}
1395 	return (0);
1396 }
1397 
1398 /*
1399  *	Routine:	pmap_extract
1400  *	Function:
1401  *		Extract the physical page address associated
1402  *		with the given map/virtual_address pair.
1403  */
1404 vm_paddr_t
1405 pmap_extract(pmap_t pmap, vm_offset_t va)
1406 {
1407 	vm_paddr_t rtval;
1408 	pt_entry_t *pte;
1409 	pd_entry_t pde;
1410 
1411 	rtval = 0;
1412 	PMAP_LOCK(pmap);
1413 	pde = pmap->pm_pdir[va >> PDRSHIFT];
1414 	if (pde != 0) {
1415 		if ((pde & PG_PS) != 0)
1416 			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1417 		else {
1418 			pte = pmap_pte(pmap, va);
1419 			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1420 			pmap_pte_release(pte);
1421 		}
1422 	}
1423 	PMAP_UNLOCK(pmap);
1424 	return (rtval);
1425 }
1426 
1427 /*
1428  *	Routine:	pmap_extract_and_hold
1429  *	Function:
1430  *		Atomically extract and hold the physical page
1431  *		with the given pmap and virtual address pair
1432  *		if that mapping permits the given protection.
1433  */
1434 vm_page_t
1435 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1436 {
1437 	pd_entry_t pde;
1438 	pt_entry_t pte, *ptep;
1439 	vm_page_t m;
1440 	vm_paddr_t pa;
1441 
1442 	pa = 0;
1443 	m = NULL;
1444 	PMAP_LOCK(pmap);
1445 retry:
1446 	pde = *pmap_pde(pmap, va);
1447 	if (pde != 0) {
1448 		if (pde & PG_PS) {
1449 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1450 				if (vm_page_pa_tryrelock(pmap, (pde &
1451 				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1452 					goto retry;
1453 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1454 				    (va & PDRMASK));
1455 				vm_page_hold(m);
1456 			}
1457 		} else {
1458 			ptep = pmap_pte(pmap, va);
1459 			pte = *ptep;
1460 			pmap_pte_release(ptep);
1461 			if (pte != 0 &&
1462 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1463 				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1464 				    &pa))
1465 					goto retry;
1466 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1467 				vm_page_hold(m);
1468 			}
1469 		}
1470 	}
1471 	PA_UNLOCK_COND(pa);
1472 	PMAP_UNLOCK(pmap);
1473 	return (m);
1474 }
1475 
1476 /***************************************************
1477  * Low level mapping routines.....
1478  ***************************************************/
1479 
1480 /*
1481  * Add a wired page to the kva.
1482  * Note: not SMP coherent.
1483  *
1484  * This function may be used before pmap_bootstrap() is called.
1485  */
1486 PMAP_INLINE void
1487 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1488 {
1489 	pt_entry_t *pte;
1490 
1491 	pte = vtopte(va);
1492 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1493 }
1494 
1495 static __inline void
1496 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1497 {
1498 	pt_entry_t *pte;
1499 
1500 	pte = vtopte(va);
1501 	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1502 }
1503 
1504 /*
1505  * Remove a page from the kernel pagetables.
1506  * Note: not SMP coherent.
1507  *
1508  * This function may be used before pmap_bootstrap() is called.
1509  */
1510 PMAP_INLINE void
1511 pmap_kremove(vm_offset_t va)
1512 {
1513 	pt_entry_t *pte;
1514 
1515 	pte = vtopte(va);
1516 	pte_clear(pte);
1517 }
1518 
1519 /*
1520  *	Used to map a range of physical addresses into kernel
1521  *	virtual address space.
1522  *
1523  *	The value passed in '*virt' is a suggested virtual address for
1524  *	the mapping. Architectures which can support a direct-mapped
1525  *	physical to virtual region can return the appropriate address
1526  *	within that region, leaving '*virt' unchanged. Other
1527  *	architectures should map the pages starting at '*virt' and
1528  *	update '*virt' with the first usable address after the mapped
1529  *	region.
1530  */
1531 vm_offset_t
1532 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1533 {
1534 	vm_offset_t va, sva;
1535 	vm_paddr_t superpage_offset;
1536 	pd_entry_t newpde;
1537 
1538 	va = *virt;
1539 	/*
1540 	 * Does the physical address range's size and alignment permit at
1541 	 * least one superpage mapping to be created?
1542 	 */
1543 	superpage_offset = start & PDRMASK;
1544 	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1545 		/*
1546 		 * Increase the starting virtual address so that its alignment
1547 		 * does not preclude the use of superpage mappings.
1548 		 */
1549 		if ((va & PDRMASK) < superpage_offset)
1550 			va = (va & ~PDRMASK) + superpage_offset;
1551 		else if ((va & PDRMASK) > superpage_offset)
1552 			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1553 	}
1554 	sva = va;
1555 	while (start < end) {
1556 		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1557 		    pseflag) {
1558 			KASSERT((va & PDRMASK) == 0,
1559 			    ("pmap_map: misaligned va %#x", va));
1560 			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1561 			pmap_kenter_pde(va, newpde);
1562 			va += NBPDR;
1563 			start += NBPDR;
1564 		} else {
1565 			pmap_kenter(va, start);
1566 			va += PAGE_SIZE;
1567 			start += PAGE_SIZE;
1568 		}
1569 	}
1570 	pmap_invalidate_range(kernel_pmap, sva, va);
1571 	*virt = va;
1572 	return (sva);
1573 }
1574 
1575 
1576 /*
1577  * Add a list of wired pages to the kva
1578  * this routine is only used for temporary
1579  * kernel mappings that do not need to have
1580  * page modification or references recorded.
1581  * Note that old mappings are simply written
1582  * over.  The page *must* be wired.
1583  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1584  */
1585 void
1586 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1587 {
1588 	pt_entry_t *endpte, oldpte, pa, *pte;
1589 	vm_page_t m;
1590 
1591 	oldpte = 0;
1592 	pte = vtopte(sva);
1593 	endpte = pte + count;
1594 	while (pte < endpte) {
1595 		m = *ma++;
1596 		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1597 		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1598 			oldpte |= *pte;
1599 			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1600 		}
1601 		pte++;
1602 	}
1603 	if (__predict_false((oldpte & PG_V) != 0))
1604 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1605 		    PAGE_SIZE);
1606 }
1607 
1608 /*
1609  * This routine tears out page mappings from the
1610  * kernel -- it is meant only for temporary mappings.
1611  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1612  */
1613 void
1614 pmap_qremove(vm_offset_t sva, int count)
1615 {
1616 	vm_offset_t va;
1617 
1618 	va = sva;
1619 	while (count-- > 0) {
1620 		pmap_kremove(va);
1621 		va += PAGE_SIZE;
1622 	}
1623 	pmap_invalidate_range(kernel_pmap, sva, va);
1624 }
1625 
1626 /***************************************************
1627  * Page table page management routines.....
1628  ***************************************************/
1629 static __inline void
1630 pmap_free_zero_pages(struct spglist *free)
1631 {
1632 	vm_page_t m;
1633 
1634 	while ((m = SLIST_FIRST(free)) != NULL) {
1635 		SLIST_REMOVE_HEAD(free, plinks.s.ss);
1636 		/* Preserve the page's PG_ZERO setting. */
1637 		vm_page_free_toq(m);
1638 	}
1639 }
1640 
1641 /*
1642  * Schedule the specified unused page table page to be freed.  Specifically,
1643  * add the page to the specified list of pages that will be released to the
1644  * physical memory manager after the TLB has been updated.
1645  */
1646 static __inline void
1647 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1648     boolean_t set_PG_ZERO)
1649 {
1650 
1651 	if (set_PG_ZERO)
1652 		m->flags |= PG_ZERO;
1653 	else
1654 		m->flags &= ~PG_ZERO;
1655 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1656 }
1657 
1658 /*
1659  * Inserts the specified page table page into the specified pmap's collection
1660  * of idle page table pages.  Each of a pmap's page table pages is responsible
1661  * for mapping a distinct range of virtual addresses.  The pmap's collection is
1662  * ordered by this virtual address range.
1663  */
1664 static __inline int
1665 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1666 {
1667 
1668 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1669 	return (vm_radix_insert(&pmap->pm_root, mpte));
1670 }
1671 
1672 /*
1673  * Looks for a page table page mapping the specified virtual address in the
1674  * specified pmap's collection of idle page table pages.  Returns NULL if there
1675  * is no page table page corresponding to the specified virtual address.
1676  */
1677 static __inline vm_page_t
1678 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1679 {
1680 
1681 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1682 	return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT));
1683 }
1684 
1685 /*
1686  * Removes the specified page table page from the specified pmap's collection
1687  * of idle page table pages.  The specified page table page must be a member of
1688  * the pmap's collection.
1689  */
1690 static __inline void
1691 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1692 {
1693 
1694 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1695 	vm_radix_remove(&pmap->pm_root, mpte->pindex);
1696 }
1697 
1698 /*
1699  * Decrements a page table page's wire count, which is used to record the
1700  * number of valid page table entries within the page.  If the wire count
1701  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1702  * page table page was unmapped and FALSE otherwise.
1703  */
1704 static inline boolean_t
1705 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1706 {
1707 
1708 	--m->wire_count;
1709 	if (m->wire_count == 0) {
1710 		_pmap_unwire_ptp(pmap, m, free);
1711 		return (TRUE);
1712 	} else
1713 		return (FALSE);
1714 }
1715 
1716 static void
1717 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1718 {
1719 	vm_offset_t pteva;
1720 
1721 	/*
1722 	 * unmap the page table page
1723 	 */
1724 	pmap->pm_pdir[m->pindex] = 0;
1725 	--pmap->pm_stats.resident_count;
1726 
1727 	/*
1728 	 * This is a release store so that the ordinary store unmapping
1729 	 * the page table page is globally performed before TLB shoot-
1730 	 * down is begun.
1731 	 */
1732 	atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1);
1733 
1734 	/*
1735 	 * Do an invltlb to make the invalidated mapping
1736 	 * take effect immediately.
1737 	 */
1738 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1739 	pmap_invalidate_page(pmap, pteva);
1740 
1741 	/*
1742 	 * Put page on a list so that it is released after
1743 	 * *ALL* TLB shootdown is done
1744 	 */
1745 	pmap_add_delayed_free_list(m, free, TRUE);
1746 }
1747 
1748 /*
1749  * After removing a page table entry, this routine is used to
1750  * conditionally free the page, and manage the hold/wire counts.
1751  */
1752 static int
1753 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
1754 {
1755 	pd_entry_t ptepde;
1756 	vm_page_t mpte;
1757 
1758 	if (va >= VM_MAXUSER_ADDRESS)
1759 		return (0);
1760 	ptepde = *pmap_pde(pmap, va);
1761 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1762 	return (pmap_unwire_ptp(pmap, mpte, free));
1763 }
1764 
1765 /*
1766  * Initialize the pmap for the swapper process.
1767  */
1768 void
1769 pmap_pinit0(pmap_t pmap)
1770 {
1771 
1772 	PMAP_LOCK_INIT(pmap);
1773 	/*
1774 	 * Since the page table directory is shared with the kernel pmap,
1775 	 * which is already included in the list "allpmaps", this pmap does
1776 	 * not need to be inserted into that list.
1777 	 */
1778 	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1779 #if defined(PAE) || defined(PAE_TABLES)
1780 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1781 #endif
1782 	pmap->pm_root.rt_root = 0;
1783 	CPU_ZERO(&pmap->pm_active);
1784 	PCPU_SET(curpmap, pmap);
1785 	TAILQ_INIT(&pmap->pm_pvchunk);
1786 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1787 }
1788 
1789 /*
1790  * Initialize a preallocated and zeroed pmap structure,
1791  * such as one in a vmspace structure.
1792  */
1793 int
1794 pmap_pinit(pmap_t pmap)
1795 {
1796 	vm_page_t m, ptdpg[NPGPTD];
1797 	vm_paddr_t pa;
1798 	int i;
1799 
1800 	/*
1801 	 * No need to allocate page table space yet but we do need a valid
1802 	 * page directory table.
1803 	 */
1804 	if (pmap->pm_pdir == NULL) {
1805 		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
1806 		if (pmap->pm_pdir == NULL)
1807 			return (0);
1808 #if defined(PAE) || defined(PAE_TABLES)
1809 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1810 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1811 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1812 		    ("pmap_pinit: pdpt misaligned"));
1813 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1814 		    ("pmap_pinit: pdpt above 4g"));
1815 #endif
1816 		pmap->pm_root.rt_root = 0;
1817 	}
1818 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1819 	    ("pmap_pinit: pmap has reserved page table page(s)"));
1820 
1821 	/*
1822 	 * allocate the page directory page(s)
1823 	 */
1824 	for (i = 0; i < NPGPTD;) {
1825 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1826 		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1827 		if (m == NULL)
1828 			VM_WAIT;
1829 		else {
1830 			ptdpg[i++] = m;
1831 		}
1832 	}
1833 
1834 	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1835 
1836 	for (i = 0; i < NPGPTD; i++)
1837 		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1838 			pagezero(pmap->pm_pdir + (i * NPDEPG));
1839 
1840 	mtx_lock_spin(&allpmaps_lock);
1841 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1842 	/* Copy the kernel page table directory entries. */
1843 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1844 	mtx_unlock_spin(&allpmaps_lock);
1845 
1846 	/* install self-referential address mapping entry(s) */
1847 	for (i = 0; i < NPGPTD; i++) {
1848 		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1849 		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1850 #if defined(PAE) || defined(PAE_TABLES)
1851 		pmap->pm_pdpt[i] = pa | PG_V;
1852 #endif
1853 	}
1854 
1855 	CPU_ZERO(&pmap->pm_active);
1856 	TAILQ_INIT(&pmap->pm_pvchunk);
1857 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1858 
1859 	return (1);
1860 }
1861 
1862 /*
1863  * this routine is called if the page table page is not
1864  * mapped correctly.
1865  */
1866 static vm_page_t
1867 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
1868 {
1869 	vm_paddr_t ptepa;
1870 	vm_page_t m;
1871 
1872 	/*
1873 	 * Allocate a page table page.
1874 	 */
1875 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1876 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1877 		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
1878 			PMAP_UNLOCK(pmap);
1879 			rw_wunlock(&pvh_global_lock);
1880 			VM_WAIT;
1881 			rw_wlock(&pvh_global_lock);
1882 			PMAP_LOCK(pmap);
1883 		}
1884 
1885 		/*
1886 		 * Indicate the need to retry.  While waiting, the page table
1887 		 * page may have been allocated.
1888 		 */
1889 		return (NULL);
1890 	}
1891 	if ((m->flags & PG_ZERO) == 0)
1892 		pmap_zero_page(m);
1893 
1894 	/*
1895 	 * Map the pagetable page into the process address space, if
1896 	 * it isn't already there.
1897 	 */
1898 
1899 	pmap->pm_stats.resident_count++;
1900 
1901 	ptepa = VM_PAGE_TO_PHYS(m);
1902 	pmap->pm_pdir[ptepindex] =
1903 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1904 
1905 	return (m);
1906 }
1907 
1908 static vm_page_t
1909 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
1910 {
1911 	u_int ptepindex;
1912 	pd_entry_t ptepa;
1913 	vm_page_t m;
1914 
1915 	/*
1916 	 * Calculate pagetable page index
1917 	 */
1918 	ptepindex = va >> PDRSHIFT;
1919 retry:
1920 	/*
1921 	 * Get the page directory entry
1922 	 */
1923 	ptepa = pmap->pm_pdir[ptepindex];
1924 
1925 	/*
1926 	 * This supports switching from a 4MB page to a
1927 	 * normal 4K page.
1928 	 */
1929 	if (ptepa & PG_PS) {
1930 		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1931 		ptepa = pmap->pm_pdir[ptepindex];
1932 	}
1933 
1934 	/*
1935 	 * If the page table page is mapped, we just increment the
1936 	 * hold count, and activate it.
1937 	 */
1938 	if (ptepa) {
1939 		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1940 		m->wire_count++;
1941 	} else {
1942 		/*
1943 		 * Here if the pte page isn't mapped, or if it has
1944 		 * been deallocated.
1945 		 */
1946 		m = _pmap_allocpte(pmap, ptepindex, flags);
1947 		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
1948 			goto retry;
1949 	}
1950 	return (m);
1951 }
1952 
1953 
1954 /***************************************************
1955 * Pmap allocation/deallocation routines.
1956  ***************************************************/
1957 
1958 /*
1959  * Release any resources held by the given physical map.
1960  * Called when a pmap initialized by pmap_pinit is being released.
1961  * Should only be called if the map contains no valid mappings.
1962  */
1963 void
1964 pmap_release(pmap_t pmap)
1965 {
1966 	vm_page_t m, ptdpg[NPGPTD];
1967 	int i;
1968 
1969 	KASSERT(pmap->pm_stats.resident_count == 0,
1970 	    ("pmap_release: pmap resident count %ld != 0",
1971 	    pmap->pm_stats.resident_count));
1972 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1973 	    ("pmap_release: pmap has reserved page table page(s)"));
1974 	KASSERT(CPU_EMPTY(&pmap->pm_active),
1975 	    ("releasing active pmap %p", pmap));
1976 
1977 	mtx_lock_spin(&allpmaps_lock);
1978 	LIST_REMOVE(pmap, pm_list);
1979 	mtx_unlock_spin(&allpmaps_lock);
1980 
1981 	for (i = 0; i < NPGPTD; i++)
1982 		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
1983 		    PG_FRAME);
1984 
1985 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1986 	    sizeof(*pmap->pm_pdir));
1987 
1988 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1989 
1990 	for (i = 0; i < NPGPTD; i++) {
1991 		m = ptdpg[i];
1992 #if defined(PAE) || defined(PAE_TABLES)
1993 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1994 		    ("pmap_release: got wrong ptd page"));
1995 #endif
1996 		m->wire_count--;
1997 		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1998 		vm_page_free_zero(m);
1999 	}
2000 }
2001 
2002 static int
2003 kvm_size(SYSCTL_HANDLER_ARGS)
2004 {
2005 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2006 
2007 	return (sysctl_handle_long(oidp, &ksize, 0, req));
2008 }
2009 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2010     0, 0, kvm_size, "IU", "Size of KVM");
2011 
2012 static int
2013 kvm_free(SYSCTL_HANDLER_ARGS)
2014 {
2015 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2016 
2017 	return (sysctl_handle_long(oidp, &kfree, 0, req));
2018 }
2019 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2020     0, 0, kvm_free, "IU", "Amount of KVM free");
2021 
2022 /*
2023  * grow the number of kernel page table entries, if needed
2024  */
2025 void
2026 pmap_growkernel(vm_offset_t addr)
2027 {
2028 	vm_paddr_t ptppaddr;
2029 	vm_page_t nkpg;
2030 	pd_entry_t newpdir;
2031 
2032 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2033 	addr = roundup2(addr, NBPDR);
2034 	if (addr - 1 >= kernel_map->max_offset)
2035 		addr = kernel_map->max_offset;
2036 	while (kernel_vm_end < addr) {
2037 		if (pdir_pde(PTD, kernel_vm_end)) {
2038 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2039 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2040 				kernel_vm_end = kernel_map->max_offset;
2041 				break;
2042 			}
2043 			continue;
2044 		}
2045 
2046 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2047 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2048 		    VM_ALLOC_ZERO);
2049 		if (nkpg == NULL)
2050 			panic("pmap_growkernel: no memory to grow kernel");
2051 
2052 		nkpt++;
2053 
2054 		if ((nkpg->flags & PG_ZERO) == 0)
2055 			pmap_zero_page(nkpg);
2056 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2057 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2058 		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2059 
2060 		pmap_kenter_pde(kernel_vm_end, newpdir);
2061 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2062 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
2063 			kernel_vm_end = kernel_map->max_offset;
2064 			break;
2065 		}
2066 	}
2067 }
2068 
2069 
2070 /***************************************************
2071  * page management routines.
2072  ***************************************************/
2073 
2074 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2075 CTASSERT(_NPCM == 11);
2076 CTASSERT(_NPCPV == 336);
2077 
2078 static __inline struct pv_chunk *
2079 pv_to_chunk(pv_entry_t pv)
2080 {
2081 
2082 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2083 }
2084 
2085 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2086 
2087 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2088 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2089 
2090 static const uint32_t pc_freemask[_NPCM] = {
2091 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2092 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2093 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2094 	PC_FREE0_9, PC_FREE10
2095 };
2096 
2097 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2098 	"Current number of pv entries");
2099 
2100 #ifdef PV_STATS
2101 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2102 
2103 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2104 	"Current number of pv entry chunks");
2105 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2106 	"Current number of pv entry chunks allocated");
2107 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2108 	"Current number of pv entry chunks frees");
2109 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2110 	"Number of times tried to get a chunk page but failed.");
2111 
2112 static long pv_entry_frees, pv_entry_allocs;
2113 static int pv_entry_spare;
2114 
2115 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2116 	"Current number of pv entry frees");
2117 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2118 	"Current number of pv entry allocs");
2119 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2120 	"Current number of spare pv entries");
2121 #endif
2122 
2123 /*
2124  * We are in a serious low memory condition.  Resort to
2125  * drastic measures to free some pages so we can allocate
2126  * another pv entry chunk.
2127  */
2128 static vm_page_t
2129 pmap_pv_reclaim(pmap_t locked_pmap)
2130 {
2131 	struct pch newtail;
2132 	struct pv_chunk *pc;
2133 	struct md_page *pvh;
2134 	pd_entry_t *pde;
2135 	pmap_t pmap;
2136 	pt_entry_t *pte, tpte;
2137 	pv_entry_t pv;
2138 	vm_offset_t va;
2139 	vm_page_t m, m_pc;
2140 	struct spglist free;
2141 	uint32_t inuse;
2142 	int bit, field, freed;
2143 
2144 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2145 	pmap = NULL;
2146 	m_pc = NULL;
2147 	SLIST_INIT(&free);
2148 	TAILQ_INIT(&newtail);
2149 	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2150 	    SLIST_EMPTY(&free))) {
2151 		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2152 		if (pmap != pc->pc_pmap) {
2153 			if (pmap != NULL) {
2154 				pmap_invalidate_all(pmap);
2155 				if (pmap != locked_pmap)
2156 					PMAP_UNLOCK(pmap);
2157 			}
2158 			pmap = pc->pc_pmap;
2159 			/* Avoid deadlock and lock recursion. */
2160 			if (pmap > locked_pmap)
2161 				PMAP_LOCK(pmap);
2162 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2163 				pmap = NULL;
2164 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2165 				continue;
2166 			}
2167 		}
2168 
2169 		/*
2170 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2171 		 */
2172 		freed = 0;
2173 		for (field = 0; field < _NPCM; field++) {
2174 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2175 			    inuse != 0; inuse &= ~(1UL << bit)) {
2176 				bit = bsfl(inuse);
2177 				pv = &pc->pc_pventry[field * 32 + bit];
2178 				va = pv->pv_va;
2179 				pde = pmap_pde(pmap, va);
2180 				if ((*pde & PG_PS) != 0)
2181 					continue;
2182 				pte = pmap_pte(pmap, va);
2183 				tpte = *pte;
2184 				if ((tpte & PG_W) == 0)
2185 					tpte = pte_load_clear(pte);
2186 				pmap_pte_release(pte);
2187 				if ((tpte & PG_W) != 0)
2188 					continue;
2189 				KASSERT(tpte != 0,
2190 				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2191 				    pmap, va));
2192 				if ((tpte & PG_G) != 0)
2193 					pmap_invalidate_page(pmap, va);
2194 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2195 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2196 					vm_page_dirty(m);
2197 				if ((tpte & PG_A) != 0)
2198 					vm_page_aflag_set(m, PGA_REFERENCED);
2199 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2200 				if (TAILQ_EMPTY(&m->md.pv_list) &&
2201 				    (m->flags & PG_FICTITIOUS) == 0) {
2202 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2203 					if (TAILQ_EMPTY(&pvh->pv_list)) {
2204 						vm_page_aflag_clear(m,
2205 						    PGA_WRITEABLE);
2206 					}
2207 				}
2208 				pc->pc_map[field] |= 1UL << bit;
2209 				pmap_unuse_pt(pmap, va, &free);
2210 				freed++;
2211 			}
2212 		}
2213 		if (freed == 0) {
2214 			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2215 			continue;
2216 		}
2217 		/* Every freed mapping is for a 4 KB page. */
2218 		pmap->pm_stats.resident_count -= freed;
2219 		PV_STAT(pv_entry_frees += freed);
2220 		PV_STAT(pv_entry_spare += freed);
2221 		pv_entry_count -= freed;
2222 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2223 		for (field = 0; field < _NPCM; field++)
2224 			if (pc->pc_map[field] != pc_freemask[field]) {
2225 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2226 				    pc_list);
2227 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2228 
2229 				/*
2230 				 * One freed pv entry in locked_pmap is
2231 				 * sufficient.
2232 				 */
2233 				if (pmap == locked_pmap)
2234 					goto out;
2235 				break;
2236 			}
2237 		if (field == _NPCM) {
2238 			PV_STAT(pv_entry_spare -= _NPCPV);
2239 			PV_STAT(pc_chunk_count--);
2240 			PV_STAT(pc_chunk_frees++);
2241 			/* Entire chunk is free; return it. */
2242 			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2243 			pmap_qremove((vm_offset_t)pc, 1);
2244 			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2245 			break;
2246 		}
2247 	}
2248 out:
2249 	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2250 	if (pmap != NULL) {
2251 		pmap_invalidate_all(pmap);
2252 		if (pmap != locked_pmap)
2253 			PMAP_UNLOCK(pmap);
2254 	}
2255 	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
2256 		m_pc = SLIST_FIRST(&free);
2257 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2258 		/* Recycle a freed page table page. */
2259 		m_pc->wire_count = 1;
2260 		atomic_add_int(&vm_cnt.v_wire_count, 1);
2261 	}
2262 	pmap_free_zero_pages(&free);
2263 	return (m_pc);
2264 }
2265 
2266 /*
2267  * free the pv_entry back to the free list
2268  */
2269 static void
2270 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2271 {
2272 	struct pv_chunk *pc;
2273 	int idx, field, bit;
2274 
2275 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2276 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2277 	PV_STAT(pv_entry_frees++);
2278 	PV_STAT(pv_entry_spare++);
2279 	pv_entry_count--;
2280 	pc = pv_to_chunk(pv);
2281 	idx = pv - &pc->pc_pventry[0];
2282 	field = idx / 32;
2283 	bit = idx % 32;
2284 	pc->pc_map[field] |= 1ul << bit;
2285 	for (idx = 0; idx < _NPCM; idx++)
2286 		if (pc->pc_map[idx] != pc_freemask[idx]) {
2287 			/*
2288 			 * 98% of the time, pc is already at the head of the
2289 			 * list.  If it isn't already, move it to the head.
2290 			 */
2291 			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2292 			    pc)) {
2293 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2294 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2295 				    pc_list);
2296 			}
2297 			return;
2298 		}
2299 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2300 	free_pv_chunk(pc);
2301 }
2302 
2303 static void
2304 free_pv_chunk(struct pv_chunk *pc)
2305 {
2306 	vm_page_t m;
2307 
2308  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2309 	PV_STAT(pv_entry_spare -= _NPCPV);
2310 	PV_STAT(pc_chunk_count--);
2311 	PV_STAT(pc_chunk_frees++);
2312 	/* entire chunk is free, return it */
2313 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2314 	pmap_qremove((vm_offset_t)pc, 1);
2315 	vm_page_unwire(m, PQ_NONE);
2316 	vm_page_free(m);
2317 	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2318 }
2319 
2320 /*
2321  * get a new pv_entry, allocating a block from the system
2322  * when needed.
2323  */
2324 static pv_entry_t
2325 get_pv_entry(pmap_t pmap, boolean_t try)
2326 {
2327 	static const struct timeval printinterval = { 60, 0 };
2328 	static struct timeval lastprint;
2329 	int bit, field;
2330 	pv_entry_t pv;
2331 	struct pv_chunk *pc;
2332 	vm_page_t m;
2333 
2334 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2335 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2336 	PV_STAT(pv_entry_allocs++);
2337 	pv_entry_count++;
2338 	if (pv_entry_count > pv_entry_high_water)
2339 		if (ratecheck(&lastprint, &printinterval))
2340 			printf("Approaching the limit on PV entries, consider "
2341 			    "increasing either the vm.pmap.shpgperproc or the "
2342 			    "vm.pmap.pv_entry_max tunable.\n");
2343 retry:
2344 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2345 	if (pc != NULL) {
2346 		for (field = 0; field < _NPCM; field++) {
2347 			if (pc->pc_map[field]) {
2348 				bit = bsfl(pc->pc_map[field]);
2349 				break;
2350 			}
2351 		}
2352 		if (field < _NPCM) {
2353 			pv = &pc->pc_pventry[field * 32 + bit];
2354 			pc->pc_map[field] &= ~(1ul << bit);
2355 			/* If this was the last item, move it to tail */
2356 			for (field = 0; field < _NPCM; field++)
2357 				if (pc->pc_map[field] != 0) {
2358 					PV_STAT(pv_entry_spare--);
2359 					return (pv);	/* not full, return */
2360 				}
2361 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2362 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2363 			PV_STAT(pv_entry_spare--);
2364 			return (pv);
2365 		}
2366 	}
2367 	/*
2368 	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2369 	 * global lock.  If "pv_vafree" is currently non-empty, it will
2370 	 * remain non-empty until pmap_ptelist_alloc() completes.
2371 	 */
2372 	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2373 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2374 		if (try) {
2375 			pv_entry_count--;
2376 			PV_STAT(pc_chunk_tryfail++);
2377 			return (NULL);
2378 		}
2379 		m = pmap_pv_reclaim(pmap);
2380 		if (m == NULL)
2381 			goto retry;
2382 	}
2383 	PV_STAT(pc_chunk_count++);
2384 	PV_STAT(pc_chunk_allocs++);
2385 	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2386 	pmap_qenter((vm_offset_t)pc, &m, 1);
2387 	pc->pc_pmap = pmap;
2388 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2389 	for (field = 1; field < _NPCM; field++)
2390 		pc->pc_map[field] = pc_freemask[field];
2391 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2392 	pv = &pc->pc_pventry[0];
2393 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2394 	PV_STAT(pv_entry_spare += _NPCPV - 1);
2395 	return (pv);
2396 }
2397 
2398 static __inline pv_entry_t
2399 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2400 {
2401 	pv_entry_t pv;
2402 
2403 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2404 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2405 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2406 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2407 			break;
2408 		}
2409 	}
2410 	return (pv);
2411 }
2412 
2413 static void
2414 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2415 {
2416 	struct md_page *pvh;
2417 	pv_entry_t pv;
2418 	vm_offset_t va_last;
2419 	vm_page_t m;
2420 
2421 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2422 	KASSERT((pa & PDRMASK) == 0,
2423 	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2424 
2425 	/*
2426 	 * Transfer the 4mpage's pv entry for this mapping to the first
2427 	 * page's pv list.
2428 	 */
2429 	pvh = pa_to_pvh(pa);
2430 	va = trunc_4mpage(va);
2431 	pv = pmap_pvh_remove(pvh, pmap, va);
2432 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2433 	m = PHYS_TO_VM_PAGE(pa);
2434 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2435 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2436 	va_last = va + NBPDR - PAGE_SIZE;
2437 	do {
2438 		m++;
2439 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2440 		    ("pmap_pv_demote_pde: page %p is not managed", m));
2441 		va += PAGE_SIZE;
2442 		pmap_insert_entry(pmap, va, m);
2443 	} while (va < va_last);
2444 }
2445 
2446 static void
2447 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2448 {
2449 	struct md_page *pvh;
2450 	pv_entry_t pv;
2451 	vm_offset_t va_last;
2452 	vm_page_t m;
2453 
2454 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2455 	KASSERT((pa & PDRMASK) == 0,
2456 	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2457 
2458 	/*
2459 	 * Transfer the first page's pv entry for this mapping to the
2460 	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2461 	 * to get_pv_entry(), a transfer avoids the possibility that
2462 	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2463 	 * removes one of the mappings that is being promoted.
2464 	 */
2465 	m = PHYS_TO_VM_PAGE(pa);
2466 	va = trunc_4mpage(va);
2467 	pv = pmap_pvh_remove(&m->md, pmap, va);
2468 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2469 	pvh = pa_to_pvh(pa);
2470 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2471 	/* Free the remaining NPTEPG - 1 pv entries. */
2472 	va_last = va + NBPDR - PAGE_SIZE;
2473 	do {
2474 		m++;
2475 		va += PAGE_SIZE;
2476 		pmap_pvh_free(&m->md, pmap, va);
2477 	} while (va < va_last);
2478 }
2479 
2480 static void
2481 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2482 {
2483 	pv_entry_t pv;
2484 
2485 	pv = pmap_pvh_remove(pvh, pmap, va);
2486 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2487 	free_pv_entry(pmap, pv);
2488 }
2489 
2490 static void
2491 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2492 {
2493 	struct md_page *pvh;
2494 
2495 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2496 	pmap_pvh_free(&m->md, pmap, va);
2497 	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2498 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2499 		if (TAILQ_EMPTY(&pvh->pv_list))
2500 			vm_page_aflag_clear(m, PGA_WRITEABLE);
2501 	}
2502 }
2503 
2504 /*
2505  * Create a pv entry for page at pa for
2506  * (pmap, va).
2507  */
2508 static void
2509 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2510 {
2511 	pv_entry_t pv;
2512 
2513 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2514 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2515 	pv = get_pv_entry(pmap, FALSE);
2516 	pv->pv_va = va;
2517 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2518 }
2519 
2520 /*
2521  * Conditionally create a pv entry.
2522  */
2523 static boolean_t
2524 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2525 {
2526 	pv_entry_t pv;
2527 
2528 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2529 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2530 	if (pv_entry_count < pv_entry_high_water &&
2531 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2532 		pv->pv_va = va;
2533 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2534 		return (TRUE);
2535 	} else
2536 		return (FALSE);
2537 }
2538 
2539 /*
2540  * Create the pv entries for each of the pages within a superpage.
2541  */
2542 static boolean_t
2543 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2544 {
2545 	struct md_page *pvh;
2546 	pv_entry_t pv;
2547 
2548 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2549 	if (pv_entry_count < pv_entry_high_water &&
2550 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2551 		pv->pv_va = va;
2552 		pvh = pa_to_pvh(pa);
2553 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2554 		return (TRUE);
2555 	} else
2556 		return (FALSE);
2557 }
2558 
2559 /*
2560  * Fills a page table page with mappings to consecutive physical pages.
2561  */
2562 static void
2563 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2564 {
2565 	pt_entry_t *pte;
2566 
2567 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2568 		*pte = newpte;
2569 		newpte += PAGE_SIZE;
2570 	}
2571 }
2572 
2573 /*
2574  * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2575  * 2- or 4MB page mapping is invalidated.
2576  */
2577 static boolean_t
2578 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2579 {
2580 	pd_entry_t newpde, oldpde;
2581 	pt_entry_t *firstpte, newpte;
2582 	vm_paddr_t mptepa;
2583 	vm_page_t mpte;
2584 	struct spglist free;
2585 
2586 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2587 	oldpde = *pde;
2588 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2589 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2590 	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
2591 	    NULL)
2592 		pmap_remove_pt_page(pmap, mpte);
2593 	else {
2594 		KASSERT((oldpde & PG_W) == 0,
2595 		    ("pmap_demote_pde: page table page for a wired mapping"
2596 		    " is missing"));
2597 
2598 		/*
2599 		 * Invalidate the 2- or 4MB page mapping and return
2600 		 * "failure" if the mapping was never accessed or the
2601 		 * allocation of the new page table page fails.
2602 		 */
2603 		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2604 		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2605 		    VM_ALLOC_WIRED)) == NULL) {
2606 			SLIST_INIT(&free);
2607 			pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2608 			pmap_invalidate_page(pmap, trunc_4mpage(va));
2609 			pmap_free_zero_pages(&free);
2610 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2611 			    " in pmap %p", va, pmap);
2612 			return (FALSE);
2613 		}
2614 		if (va < VM_MAXUSER_ADDRESS)
2615 			pmap->pm_stats.resident_count++;
2616 	}
2617 	mptepa = VM_PAGE_TO_PHYS(mpte);
2618 
2619 	/*
2620 	 * If the page mapping is in the kernel's address space, then the
2621 	 * KPTmap can provide access to the page table page.  Otherwise,
2622 	 * temporarily map the page table page (mpte) into the kernel's
2623 	 * address space at either PADDR1 or PADDR2.
2624 	 */
2625 	if (va >= KERNBASE)
2626 		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2627 	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2628 		if ((*PMAP1 & PG_FRAME) != mptepa) {
2629 			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2630 #ifdef SMP
2631 			PMAP1cpu = PCPU_GET(cpuid);
2632 #endif
2633 			invlcaddr(PADDR1);
2634 			PMAP1changed++;
2635 		} else
2636 #ifdef SMP
2637 		if (PMAP1cpu != PCPU_GET(cpuid)) {
2638 			PMAP1cpu = PCPU_GET(cpuid);
2639 			invlcaddr(PADDR1);
2640 			PMAP1changedcpu++;
2641 		} else
2642 #endif
2643 			PMAP1unchanged++;
2644 		firstpte = PADDR1;
2645 	} else {
2646 		mtx_lock(&PMAP2mutex);
2647 		if ((*PMAP2 & PG_FRAME) != mptepa) {
2648 			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2649 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2650 		}
2651 		firstpte = PADDR2;
2652 	}
2653 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2654 	KASSERT((oldpde & PG_A) != 0,
2655 	    ("pmap_demote_pde: oldpde is missing PG_A"));
2656 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2657 	    ("pmap_demote_pde: oldpde is missing PG_M"));
2658 	newpte = oldpde & ~PG_PS;
2659 	if ((newpte & PG_PDE_PAT) != 0)
2660 		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2661 
2662 	/*
2663 	 * If the page table page is new, initialize it.
2664 	 */
2665 	if (mpte->wire_count == 1) {
2666 		mpte->wire_count = NPTEPG;
2667 		pmap_fill_ptp(firstpte, newpte);
2668 	}
2669 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2670 	    ("pmap_demote_pde: firstpte and newpte map different physical"
2671 	    " addresses"));
2672 
2673 	/*
2674 	 * If the mapping has changed attributes, update the page table
2675 	 * entries.
2676 	 */
2677 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2678 		pmap_fill_ptp(firstpte, newpte);
2679 
2680 	/*
2681 	 * Demote the mapping.  This pmap is locked.  The old PDE has
2682 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2683 	 * set.  Thus, there is no danger of a race with another
2684 	 * processor changing the setting of PG_A and/or PG_M between
2685 	 * the read above and the store below.
2686 	 */
2687 	if (workaround_erratum383)
2688 		pmap_update_pde(pmap, va, pde, newpde);
2689 	else if (pmap == kernel_pmap)
2690 		pmap_kenter_pde(va, newpde);
2691 	else
2692 		pde_store(pde, newpde);
2693 	if (firstpte == PADDR2)
2694 		mtx_unlock(&PMAP2mutex);
2695 
2696 	/*
2697 	 * Invalidate the recursive mapping of the page table page.
2698 	 */
2699 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2700 
2701 	/*
2702 	 * Demote the pv entry.  This depends on the earlier demotion
2703 	 * of the mapping.  Specifically, the (re)creation of a per-
2704 	 * page pv entry might trigger the execution of pmap_collect(),
2705 	 * which might reclaim a newly (re)created per-page pv entry
2706 	 * and destroy the associated mapping.  In order to destroy
2707 	 * the mapping, the PDE must have already changed from mapping
2708 	 * the 2mpage to referencing the page table page.
2709 	 */
2710 	if ((oldpde & PG_MANAGED) != 0)
2711 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2712 
2713 	pmap_pde_demotions++;
2714 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2715 	    " in pmap %p", va, pmap);
2716 	return (TRUE);
2717 }
2718 
2719 /*
2720  * Removes a 2- or 4MB page mapping from the kernel pmap.
2721  */
2722 static void
2723 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2724 {
2725 	pd_entry_t newpde;
2726 	vm_paddr_t mptepa;
2727 	vm_page_t mpte;
2728 
2729 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2730 	mpte = pmap_lookup_pt_page(pmap, va);
2731 	if (mpte == NULL)
2732 		panic("pmap_remove_kernel_pde: Missing pt page.");
2733 
2734 	pmap_remove_pt_page(pmap, mpte);
2735 	mptepa = VM_PAGE_TO_PHYS(mpte);
2736 	newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
2737 
2738 	/*
2739 	 * Initialize the page table page.
2740 	 */
2741 	pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
2742 
2743 	/*
2744 	 * Remove the mapping.
2745 	 */
2746 	if (workaround_erratum383)
2747 		pmap_update_pde(pmap, va, pde, newpde);
2748 	else
2749 		pmap_kenter_pde(va, newpde);
2750 
2751 	/*
2752 	 * Invalidate the recursive mapping of the page table page.
2753 	 */
2754 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2755 }
2756 
2757 /*
2758  * pmap_remove_pde: do the things to unmap a superpage in a process
2759  */
2760 static void
2761 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2762     struct spglist *free)
2763 {
2764 	struct md_page *pvh;
2765 	pd_entry_t oldpde;
2766 	vm_offset_t eva, va;
2767 	vm_page_t m, mpte;
2768 
2769 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2770 	KASSERT((sva & PDRMASK) == 0,
2771 	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2772 	oldpde = pte_load_clear(pdq);
2773 	if (oldpde & PG_W)
2774 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2775 
2776 	/*
2777 	 * Machines that don't support invlpg, also don't support
2778 	 * PG_G.
2779 	 */
2780 	if (oldpde & PG_G)
2781 		pmap_invalidate_page(kernel_pmap, sva);
2782 	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2783 	if (oldpde & PG_MANAGED) {
2784 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2785 		pmap_pvh_free(pvh, pmap, sva);
2786 		eva = sva + NBPDR;
2787 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2788 		    va < eva; va += PAGE_SIZE, m++) {
2789 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2790 				vm_page_dirty(m);
2791 			if (oldpde & PG_A)
2792 				vm_page_aflag_set(m, PGA_REFERENCED);
2793 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2794 			    TAILQ_EMPTY(&pvh->pv_list))
2795 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2796 		}
2797 	}
2798 	if (pmap == kernel_pmap) {
2799 		pmap_remove_kernel_pde(pmap, pdq, sva);
2800 	} else {
2801 		mpte = pmap_lookup_pt_page(pmap, sva);
2802 		if (mpte != NULL) {
2803 			pmap_remove_pt_page(pmap, mpte);
2804 			pmap->pm_stats.resident_count--;
2805 			KASSERT(mpte->wire_count == NPTEPG,
2806 			    ("pmap_remove_pde: pte page wire count error"));
2807 			mpte->wire_count = 0;
2808 			pmap_add_delayed_free_list(mpte, free, FALSE);
2809 			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2810 		}
2811 	}
2812 }
2813 
2814 /*
2815  * pmap_remove_pte: do the things to unmap a page in a process
2816  */
2817 static int
2818 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2819     struct spglist *free)
2820 {
2821 	pt_entry_t oldpte;
2822 	vm_page_t m;
2823 
2824 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2825 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2826 	oldpte = pte_load_clear(ptq);
2827 	KASSERT(oldpte != 0,
2828 	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2829 	if (oldpte & PG_W)
2830 		pmap->pm_stats.wired_count -= 1;
2831 	/*
2832 	 * Machines that don't support invlpg, also don't support
2833 	 * PG_G.
2834 	 */
2835 	if (oldpte & PG_G)
2836 		pmap_invalidate_page(kernel_pmap, va);
2837 	pmap->pm_stats.resident_count -= 1;
2838 	if (oldpte & PG_MANAGED) {
2839 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2840 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2841 			vm_page_dirty(m);
2842 		if (oldpte & PG_A)
2843 			vm_page_aflag_set(m, PGA_REFERENCED);
2844 		pmap_remove_entry(pmap, m, va);
2845 	}
2846 	return (pmap_unuse_pt(pmap, va, free));
2847 }
2848 
2849 /*
2850  * Remove a single page from a process address space
2851  */
2852 static void
2853 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
2854 {
2855 	pt_entry_t *pte;
2856 
2857 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2858 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2859 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2860 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2861 		return;
2862 	pmap_remove_pte(pmap, pte, va, free);
2863 	pmap_invalidate_page(pmap, va);
2864 }
2865 
2866 /*
2867  *	Remove the given range of addresses from the specified map.
2868  *
2869  *	It is assumed that the start and end are properly
2870  *	rounded to the page size.
2871  */
2872 void
2873 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2874 {
2875 	vm_offset_t pdnxt;
2876 	pd_entry_t ptpaddr;
2877 	pt_entry_t *pte;
2878 	struct spglist free;
2879 	int anyvalid;
2880 
2881 	/*
2882 	 * Perform an unsynchronized read.  This is, however, safe.
2883 	 */
2884 	if (pmap->pm_stats.resident_count == 0)
2885 		return;
2886 
2887 	anyvalid = 0;
2888 	SLIST_INIT(&free);
2889 
2890 	rw_wlock(&pvh_global_lock);
2891 	sched_pin();
2892 	PMAP_LOCK(pmap);
2893 
2894 	/*
2895 	 * special handling of removing one page.  a very
2896 	 * common operation and easy to short circuit some
2897 	 * code.
2898 	 */
2899 	if ((sva + PAGE_SIZE == eva) &&
2900 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2901 		pmap_remove_page(pmap, sva, &free);
2902 		goto out;
2903 	}
2904 
2905 	for (; sva < eva; sva = pdnxt) {
2906 		u_int pdirindex;
2907 
2908 		/*
2909 		 * Calculate index for next page table.
2910 		 */
2911 		pdnxt = (sva + NBPDR) & ~PDRMASK;
2912 		if (pdnxt < sva)
2913 			pdnxt = eva;
2914 		if (pmap->pm_stats.resident_count == 0)
2915 			break;
2916 
2917 		pdirindex = sva >> PDRSHIFT;
2918 		ptpaddr = pmap->pm_pdir[pdirindex];
2919 
2920 		/*
2921 		 * Weed out invalid mappings. Note: we assume that the page
2922 		 * directory table is always allocated, and in kernel virtual.
2923 		 */
2924 		if (ptpaddr == 0)
2925 			continue;
2926 
2927 		/*
2928 		 * Check for large page.
2929 		 */
2930 		if ((ptpaddr & PG_PS) != 0) {
2931 			/*
2932 			 * Are we removing the entire large page?  If not,
2933 			 * demote the mapping and fall through.
2934 			 */
2935 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2936 				/*
2937 				 * The TLB entry for a PG_G mapping is
2938 				 * invalidated by pmap_remove_pde().
2939 				 */
2940 				if ((ptpaddr & PG_G) == 0)
2941 					anyvalid = 1;
2942 				pmap_remove_pde(pmap,
2943 				    &pmap->pm_pdir[pdirindex], sva, &free);
2944 				continue;
2945 			} else if (!pmap_demote_pde(pmap,
2946 			    &pmap->pm_pdir[pdirindex], sva)) {
2947 				/* The large page mapping was destroyed. */
2948 				continue;
2949 			}
2950 		}
2951 
2952 		/*
2953 		 * Limit our scan to either the end of the va represented
2954 		 * by the current page table page, or to the end of the
2955 		 * range being removed.
2956 		 */
2957 		if (pdnxt > eva)
2958 			pdnxt = eva;
2959 
2960 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2961 		    sva += PAGE_SIZE) {
2962 			if (*pte == 0)
2963 				continue;
2964 
2965 			/*
2966 			 * The TLB entry for a PG_G mapping is invalidated
2967 			 * by pmap_remove_pte().
2968 			 */
2969 			if ((*pte & PG_G) == 0)
2970 				anyvalid = 1;
2971 			if (pmap_remove_pte(pmap, pte, sva, &free))
2972 				break;
2973 		}
2974 	}
2975 out:
2976 	sched_unpin();
2977 	if (anyvalid)
2978 		pmap_invalidate_all(pmap);
2979 	rw_wunlock(&pvh_global_lock);
2980 	PMAP_UNLOCK(pmap);
2981 	pmap_free_zero_pages(&free);
2982 }
2983 
2984 /*
2985  *	Routine:	pmap_remove_all
2986  *	Function:
2987  *		Removes this physical page from
2988  *		all physical maps in which it resides.
2989  *		Reflects back modify bits to the pager.
2990  *
2991  *	Notes:
2992  *		Original versions of this routine were very
2993  *		inefficient because they iteratively called
2994  *		pmap_remove (slow...)
2995  */
2996 
2997 void
2998 pmap_remove_all(vm_page_t m)
2999 {
3000 	struct md_page *pvh;
3001 	pv_entry_t pv;
3002 	pmap_t pmap;
3003 	pt_entry_t *pte, tpte;
3004 	pd_entry_t *pde;
3005 	vm_offset_t va;
3006 	struct spglist free;
3007 
3008 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3009 	    ("pmap_remove_all: page %p is not managed", m));
3010 	SLIST_INIT(&free);
3011 	rw_wlock(&pvh_global_lock);
3012 	sched_pin();
3013 	if ((m->flags & PG_FICTITIOUS) != 0)
3014 		goto small_mappings;
3015 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3016 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3017 		va = pv->pv_va;
3018 		pmap = PV_PMAP(pv);
3019 		PMAP_LOCK(pmap);
3020 		pde = pmap_pde(pmap, va);
3021 		(void)pmap_demote_pde(pmap, pde, va);
3022 		PMAP_UNLOCK(pmap);
3023 	}
3024 small_mappings:
3025 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3026 		pmap = PV_PMAP(pv);
3027 		PMAP_LOCK(pmap);
3028 		pmap->pm_stats.resident_count--;
3029 		pde = pmap_pde(pmap, pv->pv_va);
3030 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3031 		    " a 4mpage in page %p's pv list", m));
3032 		pte = pmap_pte_quick(pmap, pv->pv_va);
3033 		tpte = pte_load_clear(pte);
3034 		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3035 		    pmap, pv->pv_va));
3036 		if (tpte & PG_W)
3037 			pmap->pm_stats.wired_count--;
3038 		if (tpte & PG_A)
3039 			vm_page_aflag_set(m, PGA_REFERENCED);
3040 
3041 		/*
3042 		 * Update the vm_page_t clean and reference bits.
3043 		 */
3044 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3045 			vm_page_dirty(m);
3046 		pmap_unuse_pt(pmap, pv->pv_va, &free);
3047 		pmap_invalidate_page(pmap, pv->pv_va);
3048 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3049 		free_pv_entry(pmap, pv);
3050 		PMAP_UNLOCK(pmap);
3051 	}
3052 	vm_page_aflag_clear(m, PGA_WRITEABLE);
3053 	sched_unpin();
3054 	rw_wunlock(&pvh_global_lock);
3055 	pmap_free_zero_pages(&free);
3056 }
3057 
3058 /*
3059  * pmap_protect_pde: do the things to protect a 4mpage in a process
3060  */
3061 static boolean_t
3062 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3063 {
3064 	pd_entry_t newpde, oldpde;
3065 	vm_offset_t eva, va;
3066 	vm_page_t m;
3067 	boolean_t anychanged;
3068 
3069 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3070 	KASSERT((sva & PDRMASK) == 0,
3071 	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3072 	anychanged = FALSE;
3073 retry:
3074 	oldpde = newpde = *pde;
3075 	if (oldpde & PG_MANAGED) {
3076 		eva = sva + NBPDR;
3077 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3078 		    va < eva; va += PAGE_SIZE, m++)
3079 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
3080 				vm_page_dirty(m);
3081 	}
3082 	if ((prot & VM_PROT_WRITE) == 0)
3083 		newpde &= ~(PG_RW | PG_M);
3084 #if defined(PAE) || defined(PAE_TABLES)
3085 	if ((prot & VM_PROT_EXECUTE) == 0)
3086 		newpde |= pg_nx;
3087 #endif
3088 	if (newpde != oldpde) {
3089 		if (!pde_cmpset(pde, oldpde, newpde))
3090 			goto retry;
3091 		if (oldpde & PG_G)
3092 			pmap_invalidate_page(pmap, sva);
3093 		else
3094 			anychanged = TRUE;
3095 	}
3096 	return (anychanged);
3097 }
3098 
3099 /*
3100  *	Set the physical protection on the
3101  *	specified range of this map as requested.
3102  */
3103 void
3104 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3105 {
3106 	vm_offset_t pdnxt;
3107 	pd_entry_t ptpaddr;
3108 	pt_entry_t *pte;
3109 	boolean_t anychanged, pv_lists_locked;
3110 
3111 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3112 	if (prot == VM_PROT_NONE) {
3113 		pmap_remove(pmap, sva, eva);
3114 		return;
3115 	}
3116 
3117 #if defined(PAE) || defined(PAE_TABLES)
3118 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3119 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3120 		return;
3121 #else
3122 	if (prot & VM_PROT_WRITE)
3123 		return;
3124 #endif
3125 
3126 	if (pmap_is_current(pmap))
3127 		pv_lists_locked = FALSE;
3128 	else {
3129 		pv_lists_locked = TRUE;
3130 resume:
3131 		rw_wlock(&pvh_global_lock);
3132 		sched_pin();
3133 	}
3134 	anychanged = FALSE;
3135 
3136 	PMAP_LOCK(pmap);
3137 	for (; sva < eva; sva = pdnxt) {
3138 		pt_entry_t obits, pbits;
3139 		u_int pdirindex;
3140 
3141 		pdnxt = (sva + NBPDR) & ~PDRMASK;
3142 		if (pdnxt < sva)
3143 			pdnxt = eva;
3144 
3145 		pdirindex = sva >> PDRSHIFT;
3146 		ptpaddr = pmap->pm_pdir[pdirindex];
3147 
3148 		/*
3149 		 * Weed out invalid mappings. Note: we assume that the page
3150 		 * directory table is always allocated, and in kernel virtual.
3151 		 */
3152 		if (ptpaddr == 0)
3153 			continue;
3154 
3155 		/*
3156 		 * Check for large page.
3157 		 */
3158 		if ((ptpaddr & PG_PS) != 0) {
3159 			/*
3160 			 * Are we protecting the entire large page?  If not,
3161 			 * demote the mapping and fall through.
3162 			 */
3163 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3164 				/*
3165 				 * The TLB entry for a PG_G mapping is
3166 				 * invalidated by pmap_protect_pde().
3167 				 */
3168 				if (pmap_protect_pde(pmap,
3169 				    &pmap->pm_pdir[pdirindex], sva, prot))
3170 					anychanged = TRUE;
3171 				continue;
3172 			} else {
3173 				if (!pv_lists_locked) {
3174 					pv_lists_locked = TRUE;
3175 					if (!rw_try_wlock(&pvh_global_lock)) {
3176 						if (anychanged)
3177 							pmap_invalidate_all(
3178 							    pmap);
3179 						PMAP_UNLOCK(pmap);
3180 						goto resume;
3181 					}
3182 					sched_pin();
3183 				}
3184 				if (!pmap_demote_pde(pmap,
3185 				    &pmap->pm_pdir[pdirindex], sva)) {
3186 					/*
3187 					 * The large page mapping was
3188 					 * destroyed.
3189 					 */
3190 					continue;
3191 				}
3192 			}
3193 		}
3194 
3195 		if (pdnxt > eva)
3196 			pdnxt = eva;
3197 
3198 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3199 		    sva += PAGE_SIZE) {
3200 			vm_page_t m;
3201 
3202 retry:
3203 			/*
3204 			 * Regardless of whether a pte is 32 or 64 bits in
3205 			 * size, PG_RW, PG_A, and PG_M are among the least
3206 			 * significant 32 bits.
3207 			 */
3208 			obits = pbits = *pte;
3209 			if ((pbits & PG_V) == 0)
3210 				continue;
3211 
3212 			if ((prot & VM_PROT_WRITE) == 0) {
3213 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3214 				    (PG_MANAGED | PG_M | PG_RW)) {
3215 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3216 					vm_page_dirty(m);
3217 				}
3218 				pbits &= ~(PG_RW | PG_M);
3219 			}
3220 #if defined(PAE) || defined(PAE_TABLES)
3221 			if ((prot & VM_PROT_EXECUTE) == 0)
3222 				pbits |= pg_nx;
3223 #endif
3224 
3225 			if (pbits != obits) {
3226 #if defined(PAE) || defined(PAE_TABLES)
3227 				if (!atomic_cmpset_64(pte, obits, pbits))
3228 					goto retry;
3229 #else
3230 				if (!atomic_cmpset_int((u_int *)pte, obits,
3231 				    pbits))
3232 					goto retry;
3233 #endif
3234 				if (obits & PG_G)
3235 					pmap_invalidate_page(pmap, sva);
3236 				else
3237 					anychanged = TRUE;
3238 			}
3239 		}
3240 	}
3241 	if (anychanged)
3242 		pmap_invalidate_all(pmap);
3243 	if (pv_lists_locked) {
3244 		sched_unpin();
3245 		rw_wunlock(&pvh_global_lock);
3246 	}
3247 	PMAP_UNLOCK(pmap);
3248 }
3249 
3250 /*
3251  * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3252  * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3253  * For promotion to occur, two conditions must be met: (1) the 4KB page
3254  * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3255  * mappings must have identical characteristics.
3256  *
3257  * Managed (PG_MANAGED) mappings within the kernel address space are not
3258  * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3259  * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3260  * pmap.
3261  */
3262 static void
3263 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3264 {
3265 	pd_entry_t newpde;
3266 	pt_entry_t *firstpte, oldpte, pa, *pte;
3267 	vm_offset_t oldpteva;
3268 	vm_page_t mpte;
3269 
3270 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3271 
3272 	/*
3273 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3274 	 * either invalid, unused, or does not map the first 4KB physical page
3275 	 * within a 2- or 4MB page.
3276 	 */
3277 	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3278 setpde:
3279 	newpde = *firstpte;
3280 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3281 		pmap_pde_p_failures++;
3282 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3283 		    " in pmap %p", va, pmap);
3284 		return;
3285 	}
3286 	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3287 		pmap_pde_p_failures++;
3288 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3289 		    " in pmap %p", va, pmap);
3290 		return;
3291 	}
3292 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3293 		/*
3294 		 * When PG_M is already clear, PG_RW can be cleared without
3295 		 * a TLB invalidation.
3296 		 */
3297 		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3298 		    ~PG_RW))
3299 			goto setpde;
3300 		newpde &= ~PG_RW;
3301 	}
3302 
3303 	/*
3304 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3305 	 * PTE maps an unexpected 4KB physical page or does not have identical
3306 	 * characteristics to the first PTE.
3307 	 */
3308 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3309 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3310 setpte:
3311 		oldpte = *pte;
3312 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3313 			pmap_pde_p_failures++;
3314 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3315 			    " in pmap %p", va, pmap);
3316 			return;
3317 		}
3318 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3319 			/*
3320 			 * When PG_M is already clear, PG_RW can be cleared
3321 			 * without a TLB invalidation.
3322 			 */
3323 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3324 			    oldpte & ~PG_RW))
3325 				goto setpte;
3326 			oldpte &= ~PG_RW;
3327 			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3328 			    (va & ~PDRMASK);
3329 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3330 			    " in pmap %p", oldpteva, pmap);
3331 		}
3332 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3333 			pmap_pde_p_failures++;
3334 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3335 			    " in pmap %p", va, pmap);
3336 			return;
3337 		}
3338 		pa -= PAGE_SIZE;
3339 	}
3340 
3341 	/*
3342 	 * Save the page table page in its current state until the PDE
3343 	 * mapping the superpage is demoted by pmap_demote_pde() or
3344 	 * destroyed by pmap_remove_pde().
3345 	 */
3346 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3347 	KASSERT(mpte >= vm_page_array &&
3348 	    mpte < &vm_page_array[vm_page_array_size],
3349 	    ("pmap_promote_pde: page table page is out of range"));
3350 	KASSERT(mpte->pindex == va >> PDRSHIFT,
3351 	    ("pmap_promote_pde: page table page's pindex is wrong"));
3352 	if (pmap_insert_pt_page(pmap, mpte)) {
3353 		pmap_pde_p_failures++;
3354 		CTR2(KTR_PMAP,
3355 		    "pmap_promote_pde: failure for va %#x in pmap %p", va,
3356 		    pmap);
3357 		return;
3358 	}
3359 
3360 	/*
3361 	 * Promote the pv entries.
3362 	 */
3363 	if ((newpde & PG_MANAGED) != 0)
3364 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3365 
3366 	/*
3367 	 * Propagate the PAT index to its proper position.
3368 	 */
3369 	if ((newpde & PG_PTE_PAT) != 0)
3370 		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3371 
3372 	/*
3373 	 * Map the superpage.
3374 	 */
3375 	if (workaround_erratum383)
3376 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3377 	else if (pmap == kernel_pmap)
3378 		pmap_kenter_pde(va, PG_PS | newpde);
3379 	else
3380 		pde_store(pde, PG_PS | newpde);
3381 
3382 	pmap_pde_promotions++;
3383 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3384 	    " in pmap %p", va, pmap);
3385 }
3386 
3387 /*
3388  *	Insert the given physical page (p) at
3389  *	the specified virtual address (v) in the
3390  *	target physical map with the protection requested.
3391  *
3392  *	If specified, the page will be wired down, meaning
3393  *	that the related pte can not be reclaimed.
3394  *
3395  *	NB:  This is the only routine which MAY NOT lazy-evaluate
3396  *	or lose information.  That is, this routine must actually
3397  *	insert this page into the given map NOW.
3398  */
3399 int
3400 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3401     u_int flags, int8_t psind)
3402 {
3403 	pd_entry_t *pde;
3404 	pt_entry_t *pte;
3405 	pt_entry_t newpte, origpte;
3406 	pv_entry_t pv;
3407 	vm_paddr_t opa, pa;
3408 	vm_page_t mpte, om;
3409 	boolean_t invlva, wired;
3410 
3411 	va = trunc_page(va);
3412 	mpte = NULL;
3413 	wired = (flags & PMAP_ENTER_WIRED) != 0;
3414 
3415 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3416 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3417 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3418 	    va));
3419 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
3420 		VM_OBJECT_ASSERT_LOCKED(m->object);
3421 
3422 	rw_wlock(&pvh_global_lock);
3423 	PMAP_LOCK(pmap);
3424 	sched_pin();
3425 
3426 	/*
3427 	 * In the case that a page table page is not
3428 	 * resident, we are creating it here.
3429 	 */
3430 	if (va < VM_MAXUSER_ADDRESS) {
3431 		mpte = pmap_allocpte(pmap, va, flags);
3432 		if (mpte == NULL) {
3433 			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
3434 			    ("pmap_allocpte failed with sleep allowed"));
3435 			sched_unpin();
3436 			rw_wunlock(&pvh_global_lock);
3437 			PMAP_UNLOCK(pmap);
3438 			return (KERN_RESOURCE_SHORTAGE);
3439 		}
3440 	}
3441 
3442 	pde = pmap_pde(pmap, va);
3443 	if ((*pde & PG_PS) != 0)
3444 		panic("pmap_enter: attempted pmap_enter on 4MB page");
3445 	pte = pmap_pte_quick(pmap, va);
3446 
3447 	/*
3448 	 * Page Directory table entry not valid, we need a new PT page
3449 	 */
3450 	if (pte == NULL) {
3451 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3452 			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3453 	}
3454 
3455 	pa = VM_PAGE_TO_PHYS(m);
3456 	om = NULL;
3457 	origpte = *pte;
3458 	opa = origpte & PG_FRAME;
3459 
3460 	/*
3461 	 * Mapping has not changed, must be protection or wiring change.
3462 	 */
3463 	if (origpte && (opa == pa)) {
3464 		/*
3465 		 * Wiring change, just update stats. We don't worry about
3466 		 * wiring PT pages as they remain resident as long as there
3467 		 * are valid mappings in them. Hence, if a user page is wired,
3468 		 * the PT page will be also.
3469 		 */
3470 		if (wired && ((origpte & PG_W) == 0))
3471 			pmap->pm_stats.wired_count++;
3472 		else if (!wired && (origpte & PG_W))
3473 			pmap->pm_stats.wired_count--;
3474 
3475 		/*
3476 		 * Remove extra pte reference
3477 		 */
3478 		if (mpte)
3479 			mpte->wire_count--;
3480 
3481 		if (origpte & PG_MANAGED) {
3482 			om = m;
3483 			pa |= PG_MANAGED;
3484 		}
3485 		goto validate;
3486 	}
3487 
3488 	pv = NULL;
3489 
3490 	/*
3491 	 * Mapping has changed, invalidate old range and fall through to
3492 	 * handle validating new mapping.
3493 	 */
3494 	if (opa) {
3495 		if (origpte & PG_W)
3496 			pmap->pm_stats.wired_count--;
3497 		if (origpte & PG_MANAGED) {
3498 			om = PHYS_TO_VM_PAGE(opa);
3499 			pv = pmap_pvh_remove(&om->md, pmap, va);
3500 		}
3501 		if (mpte != NULL) {
3502 			mpte->wire_count--;
3503 			KASSERT(mpte->wire_count > 0,
3504 			    ("pmap_enter: missing reference to page table page,"
3505 			     " va: 0x%x", va));
3506 		}
3507 	} else
3508 		pmap->pm_stats.resident_count++;
3509 
3510 	/*
3511 	 * Enter on the PV list if part of our managed memory.
3512 	 */
3513 	if ((m->oflags & VPO_UNMANAGED) == 0) {
3514 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3515 		    ("pmap_enter: managed mapping within the clean submap"));
3516 		if (pv == NULL)
3517 			pv = get_pv_entry(pmap, FALSE);
3518 		pv->pv_va = va;
3519 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3520 		pa |= PG_MANAGED;
3521 	} else if (pv != NULL)
3522 		free_pv_entry(pmap, pv);
3523 
3524 	/*
3525 	 * Increment counters
3526 	 */
3527 	if (wired)
3528 		pmap->pm_stats.wired_count++;
3529 
3530 validate:
3531 	/*
3532 	 * Now validate mapping with desired protection/wiring.
3533 	 */
3534 	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3535 	if ((prot & VM_PROT_WRITE) != 0) {
3536 		newpte |= PG_RW;
3537 		if ((newpte & PG_MANAGED) != 0)
3538 			vm_page_aflag_set(m, PGA_WRITEABLE);
3539 	}
3540 #if defined(PAE) || defined(PAE_TABLES)
3541 	if ((prot & VM_PROT_EXECUTE) == 0)
3542 		newpte |= pg_nx;
3543 #endif
3544 	if (wired)
3545 		newpte |= PG_W;
3546 	if (va < VM_MAXUSER_ADDRESS)
3547 		newpte |= PG_U;
3548 	if (pmap == kernel_pmap)
3549 		newpte |= pgeflag;
3550 
3551 	/*
3552 	 * if the mapping or permission bits are different, we need
3553 	 * to update the pte.
3554 	 */
3555 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3556 		newpte |= PG_A;
3557 		if ((flags & VM_PROT_WRITE) != 0)
3558 			newpte |= PG_M;
3559 		if (origpte & PG_V) {
3560 			invlva = FALSE;
3561 			origpte = pte_load_store(pte, newpte);
3562 			if (origpte & PG_A) {
3563 				if (origpte & PG_MANAGED)
3564 					vm_page_aflag_set(om, PGA_REFERENCED);
3565 				if (opa != VM_PAGE_TO_PHYS(m))
3566 					invlva = TRUE;
3567 #if defined(PAE) || defined(PAE_TABLES)
3568 				if ((origpte & PG_NX) == 0 &&
3569 				    (newpte & PG_NX) != 0)
3570 					invlva = TRUE;
3571 #endif
3572 			}
3573 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3574 				if ((origpte & PG_MANAGED) != 0)
3575 					vm_page_dirty(om);
3576 				if ((prot & VM_PROT_WRITE) == 0)
3577 					invlva = TRUE;
3578 			}
3579 			if ((origpte & PG_MANAGED) != 0 &&
3580 			    TAILQ_EMPTY(&om->md.pv_list) &&
3581 			    ((om->flags & PG_FICTITIOUS) != 0 ||
3582 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3583 				vm_page_aflag_clear(om, PGA_WRITEABLE);
3584 			if (invlva)
3585 				pmap_invalidate_page(pmap, va);
3586 		} else
3587 			pte_store(pte, newpte);
3588 	}
3589 
3590 	/*
3591 	 * If both the page table page and the reservation are fully
3592 	 * populated, then attempt promotion.
3593 	 */
3594 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3595 	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3596 	    vm_reserv_level_iffullpop(m) == 0)
3597 		pmap_promote_pde(pmap, pde, va);
3598 
3599 	sched_unpin();
3600 	rw_wunlock(&pvh_global_lock);
3601 	PMAP_UNLOCK(pmap);
3602 	return (KERN_SUCCESS);
3603 }
3604 
3605 /*
3606  * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3607  * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3608  * blocking, (2) a mapping already exists at the specified virtual address, or
3609  * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3610  */
3611 static boolean_t
3612 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3613 {
3614 	pd_entry_t *pde, newpde;
3615 
3616 	rw_assert(&pvh_global_lock, RA_WLOCKED);
3617 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3618 	pde = pmap_pde(pmap, va);
3619 	if (*pde != 0) {
3620 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3621 		    " in pmap %p", va, pmap);
3622 		return (FALSE);
3623 	}
3624 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3625 	    PG_PS | PG_V;
3626 	if ((m->oflags & VPO_UNMANAGED) == 0) {
3627 		newpde |= PG_MANAGED;
3628 
3629 		/*
3630 		 * Abort this mapping if its PV entry could not be created.
3631 		 */
3632 		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3633 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3634 			    " in pmap %p", va, pmap);
3635 			return (FALSE);
3636 		}
3637 	}
3638 #if defined(PAE) || defined(PAE_TABLES)
3639 	if ((prot & VM_PROT_EXECUTE) == 0)
3640 		newpde |= pg_nx;
3641 #endif
3642 	if (va < VM_MAXUSER_ADDRESS)
3643 		newpde |= PG_U;
3644 
3645 	/*
3646 	 * Increment counters.
3647 	 */
3648 	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3649 
3650 	/*
3651 	 * Map the superpage.
3652 	 */
3653 	pde_store(pde, newpde);
3654 
3655 	pmap_pde_mappings++;
3656 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3657 	    " in pmap %p", va, pmap);
3658 	return (TRUE);
3659 }
3660 
3661 /*
3662  * Maps a sequence of resident pages belonging to the same object.
3663  * The sequence begins with the given page m_start.  This page is
3664  * mapped at the given virtual address start.  Each subsequent page is
3665  * mapped at a virtual address that is offset from start by the same
3666  * amount as the page is offset from m_start within the object.  The
3667  * last page in the sequence is the page with the largest offset from
3668  * m_start that can be mapped at a virtual address less than the given
3669  * virtual address end.  Not every virtual page between start and end
3670  * is mapped; only those for which a resident page exists with the
3671  * corresponding offset from m_start are mapped.
3672  */
3673 void
3674 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3675     vm_page_t m_start, vm_prot_t prot)
3676 {
3677 	vm_offset_t va;
3678 	vm_page_t m, mpte;
3679 	vm_pindex_t diff, psize;
3680 
3681 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3682 
3683 	psize = atop(end - start);
3684 	mpte = NULL;
3685 	m = m_start;
3686 	rw_wlock(&pvh_global_lock);
3687 	PMAP_LOCK(pmap);
3688 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3689 		va = start + ptoa(diff);
3690 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3691 		    m->psind == 1 && pg_ps_enabled &&
3692 		    pmap_enter_pde(pmap, va, m, prot))
3693 			m = &m[NBPDR / PAGE_SIZE - 1];
3694 		else
3695 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3696 			    mpte);
3697 		m = TAILQ_NEXT(m, listq);
3698 	}
3699 	rw_wunlock(&pvh_global_lock);
3700 	PMAP_UNLOCK(pmap);
3701 }
3702 
3703 /*
3704  * this code makes some *MAJOR* assumptions:
3705  * 1. Current pmap & pmap exists.
3706  * 2. Not wired.
3707  * 3. Read access.
3708  * 4. No page table pages.
3709  * but is *MUCH* faster than pmap_enter...
3710  */
3711 
3712 void
3713 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3714 {
3715 
3716 	rw_wlock(&pvh_global_lock);
3717 	PMAP_LOCK(pmap);
3718 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3719 	rw_wunlock(&pvh_global_lock);
3720 	PMAP_UNLOCK(pmap);
3721 }
3722 
3723 static vm_page_t
3724 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3725     vm_prot_t prot, vm_page_t mpte)
3726 {
3727 	pt_entry_t *pte;
3728 	vm_paddr_t pa;
3729 	struct spglist free;
3730 
3731 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3732 	    (m->oflags & VPO_UNMANAGED) != 0,
3733 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3734 	rw_assert(&pvh_global_lock, RA_WLOCKED);
3735 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3736 
3737 	/*
3738 	 * In the case that a page table page is not
3739 	 * resident, we are creating it here.
3740 	 */
3741 	if (va < VM_MAXUSER_ADDRESS) {
3742 		u_int ptepindex;
3743 		pd_entry_t ptepa;
3744 
3745 		/*
3746 		 * Calculate pagetable page index
3747 		 */
3748 		ptepindex = va >> PDRSHIFT;
3749 		if (mpte && (mpte->pindex == ptepindex)) {
3750 			mpte->wire_count++;
3751 		} else {
3752 			/*
3753 			 * Get the page directory entry
3754 			 */
3755 			ptepa = pmap->pm_pdir[ptepindex];
3756 
3757 			/*
3758 			 * If the page table page is mapped, we just increment
3759 			 * the hold count, and activate it.
3760 			 */
3761 			if (ptepa) {
3762 				if (ptepa & PG_PS)
3763 					return (NULL);
3764 				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3765 				mpte->wire_count++;
3766 			} else {
3767 				mpte = _pmap_allocpte(pmap, ptepindex,
3768 				    PMAP_ENTER_NOSLEEP);
3769 				if (mpte == NULL)
3770 					return (mpte);
3771 			}
3772 		}
3773 	} else {
3774 		mpte = NULL;
3775 	}
3776 
3777 	/*
3778 	 * This call to vtopte makes the assumption that we are
3779 	 * entering the page into the current pmap.  In order to support
3780 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3781 	 * But that isn't as quick as vtopte.
3782 	 */
3783 	pte = vtopte(va);
3784 	if (*pte) {
3785 		if (mpte != NULL) {
3786 			mpte->wire_count--;
3787 			mpte = NULL;
3788 		}
3789 		return (mpte);
3790 	}
3791 
3792 	/*
3793 	 * Enter on the PV list if part of our managed memory.
3794 	 */
3795 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3796 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3797 		if (mpte != NULL) {
3798 			SLIST_INIT(&free);
3799 			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3800 				pmap_invalidate_page(pmap, va);
3801 				pmap_free_zero_pages(&free);
3802 			}
3803 
3804 			mpte = NULL;
3805 		}
3806 		return (mpte);
3807 	}
3808 
3809 	/*
3810 	 * Increment counters
3811 	 */
3812 	pmap->pm_stats.resident_count++;
3813 
3814 	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
3815 #if defined(PAE) || defined(PAE_TABLES)
3816 	if ((prot & VM_PROT_EXECUTE) == 0)
3817 		pa |= pg_nx;
3818 #endif
3819 
3820 	/*
3821 	 * Now validate mapping with RO protection
3822 	 */
3823 	if ((m->oflags & VPO_UNMANAGED) != 0)
3824 		pte_store(pte, pa | PG_V | PG_U);
3825 	else
3826 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3827 	return (mpte);
3828 }
3829 
3830 /*
3831  * Make a temporary mapping for a physical address.  This is only intended
3832  * to be used for panic dumps.
3833  */
3834 void *
3835 pmap_kenter_temporary(vm_paddr_t pa, int i)
3836 {
3837 	vm_offset_t va;
3838 
3839 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3840 	pmap_kenter(va, pa);
3841 	invlpg(va);
3842 	return ((void *)crashdumpmap);
3843 }
3844 
3845 /*
3846  * This code maps large physical mmap regions into the
3847  * processor address space.  Note that some shortcuts
3848  * are taken, but the code works.
3849  */
3850 void
3851 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3852     vm_pindex_t pindex, vm_size_t size)
3853 {
3854 	pd_entry_t *pde;
3855 	vm_paddr_t pa, ptepa;
3856 	vm_page_t p;
3857 	int pat_mode;
3858 
3859 	VM_OBJECT_ASSERT_WLOCKED(object);
3860 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3861 	    ("pmap_object_init_pt: non-device object"));
3862 	if (pseflag &&
3863 	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3864 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3865 			return;
3866 		p = vm_page_lookup(object, pindex);
3867 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3868 		    ("pmap_object_init_pt: invalid page %p", p));
3869 		pat_mode = p->md.pat_mode;
3870 
3871 		/*
3872 		 * Abort the mapping if the first page is not physically
3873 		 * aligned to a 2/4MB page boundary.
3874 		 */
3875 		ptepa = VM_PAGE_TO_PHYS(p);
3876 		if (ptepa & (NBPDR - 1))
3877 			return;
3878 
3879 		/*
3880 		 * Skip the first page.  Abort the mapping if the rest of
3881 		 * the pages are not physically contiguous or have differing
3882 		 * memory attributes.
3883 		 */
3884 		p = TAILQ_NEXT(p, listq);
3885 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3886 		    pa += PAGE_SIZE) {
3887 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3888 			    ("pmap_object_init_pt: invalid page %p", p));
3889 			if (pa != VM_PAGE_TO_PHYS(p) ||
3890 			    pat_mode != p->md.pat_mode)
3891 				return;
3892 			p = TAILQ_NEXT(p, listq);
3893 		}
3894 
3895 		/*
3896 		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
3897 		 * "size" is a multiple of 2/4M, adding the PAT setting to
3898 		 * "pa" will not affect the termination of this loop.
3899 		 */
3900 		PMAP_LOCK(pmap);
3901 		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
3902 		    size; pa += NBPDR) {
3903 			pde = pmap_pde(pmap, addr);
3904 			if (*pde == 0) {
3905 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
3906 				    PG_U | PG_RW | PG_V);
3907 				pmap->pm_stats.resident_count += NBPDR /
3908 				    PAGE_SIZE;
3909 				pmap_pde_mappings++;
3910 			}
3911 			/* Else continue on if the PDE is already valid. */
3912 			addr += NBPDR;
3913 		}
3914 		PMAP_UNLOCK(pmap);
3915 	}
3916 }
3917 
3918 /*
3919  *	Clear the wired attribute from the mappings for the specified range of
3920  *	addresses in the given pmap.  Every valid mapping within that range
3921  *	must have the wired attribute set.  In contrast, invalid mappings
3922  *	cannot have the wired attribute set, so they are ignored.
3923  *
3924  *	The wired attribute of the page table entry is not a hardware feature,
3925  *	so there is no need to invalidate any TLB entries.
3926  */
3927 void
3928 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3929 {
3930 	vm_offset_t pdnxt;
3931 	pd_entry_t *pde;
3932 	pt_entry_t *pte;
3933 	boolean_t pv_lists_locked;
3934 
3935 	if (pmap_is_current(pmap))
3936 		pv_lists_locked = FALSE;
3937 	else {
3938 		pv_lists_locked = TRUE;
3939 resume:
3940 		rw_wlock(&pvh_global_lock);
3941 		sched_pin();
3942 	}
3943 	PMAP_LOCK(pmap);
3944 	for (; sva < eva; sva = pdnxt) {
3945 		pdnxt = (sva + NBPDR) & ~PDRMASK;
3946 		if (pdnxt < sva)
3947 			pdnxt = eva;
3948 		pde = pmap_pde(pmap, sva);
3949 		if ((*pde & PG_V) == 0)
3950 			continue;
3951 		if ((*pde & PG_PS) != 0) {
3952 			if ((*pde & PG_W) == 0)
3953 				panic("pmap_unwire: pde %#jx is missing PG_W",
3954 				    (uintmax_t)*pde);
3955 
3956 			/*
3957 			 * Are we unwiring the entire large page?  If not,
3958 			 * demote the mapping and fall through.
3959 			 */
3960 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3961 				/*
3962 				 * Regardless of whether a pde (or pte) is 32
3963 				 * or 64 bits in size, PG_W is among the least
3964 				 * significant 32 bits.
3965 				 */
3966 				atomic_clear_int((u_int *)pde, PG_W);
3967 				pmap->pm_stats.wired_count -= NBPDR /
3968 				    PAGE_SIZE;
3969 				continue;
3970 			} else {
3971 				if (!pv_lists_locked) {
3972 					pv_lists_locked = TRUE;
3973 					if (!rw_try_wlock(&pvh_global_lock)) {
3974 						PMAP_UNLOCK(pmap);
3975 						/* Repeat sva. */
3976 						goto resume;
3977 					}
3978 					sched_pin();
3979 				}
3980 				if (!pmap_demote_pde(pmap, pde, sva))
3981 					panic("pmap_unwire: demotion failed");
3982 			}
3983 		}
3984 		if (pdnxt > eva)
3985 			pdnxt = eva;
3986 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3987 		    sva += PAGE_SIZE) {
3988 			if ((*pte & PG_V) == 0)
3989 				continue;
3990 			if ((*pte & PG_W) == 0)
3991 				panic("pmap_unwire: pte %#jx is missing PG_W",
3992 				    (uintmax_t)*pte);
3993 
3994 			/*
3995 			 * PG_W must be cleared atomically.  Although the pmap
3996 			 * lock synchronizes access to PG_W, another processor
3997 			 * could be setting PG_M and/or PG_A concurrently.
3998 			 *
3999 			 * PG_W is among the least significant 32 bits.
4000 			 */
4001 			atomic_clear_int((u_int *)pte, PG_W);
4002 			pmap->pm_stats.wired_count--;
4003 		}
4004 	}
4005 	if (pv_lists_locked) {
4006 		sched_unpin();
4007 		rw_wunlock(&pvh_global_lock);
4008 	}
4009 	PMAP_UNLOCK(pmap);
4010 }
4011 
4012 
4013 /*
4014  *	Copy the range specified by src_addr/len
4015  *	from the source map to the range dst_addr/len
4016  *	in the destination map.
4017  *
4018  *	This routine is only advisory and need not do anything.
4019  */
4020 
4021 void
4022 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4023     vm_offset_t src_addr)
4024 {
4025 	struct spglist free;
4026 	vm_offset_t addr;
4027 	vm_offset_t end_addr = src_addr + len;
4028 	vm_offset_t pdnxt;
4029 
4030 	if (dst_addr != src_addr)
4031 		return;
4032 
4033 	if (!pmap_is_current(src_pmap))
4034 		return;
4035 
4036 	rw_wlock(&pvh_global_lock);
4037 	if (dst_pmap < src_pmap) {
4038 		PMAP_LOCK(dst_pmap);
4039 		PMAP_LOCK(src_pmap);
4040 	} else {
4041 		PMAP_LOCK(src_pmap);
4042 		PMAP_LOCK(dst_pmap);
4043 	}
4044 	sched_pin();
4045 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4046 		pt_entry_t *src_pte, *dst_pte;
4047 		vm_page_t dstmpte, srcmpte;
4048 		pd_entry_t srcptepaddr;
4049 		u_int ptepindex;
4050 
4051 		KASSERT(addr < UPT_MIN_ADDRESS,
4052 		    ("pmap_copy: invalid to pmap_copy page tables"));
4053 
4054 		pdnxt = (addr + NBPDR) & ~PDRMASK;
4055 		if (pdnxt < addr)
4056 			pdnxt = end_addr;
4057 		ptepindex = addr >> PDRSHIFT;
4058 
4059 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4060 		if (srcptepaddr == 0)
4061 			continue;
4062 
4063 		if (srcptepaddr & PG_PS) {
4064 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4065 				continue;
4066 			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4067 			    ((srcptepaddr & PG_MANAGED) == 0 ||
4068 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4069 			    PG_PS_FRAME))) {
4070 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4071 				    ~PG_W;
4072 				dst_pmap->pm_stats.resident_count +=
4073 				    NBPDR / PAGE_SIZE;
4074 				pmap_pde_mappings++;
4075 			}
4076 			continue;
4077 		}
4078 
4079 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4080 		KASSERT(srcmpte->wire_count > 0,
4081 		    ("pmap_copy: source page table page is unused"));
4082 
4083 		if (pdnxt > end_addr)
4084 			pdnxt = end_addr;
4085 
4086 		src_pte = vtopte(addr);
4087 		while (addr < pdnxt) {
4088 			pt_entry_t ptetemp;
4089 			ptetemp = *src_pte;
4090 			/*
4091 			 * we only virtual copy managed pages
4092 			 */
4093 			if ((ptetemp & PG_MANAGED) != 0) {
4094 				dstmpte = pmap_allocpte(dst_pmap, addr,
4095 				    PMAP_ENTER_NOSLEEP);
4096 				if (dstmpte == NULL)
4097 					goto out;
4098 				dst_pte = pmap_pte_quick(dst_pmap, addr);
4099 				if (*dst_pte == 0 &&
4100 				    pmap_try_insert_pv_entry(dst_pmap, addr,
4101 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4102 					/*
4103 					 * Clear the wired, modified, and
4104 					 * accessed (referenced) bits
4105 					 * during the copy.
4106 					 */
4107 					*dst_pte = ptetemp & ~(PG_W | PG_M |
4108 					    PG_A);
4109 					dst_pmap->pm_stats.resident_count++;
4110 	 			} else {
4111 					SLIST_INIT(&free);
4112 					if (pmap_unwire_ptp(dst_pmap, dstmpte,
4113 					    &free)) {
4114 						pmap_invalidate_page(dst_pmap,
4115 						    addr);
4116 						pmap_free_zero_pages(&free);
4117 					}
4118 					goto out;
4119 				}
4120 				if (dstmpte->wire_count >= srcmpte->wire_count)
4121 					break;
4122 			}
4123 			addr += PAGE_SIZE;
4124 			src_pte++;
4125 		}
4126 	}
4127 out:
4128 	sched_unpin();
4129 	rw_wunlock(&pvh_global_lock);
4130 	PMAP_UNLOCK(src_pmap);
4131 	PMAP_UNLOCK(dst_pmap);
4132 }
4133 
4134 static __inline void
4135 pagezero(void *page)
4136 {
4137 #if defined(I686_CPU)
4138 	if (cpu_class == CPUCLASS_686) {
4139 #if defined(CPU_ENABLE_SSE)
4140 		if (cpu_feature & CPUID_SSE2)
4141 			sse2_pagezero(page);
4142 		else
4143 #endif
4144 			i686_pagezero(page);
4145 	} else
4146 #endif
4147 		bzero(page, PAGE_SIZE);
4148 }
4149 
4150 /*
4151  *	pmap_zero_page zeros the specified hardware page by mapping
4152  *	the page into KVM and using bzero to clear its contents.
4153  */
4154 void
4155 pmap_zero_page(vm_page_t m)
4156 {
4157 	struct sysmaps *sysmaps;
4158 
4159 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4160 	mtx_lock(&sysmaps->lock);
4161 	if (*sysmaps->CMAP2)
4162 		panic("pmap_zero_page: CMAP2 busy");
4163 	sched_pin();
4164 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4165 	    pmap_cache_bits(m->md.pat_mode, 0);
4166 	invlcaddr(sysmaps->CADDR2);
4167 	pagezero(sysmaps->CADDR2);
4168 	*sysmaps->CMAP2 = 0;
4169 	sched_unpin();
4170 	mtx_unlock(&sysmaps->lock);
4171 }
4172 
4173 /*
4174  *	pmap_zero_page_area zeros the specified hardware page by mapping
4175  *	the page into KVM and using bzero to clear its contents.
4176  *
4177  *	off and size may not cover an area beyond a single hardware page.
4178  */
4179 void
4180 pmap_zero_page_area(vm_page_t m, int off, int size)
4181 {
4182 	struct sysmaps *sysmaps;
4183 
4184 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4185 	mtx_lock(&sysmaps->lock);
4186 	if (*sysmaps->CMAP2)
4187 		panic("pmap_zero_page_area: CMAP2 busy");
4188 	sched_pin();
4189 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4190 	    pmap_cache_bits(m->md.pat_mode, 0);
4191 	invlcaddr(sysmaps->CADDR2);
4192 	if (off == 0 && size == PAGE_SIZE)
4193 		pagezero(sysmaps->CADDR2);
4194 	else
4195 		bzero((char *)sysmaps->CADDR2 + off, size);
4196 	*sysmaps->CMAP2 = 0;
4197 	sched_unpin();
4198 	mtx_unlock(&sysmaps->lock);
4199 }
4200 
4201 /*
4202  *	pmap_zero_page_idle zeros the specified hardware page by mapping
4203  *	the page into KVM and using bzero to clear its contents.  This
4204  *	is intended to be called from the vm_pagezero process only and
4205  *	outside of Giant.
4206  */
4207 void
4208 pmap_zero_page_idle(vm_page_t m)
4209 {
4210 
4211 	if (*CMAP3)
4212 		panic("pmap_zero_page_idle: CMAP3 busy");
4213 	sched_pin();
4214 	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4215 	    pmap_cache_bits(m->md.pat_mode, 0);
4216 	invlcaddr(CADDR3);
4217 	pagezero(CADDR3);
4218 	*CMAP3 = 0;
4219 	sched_unpin();
4220 }
4221 
4222 /*
4223  *	pmap_copy_page copies the specified (machine independent)
4224  *	page by mapping the page into virtual memory and using
4225  *	bcopy to copy the page, one machine dependent page at a
4226  *	time.
4227  */
4228 void
4229 pmap_copy_page(vm_page_t src, vm_page_t dst)
4230 {
4231 	struct sysmaps *sysmaps;
4232 
4233 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4234 	mtx_lock(&sysmaps->lock);
4235 	if (*sysmaps->CMAP1)
4236 		panic("pmap_copy_page: CMAP1 busy");
4237 	if (*sysmaps->CMAP2)
4238 		panic("pmap_copy_page: CMAP2 busy");
4239 	sched_pin();
4240 	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4241 	    pmap_cache_bits(src->md.pat_mode, 0);
4242 	invlcaddr(sysmaps->CADDR1);
4243 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4244 	    pmap_cache_bits(dst->md.pat_mode, 0);
4245 	invlcaddr(sysmaps->CADDR2);
4246 	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
4247 	*sysmaps->CMAP1 = 0;
4248 	*sysmaps->CMAP2 = 0;
4249 	sched_unpin();
4250 	mtx_unlock(&sysmaps->lock);
4251 }
4252 
4253 int unmapped_buf_allowed = 1;
4254 
4255 void
4256 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4257     vm_offset_t b_offset, int xfersize)
4258 {
4259 	struct sysmaps *sysmaps;
4260 	vm_page_t a_pg, b_pg;
4261 	char *a_cp, *b_cp;
4262 	vm_offset_t a_pg_offset, b_pg_offset;
4263 	int cnt;
4264 
4265 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
4266 	mtx_lock(&sysmaps->lock);
4267 	if (*sysmaps->CMAP1 != 0)
4268 		panic("pmap_copy_pages: CMAP1 busy");
4269 	if (*sysmaps->CMAP2 != 0)
4270 		panic("pmap_copy_pages: CMAP2 busy");
4271 	sched_pin();
4272 	while (xfersize > 0) {
4273 		a_pg = ma[a_offset >> PAGE_SHIFT];
4274 		a_pg_offset = a_offset & PAGE_MASK;
4275 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4276 		b_pg = mb[b_offset >> PAGE_SHIFT];
4277 		b_pg_offset = b_offset & PAGE_MASK;
4278 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4279 		*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4280 		    pmap_cache_bits(a_pg->md.pat_mode, 0);
4281 		invlcaddr(sysmaps->CADDR1);
4282 		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4283 		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4284 		invlcaddr(sysmaps->CADDR2);
4285 		a_cp = sysmaps->CADDR1 + a_pg_offset;
4286 		b_cp = sysmaps->CADDR2 + b_pg_offset;
4287 		bcopy(a_cp, b_cp, cnt);
4288 		a_offset += cnt;
4289 		b_offset += cnt;
4290 		xfersize -= cnt;
4291 	}
4292 	*sysmaps->CMAP1 = 0;
4293 	*sysmaps->CMAP2 = 0;
4294 	sched_unpin();
4295 	mtx_unlock(&sysmaps->lock);
4296 }
4297 
4298 /*
4299  * Returns true if the pmap's pv is one of the first
4300  * 16 pvs linked to from this page.  This count may
4301  * be changed upwards or downwards in the future; it
4302  * is only necessary that true be returned for a small
4303  * subset of pmaps for proper page aging.
4304  */
4305 boolean_t
4306 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4307 {
4308 	struct md_page *pvh;
4309 	pv_entry_t pv;
4310 	int loops = 0;
4311 	boolean_t rv;
4312 
4313 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4314 	    ("pmap_page_exists_quick: page %p is not managed", m));
4315 	rv = FALSE;
4316 	rw_wlock(&pvh_global_lock);
4317 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4318 		if (PV_PMAP(pv) == pmap) {
4319 			rv = TRUE;
4320 			break;
4321 		}
4322 		loops++;
4323 		if (loops >= 16)
4324 			break;
4325 	}
4326 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4327 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4328 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4329 			if (PV_PMAP(pv) == pmap) {
4330 				rv = TRUE;
4331 				break;
4332 			}
4333 			loops++;
4334 			if (loops >= 16)
4335 				break;
4336 		}
4337 	}
4338 	rw_wunlock(&pvh_global_lock);
4339 	return (rv);
4340 }
4341 
4342 /*
4343  *	pmap_page_wired_mappings:
4344  *
4345  *	Return the number of managed mappings to the given physical page
4346  *	that are wired.
4347  */
4348 int
4349 pmap_page_wired_mappings(vm_page_t m)
4350 {
4351 	int count;
4352 
4353 	count = 0;
4354 	if ((m->oflags & VPO_UNMANAGED) != 0)
4355 		return (count);
4356 	rw_wlock(&pvh_global_lock);
4357 	count = pmap_pvh_wired_mappings(&m->md, count);
4358 	if ((m->flags & PG_FICTITIOUS) == 0) {
4359 	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4360 	        count);
4361 	}
4362 	rw_wunlock(&pvh_global_lock);
4363 	return (count);
4364 }
4365 
4366 /*
4367  *	pmap_pvh_wired_mappings:
4368  *
4369  *	Return the updated number "count" of managed mappings that are wired.
4370  */
4371 static int
4372 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4373 {
4374 	pmap_t pmap;
4375 	pt_entry_t *pte;
4376 	pv_entry_t pv;
4377 
4378 	rw_assert(&pvh_global_lock, RA_WLOCKED);
4379 	sched_pin();
4380 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4381 		pmap = PV_PMAP(pv);
4382 		PMAP_LOCK(pmap);
4383 		pte = pmap_pte_quick(pmap, pv->pv_va);
4384 		if ((*pte & PG_W) != 0)
4385 			count++;
4386 		PMAP_UNLOCK(pmap);
4387 	}
4388 	sched_unpin();
4389 	return (count);
4390 }
4391 
4392 /*
4393  * Returns TRUE if the given page is mapped individually or as part of
4394  * a 4mpage.  Otherwise, returns FALSE.
4395  */
4396 boolean_t
4397 pmap_page_is_mapped(vm_page_t m)
4398 {
4399 	boolean_t rv;
4400 
4401 	if ((m->oflags & VPO_UNMANAGED) != 0)
4402 		return (FALSE);
4403 	rw_wlock(&pvh_global_lock);
4404 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4405 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4406 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4407 	rw_wunlock(&pvh_global_lock);
4408 	return (rv);
4409 }
4410 
4411 /*
4412  * Remove all pages from specified address space
4413  * this aids process exit speeds.  Also, this code
4414  * is special cased for current process only, but
4415  * can have the more generic (and slightly slower)
4416  * mode enabled.  This is much faster than pmap_remove
4417  * in the case of running down an entire address space.
4418  */
4419 void
4420 pmap_remove_pages(pmap_t pmap)
4421 {
4422 	pt_entry_t *pte, tpte;
4423 	vm_page_t m, mpte, mt;
4424 	pv_entry_t pv;
4425 	struct md_page *pvh;
4426 	struct pv_chunk *pc, *npc;
4427 	struct spglist free;
4428 	int field, idx;
4429 	int32_t bit;
4430 	uint32_t inuse, bitmask;
4431 	int allfree;
4432 
4433 	if (pmap != PCPU_GET(curpmap)) {
4434 		printf("warning: pmap_remove_pages called with non-current pmap\n");
4435 		return;
4436 	}
4437 	SLIST_INIT(&free);
4438 	rw_wlock(&pvh_global_lock);
4439 	PMAP_LOCK(pmap);
4440 	sched_pin();
4441 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4442 		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4443 		    pc->pc_pmap));
4444 		allfree = 1;
4445 		for (field = 0; field < _NPCM; field++) {
4446 			inuse = ~pc->pc_map[field] & pc_freemask[field];
4447 			while (inuse != 0) {
4448 				bit = bsfl(inuse);
4449 				bitmask = 1UL << bit;
4450 				idx = field * 32 + bit;
4451 				pv = &pc->pc_pventry[idx];
4452 				inuse &= ~bitmask;
4453 
4454 				pte = pmap_pde(pmap, pv->pv_va);
4455 				tpte = *pte;
4456 				if ((tpte & PG_PS) == 0) {
4457 					pte = vtopte(pv->pv_va);
4458 					tpte = *pte & ~PG_PTE_PAT;
4459 				}
4460 
4461 				if (tpte == 0) {
4462 					printf(
4463 					    "TPTE at %p  IS ZERO @ VA %08x\n",
4464 					    pte, pv->pv_va);
4465 					panic("bad pte");
4466 				}
4467 
4468 /*
4469  * We cannot remove wired pages from a process' mapping at this time
4470  */
4471 				if (tpte & PG_W) {
4472 					allfree = 0;
4473 					continue;
4474 				}
4475 
4476 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4477 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4478 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4479 				    m, (uintmax_t)m->phys_addr,
4480 				    (uintmax_t)tpte));
4481 
4482 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4483 				    m < &vm_page_array[vm_page_array_size],
4484 				    ("pmap_remove_pages: bad tpte %#jx",
4485 				    (uintmax_t)tpte));
4486 
4487 				pte_clear(pte);
4488 
4489 				/*
4490 				 * Update the vm_page_t clean/reference bits.
4491 				 */
4492 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4493 					if ((tpte & PG_PS) != 0) {
4494 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4495 							vm_page_dirty(mt);
4496 					} else
4497 						vm_page_dirty(m);
4498 				}
4499 
4500 				/* Mark free */
4501 				PV_STAT(pv_entry_frees++);
4502 				PV_STAT(pv_entry_spare++);
4503 				pv_entry_count--;
4504 				pc->pc_map[field] |= bitmask;
4505 				if ((tpte & PG_PS) != 0) {
4506 					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4507 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4508 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4509 					if (TAILQ_EMPTY(&pvh->pv_list)) {
4510 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4511 							if (TAILQ_EMPTY(&mt->md.pv_list))
4512 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4513 					}
4514 					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
4515 					if (mpte != NULL) {
4516 						pmap_remove_pt_page(pmap, mpte);
4517 						pmap->pm_stats.resident_count--;
4518 						KASSERT(mpte->wire_count == NPTEPG,
4519 						    ("pmap_remove_pages: pte page wire count error"));
4520 						mpte->wire_count = 0;
4521 						pmap_add_delayed_free_list(mpte, &free, FALSE);
4522 						atomic_subtract_int(&vm_cnt.v_wire_count, 1);
4523 					}
4524 				} else {
4525 					pmap->pm_stats.resident_count--;
4526 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4527 					if (TAILQ_EMPTY(&m->md.pv_list) &&
4528 					    (m->flags & PG_FICTITIOUS) == 0) {
4529 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4530 						if (TAILQ_EMPTY(&pvh->pv_list))
4531 							vm_page_aflag_clear(m, PGA_WRITEABLE);
4532 					}
4533 					pmap_unuse_pt(pmap, pv->pv_va, &free);
4534 				}
4535 			}
4536 		}
4537 		if (allfree) {
4538 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4539 			free_pv_chunk(pc);
4540 		}
4541 	}
4542 	sched_unpin();
4543 	pmap_invalidate_all(pmap);
4544 	rw_wunlock(&pvh_global_lock);
4545 	PMAP_UNLOCK(pmap);
4546 	pmap_free_zero_pages(&free);
4547 }
4548 
4549 /*
4550  *	pmap_is_modified:
4551  *
4552  *	Return whether or not the specified physical page was modified
4553  *	in any physical maps.
4554  */
4555 boolean_t
4556 pmap_is_modified(vm_page_t m)
4557 {
4558 	boolean_t rv;
4559 
4560 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4561 	    ("pmap_is_modified: page %p is not managed", m));
4562 
4563 	/*
4564 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4565 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4566 	 * is clear, no PTEs can have PG_M set.
4567 	 */
4568 	VM_OBJECT_ASSERT_WLOCKED(m->object);
4569 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4570 		return (FALSE);
4571 	rw_wlock(&pvh_global_lock);
4572 	rv = pmap_is_modified_pvh(&m->md) ||
4573 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4574 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4575 	rw_wunlock(&pvh_global_lock);
4576 	return (rv);
4577 }
4578 
4579 /*
4580  * Returns TRUE if any of the given mappings were used to modify
4581  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4582  * mappings are supported.
4583  */
4584 static boolean_t
4585 pmap_is_modified_pvh(struct md_page *pvh)
4586 {
4587 	pv_entry_t pv;
4588 	pt_entry_t *pte;
4589 	pmap_t pmap;
4590 	boolean_t rv;
4591 
4592 	rw_assert(&pvh_global_lock, RA_WLOCKED);
4593 	rv = FALSE;
4594 	sched_pin();
4595 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4596 		pmap = PV_PMAP(pv);
4597 		PMAP_LOCK(pmap);
4598 		pte = pmap_pte_quick(pmap, pv->pv_va);
4599 		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4600 		PMAP_UNLOCK(pmap);
4601 		if (rv)
4602 			break;
4603 	}
4604 	sched_unpin();
4605 	return (rv);
4606 }
4607 
4608 /*
4609  *	pmap_is_prefaultable:
4610  *
4611  *	Return whether or not the specified virtual address is elgible
4612  *	for prefault.
4613  */
4614 boolean_t
4615 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4616 {
4617 	pd_entry_t *pde;
4618 	pt_entry_t *pte;
4619 	boolean_t rv;
4620 
4621 	rv = FALSE;
4622 	PMAP_LOCK(pmap);
4623 	pde = pmap_pde(pmap, addr);
4624 	if (*pde != 0 && (*pde & PG_PS) == 0) {
4625 		pte = vtopte(addr);
4626 		rv = *pte == 0;
4627 	}
4628 	PMAP_UNLOCK(pmap);
4629 	return (rv);
4630 }
4631 
4632 /*
4633  *	pmap_is_referenced:
4634  *
4635  *	Return whether or not the specified physical page was referenced
4636  *	in any physical maps.
4637  */
4638 boolean_t
4639 pmap_is_referenced(vm_page_t m)
4640 {
4641 	boolean_t rv;
4642 
4643 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4644 	    ("pmap_is_referenced: page %p is not managed", m));
4645 	rw_wlock(&pvh_global_lock);
4646 	rv = pmap_is_referenced_pvh(&m->md) ||
4647 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4648 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4649 	rw_wunlock(&pvh_global_lock);
4650 	return (rv);
4651 }
4652 
4653 /*
4654  * Returns TRUE if any of the given mappings were referenced and FALSE
4655  * otherwise.  Both page and 4mpage mappings are supported.
4656  */
4657 static boolean_t
4658 pmap_is_referenced_pvh(struct md_page *pvh)
4659 {
4660 	pv_entry_t pv;
4661 	pt_entry_t *pte;
4662 	pmap_t pmap;
4663 	boolean_t rv;
4664 
4665 	rw_assert(&pvh_global_lock, RA_WLOCKED);
4666 	rv = FALSE;
4667 	sched_pin();
4668 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4669 		pmap = PV_PMAP(pv);
4670 		PMAP_LOCK(pmap);
4671 		pte = pmap_pte_quick(pmap, pv->pv_va);
4672 		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4673 		PMAP_UNLOCK(pmap);
4674 		if (rv)
4675 			break;
4676 	}
4677 	sched_unpin();
4678 	return (rv);
4679 }
4680 
4681 /*
4682  * Clear the write and modified bits in each of the given page's mappings.
4683  */
4684 void
4685 pmap_remove_write(vm_page_t m)
4686 {
4687 	struct md_page *pvh;
4688 	pv_entry_t next_pv, pv;
4689 	pmap_t pmap;
4690 	pd_entry_t *pde;
4691 	pt_entry_t oldpte, *pte;
4692 	vm_offset_t va;
4693 
4694 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4695 	    ("pmap_remove_write: page %p is not managed", m));
4696 
4697 	/*
4698 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4699 	 * set by another thread while the object is locked.  Thus,
4700 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
4701 	 */
4702 	VM_OBJECT_ASSERT_WLOCKED(m->object);
4703 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4704 		return;
4705 	rw_wlock(&pvh_global_lock);
4706 	sched_pin();
4707 	if ((m->flags & PG_FICTITIOUS) != 0)
4708 		goto small_mappings;
4709 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4710 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4711 		va = pv->pv_va;
4712 		pmap = PV_PMAP(pv);
4713 		PMAP_LOCK(pmap);
4714 		pde = pmap_pde(pmap, va);
4715 		if ((*pde & PG_RW) != 0)
4716 			(void)pmap_demote_pde(pmap, pde, va);
4717 		PMAP_UNLOCK(pmap);
4718 	}
4719 small_mappings:
4720 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4721 		pmap = PV_PMAP(pv);
4722 		PMAP_LOCK(pmap);
4723 		pde = pmap_pde(pmap, pv->pv_va);
4724 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4725 		    " a 4mpage in page %p's pv list", m));
4726 		pte = pmap_pte_quick(pmap, pv->pv_va);
4727 retry:
4728 		oldpte = *pte;
4729 		if ((oldpte & PG_RW) != 0) {
4730 			/*
4731 			 * Regardless of whether a pte is 32 or 64 bits
4732 			 * in size, PG_RW and PG_M are among the least
4733 			 * significant 32 bits.
4734 			 */
4735 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4736 			    oldpte & ~(PG_RW | PG_M)))
4737 				goto retry;
4738 			if ((oldpte & PG_M) != 0)
4739 				vm_page_dirty(m);
4740 			pmap_invalidate_page(pmap, pv->pv_va);
4741 		}
4742 		PMAP_UNLOCK(pmap);
4743 	}
4744 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4745 	sched_unpin();
4746 	rw_wunlock(&pvh_global_lock);
4747 }
4748 
4749 #define	PMAP_TS_REFERENCED_MAX	5
4750 
4751 /*
4752  *	pmap_ts_referenced:
4753  *
4754  *	Return a count of reference bits for a page, clearing those bits.
4755  *	It is not necessary for every reference bit to be cleared, but it
4756  *	is necessary that 0 only be returned when there are truly no
4757  *	reference bits set.
4758  *
4759  *	XXX: The exact number of bits to check and clear is a matter that
4760  *	should be tested and standardized at some point in the future for
4761  *	optimal aging of shared pages.
4762  */
4763 int
4764 pmap_ts_referenced(vm_page_t m)
4765 {
4766 	struct md_page *pvh;
4767 	pv_entry_t pv, pvf;
4768 	pmap_t pmap;
4769 	pd_entry_t *pde;
4770 	pt_entry_t *pte;
4771 	vm_paddr_t pa;
4772 	int rtval = 0;
4773 
4774 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4775 	    ("pmap_ts_referenced: page %p is not managed", m));
4776 	pa = VM_PAGE_TO_PHYS(m);
4777 	pvh = pa_to_pvh(pa);
4778 	rw_wlock(&pvh_global_lock);
4779 	sched_pin();
4780 	if ((m->flags & PG_FICTITIOUS) != 0 ||
4781 	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4782 		goto small_mappings;
4783 	pv = pvf;
4784 	do {
4785 		pmap = PV_PMAP(pv);
4786 		PMAP_LOCK(pmap);
4787 		pde = pmap_pde(pmap, pv->pv_va);
4788 		if ((*pde & PG_A) != 0) {
4789 			/*
4790 			 * Since this reference bit is shared by either 1024
4791 			 * or 512 4KB pages, it should not be cleared every
4792 			 * time it is tested.  Apply a simple "hash" function
4793 			 * on the physical page number, the virtual superpage
4794 			 * number, and the pmap address to select one 4KB page
4795 			 * out of the 1024 or 512 on which testing the
4796 			 * reference bit will result in clearing that bit.
4797 			 * This function is designed to avoid the selection of
4798 			 * the same 4KB page for every 2- or 4MB page mapping.
4799 			 *
4800 			 * On demotion, a mapping that hasn't been referenced
4801 			 * is simply destroyed.  To avoid the possibility of a
4802 			 * subsequent page fault on a demoted wired mapping,
4803 			 * always leave its reference bit set.  Moreover,
4804 			 * since the superpage is wired, the current state of
4805 			 * its reference bit won't affect page replacement.
4806 			 */
4807 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
4808 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
4809 			    (*pde & PG_W) == 0) {
4810 				atomic_clear_int((u_int *)pde, PG_A);
4811 				pmap_invalidate_page(pmap, pv->pv_va);
4812 			}
4813 			rtval++;
4814 		}
4815 		PMAP_UNLOCK(pmap);
4816 		/* Rotate the PV list if it has more than one entry. */
4817 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4818 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4819 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4820 		}
4821 		if (rtval >= PMAP_TS_REFERENCED_MAX)
4822 			goto out;
4823 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4824 small_mappings:
4825 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4826 		goto out;
4827 	pv = pvf;
4828 	do {
4829 		pmap = PV_PMAP(pv);
4830 		PMAP_LOCK(pmap);
4831 		pde = pmap_pde(pmap, pv->pv_va);
4832 		KASSERT((*pde & PG_PS) == 0,
4833 		    ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
4834 		    m));
4835 		pte = pmap_pte_quick(pmap, pv->pv_va);
4836 		if ((*pte & PG_A) != 0) {
4837 			atomic_clear_int((u_int *)pte, PG_A);
4838 			pmap_invalidate_page(pmap, pv->pv_va);
4839 			rtval++;
4840 		}
4841 		PMAP_UNLOCK(pmap);
4842 		/* Rotate the PV list if it has more than one entry. */
4843 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4844 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4845 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4846 		}
4847 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
4848 	    PMAP_TS_REFERENCED_MAX);
4849 out:
4850 	sched_unpin();
4851 	rw_wunlock(&pvh_global_lock);
4852 	return (rtval);
4853 }
4854 
4855 /*
4856  *	Apply the given advice to the specified range of addresses within the
4857  *	given pmap.  Depending on the advice, clear the referenced and/or
4858  *	modified flags in each mapping and set the mapped page's dirty field.
4859  */
4860 void
4861 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4862 {
4863 	pd_entry_t oldpde, *pde;
4864 	pt_entry_t *pte;
4865 	vm_offset_t pdnxt;
4866 	vm_page_t m;
4867 	boolean_t anychanged, pv_lists_locked;
4868 
4869 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
4870 		return;
4871 	if (pmap_is_current(pmap))
4872 		pv_lists_locked = FALSE;
4873 	else {
4874 		pv_lists_locked = TRUE;
4875 resume:
4876 		rw_wlock(&pvh_global_lock);
4877 		sched_pin();
4878 	}
4879 	anychanged = FALSE;
4880 	PMAP_LOCK(pmap);
4881 	for (; sva < eva; sva = pdnxt) {
4882 		pdnxt = (sva + NBPDR) & ~PDRMASK;
4883 		if (pdnxt < sva)
4884 			pdnxt = eva;
4885 		pde = pmap_pde(pmap, sva);
4886 		oldpde = *pde;
4887 		if ((oldpde & PG_V) == 0)
4888 			continue;
4889 		else if ((oldpde & PG_PS) != 0) {
4890 			if ((oldpde & PG_MANAGED) == 0)
4891 				continue;
4892 			if (!pv_lists_locked) {
4893 				pv_lists_locked = TRUE;
4894 				if (!rw_try_wlock(&pvh_global_lock)) {
4895 					if (anychanged)
4896 						pmap_invalidate_all(pmap);
4897 					PMAP_UNLOCK(pmap);
4898 					goto resume;
4899 				}
4900 				sched_pin();
4901 			}
4902 			if (!pmap_demote_pde(pmap, pde, sva)) {
4903 				/*
4904 				 * The large page mapping was destroyed.
4905 				 */
4906 				continue;
4907 			}
4908 
4909 			/*
4910 			 * Unless the page mappings are wired, remove the
4911 			 * mapping to a single page so that a subsequent
4912 			 * access may repromote.  Since the underlying page
4913 			 * table page is fully populated, this removal never
4914 			 * frees a page table page.
4915 			 */
4916 			if ((oldpde & PG_W) == 0) {
4917 				pte = pmap_pte_quick(pmap, sva);
4918 				KASSERT((*pte & PG_V) != 0,
4919 				    ("pmap_advise: invalid PTE"));
4920 				pmap_remove_pte(pmap, pte, sva, NULL);
4921 				anychanged = TRUE;
4922 			}
4923 		}
4924 		if (pdnxt > eva)
4925 			pdnxt = eva;
4926 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
4927 		    sva += PAGE_SIZE) {
4928 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
4929 			    PG_V))
4930 				continue;
4931 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4932 				if (advice == MADV_DONTNEED) {
4933 					/*
4934 					 * Future calls to pmap_is_modified()
4935 					 * can be avoided by making the page
4936 					 * dirty now.
4937 					 */
4938 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
4939 					vm_page_dirty(m);
4940 				}
4941 				atomic_clear_int((u_int *)pte, PG_M | PG_A);
4942 			} else if ((*pte & PG_A) != 0)
4943 				atomic_clear_int((u_int *)pte, PG_A);
4944 			else
4945 				continue;
4946 			if ((*pte & PG_G) != 0)
4947 				pmap_invalidate_page(pmap, sva);
4948 			else
4949 				anychanged = TRUE;
4950 		}
4951 	}
4952 	if (anychanged)
4953 		pmap_invalidate_all(pmap);
4954 	if (pv_lists_locked) {
4955 		sched_unpin();
4956 		rw_wunlock(&pvh_global_lock);
4957 	}
4958 	PMAP_UNLOCK(pmap);
4959 }
4960 
4961 /*
4962  *	Clear the modify bits on the specified physical page.
4963  */
4964 void
4965 pmap_clear_modify(vm_page_t m)
4966 {
4967 	struct md_page *pvh;
4968 	pv_entry_t next_pv, pv;
4969 	pmap_t pmap;
4970 	pd_entry_t oldpde, *pde;
4971 	pt_entry_t oldpte, *pte;
4972 	vm_offset_t va;
4973 
4974 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4975 	    ("pmap_clear_modify: page %p is not managed", m));
4976 	VM_OBJECT_ASSERT_WLOCKED(m->object);
4977 	KASSERT(!vm_page_xbusied(m),
4978 	    ("pmap_clear_modify: page %p is exclusive busied", m));
4979 
4980 	/*
4981 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4982 	 * If the object containing the page is locked and the page is not
4983 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
4984 	 */
4985 	if ((m->aflags & PGA_WRITEABLE) == 0)
4986 		return;
4987 	rw_wlock(&pvh_global_lock);
4988 	sched_pin();
4989 	if ((m->flags & PG_FICTITIOUS) != 0)
4990 		goto small_mappings;
4991 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4992 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4993 		va = pv->pv_va;
4994 		pmap = PV_PMAP(pv);
4995 		PMAP_LOCK(pmap);
4996 		pde = pmap_pde(pmap, va);
4997 		oldpde = *pde;
4998 		if ((oldpde & PG_RW) != 0) {
4999 			if (pmap_demote_pde(pmap, pde, va)) {
5000 				if ((oldpde & PG_W) == 0) {
5001 					/*
5002 					 * Write protect the mapping to a
5003 					 * single page so that a subsequent
5004 					 * write access may repromote.
5005 					 */
5006 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5007 					    PG_PS_FRAME);
5008 					pte = pmap_pte_quick(pmap, va);
5009 					oldpte = *pte;
5010 					if ((oldpte & PG_V) != 0) {
5011 						/*
5012 						 * Regardless of whether a pte is 32 or 64 bits
5013 						 * in size, PG_RW and PG_M are among the least
5014 						 * significant 32 bits.
5015 						 */
5016 						while (!atomic_cmpset_int((u_int *)pte,
5017 						    oldpte,
5018 						    oldpte & ~(PG_M | PG_RW)))
5019 							oldpte = *pte;
5020 						vm_page_dirty(m);
5021 						pmap_invalidate_page(pmap, va);
5022 					}
5023 				}
5024 			}
5025 		}
5026 		PMAP_UNLOCK(pmap);
5027 	}
5028 small_mappings:
5029 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5030 		pmap = PV_PMAP(pv);
5031 		PMAP_LOCK(pmap);
5032 		pde = pmap_pde(pmap, pv->pv_va);
5033 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
5034 		    " a 4mpage in page %p's pv list", m));
5035 		pte = pmap_pte_quick(pmap, pv->pv_va);
5036 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5037 			/*
5038 			 * Regardless of whether a pte is 32 or 64 bits
5039 			 * in size, PG_M is among the least significant
5040 			 * 32 bits.
5041 			 */
5042 			atomic_clear_int((u_int *)pte, PG_M);
5043 			pmap_invalidate_page(pmap, pv->pv_va);
5044 		}
5045 		PMAP_UNLOCK(pmap);
5046 	}
5047 	sched_unpin();
5048 	rw_wunlock(&pvh_global_lock);
5049 }
5050 
5051 /*
5052  * Miscellaneous support routines follow
5053  */
5054 
5055 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
5056 static __inline void
5057 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5058 {
5059 	u_int opte, npte;
5060 
5061 	/*
5062 	 * The cache mode bits are all in the low 32-bits of the
5063 	 * PTE, so we can just spin on updating the low 32-bits.
5064 	 */
5065 	do {
5066 		opte = *(u_int *)pte;
5067 		npte = opte & ~PG_PTE_CACHE;
5068 		npte |= cache_bits;
5069 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5070 }
5071 
5072 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5073 static __inline void
5074 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5075 {
5076 	u_int opde, npde;
5077 
5078 	/*
5079 	 * The cache mode bits are all in the low 32-bits of the
5080 	 * PDE, so we can just spin on updating the low 32-bits.
5081 	 */
5082 	do {
5083 		opde = *(u_int *)pde;
5084 		npde = opde & ~PG_PDE_CACHE;
5085 		npde |= cache_bits;
5086 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5087 }
5088 
5089 /*
5090  * Map a set of physical memory pages into the kernel virtual
5091  * address space. Return a pointer to where it is mapped. This
5092  * routine is intended to be used for mapping device memory,
5093  * NOT real memory.
5094  */
5095 void *
5096 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5097 {
5098 	struct pmap_preinit_mapping *ppim;
5099 	vm_offset_t va, offset;
5100 	vm_size_t tmpsize;
5101 	int i;
5102 
5103 	offset = pa & PAGE_MASK;
5104 	size = round_page(offset + size);
5105 	pa = pa & PG_FRAME;
5106 
5107 	if (pa < KERNLOAD && pa + size <= KERNLOAD)
5108 		va = KERNBASE + pa;
5109 	else if (!pmap_initialized) {
5110 		va = 0;
5111 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5112 			ppim = pmap_preinit_mapping + i;
5113 			if (ppim->va == 0) {
5114 				ppim->pa = pa;
5115 				ppim->sz = size;
5116 				ppim->mode = mode;
5117 				ppim->va = virtual_avail;
5118 				virtual_avail += size;
5119 				va = ppim->va;
5120 				break;
5121 			}
5122 		}
5123 		if (va == 0)
5124 			panic("%s: too many preinit mappings", __func__);
5125 	} else {
5126 		/*
5127 		 * If we have a preinit mapping, re-use it.
5128 		 */
5129 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5130 			ppim = pmap_preinit_mapping + i;
5131 			if (ppim->pa == pa && ppim->sz == size &&
5132 			    ppim->mode == mode)
5133 				return ((void *)(ppim->va + offset));
5134 		}
5135 		va = kva_alloc(size);
5136 		if (va == 0)
5137 			panic("%s: Couldn't allocate KVA", __func__);
5138 	}
5139 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5140 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5141 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5142 	pmap_invalidate_cache_range(va, va + size, FALSE);
5143 	return ((void *)(va + offset));
5144 }
5145 
5146 void *
5147 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5148 {
5149 
5150 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5151 }
5152 
5153 void *
5154 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5155 {
5156 
5157 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5158 }
5159 
5160 void
5161 pmap_unmapdev(vm_offset_t va, vm_size_t size)
5162 {
5163 	struct pmap_preinit_mapping *ppim;
5164 	vm_offset_t offset;
5165 	int i;
5166 
5167 	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5168 		return;
5169 	offset = va & PAGE_MASK;
5170 	size = round_page(offset + size);
5171 	va = trunc_page(va);
5172 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5173 		ppim = pmap_preinit_mapping + i;
5174 		if (ppim->va == va && ppim->sz == size) {
5175 			if (pmap_initialized)
5176 				return;
5177 			ppim->pa = 0;
5178 			ppim->va = 0;
5179 			ppim->sz = 0;
5180 			ppim->mode = 0;
5181 			if (va + size == virtual_avail)
5182 				virtual_avail = va;
5183 			return;
5184 		}
5185 	}
5186 	if (pmap_initialized)
5187 		kva_free(va, size);
5188 }
5189 
5190 /*
5191  * Sets the memory attribute for the specified page.
5192  */
5193 void
5194 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5195 {
5196 
5197 	m->md.pat_mode = ma;
5198 	if ((m->flags & PG_FICTITIOUS) != 0)
5199 		return;
5200 
5201 	/*
5202 	 * If "m" is a normal page, flush it from the cache.
5203 	 * See pmap_invalidate_cache_range().
5204 	 *
5205 	 * First, try to find an existing mapping of the page by sf
5206 	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5207 	 * flushes the cache.
5208 	 */
5209 	if (sf_buf_invalidate_cache(m))
5210 		return;
5211 
5212 	/*
5213 	 * If page is not mapped by sf buffer, but CPU does not
5214 	 * support self snoop, map the page transient and do
5215 	 * invalidation. In the worst case, whole cache is flushed by
5216 	 * pmap_invalidate_cache_range().
5217 	 */
5218 	if ((cpu_feature & CPUID_SS) == 0)
5219 		pmap_flush_page(m);
5220 }
5221 
5222 static void
5223 pmap_flush_page(vm_page_t m)
5224 {
5225 	struct sysmaps *sysmaps;
5226 	vm_offset_t sva, eva;
5227 
5228 	if ((cpu_feature & CPUID_CLFSH) != 0) {
5229 		sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
5230 		mtx_lock(&sysmaps->lock);
5231 		if (*sysmaps->CMAP2)
5232 			panic("pmap_flush_page: CMAP2 busy");
5233 		sched_pin();
5234 		*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5235 		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5236 		invlcaddr(sysmaps->CADDR2);
5237 		sva = (vm_offset_t)sysmaps->CADDR2;
5238 		eva = sva + PAGE_SIZE;
5239 
5240 		/*
5241 		 * Use mfence despite the ordering implied by
5242 		 * mtx_{un,}lock() because clflush is not guaranteed
5243 		 * to be ordered by any other instruction.
5244 		 */
5245 		mfence();
5246 		for (; sva < eva; sva += cpu_clflush_line_size)
5247 			clflush(sva);
5248 		mfence();
5249 		*sysmaps->CMAP2 = 0;
5250 		sched_unpin();
5251 		mtx_unlock(&sysmaps->lock);
5252 	} else
5253 		pmap_invalidate_cache();
5254 }
5255 
5256 /*
5257  * Changes the specified virtual address range's memory type to that given by
5258  * the parameter "mode".  The specified virtual address range must be
5259  * completely contained within either the kernel map.
5260  *
5261  * Returns zero if the change completed successfully, and either EINVAL or
5262  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5263  * of the virtual address range was not mapped, and ENOMEM is returned if
5264  * there was insufficient memory available to complete the change.
5265  */
5266 int
5267 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5268 {
5269 	vm_offset_t base, offset, tmpva;
5270 	pd_entry_t *pde;
5271 	pt_entry_t *pte;
5272 	int cache_bits_pte, cache_bits_pde;
5273 	boolean_t changed;
5274 
5275 	base = trunc_page(va);
5276 	offset = va & PAGE_MASK;
5277 	size = round_page(offset + size);
5278 
5279 	/*
5280 	 * Only supported on kernel virtual addresses above the recursive map.
5281 	 */
5282 	if (base < VM_MIN_KERNEL_ADDRESS)
5283 		return (EINVAL);
5284 
5285 	cache_bits_pde = pmap_cache_bits(mode, 1);
5286 	cache_bits_pte = pmap_cache_bits(mode, 0);
5287 	changed = FALSE;
5288 
5289 	/*
5290 	 * Pages that aren't mapped aren't supported.  Also break down
5291 	 * 2/4MB pages into 4KB pages if required.
5292 	 */
5293 	PMAP_LOCK(kernel_pmap);
5294 	for (tmpva = base; tmpva < base + size; ) {
5295 		pde = pmap_pde(kernel_pmap, tmpva);
5296 		if (*pde == 0) {
5297 			PMAP_UNLOCK(kernel_pmap);
5298 			return (EINVAL);
5299 		}
5300 		if (*pde & PG_PS) {
5301 			/*
5302 			 * If the current 2/4MB page already has
5303 			 * the required memory type, then we need not
5304 			 * demote this page.  Just increment tmpva to
5305 			 * the next 2/4MB page frame.
5306 			 */
5307 			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5308 				tmpva = trunc_4mpage(tmpva) + NBPDR;
5309 				continue;
5310 			}
5311 
5312 			/*
5313 			 * If the current offset aligns with a 2/4MB
5314 			 * page frame and there is at least 2/4MB left
5315 			 * within the range, then we need not break
5316 			 * down this page into 4KB pages.
5317 			 */
5318 			if ((tmpva & PDRMASK) == 0 &&
5319 			    tmpva + PDRMASK < base + size) {
5320 				tmpva += NBPDR;
5321 				continue;
5322 			}
5323 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5324 				PMAP_UNLOCK(kernel_pmap);
5325 				return (ENOMEM);
5326 			}
5327 		}
5328 		pte = vtopte(tmpva);
5329 		if (*pte == 0) {
5330 			PMAP_UNLOCK(kernel_pmap);
5331 			return (EINVAL);
5332 		}
5333 		tmpva += PAGE_SIZE;
5334 	}
5335 	PMAP_UNLOCK(kernel_pmap);
5336 
5337 	/*
5338 	 * Ok, all the pages exist, so run through them updating their
5339 	 * cache mode if required.
5340 	 */
5341 	for (tmpva = base; tmpva < base + size; ) {
5342 		pde = pmap_pde(kernel_pmap, tmpva);
5343 		if (*pde & PG_PS) {
5344 			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5345 				pmap_pde_attr(pde, cache_bits_pde);
5346 				changed = TRUE;
5347 			}
5348 			tmpva = trunc_4mpage(tmpva) + NBPDR;
5349 		} else {
5350 			pte = vtopte(tmpva);
5351 			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5352 				pmap_pte_attr(pte, cache_bits_pte);
5353 				changed = TRUE;
5354 			}
5355 			tmpva += PAGE_SIZE;
5356 		}
5357 	}
5358 
5359 	/*
5360 	 * Flush CPU caches to make sure any data isn't cached that
5361 	 * shouldn't be, etc.
5362 	 */
5363 	if (changed) {
5364 		pmap_invalidate_range(kernel_pmap, base, tmpva);
5365 		pmap_invalidate_cache_range(base, tmpva, FALSE);
5366 	}
5367 	return (0);
5368 }
5369 
5370 /*
5371  * perform the pmap work for mincore
5372  */
5373 int
5374 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5375 {
5376 	pd_entry_t *pdep;
5377 	pt_entry_t *ptep, pte;
5378 	vm_paddr_t pa;
5379 	int val;
5380 
5381 	PMAP_LOCK(pmap);
5382 retry:
5383 	pdep = pmap_pde(pmap, addr);
5384 	if (*pdep != 0) {
5385 		if (*pdep & PG_PS) {
5386 			pte = *pdep;
5387 			/* Compute the physical address of the 4KB page. */
5388 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5389 			    PG_FRAME;
5390 			val = MINCORE_SUPER;
5391 		} else {
5392 			ptep = pmap_pte(pmap, addr);
5393 			pte = *ptep;
5394 			pmap_pte_release(ptep);
5395 			pa = pte & PG_FRAME;
5396 			val = 0;
5397 		}
5398 	} else {
5399 		pte = 0;
5400 		pa = 0;
5401 		val = 0;
5402 	}
5403 	if ((pte & PG_V) != 0) {
5404 		val |= MINCORE_INCORE;
5405 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5406 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5407 		if ((pte & PG_A) != 0)
5408 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5409 	}
5410 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5411 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5412 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5413 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5414 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5415 			goto retry;
5416 	} else
5417 		PA_UNLOCK_COND(*locked_pa);
5418 	PMAP_UNLOCK(pmap);
5419 	return (val);
5420 }
5421 
5422 void
5423 pmap_activate(struct thread *td)
5424 {
5425 	pmap_t	pmap, oldpmap;
5426 	u_int	cpuid;
5427 	u_int32_t  cr3;
5428 
5429 	critical_enter();
5430 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5431 	oldpmap = PCPU_GET(curpmap);
5432 	cpuid = PCPU_GET(cpuid);
5433 #if defined(SMP)
5434 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5435 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5436 #else
5437 	CPU_CLR(cpuid, &oldpmap->pm_active);
5438 	CPU_SET(cpuid, &pmap->pm_active);
5439 #endif
5440 #if defined(PAE) || defined(PAE_TABLES)
5441 	cr3 = vtophys(pmap->pm_pdpt);
5442 #else
5443 	cr3 = vtophys(pmap->pm_pdir);
5444 #endif
5445 	/*
5446 	 * pmap_activate is for the current thread on the current cpu
5447 	 */
5448 	td->td_pcb->pcb_cr3 = cr3;
5449 	load_cr3(cr3);
5450 	PCPU_SET(curpmap, pmap);
5451 	critical_exit();
5452 }
5453 
5454 void
5455 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5456 {
5457 }
5458 
5459 /*
5460  *	Increase the starting virtual address of the given mapping if a
5461  *	different alignment might result in more superpage mappings.
5462  */
5463 void
5464 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5465     vm_offset_t *addr, vm_size_t size)
5466 {
5467 	vm_offset_t superpage_offset;
5468 
5469 	if (size < NBPDR)
5470 		return;
5471 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5472 		offset += ptoa(object->pg_color);
5473 	superpage_offset = offset & PDRMASK;
5474 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5475 	    (*addr & PDRMASK) == superpage_offset)
5476 		return;
5477 	if ((*addr & PDRMASK) < superpage_offset)
5478 		*addr = (*addr & ~PDRMASK) + superpage_offset;
5479 	else
5480 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5481 }
5482 
5483 vm_offset_t
5484 pmap_quick_enter_page(vm_page_t m)
5485 {
5486 	vm_offset_t qaddr;
5487 	pt_entry_t *pte;
5488 
5489 	critical_enter();
5490 	qaddr = PCPU_GET(qmap_addr);
5491 	pte = vtopte(qaddr);
5492 
5493 	KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
5494 	*pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
5495 	    pmap_cache_bits(pmap_page_get_memattr(m), 0);
5496 	invlpg(qaddr);
5497 
5498 	return (qaddr);
5499 }
5500 
5501 void
5502 pmap_quick_remove_page(vm_offset_t addr)
5503 {
5504 	vm_offset_t qaddr;
5505 	pt_entry_t *pte;
5506 
5507 	qaddr = PCPU_GET(qmap_addr);
5508 	pte = vtopte(qaddr);
5509 
5510 	KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
5511 	KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
5512 
5513 	*pte = 0;
5514 	critical_exit();
5515 }
5516 
5517 #if defined(PMAP_DEBUG)
5518 pmap_pid_dump(int pid)
5519 {
5520 	pmap_t pmap;
5521 	struct proc *p;
5522 	int npte = 0;
5523 	int index;
5524 
5525 	sx_slock(&allproc_lock);
5526 	FOREACH_PROC_IN_SYSTEM(p) {
5527 		if (p->p_pid != pid)
5528 			continue;
5529 
5530 		if (p->p_vmspace) {
5531 			int i,j;
5532 			index = 0;
5533 			pmap = vmspace_pmap(p->p_vmspace);
5534 			for (i = 0; i < NPDEPTD; i++) {
5535 				pd_entry_t *pde;
5536 				pt_entry_t *pte;
5537 				vm_offset_t base = i << PDRSHIFT;
5538 
5539 				pde = &pmap->pm_pdir[i];
5540 				if (pde && pmap_pde_v(pde)) {
5541 					for (j = 0; j < NPTEPG; j++) {
5542 						vm_offset_t va = base + (j << PAGE_SHIFT);
5543 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5544 							if (index) {
5545 								index = 0;
5546 								printf("\n");
5547 							}
5548 							sx_sunlock(&allproc_lock);
5549 							return (npte);
5550 						}
5551 						pte = pmap_pte(pmap, va);
5552 						if (pte && pmap_pte_v(pte)) {
5553 							pt_entry_t pa;
5554 							vm_page_t m;
5555 							pa = *pte;
5556 							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5557 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5558 								va, pa, m->hold_count, m->wire_count, m->flags);
5559 							npte++;
5560 							index++;
5561 							if (index >= 2) {
5562 								index = 0;
5563 								printf("\n");
5564 							} else {
5565 								printf(" ");
5566 							}
5567 						}
5568 					}
5569 				}
5570 			}
5571 		}
5572 	}
5573 	sx_sunlock(&allproc_lock);
5574 	return (npte);
5575 }
5576 #endif
5577