xref: /freebsd/sys/i386/i386/pmap.c (revision 39beb93c)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * the Systems Programming Group of the University of Utah Computer
13  * Science Department and William Jolitz of UUNET Technologies Inc.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. All advertising materials mentioning features or use of this software
24  *    must display the following acknowledgement:
25  *	This product includes software developed by the University of
26  *	California, Berkeley and its contributors.
27  * 4. Neither the name of the University nor the names of its contributors
28  *    may be used to endorse or promote products derived from this software
29  *    without specific prior written permission.
30  *
31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41  * SUCH DAMAGE.
42  *
43  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44  */
45 /*-
46  * Copyright (c) 2003 Networks Associates Technology, Inc.
47  * All rights reserved.
48  *
49  * This software was developed for the FreeBSD Project by Jake Burkholder,
50  * Safeport Network Services, and Network Associates Laboratories, the
51  * Security Research Division of Network Associates, Inc. under
52  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53  * CHATS research program.
54  *
55  * Redistribution and use in source and binary forms, with or without
56  * modification, are permitted provided that the following conditions
57  * are met:
58  * 1. Redistributions of source code must retain the above copyright
59  *    notice, this list of conditions and the following disclaimer.
60  * 2. Redistributions in binary form must reproduce the above copyright
61  *    notice, this list of conditions and the following disclaimer in the
62  *    documentation and/or other materials provided with the distribution.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74  * SUCH DAMAGE.
75  */
76 
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD$");
79 
80 /*
81  *	Manages physical address maps.
82  *
83  *	In addition to hardware address maps, this
84  *	module is called upon to provide software-use-only
85  *	maps which may or may not be stored in the same
86  *	form as hardware maps.  These pseudo-maps are
87  *	used to store intermediate results from copy
88  *	operations to and from address spaces.
89  *
90  *	Since the information managed by this module is
91  *	also stored by the logical address mapping module,
92  *	this module may throw away valid virtual-to-physical
93  *	mappings at almost any time.  However, invalidations
94  *	of virtual-to-physical mappings must be done as
95  *	requested.
96  *
97  *	In order to cope with hardware architectures which
98  *	make virtual-to-physical map invalidates expensive,
99  *	this module may delay invalidate or reduced protection
100  *	operations until such time as they are actually
101  *	necessary.  This module is given full information as
102  *	to which processors are currently using which maps,
103  *	and to when physical maps must be made correct.
104  */
105 
106 #include "opt_cpu.h"
107 #include "opt_pmap.h"
108 #include "opt_msgbuf.h"
109 #include "opt_smp.h"
110 #include "opt_xbox.h"
111 
112 #include <sys/param.h>
113 #include <sys/systm.h>
114 #include <sys/kernel.h>
115 #include <sys/ktr.h>
116 #include <sys/lock.h>
117 #include <sys/malloc.h>
118 #include <sys/mman.h>
119 #include <sys/msgbuf.h>
120 #include <sys/mutex.h>
121 #include <sys/proc.h>
122 #include <sys/sx.h>
123 #include <sys/vmmeter.h>
124 #include <sys/sched.h>
125 #include <sys/sysctl.h>
126 #ifdef SMP
127 #include <sys/smp.h>
128 #endif
129 
130 #include <vm/vm.h>
131 #include <vm/vm_param.h>
132 #include <vm/vm_kern.h>
133 #include <vm/vm_page.h>
134 #include <vm/vm_map.h>
135 #include <vm/vm_object.h>
136 #include <vm/vm_extern.h>
137 #include <vm/vm_pageout.h>
138 #include <vm/vm_pager.h>
139 #include <vm/vm_reserv.h>
140 #include <vm/uma.h>
141 
142 #include <machine/cpu.h>
143 #include <machine/cputypes.h>
144 #include <machine/md_var.h>
145 #include <machine/pcb.h>
146 #include <machine/specialreg.h>
147 #ifdef SMP
148 #include <machine/smp.h>
149 #endif
150 
151 #ifdef XBOX
152 #include <machine/xbox.h>
153 #endif
154 
155 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
156 #define CPU_ENABLE_SSE
157 #endif
158 
159 #ifndef PMAP_SHPGPERPROC
160 #define PMAP_SHPGPERPROC 200
161 #endif
162 
163 #if !defined(DIAGNOSTIC)
164 #define PMAP_INLINE	__gnu89_inline
165 #else
166 #define PMAP_INLINE
167 #endif
168 
169 #define PV_STATS
170 #ifdef PV_STATS
171 #define PV_STAT(x)	do { x ; } while (0)
172 #else
173 #define PV_STAT(x)	do { } while (0)
174 #endif
175 
176 #define	pa_index(pa)	((pa) >> PDRSHIFT)
177 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
178 
179 /*
180  * Get PDEs and PTEs for user/kernel address space
181  */
182 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
183 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
184 
185 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
186 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
187 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
188 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
189 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
190 
191 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
192     atomic_clear_int((u_int *)(pte), PG_W))
193 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
194 
195 struct pmap kernel_pmap_store;
196 LIST_HEAD(pmaplist, pmap);
197 static struct pmaplist allpmaps;
198 static struct mtx allpmaps_lock;
199 
200 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
201 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
202 int pgeflag = 0;		/* PG_G or-in */
203 int pseflag = 0;		/* PG_PS or-in */
204 
205 static int nkpt;
206 vm_offset_t kernel_vm_end;
207 extern u_int32_t KERNend;
208 
209 #ifdef PAE
210 pt_entry_t pg_nx;
211 static uma_zone_t pdptzone;
212 #endif
213 
214 static int pat_works;			/* Is page attribute table sane? */
215 
216 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
217 
218 static int pg_ps_enabled;
219 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0,
220     "Are large page mappings enabled?");
221 
222 /*
223  * Data for the pv entry allocation mechanism
224  */
225 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
226 static struct md_page *pv_table;
227 static int shpgperproc = PMAP_SHPGPERPROC;
228 
229 struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
230 int pv_maxchunks;			/* How many chunks we have KVA for */
231 vm_offset_t pv_vafree;			/* freelist stored in the PTE */
232 
233 /*
234  * All those kernel PT submaps that BSD is so fond of
235  */
236 struct sysmaps {
237 	struct	mtx lock;
238 	pt_entry_t *CMAP1;
239 	pt_entry_t *CMAP2;
240 	caddr_t	CADDR1;
241 	caddr_t	CADDR2;
242 };
243 static struct sysmaps sysmaps_pcpu[MAXCPU];
244 pt_entry_t *CMAP1 = 0;
245 static pt_entry_t *CMAP3;
246 caddr_t CADDR1 = 0, ptvmmap = 0;
247 static caddr_t CADDR3;
248 struct msgbuf *msgbufp = 0;
249 
250 /*
251  * Crashdump maps.
252  */
253 static caddr_t crashdumpmap;
254 
255 static pt_entry_t *PMAP1 = 0, *PMAP2;
256 static pt_entry_t *PADDR1 = 0, *PADDR2;
257 #ifdef SMP
258 static int PMAP1cpu;
259 static int PMAP1changedcpu;
260 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
261 	   &PMAP1changedcpu, 0,
262 	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
263 #endif
264 static int PMAP1changed;
265 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
266 	   &PMAP1changed, 0,
267 	   "Number of times pmap_pte_quick changed PMAP1");
268 static int PMAP1unchanged;
269 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
270 	   &PMAP1unchanged, 0,
271 	   "Number of times pmap_pte_quick didn't change PMAP1");
272 static struct mtx PMAP2mutex;
273 
274 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
275 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
276 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
277 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
278 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
279 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
280 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
281 		    vm_offset_t va);
282 static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
283 
284 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
285 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
286     vm_prot_t prot);
287 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
288     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
289 static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
290 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
291 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
292 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
293 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
294 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
295     vm_prot_t prot);
296 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
297     vm_page_t *free);
298 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
299     vm_page_t *free);
300 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
301 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
302     vm_page_t *free);
303 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
304 					vm_offset_t va);
305 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
306 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
307     vm_page_t m);
308 
309 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
310 
311 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
312 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
313 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
314 static void pmap_pte_release(pt_entry_t *pte);
315 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
316 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
317 #ifdef PAE
318 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
319 #endif
320 
321 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
322 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
323 
324 /*
325  * If you get an error here, then you set KVA_PAGES wrong! See the
326  * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
327  * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
328  */
329 CTASSERT(KERNBASE % (1 << 24) == 0);
330 
331 /*
332  * Move the kernel virtual free pointer to the next
333  * 4MB.  This is used to help improve performance
334  * by using a large (4MB) page for much of the kernel
335  * (.text, .data, .bss)
336  */
337 static vm_offset_t
338 pmap_kmem_choose(vm_offset_t addr)
339 {
340 	vm_offset_t newaddr = addr;
341 
342 #ifndef DISABLE_PSE
343 	if (cpu_feature & CPUID_PSE)
344 		newaddr = (addr + PDRMASK) & ~PDRMASK;
345 #endif
346 	return newaddr;
347 }
348 
349 /*
350  *	Bootstrap the system enough to run with virtual memory.
351  *
352  *	On the i386 this is called after mapping has already been enabled
353  *	and just syncs the pmap module with what has already been done.
354  *	[We can't call it easily with mapping off since the kernel is not
355  *	mapped with PA == VA, hence we would have to relocate every address
356  *	from the linked base (virtual) address "KERNBASE" to the actual
357  *	(physical) address starting relative to 0]
358  */
359 void
360 pmap_bootstrap(vm_paddr_t firstaddr)
361 {
362 	vm_offset_t va;
363 	pt_entry_t *pte, *unused;
364 	struct sysmaps *sysmaps;
365 	int i;
366 
367 	/*
368 	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
369 	 * large. It should instead be correctly calculated in locore.s and
370 	 * not based on 'first' (which is a physical address, not a virtual
371 	 * address, for the start of unused physical memory). The kernel
372 	 * page tables are NOT double mapped and thus should not be included
373 	 * in this calculation.
374 	 */
375 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
376 	virtual_avail = pmap_kmem_choose(virtual_avail);
377 
378 	virtual_end = VM_MAX_KERNEL_ADDRESS;
379 
380 	/*
381 	 * Initialize the kernel pmap (which is statically allocated).
382 	 */
383 	PMAP_LOCK_INIT(kernel_pmap);
384 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
385 #ifdef PAE
386 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
387 #endif
388 	kernel_pmap->pm_root = NULL;
389 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
390 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
391 	LIST_INIT(&allpmaps);
392 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
393 	mtx_lock_spin(&allpmaps_lock);
394 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
395 	mtx_unlock_spin(&allpmaps_lock);
396 	nkpt = NKPT;
397 
398 	/*
399 	 * Reserve some special page table entries/VA space for temporary
400 	 * mapping of pages.
401 	 */
402 #define	SYSMAP(c, p, v, n)	\
403 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
404 
405 	va = virtual_avail;
406 	pte = vtopte(va);
407 
408 	/*
409 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
410 	 * CMAP3 is used for the idle process page zeroing.
411 	 */
412 	for (i = 0; i < MAXCPU; i++) {
413 		sysmaps = &sysmaps_pcpu[i];
414 		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
415 		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
416 		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
417 	}
418 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
419 	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
420 	*CMAP3 = 0;
421 
422 	/*
423 	 * Crashdump maps.
424 	 */
425 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
426 
427 	/*
428 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
429 	 */
430 	SYSMAP(caddr_t, unused, ptvmmap, 1)
431 
432 	/*
433 	 * msgbufp is used to map the system message buffer.
434 	 */
435 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
436 
437 	/*
438 	 * ptemap is used for pmap_pte_quick
439 	 */
440 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
441 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
442 
443 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
444 
445 	virtual_avail = va;
446 
447 	*CMAP1 = 0;
448 
449 	/*
450 	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
451 	 * physical memory region that is used by the ACPI wakeup code.  This
452 	 * mapping must not have PG_G set.
453 	 */
454 #ifdef XBOX
455 	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
456 	 * an early stadium, we cannot yet neatly map video memory ... :-(
457 	 * Better fixes are very welcome! */
458 	if (!arch_i386_is_xbox)
459 #endif
460 	for (i = 1; i < NKPT; i++)
461 		PTD[i] = 0;
462 
463 	/* Initialize the PAT MSR if present. */
464 	pmap_init_pat();
465 
466 	/* Turn on PG_G on kernel page(s) */
467 	pmap_set_pg();
468 }
469 
470 /*
471  * Setup the PAT MSR.
472  */
473 void
474 pmap_init_pat(void)
475 {
476 	uint64_t pat_msr;
477 
478 	/* Bail if this CPU doesn't implement PAT. */
479 	if (!(cpu_feature & CPUID_PAT))
480 		return;
481 
482 	if (cpu_vendor_id != CPU_VENDOR_INTEL ||
483 	    (I386_CPU_FAMILY(cpu_id) == 6 && I386_CPU_MODEL(cpu_id) >= 0xe)) {
484 		/*
485 		 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
486 		 * Program 4 and 5 as WP and WC.
487 		 * Leave 6 and 7 as UC and UC-.
488 		 */
489 		pat_msr = rdmsr(MSR_PAT);
490 		pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
491 		pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
492 		    PAT_VALUE(5, PAT_WRITE_COMBINING);
493 		pat_works = 1;
494 	} else {
495 		/*
496 		 * Due to some Intel errata, we can only safely use the lower 4
497 		 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
498 		 * of UC-.
499 		 *
500 		 *   Intel Pentium III Processor Specification Update
501 		 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
502 		 * or Mode C Paging)
503 		 *
504 		 *   Intel Pentium IV  Processor Specification Update
505 		 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
506 		 */
507 		pat_msr = rdmsr(MSR_PAT);
508 		pat_msr &= ~PAT_MASK(2);
509 		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
510 		pat_works = 0;
511 	}
512 	wrmsr(MSR_PAT, pat_msr);
513 }
514 
515 /*
516  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
517  */
518 void
519 pmap_set_pg(void)
520 {
521 	pd_entry_t pdir;
522 	pt_entry_t *pte;
523 	vm_offset_t va, endva;
524 	int i;
525 
526 	if (pgeflag == 0)
527 		return;
528 
529 	i = KERNLOAD/NBPDR;
530 	endva = KERNBASE + KERNend;
531 
532 	if (pseflag) {
533 		va = KERNBASE + KERNLOAD;
534 		while (va  < endva) {
535 			pdir = kernel_pmap->pm_pdir[KPTDI+i];
536 			pdir |= pgeflag;
537 			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
538 			invltlb();	/* Play it safe, invltlb() every time */
539 			i++;
540 			va += NBPDR;
541 		}
542 	} else {
543 		va = (vm_offset_t)btext;
544 		while (va < endva) {
545 			pte = vtopte(va);
546 			if (*pte)
547 				*pte |= pgeflag;
548 			invltlb();	/* Play it safe, invltlb() every time */
549 			va += PAGE_SIZE;
550 		}
551 	}
552 }
553 
554 /*
555  * Initialize a vm_page's machine-dependent fields.
556  */
557 void
558 pmap_page_init(vm_page_t m)
559 {
560 
561 	TAILQ_INIT(&m->md.pv_list);
562 }
563 
564 #ifdef PAE
565 
566 static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
567 
568 static void *
569 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
570 {
571 
572 	/* Inform UMA that this allocator uses kernel_map/object. */
573 	*flags = UMA_SLAB_KERNEL;
574 	return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
575 	    1, 0));
576 }
577 #endif
578 
579 /*
580  * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
581  * Requirements:
582  *  - Must deal with pages in order to ensure that none of the PG_* bits
583  *    are ever set, PG_V in particular.
584  *  - Assumes we can write to ptes without pte_store() atomic ops, even
585  *    on PAE systems.  This should be ok.
586  *  - Assumes nothing will ever test these addresses for 0 to indicate
587  *    no mapping instead of correctly checking PG_V.
588  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
589  * Because PG_V is never set, there can be no mappings to invalidate.
590  */
591 static vm_offset_t
592 pmap_ptelist_alloc(vm_offset_t *head)
593 {
594 	pt_entry_t *pte;
595 	vm_offset_t va;
596 
597 	va = *head;
598 	if (va == 0)
599 		return (va);	/* Out of memory */
600 	pte = vtopte(va);
601 	*head = *pte;
602 	if (*head & PG_V)
603 		panic("pmap_ptelist_alloc: va with PG_V set!");
604 	*pte = 0;
605 	return (va);
606 }
607 
608 static void
609 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
610 {
611 	pt_entry_t *pte;
612 
613 	if (va & PG_V)
614 		panic("pmap_ptelist_free: freeing va with PG_V set!");
615 	pte = vtopte(va);
616 	*pte = *head;		/* virtual! PG_V is 0 though */
617 	*head = va;
618 }
619 
620 static void
621 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
622 {
623 	int i;
624 	vm_offset_t va;
625 
626 	*head = 0;
627 	for (i = npages - 1; i >= 0; i--) {
628 		va = (vm_offset_t)base + i * PAGE_SIZE;
629 		pmap_ptelist_free(head, va);
630 	}
631 }
632 
633 
634 /*
635  *	Initialize the pmap module.
636  *	Called by vm_init, to initialize any structures that the pmap
637  *	system needs to map virtual memory.
638  */
639 void
640 pmap_init(void)
641 {
642 	vm_page_t mpte;
643 	vm_size_t s;
644 	int i, pv_npg;
645 
646 	/*
647 	 * Initialize the vm page array entries for the kernel pmap's
648 	 * page table pages.
649 	 */
650 	for (i = 0; i < nkpt; i++) {
651 		mpte = PHYS_TO_VM_PAGE(PTD[i + KPTDI] & PG_FRAME);
652 		KASSERT(mpte >= vm_page_array &&
653 		    mpte < &vm_page_array[vm_page_array_size],
654 		    ("pmap_init: page table page is out of range"));
655 		mpte->pindex = i + KPTDI;
656 		mpte->phys_addr = PTD[i + KPTDI] & PG_FRAME;
657 	}
658 
659 	/*
660 	 * Initialize the address space (zone) for the pv entries.  Set a
661 	 * high water mark so that the system can recover from excessive
662 	 * numbers of pv entries.
663 	 */
664 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
665 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
666 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
667 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
668 	pv_entry_high_water = 9 * (pv_entry_max / 10);
669 
670 	/*
671 	 * Are large page mappings enabled?
672 	 */
673 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
674 
675 	/*
676 	 * Calculate the size of the pv head table for superpages.
677 	 */
678 	for (i = 0; phys_avail[i + 1]; i += 2);
679 	pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR;
680 
681 	/*
682 	 * Allocate memory for the pv head table for superpages.
683 	 */
684 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
685 	s = round_page(s);
686 	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
687 	for (i = 0; i < pv_npg; i++)
688 		TAILQ_INIT(&pv_table[i].pv_list);
689 
690 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
691 	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
692 	    PAGE_SIZE * pv_maxchunks);
693 	if (pv_chunkbase == NULL)
694 		panic("pmap_init: not enough kvm for pv chunks");
695 	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
696 #ifdef PAE
697 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
698 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
699 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
700 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
701 #endif
702 }
703 
704 
705 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
706 	"Max number of PV entries");
707 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
708 	"Page share factor per proc");
709 
710 SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
711     "2/4MB page mapping counters");
712 
713 static u_long pmap_pde_demotions;
714 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
715     &pmap_pde_demotions, 0, "2/4MB page demotions");
716 
717 static u_long pmap_pde_mappings;
718 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
719     &pmap_pde_mappings, 0, "2/4MB page mappings");
720 
721 static u_long pmap_pde_p_failures;
722 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
723     &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
724 
725 static u_long pmap_pde_promotions;
726 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
727     &pmap_pde_promotions, 0, "2/4MB page promotions");
728 
729 /***************************************************
730  * Low level helper routines.....
731  ***************************************************/
732 
733 /*
734  * Determine the appropriate bits to set in a PTE or PDE for a specified
735  * caching mode.
736  */
737 static int
738 pmap_cache_bits(int mode, boolean_t is_pde)
739 {
740 	int pat_flag, pat_index, cache_bits;
741 
742 	/* The PAT bit is different for PTE's and PDE's. */
743 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
744 
745 	/* If we don't support PAT, map extended modes to older ones. */
746 	if (!(cpu_feature & CPUID_PAT)) {
747 		switch (mode) {
748 		case PAT_UNCACHEABLE:
749 		case PAT_WRITE_THROUGH:
750 		case PAT_WRITE_BACK:
751 			break;
752 		case PAT_UNCACHED:
753 		case PAT_WRITE_COMBINING:
754 		case PAT_WRITE_PROTECTED:
755 			mode = PAT_UNCACHEABLE;
756 			break;
757 		}
758 	}
759 
760 	/* Map the caching mode to a PAT index. */
761 	if (pat_works) {
762 		switch (mode) {
763 		case PAT_UNCACHEABLE:
764 			pat_index = 3;
765 			break;
766 		case PAT_WRITE_THROUGH:
767 			pat_index = 1;
768 			break;
769 		case PAT_WRITE_BACK:
770 			pat_index = 0;
771 			break;
772 		case PAT_UNCACHED:
773 			pat_index = 2;
774 			break;
775 		case PAT_WRITE_COMBINING:
776 			pat_index = 5;
777 			break;
778 		case PAT_WRITE_PROTECTED:
779 			pat_index = 4;
780 			break;
781 		default:
782 			panic("Unknown caching mode %d\n", mode);
783 		}
784 	} else {
785 		switch (mode) {
786 		case PAT_UNCACHED:
787 		case PAT_UNCACHEABLE:
788 		case PAT_WRITE_PROTECTED:
789 			pat_index = 3;
790 			break;
791 		case PAT_WRITE_THROUGH:
792 			pat_index = 1;
793 			break;
794 		case PAT_WRITE_BACK:
795 			pat_index = 0;
796 			break;
797 		case PAT_WRITE_COMBINING:
798 			pat_index = 2;
799 			break;
800 		default:
801 			panic("Unknown caching mode %d\n", mode);
802 		}
803 	}
804 
805 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
806 	cache_bits = 0;
807 	if (pat_index & 0x4)
808 		cache_bits |= pat_flag;
809 	if (pat_index & 0x2)
810 		cache_bits |= PG_NC_PCD;
811 	if (pat_index & 0x1)
812 		cache_bits |= PG_NC_PWT;
813 	return (cache_bits);
814 }
815 #ifdef SMP
816 /*
817  * For SMP, these functions have to use the IPI mechanism for coherence.
818  *
819  * N.B.: Before calling any of the following TLB invalidation functions,
820  * the calling processor must ensure that all stores updating a non-
821  * kernel page table are globally performed.  Otherwise, another
822  * processor could cache an old, pre-update entry without being
823  * invalidated.  This can happen one of two ways: (1) The pmap becomes
824  * active on another processor after its pm_active field is checked by
825  * one of the following functions but before a store updating the page
826  * table is globally performed. (2) The pmap becomes active on another
827  * processor before its pm_active field is checked but due to
828  * speculative loads one of the following functions stills reads the
829  * pmap as inactive on the other processor.
830  *
831  * The kernel page table is exempt because its pm_active field is
832  * immutable.  The kernel page table is always active on every
833  * processor.
834  */
835 void
836 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
837 {
838 	u_int cpumask;
839 	u_int other_cpus;
840 
841 	sched_pin();
842 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
843 		invlpg(va);
844 		smp_invlpg(va);
845 	} else {
846 		cpumask = PCPU_GET(cpumask);
847 		other_cpus = PCPU_GET(other_cpus);
848 		if (pmap->pm_active & cpumask)
849 			invlpg(va);
850 		if (pmap->pm_active & other_cpus)
851 			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
852 	}
853 	sched_unpin();
854 }
855 
856 void
857 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
858 {
859 	u_int cpumask;
860 	u_int other_cpus;
861 	vm_offset_t addr;
862 
863 	sched_pin();
864 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
865 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
866 			invlpg(addr);
867 		smp_invlpg_range(sva, eva);
868 	} else {
869 		cpumask = PCPU_GET(cpumask);
870 		other_cpus = PCPU_GET(other_cpus);
871 		if (pmap->pm_active & cpumask)
872 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
873 				invlpg(addr);
874 		if (pmap->pm_active & other_cpus)
875 			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
876 			    sva, eva);
877 	}
878 	sched_unpin();
879 }
880 
881 void
882 pmap_invalidate_all(pmap_t pmap)
883 {
884 	u_int cpumask;
885 	u_int other_cpus;
886 
887 	sched_pin();
888 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
889 		invltlb();
890 		smp_invltlb();
891 	} else {
892 		cpumask = PCPU_GET(cpumask);
893 		other_cpus = PCPU_GET(other_cpus);
894 		if (pmap->pm_active & cpumask)
895 			invltlb();
896 		if (pmap->pm_active & other_cpus)
897 			smp_masked_invltlb(pmap->pm_active & other_cpus);
898 	}
899 	sched_unpin();
900 }
901 
902 void
903 pmap_invalidate_cache(void)
904 {
905 
906 	sched_pin();
907 	wbinvd();
908 	smp_cache_flush();
909 	sched_unpin();
910 }
911 #else /* !SMP */
912 /*
913  * Normal, non-SMP, 486+ invalidation functions.
914  * We inline these within pmap.c for speed.
915  */
916 PMAP_INLINE void
917 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
918 {
919 
920 	if (pmap == kernel_pmap || pmap->pm_active)
921 		invlpg(va);
922 }
923 
924 PMAP_INLINE void
925 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
926 {
927 	vm_offset_t addr;
928 
929 	if (pmap == kernel_pmap || pmap->pm_active)
930 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
931 			invlpg(addr);
932 }
933 
934 PMAP_INLINE void
935 pmap_invalidate_all(pmap_t pmap)
936 {
937 
938 	if (pmap == kernel_pmap || pmap->pm_active)
939 		invltlb();
940 }
941 
942 PMAP_INLINE void
943 pmap_invalidate_cache(void)
944 {
945 
946 	wbinvd();
947 }
948 #endif /* !SMP */
949 
950 /*
951  * Are we current address space or kernel?  N.B. We return FALSE when
952  * a pmap's page table is in use because a kernel thread is borrowing
953  * it.  The borrowed page table can change spontaneously, making any
954  * dependence on its continued use subject to a race condition.
955  */
956 static __inline int
957 pmap_is_current(pmap_t pmap)
958 {
959 
960 	return (pmap == kernel_pmap ||
961 		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
962 	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
963 }
964 
965 /*
966  * If the given pmap is not the current or kernel pmap, the returned pte must
967  * be released by passing it to pmap_pte_release().
968  */
969 pt_entry_t *
970 pmap_pte(pmap_t pmap, vm_offset_t va)
971 {
972 	pd_entry_t newpf;
973 	pd_entry_t *pde;
974 
975 	pde = pmap_pde(pmap, va);
976 	if (*pde & PG_PS)
977 		return (pde);
978 	if (*pde != 0) {
979 		/* are we current address space or kernel? */
980 		if (pmap_is_current(pmap))
981 			return (vtopte(va));
982 		mtx_lock(&PMAP2mutex);
983 		newpf = *pde & PG_FRAME;
984 		if ((*PMAP2 & PG_FRAME) != newpf) {
985 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
986 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
987 		}
988 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
989 	}
990 	return (0);
991 }
992 
993 /*
994  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
995  * being NULL.
996  */
997 static __inline void
998 pmap_pte_release(pt_entry_t *pte)
999 {
1000 
1001 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1002 		mtx_unlock(&PMAP2mutex);
1003 }
1004 
1005 static __inline void
1006 invlcaddr(void *caddr)
1007 {
1008 
1009 	invlpg((u_int)caddr);
1010 }
1011 
1012 /*
1013  * Super fast pmap_pte routine best used when scanning
1014  * the pv lists.  This eliminates many coarse-grained
1015  * invltlb calls.  Note that many of the pv list
1016  * scans are across different pmaps.  It is very wasteful
1017  * to do an entire invltlb for checking a single mapping.
1018  *
1019  * If the given pmap is not the current pmap, vm_page_queue_mtx
1020  * must be held and curthread pinned to a CPU.
1021  */
1022 static pt_entry_t *
1023 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1024 {
1025 	pd_entry_t newpf;
1026 	pd_entry_t *pde;
1027 
1028 	pde = pmap_pde(pmap, va);
1029 	if (*pde & PG_PS)
1030 		return (pde);
1031 	if (*pde != 0) {
1032 		/* are we current address space or kernel? */
1033 		if (pmap_is_current(pmap))
1034 			return (vtopte(va));
1035 		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1036 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1037 		newpf = *pde & PG_FRAME;
1038 		if ((*PMAP1 & PG_FRAME) != newpf) {
1039 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1040 #ifdef SMP
1041 			PMAP1cpu = PCPU_GET(cpuid);
1042 #endif
1043 			invlcaddr(PADDR1);
1044 			PMAP1changed++;
1045 		} else
1046 #ifdef SMP
1047 		if (PMAP1cpu != PCPU_GET(cpuid)) {
1048 			PMAP1cpu = PCPU_GET(cpuid);
1049 			invlcaddr(PADDR1);
1050 			PMAP1changedcpu++;
1051 		} else
1052 #endif
1053 			PMAP1unchanged++;
1054 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1055 	}
1056 	return (0);
1057 }
1058 
1059 /*
1060  *	Routine:	pmap_extract
1061  *	Function:
1062  *		Extract the physical page address associated
1063  *		with the given map/virtual_address pair.
1064  */
1065 vm_paddr_t
1066 pmap_extract(pmap_t pmap, vm_offset_t va)
1067 {
1068 	vm_paddr_t rtval;
1069 	pt_entry_t *pte;
1070 	pd_entry_t pde;
1071 
1072 	rtval = 0;
1073 	PMAP_LOCK(pmap);
1074 	pde = pmap->pm_pdir[va >> PDRSHIFT];
1075 	if (pde != 0) {
1076 		if ((pde & PG_PS) != 0)
1077 			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1078 		else {
1079 			pte = pmap_pte(pmap, va);
1080 			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1081 			pmap_pte_release(pte);
1082 		}
1083 	}
1084 	PMAP_UNLOCK(pmap);
1085 	return (rtval);
1086 }
1087 
1088 /*
1089  *	Routine:	pmap_extract_and_hold
1090  *	Function:
1091  *		Atomically extract and hold the physical page
1092  *		with the given pmap and virtual address pair
1093  *		if that mapping permits the given protection.
1094  */
1095 vm_page_t
1096 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1097 {
1098 	pd_entry_t pde;
1099 	pt_entry_t pte;
1100 	vm_page_t m;
1101 
1102 	m = NULL;
1103 	vm_page_lock_queues();
1104 	PMAP_LOCK(pmap);
1105 	pde = *pmap_pde(pmap, va);
1106 	if (pde != 0) {
1107 		if (pde & PG_PS) {
1108 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1109 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1110 				    (va & PDRMASK));
1111 				vm_page_hold(m);
1112 			}
1113 		} else {
1114 			sched_pin();
1115 			pte = *pmap_pte_quick(pmap, va);
1116 			if (pte != 0 &&
1117 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1118 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1119 				vm_page_hold(m);
1120 			}
1121 			sched_unpin();
1122 		}
1123 	}
1124 	vm_page_unlock_queues();
1125 	PMAP_UNLOCK(pmap);
1126 	return (m);
1127 }
1128 
1129 /***************************************************
1130  * Low level mapping routines.....
1131  ***************************************************/
1132 
1133 /*
1134  * Add a wired page to the kva.
1135  * Note: not SMP coherent.
1136  */
1137 PMAP_INLINE void
1138 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1139 {
1140 	pt_entry_t *pte;
1141 
1142 	pte = vtopte(va);
1143 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1144 }
1145 
1146 static __inline void
1147 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1148 {
1149 	pt_entry_t *pte;
1150 
1151 	pte = vtopte(va);
1152 	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1153 }
1154 
1155 /*
1156  * Remove a page from the kernel pagetables.
1157  * Note: not SMP coherent.
1158  */
1159 PMAP_INLINE void
1160 pmap_kremove(vm_offset_t va)
1161 {
1162 	pt_entry_t *pte;
1163 
1164 	pte = vtopte(va);
1165 	pte_clear(pte);
1166 }
1167 
1168 /*
1169  *	Used to map a range of physical addresses into kernel
1170  *	virtual address space.
1171  *
1172  *	The value passed in '*virt' is a suggested virtual address for
1173  *	the mapping. Architectures which can support a direct-mapped
1174  *	physical to virtual region can return the appropriate address
1175  *	within that region, leaving '*virt' unchanged. Other
1176  *	architectures should map the pages starting at '*virt' and
1177  *	update '*virt' with the first usable address after the mapped
1178  *	region.
1179  */
1180 vm_offset_t
1181 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1182 {
1183 	vm_offset_t va, sva;
1184 
1185 	va = sva = *virt;
1186 	while (start < end) {
1187 		pmap_kenter(va, start);
1188 		va += PAGE_SIZE;
1189 		start += PAGE_SIZE;
1190 	}
1191 	pmap_invalidate_range(kernel_pmap, sva, va);
1192 	*virt = va;
1193 	return (sva);
1194 }
1195 
1196 
1197 /*
1198  * Add a list of wired pages to the kva
1199  * this routine is only used for temporary
1200  * kernel mappings that do not need to have
1201  * page modification or references recorded.
1202  * Note that old mappings are simply written
1203  * over.  The page *must* be wired.
1204  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1205  */
1206 void
1207 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1208 {
1209 	pt_entry_t *endpte, oldpte, *pte;
1210 
1211 	oldpte = 0;
1212 	pte = vtopte(sva);
1213 	endpte = pte + count;
1214 	while (pte < endpte) {
1215 		oldpte |= *pte;
1216 		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V);
1217 		pte++;
1218 		ma++;
1219 	}
1220 	if ((oldpte & PG_V) != 0)
1221 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1222 		    PAGE_SIZE);
1223 }
1224 
1225 /*
1226  * This routine tears out page mappings from the
1227  * kernel -- it is meant only for temporary mappings.
1228  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1229  */
1230 void
1231 pmap_qremove(vm_offset_t sva, int count)
1232 {
1233 	vm_offset_t va;
1234 
1235 	va = sva;
1236 	while (count-- > 0) {
1237 		pmap_kremove(va);
1238 		va += PAGE_SIZE;
1239 	}
1240 	pmap_invalidate_range(kernel_pmap, sva, va);
1241 }
1242 
1243 /***************************************************
1244  * Page table page management routines.....
1245  ***************************************************/
1246 static __inline void
1247 pmap_free_zero_pages(vm_page_t free)
1248 {
1249 	vm_page_t m;
1250 
1251 	while (free != NULL) {
1252 		m = free;
1253 		free = m->right;
1254 		/* Preserve the page's PG_ZERO setting. */
1255 		vm_page_free_toq(m);
1256 	}
1257 }
1258 
1259 /*
1260  * Schedule the specified unused page table page to be freed.  Specifically,
1261  * add the page to the specified list of pages that will be released to the
1262  * physical memory manager after the TLB has been updated.
1263  */
1264 static __inline void
1265 pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
1266 {
1267 
1268 	if (set_PG_ZERO)
1269 		m->flags |= PG_ZERO;
1270 	else
1271 		m->flags &= ~PG_ZERO;
1272 	m->right = *free;
1273 	*free = m;
1274 }
1275 
1276 /*
1277  * Inserts the specified page table page into the specified pmap's collection
1278  * of idle page table pages.  Each of a pmap's page table pages is responsible
1279  * for mapping a distinct range of virtual addresses.  The pmap's collection is
1280  * ordered by this virtual address range.
1281  */
1282 static void
1283 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1284 {
1285 	vm_page_t root;
1286 
1287 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1288 	root = pmap->pm_root;
1289 	if (root == NULL) {
1290 		mpte->left = NULL;
1291 		mpte->right = NULL;
1292 	} else {
1293 		root = vm_page_splay(mpte->pindex, root);
1294 		if (mpte->pindex < root->pindex) {
1295 			mpte->left = root->left;
1296 			mpte->right = root;
1297 			root->left = NULL;
1298 		} else if (mpte->pindex == root->pindex)
1299 			panic("pmap_insert_pt_page: pindex already inserted");
1300 		else {
1301 			mpte->right = root->right;
1302 			mpte->left = root;
1303 			root->right = NULL;
1304 		}
1305 	}
1306 	pmap->pm_root = mpte;
1307 }
1308 
1309 /*
1310  * Looks for a page table page mapping the specified virtual address in the
1311  * specified pmap's collection of idle page table pages.  Returns NULL if there
1312  * is no page table page corresponding to the specified virtual address.
1313  */
1314 static vm_page_t
1315 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
1316 {
1317 	vm_page_t mpte;
1318 	vm_pindex_t pindex = va >> PDRSHIFT;
1319 
1320 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1321 	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
1322 		mpte = vm_page_splay(pindex, mpte);
1323 		if ((pmap->pm_root = mpte)->pindex != pindex)
1324 			mpte = NULL;
1325 	}
1326 	return (mpte);
1327 }
1328 
1329 /*
1330  * Removes the specified page table page from the specified pmap's collection
1331  * of idle page table pages.  The specified page table page must be a member of
1332  * the pmap's collection.
1333  */
1334 static void
1335 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
1336 {
1337 	vm_page_t root;
1338 
1339 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1340 	if (mpte != pmap->pm_root)
1341 		vm_page_splay(mpte->pindex, pmap->pm_root);
1342 	if (mpte->left == NULL)
1343 		root = mpte->right;
1344 	else {
1345 		root = vm_page_splay(mpte->pindex, mpte->left);
1346 		root->right = mpte->right;
1347 	}
1348 	pmap->pm_root = root;
1349 }
1350 
1351 /*
1352  * This routine unholds page table pages, and if the hold count
1353  * drops to zero, then it decrements the wire count.
1354  */
1355 static __inline int
1356 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1357 {
1358 
1359 	--m->wire_count;
1360 	if (m->wire_count == 0)
1361 		return _pmap_unwire_pte_hold(pmap, m, free);
1362 	else
1363 		return 0;
1364 }
1365 
1366 static int
1367 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1368 {
1369 	vm_offset_t pteva;
1370 
1371 	/*
1372 	 * unmap the page table page
1373 	 */
1374 	pmap->pm_pdir[m->pindex] = 0;
1375 	--pmap->pm_stats.resident_count;
1376 
1377 	/*
1378 	 * This is a release store so that the ordinary store unmapping
1379 	 * the page table page is globally performed before TLB shoot-
1380 	 * down is begun.
1381 	 */
1382 	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1383 
1384 	/*
1385 	 * Do an invltlb to make the invalidated mapping
1386 	 * take effect immediately.
1387 	 */
1388 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1389 	pmap_invalidate_page(pmap, pteva);
1390 
1391 	/*
1392 	 * Put page on a list so that it is released after
1393 	 * *ALL* TLB shootdown is done
1394 	 */
1395 	pmap_add_delayed_free_list(m, free, TRUE);
1396 
1397 	return 1;
1398 }
1399 
1400 /*
1401  * After removing a page table entry, this routine is used to
1402  * conditionally free the page, and manage the hold/wire counts.
1403  */
1404 static int
1405 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1406 {
1407 	pd_entry_t ptepde;
1408 	vm_page_t mpte;
1409 
1410 	if (va >= VM_MAXUSER_ADDRESS)
1411 		return 0;
1412 	ptepde = *pmap_pde(pmap, va);
1413 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1414 	return pmap_unwire_pte_hold(pmap, mpte, free);
1415 }
1416 
1417 void
1418 pmap_pinit0(pmap_t pmap)
1419 {
1420 
1421 	PMAP_LOCK_INIT(pmap);
1422 	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1423 #ifdef PAE
1424 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1425 #endif
1426 	pmap->pm_root = NULL;
1427 	pmap->pm_active = 0;
1428 	PCPU_SET(curpmap, pmap);
1429 	TAILQ_INIT(&pmap->pm_pvchunk);
1430 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1431 	mtx_lock_spin(&allpmaps_lock);
1432 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1433 	mtx_unlock_spin(&allpmaps_lock);
1434 }
1435 
1436 /*
1437  * Initialize a preallocated and zeroed pmap structure,
1438  * such as one in a vmspace structure.
1439  */
1440 int
1441 pmap_pinit(pmap_t pmap)
1442 {
1443 	vm_page_t m, ptdpg[NPGPTD];
1444 	vm_paddr_t pa;
1445 	static int color;
1446 	int i;
1447 
1448 	PMAP_LOCK_INIT(pmap);
1449 
1450 	/*
1451 	 * No need to allocate page table space yet but we do need a valid
1452 	 * page directory table.
1453 	 */
1454 	if (pmap->pm_pdir == NULL) {
1455 		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1456 		    NBPTD);
1457 
1458 		if (pmap->pm_pdir == NULL) {
1459 			PMAP_LOCK_DESTROY(pmap);
1460 			return (0);
1461 		}
1462 #ifdef PAE
1463 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1464 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1465 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1466 		    ("pmap_pinit: pdpt misaligned"));
1467 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1468 		    ("pmap_pinit: pdpt above 4g"));
1469 #endif
1470 		pmap->pm_root = NULL;
1471 	}
1472 	KASSERT(pmap->pm_root == NULL,
1473 	    ("pmap_pinit: pmap has reserved page table page(s)"));
1474 
1475 	/*
1476 	 * allocate the page directory page(s)
1477 	 */
1478 	for (i = 0; i < NPGPTD;) {
1479 		m = vm_page_alloc(NULL, color++,
1480 		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1481 		    VM_ALLOC_ZERO);
1482 		if (m == NULL)
1483 			VM_WAIT;
1484 		else {
1485 			ptdpg[i++] = m;
1486 		}
1487 	}
1488 
1489 	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1490 
1491 	for (i = 0; i < NPGPTD; i++) {
1492 		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1493 			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1494 	}
1495 
1496 	mtx_lock_spin(&allpmaps_lock);
1497 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1498 	mtx_unlock_spin(&allpmaps_lock);
1499 	/* Wire in kernel global address entries. */
1500 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1501 
1502 	/* install self-referential address mapping entry(s) */
1503 	for (i = 0; i < NPGPTD; i++) {
1504 		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1505 		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1506 #ifdef PAE
1507 		pmap->pm_pdpt[i] = pa | PG_V;
1508 #endif
1509 	}
1510 
1511 	pmap->pm_active = 0;
1512 	TAILQ_INIT(&pmap->pm_pvchunk);
1513 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1514 
1515 	return (1);
1516 }
1517 
1518 /*
1519  * this routine is called if the page table page is not
1520  * mapped correctly.
1521  */
1522 static vm_page_t
1523 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1524 {
1525 	vm_paddr_t ptepa;
1526 	vm_page_t m;
1527 
1528 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1529 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1530 	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1531 
1532 	/*
1533 	 * Allocate a page table page.
1534 	 */
1535 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1536 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1537 		if (flags & M_WAITOK) {
1538 			PMAP_UNLOCK(pmap);
1539 			vm_page_unlock_queues();
1540 			VM_WAIT;
1541 			vm_page_lock_queues();
1542 			PMAP_LOCK(pmap);
1543 		}
1544 
1545 		/*
1546 		 * Indicate the need to retry.  While waiting, the page table
1547 		 * page may have been allocated.
1548 		 */
1549 		return (NULL);
1550 	}
1551 	if ((m->flags & PG_ZERO) == 0)
1552 		pmap_zero_page(m);
1553 
1554 	/*
1555 	 * Map the pagetable page into the process address space, if
1556 	 * it isn't already there.
1557 	 */
1558 
1559 	pmap->pm_stats.resident_count++;
1560 
1561 	ptepa = VM_PAGE_TO_PHYS(m);
1562 	pmap->pm_pdir[ptepindex] =
1563 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1564 
1565 	return m;
1566 }
1567 
1568 static vm_page_t
1569 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1570 {
1571 	unsigned ptepindex;
1572 	pd_entry_t ptepa;
1573 	vm_page_t m;
1574 
1575 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1576 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1577 	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1578 
1579 	/*
1580 	 * Calculate pagetable page index
1581 	 */
1582 	ptepindex = va >> PDRSHIFT;
1583 retry:
1584 	/*
1585 	 * Get the page directory entry
1586 	 */
1587 	ptepa = pmap->pm_pdir[ptepindex];
1588 
1589 	/*
1590 	 * This supports switching from a 4MB page to a
1591 	 * normal 4K page.
1592 	 */
1593 	if (ptepa & PG_PS) {
1594 		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
1595 		ptepa = pmap->pm_pdir[ptepindex];
1596 	}
1597 
1598 	/*
1599 	 * If the page table page is mapped, we just increment the
1600 	 * hold count, and activate it.
1601 	 */
1602 	if (ptepa) {
1603 		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1604 		m->wire_count++;
1605 	} else {
1606 		/*
1607 		 * Here if the pte page isn't mapped, or if it has
1608 		 * been deallocated.
1609 		 */
1610 		m = _pmap_allocpte(pmap, ptepindex, flags);
1611 		if (m == NULL && (flags & M_WAITOK))
1612 			goto retry;
1613 	}
1614 	return (m);
1615 }
1616 
1617 
1618 /***************************************************
1619 * Pmap allocation/deallocation routines.
1620  ***************************************************/
1621 
1622 #ifdef SMP
1623 /*
1624  * Deal with a SMP shootdown of other users of the pmap that we are
1625  * trying to dispose of.  This can be a bit hairy.
1626  */
1627 static u_int *lazymask;
1628 static u_int lazyptd;
1629 static volatile u_int lazywait;
1630 
1631 void pmap_lazyfix_action(void);
1632 
1633 void
1634 pmap_lazyfix_action(void)
1635 {
1636 	u_int mymask = PCPU_GET(cpumask);
1637 
1638 #ifdef COUNT_IPIS
1639 	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1640 #endif
1641 	if (rcr3() == lazyptd)
1642 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1643 	atomic_clear_int(lazymask, mymask);
1644 	atomic_store_rel_int(&lazywait, 1);
1645 }
1646 
1647 static void
1648 pmap_lazyfix_self(u_int mymask)
1649 {
1650 
1651 	if (rcr3() == lazyptd)
1652 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1653 	atomic_clear_int(lazymask, mymask);
1654 }
1655 
1656 
1657 static void
1658 pmap_lazyfix(pmap_t pmap)
1659 {
1660 	u_int mymask;
1661 	u_int mask;
1662 	u_int spins;
1663 
1664 	while ((mask = pmap->pm_active) != 0) {
1665 		spins = 50000000;
1666 		mask = mask & -mask;	/* Find least significant set bit */
1667 		mtx_lock_spin(&smp_ipi_mtx);
1668 #ifdef PAE
1669 		lazyptd = vtophys(pmap->pm_pdpt);
1670 #else
1671 		lazyptd = vtophys(pmap->pm_pdir);
1672 #endif
1673 		mymask = PCPU_GET(cpumask);
1674 		if (mask == mymask) {
1675 			lazymask = &pmap->pm_active;
1676 			pmap_lazyfix_self(mymask);
1677 		} else {
1678 			atomic_store_rel_int((u_int *)&lazymask,
1679 			    (u_int)&pmap->pm_active);
1680 			atomic_store_rel_int(&lazywait, 0);
1681 			ipi_selected(mask, IPI_LAZYPMAP);
1682 			while (lazywait == 0) {
1683 				ia32_pause();
1684 				if (--spins == 0)
1685 					break;
1686 			}
1687 		}
1688 		mtx_unlock_spin(&smp_ipi_mtx);
1689 		if (spins == 0)
1690 			printf("pmap_lazyfix: spun for 50000000\n");
1691 	}
1692 }
1693 
1694 #else	/* SMP */
1695 
1696 /*
1697  * Cleaning up on uniprocessor is easy.  For various reasons, we're
1698  * unlikely to have to even execute this code, including the fact
1699  * that the cleanup is deferred until the parent does a wait(2), which
1700  * means that another userland process has run.
1701  */
1702 static void
1703 pmap_lazyfix(pmap_t pmap)
1704 {
1705 	u_int cr3;
1706 
1707 	cr3 = vtophys(pmap->pm_pdir);
1708 	if (cr3 == rcr3()) {
1709 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1710 		pmap->pm_active &= ~(PCPU_GET(cpumask));
1711 	}
1712 }
1713 #endif	/* SMP */
1714 
1715 /*
1716  * Release any resources held by the given physical map.
1717  * Called when a pmap initialized by pmap_pinit is being released.
1718  * Should only be called if the map contains no valid mappings.
1719  */
1720 void
1721 pmap_release(pmap_t pmap)
1722 {
1723 	vm_page_t m, ptdpg[NPGPTD];
1724 	int i;
1725 
1726 	KASSERT(pmap->pm_stats.resident_count == 0,
1727 	    ("pmap_release: pmap resident count %ld != 0",
1728 	    pmap->pm_stats.resident_count));
1729 	KASSERT(pmap->pm_root == NULL,
1730 	    ("pmap_release: pmap has reserved page table page(s)"));
1731 
1732 	pmap_lazyfix(pmap);
1733 	mtx_lock_spin(&allpmaps_lock);
1734 	LIST_REMOVE(pmap, pm_list);
1735 	mtx_unlock_spin(&allpmaps_lock);
1736 
1737 	for (i = 0; i < NPGPTD; i++)
1738 		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
1739 		    PG_FRAME);
1740 
1741 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1742 	    sizeof(*pmap->pm_pdir));
1743 
1744 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1745 
1746 	for (i = 0; i < NPGPTD; i++) {
1747 		m = ptdpg[i];
1748 #ifdef PAE
1749 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1750 		    ("pmap_release: got wrong ptd page"));
1751 #endif
1752 		m->wire_count--;
1753 		atomic_subtract_int(&cnt.v_wire_count, 1);
1754 		vm_page_free_zero(m);
1755 	}
1756 	PMAP_LOCK_DESTROY(pmap);
1757 }
1758 
1759 static int
1760 kvm_size(SYSCTL_HANDLER_ARGS)
1761 {
1762 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1763 
1764 	return sysctl_handle_long(oidp, &ksize, 0, req);
1765 }
1766 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1767     0, 0, kvm_size, "IU", "Size of KVM");
1768 
1769 static int
1770 kvm_free(SYSCTL_HANDLER_ARGS)
1771 {
1772 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1773 
1774 	return sysctl_handle_long(oidp, &kfree, 0, req);
1775 }
1776 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1777     0, 0, kvm_free, "IU", "Amount of KVM free");
1778 
1779 /*
1780  * grow the number of kernel page table entries, if needed
1781  */
1782 void
1783 pmap_growkernel(vm_offset_t addr)
1784 {
1785 	struct pmap *pmap;
1786 	vm_paddr_t ptppaddr;
1787 	vm_page_t nkpg;
1788 	pd_entry_t newpdir;
1789 	pt_entry_t *pde;
1790 
1791 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1792 	if (kernel_vm_end == 0) {
1793 		kernel_vm_end = KERNBASE;
1794 		nkpt = 0;
1795 		while (pdir_pde(PTD, kernel_vm_end)) {
1796 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1797 			nkpt++;
1798 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1799 				kernel_vm_end = kernel_map->max_offset;
1800 				break;
1801 			}
1802 		}
1803 	}
1804 	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1805 	if (addr - 1 >= kernel_map->max_offset)
1806 		addr = kernel_map->max_offset;
1807 	while (kernel_vm_end < addr) {
1808 		if (pdir_pde(PTD, kernel_vm_end)) {
1809 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1810 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1811 				kernel_vm_end = kernel_map->max_offset;
1812 				break;
1813 			}
1814 			continue;
1815 		}
1816 
1817 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
1818 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1819 		    VM_ALLOC_ZERO);
1820 		if (nkpg == NULL)
1821 			panic("pmap_growkernel: no memory to grow kernel");
1822 
1823 		nkpt++;
1824 
1825 		if ((nkpg->flags & PG_ZERO) == 0)
1826 			pmap_zero_page(nkpg);
1827 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1828 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1829 		pdir_pde(PTD, kernel_vm_end) = newpdir;
1830 
1831 		mtx_lock_spin(&allpmaps_lock);
1832 		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1833 			pde = pmap_pde(pmap, kernel_vm_end);
1834 			pde_store(pde, newpdir);
1835 		}
1836 		mtx_unlock_spin(&allpmaps_lock);
1837 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1838 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1839 			kernel_vm_end = kernel_map->max_offset;
1840 			break;
1841 		}
1842 	}
1843 }
1844 
1845 
1846 /***************************************************
1847  * page management routines.
1848  ***************************************************/
1849 
1850 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1851 CTASSERT(_NPCM == 11);
1852 
1853 static __inline struct pv_chunk *
1854 pv_to_chunk(pv_entry_t pv)
1855 {
1856 
1857 	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1858 }
1859 
1860 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1861 
1862 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
1863 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
1864 
1865 static uint32_t pc_freemask[11] = {
1866 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1867 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1868 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1869 	PC_FREE0_9, PC_FREE10
1870 };
1871 
1872 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1873 	"Current number of pv entries");
1874 
1875 #ifdef PV_STATS
1876 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1877 
1878 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1879 	"Current number of pv entry chunks");
1880 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1881 	"Current number of pv entry chunks allocated");
1882 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1883 	"Current number of pv entry chunks frees");
1884 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1885 	"Number of times tried to get a chunk page but failed.");
1886 
1887 static long pv_entry_frees, pv_entry_allocs;
1888 static int pv_entry_spare;
1889 
1890 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1891 	"Current number of pv entry frees");
1892 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1893 	"Current number of pv entry allocs");
1894 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1895 	"Current number of spare pv entries");
1896 
1897 static int pmap_collect_inactive, pmap_collect_active;
1898 
1899 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1900 	"Current number times pmap_collect called on inactive queue");
1901 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1902 	"Current number times pmap_collect called on active queue");
1903 #endif
1904 
1905 /*
1906  * We are in a serious low memory condition.  Resort to
1907  * drastic measures to free some pages so we can allocate
1908  * another pv entry chunk.  This is normally called to
1909  * unmap inactive pages, and if necessary, active pages.
1910  */
1911 static void
1912 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1913 {
1914 	struct md_page *pvh;
1915 	pd_entry_t *pde;
1916 	pmap_t pmap;
1917 	pt_entry_t *pte, tpte;
1918 	pv_entry_t next_pv, pv;
1919 	vm_offset_t va;
1920 	vm_page_t m, free;
1921 
1922 	sched_pin();
1923 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
1924 		if (m->hold_count || m->busy)
1925 			continue;
1926 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1927 			va = pv->pv_va;
1928 			pmap = PV_PMAP(pv);
1929 			/* Avoid deadlock and lock recursion. */
1930 			if (pmap > locked_pmap)
1931 				PMAP_LOCK(pmap);
1932 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1933 				continue;
1934 			pmap->pm_stats.resident_count--;
1935 			pde = pmap_pde(pmap, va);
1936 			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
1937 			    " a 4mpage in page %p's pv list", m));
1938 			pte = pmap_pte_quick(pmap, va);
1939 			tpte = pte_load_clear(pte);
1940 			KASSERT((tpte & PG_W) == 0,
1941 			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
1942 			if (tpte & PG_A)
1943 				vm_page_flag_set(m, PG_REFERENCED);
1944 			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
1945 				vm_page_dirty(m);
1946 			free = NULL;
1947 			pmap_unuse_pt(pmap, va, &free);
1948 			pmap_invalidate_page(pmap, va);
1949 			pmap_free_zero_pages(free);
1950 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1951 			if (TAILQ_EMPTY(&m->md.pv_list)) {
1952 				pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1953 				if (TAILQ_EMPTY(&pvh->pv_list))
1954 					vm_page_flag_clear(m, PG_WRITEABLE);
1955 			}
1956 			free_pv_entry(pmap, pv);
1957 			if (pmap != locked_pmap)
1958 				PMAP_UNLOCK(pmap);
1959 		}
1960 	}
1961 	sched_unpin();
1962 }
1963 
1964 
1965 /*
1966  * free the pv_entry back to the free list
1967  */
1968 static void
1969 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1970 {
1971 	vm_page_t m;
1972 	struct pv_chunk *pc;
1973 	int idx, field, bit;
1974 
1975 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1976 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1977 	PV_STAT(pv_entry_frees++);
1978 	PV_STAT(pv_entry_spare++);
1979 	pv_entry_count--;
1980 	pc = pv_to_chunk(pv);
1981 	idx = pv - &pc->pc_pventry[0];
1982 	field = idx / 32;
1983 	bit = idx % 32;
1984 	pc->pc_map[field] |= 1ul << bit;
1985 	/* move to head of list */
1986 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1987 	for (idx = 0; idx < _NPCM; idx++)
1988 		if (pc->pc_map[idx] != pc_freemask[idx]) {
1989 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1990 			return;
1991 		}
1992 	PV_STAT(pv_entry_spare -= _NPCPV);
1993 	PV_STAT(pc_chunk_count--);
1994 	PV_STAT(pc_chunk_frees++);
1995 	/* entire chunk is free, return it */
1996 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
1997 	pmap_qremove((vm_offset_t)pc, 1);
1998 	vm_page_unwire(m, 0);
1999 	vm_page_free(m);
2000 	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2001 }
2002 
2003 /*
2004  * get a new pv_entry, allocating a block from the system
2005  * when needed.
2006  */
2007 static pv_entry_t
2008 get_pv_entry(pmap_t pmap, int try)
2009 {
2010 	static const struct timeval printinterval = { 60, 0 };
2011 	static struct timeval lastprint;
2012 	static vm_pindex_t colour;
2013 	struct vpgqueues *pq;
2014 	int bit, field;
2015 	pv_entry_t pv;
2016 	struct pv_chunk *pc;
2017 	vm_page_t m;
2018 
2019 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2020 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2021 	PV_STAT(pv_entry_allocs++);
2022 	pv_entry_count++;
2023 	if (pv_entry_count > pv_entry_high_water)
2024 		if (ratecheck(&lastprint, &printinterval))
2025 			printf("Approaching the limit on PV entries, consider "
2026 			    "increasing either the vm.pmap.shpgperproc or the "
2027 			    "vm.pmap.pv_entry_max tunable.\n");
2028 	pq = NULL;
2029 retry:
2030 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2031 	if (pc != NULL) {
2032 		for (field = 0; field < _NPCM; field++) {
2033 			if (pc->pc_map[field]) {
2034 				bit = bsfl(pc->pc_map[field]);
2035 				break;
2036 			}
2037 		}
2038 		if (field < _NPCM) {
2039 			pv = &pc->pc_pventry[field * 32 + bit];
2040 			pc->pc_map[field] &= ~(1ul << bit);
2041 			/* If this was the last item, move it to tail */
2042 			for (field = 0; field < _NPCM; field++)
2043 				if (pc->pc_map[field] != 0) {
2044 					PV_STAT(pv_entry_spare--);
2045 					return (pv);	/* not full, return */
2046 				}
2047 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2048 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2049 			PV_STAT(pv_entry_spare--);
2050 			return (pv);
2051 		}
2052 	}
2053 	/*
2054 	 * Access to the ptelist "pv_vafree" is synchronized by the page
2055 	 * queues lock.  If "pv_vafree" is currently non-empty, it will
2056 	 * remain non-empty until pmap_ptelist_alloc() completes.
2057 	 */
2058 	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq ==
2059 	    &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
2060 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2061 		if (try) {
2062 			pv_entry_count--;
2063 			PV_STAT(pc_chunk_tryfail++);
2064 			return (NULL);
2065 		}
2066 		/*
2067 		 * Reclaim pv entries: At first, destroy mappings to
2068 		 * inactive pages.  After that, if a pv chunk entry
2069 		 * is still needed, destroy mappings to active pages.
2070 		 */
2071 		if (pq == NULL) {
2072 			PV_STAT(pmap_collect_inactive++);
2073 			pq = &vm_page_queues[PQ_INACTIVE];
2074 		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
2075 			PV_STAT(pmap_collect_active++);
2076 			pq = &vm_page_queues[PQ_ACTIVE];
2077 		} else
2078 			panic("get_pv_entry: increase vm.pmap.shpgperproc");
2079 		pmap_collect(pmap, pq);
2080 		goto retry;
2081 	}
2082 	PV_STAT(pc_chunk_count++);
2083 	PV_STAT(pc_chunk_allocs++);
2084 	colour++;
2085 	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2086 	pmap_qenter((vm_offset_t)pc, &m, 1);
2087 	pc->pc_pmap = pmap;
2088 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2089 	for (field = 1; field < _NPCM; field++)
2090 		pc->pc_map[field] = pc_freemask[field];
2091 	pv = &pc->pc_pventry[0];
2092 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2093 	PV_STAT(pv_entry_spare += _NPCPV - 1);
2094 	return (pv);
2095 }
2096 
2097 static __inline pv_entry_t
2098 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2099 {
2100 	pv_entry_t pv;
2101 
2102 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2103 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
2104 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2105 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
2106 			break;
2107 		}
2108 	}
2109 	return (pv);
2110 }
2111 
2112 static void
2113 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2114 {
2115 	struct md_page *pvh;
2116 	pv_entry_t pv;
2117 	vm_offset_t va_last;
2118 	vm_page_t m;
2119 
2120 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2121 	KASSERT((pa & PDRMASK) == 0,
2122 	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2123 
2124 	/*
2125 	 * Transfer the 4mpage's pv entry for this mapping to the first
2126 	 * page's pv list.
2127 	 */
2128 	pvh = pa_to_pvh(pa);
2129 	va = trunc_4mpage(va);
2130 	pv = pmap_pvh_remove(pvh, pmap, va);
2131 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2132 	m = PHYS_TO_VM_PAGE(pa);
2133 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2134 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2135 	va_last = va + NBPDR - PAGE_SIZE;
2136 	do {
2137 		m++;
2138 		KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
2139 		    ("pmap_pv_demote_pde: page %p is not managed", m));
2140 		va += PAGE_SIZE;
2141 		pmap_insert_entry(pmap, va, m);
2142 	} while (va < va_last);
2143 }
2144 
2145 static void
2146 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2147 {
2148 	struct md_page *pvh;
2149 	pv_entry_t pv;
2150 	vm_offset_t va_last;
2151 	vm_page_t m;
2152 
2153 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2154 	KASSERT((pa & PDRMASK) == 0,
2155 	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2156 
2157 	/*
2158 	 * Transfer the first page's pv entry for this mapping to the
2159 	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2160 	 * to get_pv_entry(), a transfer avoids the possibility that
2161 	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2162 	 * removes one of the mappings that is being promoted.
2163 	 */
2164 	m = PHYS_TO_VM_PAGE(pa);
2165 	va = trunc_4mpage(va);
2166 	pv = pmap_pvh_remove(&m->md, pmap, va);
2167 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2168 	pvh = pa_to_pvh(pa);
2169 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2170 	/* Free the remaining NPTEPG - 1 pv entries. */
2171 	va_last = va + NBPDR - PAGE_SIZE;
2172 	do {
2173 		m++;
2174 		va += PAGE_SIZE;
2175 		pmap_pvh_free(&m->md, pmap, va);
2176 	} while (va < va_last);
2177 }
2178 
2179 static void
2180 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2181 {
2182 	pv_entry_t pv;
2183 
2184 	pv = pmap_pvh_remove(pvh, pmap, va);
2185 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2186 	free_pv_entry(pmap, pv);
2187 }
2188 
2189 static void
2190 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2191 {
2192 	struct md_page *pvh;
2193 
2194 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2195 	pmap_pvh_free(&m->md, pmap, va);
2196 	if (TAILQ_EMPTY(&m->md.pv_list)) {
2197 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2198 		if (TAILQ_EMPTY(&pvh->pv_list))
2199 			vm_page_flag_clear(m, PG_WRITEABLE);
2200 	}
2201 }
2202 
2203 /*
2204  * Create a pv entry for page at pa for
2205  * (pmap, va).
2206  */
2207 static void
2208 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2209 {
2210 	pv_entry_t pv;
2211 
2212 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2213 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2214 	pv = get_pv_entry(pmap, FALSE);
2215 	pv->pv_va = va;
2216 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2217 }
2218 
2219 /*
2220  * Conditionally create a pv entry.
2221  */
2222 static boolean_t
2223 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2224 {
2225 	pv_entry_t pv;
2226 
2227 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2228 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2229 	if (pv_entry_count < pv_entry_high_water &&
2230 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2231 		pv->pv_va = va;
2232 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2233 		return (TRUE);
2234 	} else
2235 		return (FALSE);
2236 }
2237 
2238 /*
2239  * Create the pv entries for each of the pages within a superpage.
2240  */
2241 static boolean_t
2242 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2243 {
2244 	struct md_page *pvh;
2245 	pv_entry_t pv;
2246 
2247 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2248 	if (pv_entry_count < pv_entry_high_water &&
2249 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2250 		pv->pv_va = va;
2251 		pvh = pa_to_pvh(pa);
2252 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
2253 		return (TRUE);
2254 	} else
2255 		return (FALSE);
2256 }
2257 
2258 /*
2259  * Tries to demote a 2- or 4MB page mapping.
2260  */
2261 static boolean_t
2262 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2263 {
2264 	pd_entry_t newpde, oldpde;
2265 	pmap_t allpmaps_entry;
2266 	pt_entry_t *firstpte, newpte, *pte;
2267 	vm_paddr_t mptepa;
2268 	vm_page_t free, mpte;
2269 
2270 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2271 	mpte = pmap_lookup_pt_page(pmap, va);
2272 	if (mpte != NULL)
2273 		pmap_remove_pt_page(pmap, mpte);
2274 	else {
2275 		KASSERT((*pde & PG_W) == 0,
2276 		    ("pmap_demote_pde: page table page for a wired mapping"
2277 		    " is missing"));
2278 		free = NULL;
2279 		pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free);
2280 		pmap_invalidate_page(pmap, trunc_4mpage(va));
2281 		pmap_free_zero_pages(free);
2282 		CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2283 		    " in pmap %p", va, pmap);
2284 		return (FALSE);
2285 	}
2286 	mptepa = VM_PAGE_TO_PHYS(mpte);
2287 
2288 	/*
2289 	 * Temporarily map the page table page (mpte) into the kernel's
2290 	 * address space at either PADDR1 or PADDR2.
2291 	 */
2292 	if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) {
2293 		if ((*PMAP1 & PG_FRAME) != mptepa) {
2294 			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2295 #ifdef SMP
2296 			PMAP1cpu = PCPU_GET(cpuid);
2297 #endif
2298 			invlcaddr(PADDR1);
2299 			PMAP1changed++;
2300 		} else
2301 #ifdef SMP
2302 		if (PMAP1cpu != PCPU_GET(cpuid)) {
2303 			PMAP1cpu = PCPU_GET(cpuid);
2304 			invlcaddr(PADDR1);
2305 			PMAP1changedcpu++;
2306 		} else
2307 #endif
2308 			PMAP1unchanged++;
2309 		firstpte = PADDR1;
2310 	} else {
2311 		mtx_lock(&PMAP2mutex);
2312 		if ((*PMAP2 & PG_FRAME) != mptepa) {
2313 			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2314 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2315 		}
2316 		firstpte = PADDR2;
2317 	}
2318 	oldpde = *pde;
2319 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2320 	KASSERT((oldpde & (PG_A | PG_V)) == (PG_A | PG_V),
2321 	    ("pmap_demote_pde: oldpde is missing PG_A and/or PG_V"));
2322 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2323 	    ("pmap_demote_pde: oldpde is missing PG_M"));
2324 	KASSERT((oldpde & PG_PS) != 0,
2325 	    ("pmap_demote_pde: oldpde is missing PG_PS"));
2326 	newpte = oldpde & ~PG_PS;
2327 	if ((newpte & PG_PDE_PAT) != 0)
2328 		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2329 
2330 	/*
2331 	 * If the mapping has changed attributes, update the page table
2332 	 * entries.
2333 	 */
2334 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2335 	    ("pmap_demote_pde: firstpte and newpte map different physical"
2336 	    " addresses"));
2337 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2338 		for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2339 			*pte = newpte;
2340 			newpte += PAGE_SIZE;
2341 		}
2342 
2343 	/*
2344 	 * Demote the mapping.  This pmap is locked.  The old PDE has
2345 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2346 	 * set.  Thus, there is no danger of a race with another
2347 	 * processor changing the setting of PG_A and/or PG_M between
2348 	 * the read above and the store below.
2349 	 */
2350 	if (pmap == kernel_pmap) {
2351 		/*
2352 		 * A harmless race exists between this loop and the bcopy()
2353 		 * in pmap_pinit() that initializes the kernel segment of
2354 		 * the new page table.  Specifically, that bcopy() may copy
2355 		 * the new PDE from the PTD, which is first in allpmaps, to
2356 		 * the new page table before this loop updates that new
2357 		 * page table.
2358 		 */
2359 		mtx_lock_spin(&allpmaps_lock);
2360 		LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) {
2361 			pde = pmap_pde(allpmaps_entry, va);
2362 			KASSERT(*pde == newpde || (*pde & PG_PTE_PROMOTE) ==
2363 			    (oldpde & PG_PTE_PROMOTE),
2364 			    ("pmap_demote_pde: pde was %#jx, expected %#jx",
2365 			    (uintmax_t)*pde, (uintmax_t)oldpde));
2366 			pde_store(pde, newpde);
2367 		}
2368 		mtx_unlock_spin(&allpmaps_lock);
2369 	} else
2370 		pde_store(pde, newpde);
2371 	if (firstpte == PADDR2)
2372 		mtx_unlock(&PMAP2mutex);
2373 
2374 	/*
2375 	 * Invalidate the recursive mapping of the page table page.
2376 	 */
2377 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2378 
2379 	/*
2380 	 * Demote the pv entry.  This depends on the earlier demotion
2381 	 * of the mapping.  Specifically, the (re)creation of a per-
2382 	 * page pv entry might trigger the execution of pmap_collect(),
2383 	 * which might reclaim a newly (re)created per-page pv entry
2384 	 * and destroy the associated mapping.  In order to destroy
2385 	 * the mapping, the PDE must have already changed from mapping
2386 	 * the 2mpage to referencing the page table page.
2387 	 */
2388 	if ((oldpde & PG_MANAGED) != 0)
2389 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2390 
2391 	pmap_pde_demotions++;
2392 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2393 	    " in pmap %p", va, pmap);
2394 	return (TRUE);
2395 }
2396 
2397 /*
2398  * pmap_remove_pde: do the things to unmap a superpage in a process
2399  */
2400 static void
2401 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2402     vm_page_t *free)
2403 {
2404 	struct md_page *pvh;
2405 	pd_entry_t oldpde;
2406 	vm_offset_t eva, va;
2407 	vm_page_t m, mpte;
2408 
2409 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2410 	KASSERT((sva & PDRMASK) == 0,
2411 	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2412 	oldpde = pte_load_clear(pdq);
2413 	if (oldpde & PG_W)
2414 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2415 
2416 	/*
2417 	 * Machines that don't support invlpg, also don't support
2418 	 * PG_G.
2419 	 */
2420 	if (oldpde & PG_G)
2421 		pmap_invalidate_page(kernel_pmap, sva);
2422 	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2423 	if (oldpde & PG_MANAGED) {
2424 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2425 		pmap_pvh_free(pvh, pmap, sva);
2426 		eva = sva + NBPDR;
2427 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2428 		    va < eva; va += PAGE_SIZE, m++) {
2429 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2430 				vm_page_dirty(m);
2431 			if (oldpde & PG_A)
2432 				vm_page_flag_set(m, PG_REFERENCED);
2433 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2434 			    TAILQ_EMPTY(&pvh->pv_list))
2435 				vm_page_flag_clear(m, PG_WRITEABLE);
2436 		}
2437 	}
2438 	if (pmap == kernel_pmap) {
2439 		if (!pmap_demote_pde(pmap, pdq, sva))
2440 			panic("pmap_remove_pde: failed demotion");
2441 	} else {
2442 		mpte = pmap_lookup_pt_page(pmap, sva);
2443 		if (mpte != NULL) {
2444 			pmap_remove_pt_page(pmap, mpte);
2445 			KASSERT(mpte->wire_count == NPTEPG,
2446 			    ("pmap_remove_pde: pte page wire count error"));
2447 			mpte->wire_count = 0;
2448 			pmap_add_delayed_free_list(mpte, free, FALSE);
2449 			atomic_subtract_int(&cnt.v_wire_count, 1);
2450 		}
2451 	}
2452 }
2453 
2454 /*
2455  * pmap_remove_pte: do the things to unmap a page in a process
2456  */
2457 static int
2458 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
2459 {
2460 	pt_entry_t oldpte;
2461 	vm_page_t m;
2462 
2463 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2464 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2465 	oldpte = pte_load_clear(ptq);
2466 	if (oldpte & PG_W)
2467 		pmap->pm_stats.wired_count -= 1;
2468 	/*
2469 	 * Machines that don't support invlpg, also don't support
2470 	 * PG_G.
2471 	 */
2472 	if (oldpte & PG_G)
2473 		pmap_invalidate_page(kernel_pmap, va);
2474 	pmap->pm_stats.resident_count -= 1;
2475 	if (oldpte & PG_MANAGED) {
2476 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2477 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2478 			vm_page_dirty(m);
2479 		if (oldpte & PG_A)
2480 			vm_page_flag_set(m, PG_REFERENCED);
2481 		pmap_remove_entry(pmap, m, va);
2482 	}
2483 	return (pmap_unuse_pt(pmap, va, free));
2484 }
2485 
2486 /*
2487  * Remove a single page from a process address space
2488  */
2489 static void
2490 pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2491 {
2492 	pt_entry_t *pte;
2493 
2494 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2495 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2496 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2497 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2498 		return;
2499 	pmap_remove_pte(pmap, pte, va, free);
2500 	pmap_invalidate_page(pmap, va);
2501 }
2502 
2503 /*
2504  *	Remove the given range of addresses from the specified map.
2505  *
2506  *	It is assumed that the start and end are properly
2507  *	rounded to the page size.
2508  */
2509 void
2510 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2511 {
2512 	vm_offset_t pdnxt;
2513 	pd_entry_t ptpaddr;
2514 	pt_entry_t *pte;
2515 	vm_page_t free = NULL;
2516 	int anyvalid;
2517 
2518 	/*
2519 	 * Perform an unsynchronized read.  This is, however, safe.
2520 	 */
2521 	if (pmap->pm_stats.resident_count == 0)
2522 		return;
2523 
2524 	anyvalid = 0;
2525 
2526 	vm_page_lock_queues();
2527 	sched_pin();
2528 	PMAP_LOCK(pmap);
2529 
2530 	/*
2531 	 * special handling of removing one page.  a very
2532 	 * common operation and easy to short circuit some
2533 	 * code.
2534 	 */
2535 	if ((sva + PAGE_SIZE == eva) &&
2536 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2537 		pmap_remove_page(pmap, sva, &free);
2538 		goto out;
2539 	}
2540 
2541 	for (; sva < eva; sva = pdnxt) {
2542 		unsigned pdirindex;
2543 
2544 		/*
2545 		 * Calculate index for next page table.
2546 		 */
2547 		pdnxt = (sva + NBPDR) & ~PDRMASK;
2548 		if (pdnxt < sva)
2549 			pdnxt = eva;
2550 		if (pmap->pm_stats.resident_count == 0)
2551 			break;
2552 
2553 		pdirindex = sva >> PDRSHIFT;
2554 		ptpaddr = pmap->pm_pdir[pdirindex];
2555 
2556 		/*
2557 		 * Weed out invalid mappings. Note: we assume that the page
2558 		 * directory table is always allocated, and in kernel virtual.
2559 		 */
2560 		if (ptpaddr == 0)
2561 			continue;
2562 
2563 		/*
2564 		 * Check for large page.
2565 		 */
2566 		if ((ptpaddr & PG_PS) != 0) {
2567 			/*
2568 			 * Are we removing the entire large page?  If not,
2569 			 * demote the mapping and fall through.
2570 			 */
2571 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2572 				/*
2573 				 * The TLB entry for a PG_G mapping is
2574 				 * invalidated by pmap_remove_pde().
2575 				 */
2576 				if ((ptpaddr & PG_G) == 0)
2577 					anyvalid = 1;
2578 				pmap_remove_pde(pmap,
2579 				    &pmap->pm_pdir[pdirindex], sva, &free);
2580 				continue;
2581 			} else if (!pmap_demote_pde(pmap,
2582 			    &pmap->pm_pdir[pdirindex], sva)) {
2583 				/* The large page mapping was destroyed. */
2584 				continue;
2585 			}
2586 		}
2587 
2588 		/*
2589 		 * Limit our scan to either the end of the va represented
2590 		 * by the current page table page, or to the end of the
2591 		 * range being removed.
2592 		 */
2593 		if (pdnxt > eva)
2594 			pdnxt = eva;
2595 
2596 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2597 		    sva += PAGE_SIZE) {
2598 			if (*pte == 0)
2599 				continue;
2600 
2601 			/*
2602 			 * The TLB entry for a PG_G mapping is invalidated
2603 			 * by pmap_remove_pte().
2604 			 */
2605 			if ((*pte & PG_G) == 0)
2606 				anyvalid = 1;
2607 			if (pmap_remove_pte(pmap, pte, sva, &free))
2608 				break;
2609 		}
2610 	}
2611 out:
2612 	sched_unpin();
2613 	if (anyvalid)
2614 		pmap_invalidate_all(pmap);
2615 	vm_page_unlock_queues();
2616 	PMAP_UNLOCK(pmap);
2617 	pmap_free_zero_pages(free);
2618 }
2619 
2620 /*
2621  *	Routine:	pmap_remove_all
2622  *	Function:
2623  *		Removes this physical page from
2624  *		all physical maps in which it resides.
2625  *		Reflects back modify bits to the pager.
2626  *
2627  *	Notes:
2628  *		Original versions of this routine were very
2629  *		inefficient because they iteratively called
2630  *		pmap_remove (slow...)
2631  */
2632 
2633 void
2634 pmap_remove_all(vm_page_t m)
2635 {
2636 	struct md_page *pvh;
2637 	pv_entry_t pv;
2638 	pmap_t pmap;
2639 	pt_entry_t *pte, tpte;
2640 	pd_entry_t *pde;
2641 	vm_offset_t va;
2642 	vm_page_t free;
2643 
2644 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
2645 	    ("pmap_remove_all: page %p is fictitious", m));
2646 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2647 	sched_pin();
2648 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2649 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2650 		va = pv->pv_va;
2651 		pmap = PV_PMAP(pv);
2652 		PMAP_LOCK(pmap);
2653 		pde = pmap_pde(pmap, va);
2654 		(void)pmap_demote_pde(pmap, pde, va);
2655 		PMAP_UNLOCK(pmap);
2656 	}
2657 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2658 		pmap = PV_PMAP(pv);
2659 		PMAP_LOCK(pmap);
2660 		pmap->pm_stats.resident_count--;
2661 		pde = pmap_pde(pmap, pv->pv_va);
2662 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
2663 		    " a 4mpage in page %p's pv list", m));
2664 		pte = pmap_pte_quick(pmap, pv->pv_va);
2665 		tpte = pte_load_clear(pte);
2666 		if (tpte & PG_W)
2667 			pmap->pm_stats.wired_count--;
2668 		if (tpte & PG_A)
2669 			vm_page_flag_set(m, PG_REFERENCED);
2670 
2671 		/*
2672 		 * Update the vm_page_t clean and reference bits.
2673 		 */
2674 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2675 			vm_page_dirty(m);
2676 		free = NULL;
2677 		pmap_unuse_pt(pmap, pv->pv_va, &free);
2678 		pmap_invalidate_page(pmap, pv->pv_va);
2679 		pmap_free_zero_pages(free);
2680 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2681 		free_pv_entry(pmap, pv);
2682 		PMAP_UNLOCK(pmap);
2683 	}
2684 	vm_page_flag_clear(m, PG_WRITEABLE);
2685 	sched_unpin();
2686 }
2687 
2688 /*
2689  * pmap_protect_pde: do the things to protect a 4mpage in a process
2690  */
2691 static boolean_t
2692 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
2693 {
2694 	pd_entry_t newpde, oldpde;
2695 	vm_offset_t eva, va;
2696 	vm_page_t m;
2697 	boolean_t anychanged;
2698 
2699 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2700 	KASSERT((sva & PDRMASK) == 0,
2701 	    ("pmap_protect_pde: sva is not 4mpage aligned"));
2702 	anychanged = FALSE;
2703 retry:
2704 	oldpde = newpde = *pde;
2705 	if (oldpde & PG_MANAGED) {
2706 		eva = sva + NBPDR;
2707 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2708 		    va < eva; va += PAGE_SIZE, m++) {
2709 			/*
2710 			 * In contrast to the analogous operation on a 4KB page
2711 			 * mapping, the mapping's PG_A flag is not cleared and
2712 			 * the page's PG_REFERENCED flag is not set.  The
2713 			 * reason is that pmap_demote_pde() expects that a 2/4MB
2714 			 * page mapping with a stored page table page has PG_A
2715 			 * set.
2716 			 */
2717 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2718 				vm_page_dirty(m);
2719 		}
2720 	}
2721 	if ((prot & VM_PROT_WRITE) == 0)
2722 		newpde &= ~(PG_RW | PG_M);
2723 #ifdef PAE
2724 	if ((prot & VM_PROT_EXECUTE) == 0)
2725 		newpde |= pg_nx;
2726 #endif
2727 	if (newpde != oldpde) {
2728 		if (!pde_cmpset(pde, oldpde, newpde))
2729 			goto retry;
2730 		if (oldpde & PG_G)
2731 			pmap_invalidate_page(pmap, sva);
2732 		else
2733 			anychanged = TRUE;
2734 	}
2735 	return (anychanged);
2736 }
2737 
2738 /*
2739  *	Set the physical protection on the
2740  *	specified range of this map as requested.
2741  */
2742 void
2743 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2744 {
2745 	vm_offset_t pdnxt;
2746 	pd_entry_t ptpaddr;
2747 	pt_entry_t *pte;
2748 	int anychanged;
2749 
2750 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2751 		pmap_remove(pmap, sva, eva);
2752 		return;
2753 	}
2754 
2755 #ifdef PAE
2756 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2757 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
2758 		return;
2759 #else
2760 	if (prot & VM_PROT_WRITE)
2761 		return;
2762 #endif
2763 
2764 	anychanged = 0;
2765 
2766 	vm_page_lock_queues();
2767 	sched_pin();
2768 	PMAP_LOCK(pmap);
2769 	for (; sva < eva; sva = pdnxt) {
2770 		pt_entry_t obits, pbits;
2771 		unsigned pdirindex;
2772 
2773 		pdnxt = (sva + NBPDR) & ~PDRMASK;
2774 		if (pdnxt < sva)
2775 			pdnxt = eva;
2776 
2777 		pdirindex = sva >> PDRSHIFT;
2778 		ptpaddr = pmap->pm_pdir[pdirindex];
2779 
2780 		/*
2781 		 * Weed out invalid mappings. Note: we assume that the page
2782 		 * directory table is always allocated, and in kernel virtual.
2783 		 */
2784 		if (ptpaddr == 0)
2785 			continue;
2786 
2787 		/*
2788 		 * Check for large page.
2789 		 */
2790 		if ((ptpaddr & PG_PS) != 0) {
2791 			/*
2792 			 * Are we protecting the entire large page?  If not,
2793 			 * demote the mapping and fall through.
2794 			 */
2795 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
2796 				/*
2797 				 * The TLB entry for a PG_G mapping is
2798 				 * invalidated by pmap_protect_pde().
2799 				 */
2800 				if (pmap_protect_pde(pmap,
2801 				    &pmap->pm_pdir[pdirindex], sva, prot))
2802 					anychanged = 1;
2803 				continue;
2804 			} else if (!pmap_demote_pde(pmap,
2805 			    &pmap->pm_pdir[pdirindex], sva)) {
2806 				/* The large page mapping was destroyed. */
2807 				continue;
2808 			}
2809 		}
2810 
2811 		if (pdnxt > eva)
2812 			pdnxt = eva;
2813 
2814 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2815 		    sva += PAGE_SIZE) {
2816 			vm_page_t m;
2817 
2818 retry:
2819 			/*
2820 			 * Regardless of whether a pte is 32 or 64 bits in
2821 			 * size, PG_RW, PG_A, and PG_M are among the least
2822 			 * significant 32 bits.
2823 			 */
2824 			obits = pbits = *pte;
2825 			if ((pbits & PG_V) == 0)
2826 				continue;
2827 			if (pbits & PG_MANAGED) {
2828 				m = NULL;
2829 				if (pbits & PG_A) {
2830 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2831 					vm_page_flag_set(m, PG_REFERENCED);
2832 					pbits &= ~PG_A;
2833 				}
2834 				if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2835 					if (m == NULL)
2836 						m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2837 					vm_page_dirty(m);
2838 				}
2839 			}
2840 
2841 			if ((prot & VM_PROT_WRITE) == 0)
2842 				pbits &= ~(PG_RW | PG_M);
2843 #ifdef PAE
2844 			if ((prot & VM_PROT_EXECUTE) == 0)
2845 				pbits |= pg_nx;
2846 #endif
2847 
2848 			if (pbits != obits) {
2849 #ifdef PAE
2850 				if (!atomic_cmpset_64(pte, obits, pbits))
2851 					goto retry;
2852 #else
2853 				if (!atomic_cmpset_int((u_int *)pte, obits,
2854 				    pbits))
2855 					goto retry;
2856 #endif
2857 				if (obits & PG_G)
2858 					pmap_invalidate_page(pmap, sva);
2859 				else
2860 					anychanged = 1;
2861 			}
2862 		}
2863 	}
2864 	sched_unpin();
2865 	if (anychanged)
2866 		pmap_invalidate_all(pmap);
2867 	vm_page_unlock_queues();
2868 	PMAP_UNLOCK(pmap);
2869 }
2870 
2871 /*
2872  * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
2873  * within a single page table page (PTP) to a single 2- or 4MB page mapping.
2874  * For promotion to occur, two conditions must be met: (1) the 4KB page
2875  * mappings must map aligned, contiguous physical memory and (2) the 4KB page
2876  * mappings must have identical characteristics.
2877  *
2878  * Managed (PG_MANAGED) mappings within the kernel address space are not
2879  * promoted.  The reason is that kernel PDEs are replicated in each pmap but
2880  * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
2881  * pmap.
2882  */
2883 static void
2884 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2885 {
2886 	pd_entry_t newpde;
2887 	pmap_t allpmaps_entry;
2888 	pt_entry_t *firstpte, oldpte, pa, *pte;
2889 	vm_offset_t oldpteva;
2890 	vm_page_t mpte;
2891 
2892 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2893 
2894 	/*
2895 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
2896 	 * either invalid, unused, or does not map the first 4KB physical page
2897 	 * within a 2- or 4MB page.
2898 	 */
2899 	firstpte = vtopte(trunc_4mpage(va));
2900 setpde:
2901 	newpde = *firstpte;
2902 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
2903 		pmap_pde_p_failures++;
2904 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
2905 		    " in pmap %p", va, pmap);
2906 		return;
2907 	}
2908 	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
2909 		pmap_pde_p_failures++;
2910 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
2911 		    " in pmap %p", va, pmap);
2912 		return;
2913 	}
2914 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
2915 		/*
2916 		 * When PG_M is already clear, PG_RW can be cleared without
2917 		 * a TLB invalidation.
2918 		 */
2919 		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
2920 		    ~PG_RW))
2921 			goto setpde;
2922 		newpde &= ~PG_RW;
2923 	}
2924 
2925 	/*
2926 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
2927 	 * PTE maps an unexpected 4KB physical page or does not have identical
2928 	 * characteristics to the first PTE.
2929 	 */
2930 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
2931 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
2932 setpte:
2933 		oldpte = *pte;
2934 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
2935 			pmap_pde_p_failures++;
2936 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
2937 			    " in pmap %p", va, pmap);
2938 			return;
2939 		}
2940 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
2941 			/*
2942 			 * When PG_M is already clear, PG_RW can be cleared
2943 			 * without a TLB invalidation.
2944 			 */
2945 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
2946 			    oldpte & ~PG_RW))
2947 				goto setpte;
2948 			oldpte &= ~PG_RW;
2949 			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
2950 			    (va & ~PDRMASK);
2951 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
2952 			    " in pmap %p", oldpteva, pmap);
2953 		}
2954 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
2955 			pmap_pde_p_failures++;
2956 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
2957 			    " in pmap %p", va, pmap);
2958 			return;
2959 		}
2960 		pa -= PAGE_SIZE;
2961 	}
2962 
2963 	/*
2964 	 * Save the page table page in its current state until the PDE
2965 	 * mapping the superpage is demoted by pmap_demote_pde() or
2966 	 * destroyed by pmap_remove_pde().
2967 	 */
2968 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
2969 	KASSERT(mpte >= vm_page_array &&
2970 	    mpte < &vm_page_array[vm_page_array_size],
2971 	    ("pmap_promote_pde: page table page is out of range"));
2972 	KASSERT(mpte->pindex == va >> PDRSHIFT,
2973 	    ("pmap_promote_pde: page table page's pindex is wrong"));
2974 	pmap_insert_pt_page(pmap, mpte);
2975 
2976 	/*
2977 	 * Promote the pv entries.
2978 	 */
2979 	if ((newpde & PG_MANAGED) != 0)
2980 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
2981 
2982 	/*
2983 	 * Propagate the PAT index to its proper position.
2984 	 */
2985 	if ((newpde & PG_PTE_PAT) != 0)
2986 		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
2987 
2988 	/*
2989 	 * Map the superpage.
2990 	 */
2991 	if (pmap == kernel_pmap) {
2992 		mtx_lock_spin(&allpmaps_lock);
2993 		LIST_FOREACH(allpmaps_entry, &allpmaps, pm_list) {
2994 			pde = pmap_pde(allpmaps_entry, va);
2995 			pde_store(pde, PG_PS | newpde);
2996 		}
2997 		mtx_unlock_spin(&allpmaps_lock);
2998 	} else
2999 		pde_store(pde, PG_PS | newpde);
3000 
3001 	pmap_pde_promotions++;
3002 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3003 	    " in pmap %p", va, pmap);
3004 }
3005 
3006 /*
3007  *	Insert the given physical page (p) at
3008  *	the specified virtual address (v) in the
3009  *	target physical map with the protection requested.
3010  *
3011  *	If specified, the page will be wired down, meaning
3012  *	that the related pte can not be reclaimed.
3013  *
3014  *	NB:  This is the only routine which MAY NOT lazy-evaluate
3015  *	or lose information.  That is, this routine must actually
3016  *	insert this page into the given map NOW.
3017  */
3018 void
3019 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
3020     vm_prot_t prot, boolean_t wired)
3021 {
3022 	vm_paddr_t pa;
3023 	pd_entry_t *pde;
3024 	pt_entry_t *pte;
3025 	vm_paddr_t opa;
3026 	pt_entry_t origpte, newpte;
3027 	vm_page_t mpte, om;
3028 	boolean_t invlva;
3029 
3030 	va = trunc_page(va);
3031 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3032 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3033 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va));
3034 
3035 	mpte = NULL;
3036 
3037 	vm_page_lock_queues();
3038 	PMAP_LOCK(pmap);
3039 	sched_pin();
3040 
3041 	/*
3042 	 * In the case that a page table page is not
3043 	 * resident, we are creating it here.
3044 	 */
3045 	if (va < VM_MAXUSER_ADDRESS) {
3046 		mpte = pmap_allocpte(pmap, va, M_WAITOK);
3047 	}
3048 
3049 	pde = pmap_pde(pmap, va);
3050 	if ((*pde & PG_PS) != 0)
3051 		panic("pmap_enter: attempted pmap_enter on 4MB page");
3052 	pte = pmap_pte_quick(pmap, va);
3053 
3054 	/*
3055 	 * Page Directory table entry not valid, we need a new PT page
3056 	 */
3057 	if (pte == NULL) {
3058 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3059 			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3060 	}
3061 
3062 	pa = VM_PAGE_TO_PHYS(m);
3063 	om = NULL;
3064 	origpte = *pte;
3065 	opa = origpte & PG_FRAME;
3066 
3067 	/*
3068 	 * Mapping has not changed, must be protection or wiring change.
3069 	 */
3070 	if (origpte && (opa == pa)) {
3071 		/*
3072 		 * Wiring change, just update stats. We don't worry about
3073 		 * wiring PT pages as they remain resident as long as there
3074 		 * are valid mappings in them. Hence, if a user page is wired,
3075 		 * the PT page will be also.
3076 		 */
3077 		if (wired && ((origpte & PG_W) == 0))
3078 			pmap->pm_stats.wired_count++;
3079 		else if (!wired && (origpte & PG_W))
3080 			pmap->pm_stats.wired_count--;
3081 
3082 		/*
3083 		 * Remove extra pte reference
3084 		 */
3085 		if (mpte)
3086 			mpte->wire_count--;
3087 
3088 		/*
3089 		 * We might be turning off write access to the page,
3090 		 * so we go ahead and sense modify status.
3091 		 */
3092 		if (origpte & PG_MANAGED) {
3093 			om = m;
3094 			pa |= PG_MANAGED;
3095 		}
3096 		goto validate;
3097 	}
3098 	/*
3099 	 * Mapping has changed, invalidate old range and fall through to
3100 	 * handle validating new mapping.
3101 	 */
3102 	if (opa) {
3103 		if (origpte & PG_W)
3104 			pmap->pm_stats.wired_count--;
3105 		if (origpte & PG_MANAGED) {
3106 			om = PHYS_TO_VM_PAGE(opa);
3107 			pmap_remove_entry(pmap, om, va);
3108 		}
3109 		if (mpte != NULL) {
3110 			mpte->wire_count--;
3111 			KASSERT(mpte->wire_count > 0,
3112 			    ("pmap_enter: missing reference to page table page,"
3113 			     " va: 0x%x", va));
3114 		}
3115 	} else
3116 		pmap->pm_stats.resident_count++;
3117 
3118 	/*
3119 	 * Enter on the PV list if part of our managed memory.
3120 	 */
3121 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3122 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3123 		    ("pmap_enter: managed mapping within the clean submap"));
3124 		pmap_insert_entry(pmap, va, m);
3125 		pa |= PG_MANAGED;
3126 	}
3127 
3128 	/*
3129 	 * Increment counters
3130 	 */
3131 	if (wired)
3132 		pmap->pm_stats.wired_count++;
3133 
3134 validate:
3135 	/*
3136 	 * Now validate mapping with desired protection/wiring.
3137 	 */
3138 	newpte = (pt_entry_t)(pa | PG_V);
3139 	if ((prot & VM_PROT_WRITE) != 0) {
3140 		newpte |= PG_RW;
3141 		vm_page_flag_set(m, PG_WRITEABLE);
3142 	}
3143 #ifdef PAE
3144 	if ((prot & VM_PROT_EXECUTE) == 0)
3145 		newpte |= pg_nx;
3146 #endif
3147 	if (wired)
3148 		newpte |= PG_W;
3149 	if (va < VM_MAXUSER_ADDRESS)
3150 		newpte |= PG_U;
3151 	if (pmap == kernel_pmap)
3152 		newpte |= pgeflag;
3153 
3154 	/*
3155 	 * if the mapping or permission bits are different, we need
3156 	 * to update the pte.
3157 	 */
3158 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3159 		newpte |= PG_A;
3160 		if ((access & VM_PROT_WRITE) != 0)
3161 			newpte |= PG_M;
3162 		if (origpte & PG_V) {
3163 			invlva = FALSE;
3164 			origpte = pte_load_store(pte, newpte);
3165 			if (origpte & PG_A) {
3166 				if (origpte & PG_MANAGED)
3167 					vm_page_flag_set(om, PG_REFERENCED);
3168 				if (opa != VM_PAGE_TO_PHYS(m))
3169 					invlva = TRUE;
3170 #ifdef PAE
3171 				if ((origpte & PG_NX) == 0 &&
3172 				    (newpte & PG_NX) != 0)
3173 					invlva = TRUE;
3174 #endif
3175 			}
3176 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3177 				if ((origpte & PG_MANAGED) != 0)
3178 					vm_page_dirty(om);
3179 				if ((prot & VM_PROT_WRITE) == 0)
3180 					invlva = TRUE;
3181 			}
3182 			if (invlva)
3183 				pmap_invalidate_page(pmap, va);
3184 		} else
3185 			pte_store(pte, newpte);
3186 	}
3187 
3188 	/*
3189 	 * If both the page table page and the reservation are fully
3190 	 * populated, then attempt promotion.
3191 	 */
3192 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3193 	    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0)
3194 		pmap_promote_pde(pmap, pde, va);
3195 
3196 	sched_unpin();
3197 	vm_page_unlock_queues();
3198 	PMAP_UNLOCK(pmap);
3199 }
3200 
3201 /*
3202  * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3203  * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3204  * blocking, (2) a mapping already exists at the specified virtual address, or
3205  * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3206  */
3207 static boolean_t
3208 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3209 {
3210 	pd_entry_t *pde, newpde;
3211 
3212 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3213 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3214 	pde = pmap_pde(pmap, va);
3215 	if (*pde != 0) {
3216 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3217 		    " in pmap %p", va, pmap);
3218 		return (FALSE);
3219 	}
3220 	newpde = VM_PAGE_TO_PHYS(m) | PG_PS | PG_V;
3221 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
3222 		newpde |= PG_MANAGED;
3223 
3224 		/*
3225 		 * Abort this mapping if its PV entry could not be created.
3226 		 */
3227 		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3228 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3229 			    " in pmap %p", va, pmap);
3230 			return (FALSE);
3231 		}
3232 	}
3233 #ifdef PAE
3234 	if ((prot & VM_PROT_EXECUTE) == 0)
3235 		newpde |= pg_nx;
3236 #endif
3237 	if (va < VM_MAXUSER_ADDRESS)
3238 		newpde |= PG_U;
3239 
3240 	/*
3241 	 * Increment counters.
3242 	 */
3243 	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3244 
3245 	/*
3246 	 * Map the superpage.
3247 	 */
3248 	pde_store(pde, newpde);
3249 
3250 	pmap_pde_mappings++;
3251 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3252 	    " in pmap %p", va, pmap);
3253 	return (TRUE);
3254 }
3255 
3256 /*
3257  * Maps a sequence of resident pages belonging to the same object.
3258  * The sequence begins with the given page m_start.  This page is
3259  * mapped at the given virtual address start.  Each subsequent page is
3260  * mapped at a virtual address that is offset from start by the same
3261  * amount as the page is offset from m_start within the object.  The
3262  * last page in the sequence is the page with the largest offset from
3263  * m_start that can be mapped at a virtual address less than the given
3264  * virtual address end.  Not every virtual page between start and end
3265  * is mapped; only those for which a resident page exists with the
3266  * corresponding offset from m_start are mapped.
3267  */
3268 void
3269 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3270     vm_page_t m_start, vm_prot_t prot)
3271 {
3272 	vm_offset_t va;
3273 	vm_page_t m, mpte;
3274 	vm_pindex_t diff, psize;
3275 
3276 	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
3277 	psize = atop(end - start);
3278 	mpte = NULL;
3279 	m = m_start;
3280 	PMAP_LOCK(pmap);
3281 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3282 		va = start + ptoa(diff);
3283 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3284 		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
3285 		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
3286 		    pmap_enter_pde(pmap, va, m, prot))
3287 			m = &m[NBPDR / PAGE_SIZE - 1];
3288 		else
3289 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3290 			    mpte);
3291 		m = TAILQ_NEXT(m, listq);
3292 	}
3293  	PMAP_UNLOCK(pmap);
3294 }
3295 
3296 /*
3297  * this code makes some *MAJOR* assumptions:
3298  * 1. Current pmap & pmap exists.
3299  * 2. Not wired.
3300  * 3. Read access.
3301  * 4. No page table pages.
3302  * but is *MUCH* faster than pmap_enter...
3303  */
3304 
3305 void
3306 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3307 {
3308 
3309 	PMAP_LOCK(pmap);
3310 	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3311 	PMAP_UNLOCK(pmap);
3312 }
3313 
3314 static vm_page_t
3315 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3316     vm_prot_t prot, vm_page_t mpte)
3317 {
3318 	pt_entry_t *pte;
3319 	vm_paddr_t pa;
3320 	vm_page_t free;
3321 
3322 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3323 	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
3324 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3325 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3326 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3327 
3328 	/*
3329 	 * In the case that a page table page is not
3330 	 * resident, we are creating it here.
3331 	 */
3332 	if (va < VM_MAXUSER_ADDRESS) {
3333 		unsigned ptepindex;
3334 		pd_entry_t ptepa;
3335 
3336 		/*
3337 		 * Calculate pagetable page index
3338 		 */
3339 		ptepindex = va >> PDRSHIFT;
3340 		if (mpte && (mpte->pindex == ptepindex)) {
3341 			mpte->wire_count++;
3342 		} else {
3343 			/*
3344 			 * Get the page directory entry
3345 			 */
3346 			ptepa = pmap->pm_pdir[ptepindex];
3347 
3348 			/*
3349 			 * If the page table page is mapped, we just increment
3350 			 * the hold count, and activate it.
3351 			 */
3352 			if (ptepa) {
3353 				if (ptepa & PG_PS)
3354 					return (NULL);
3355 				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3356 				mpte->wire_count++;
3357 			} else {
3358 				mpte = _pmap_allocpte(pmap, ptepindex,
3359 				    M_NOWAIT);
3360 				if (mpte == NULL)
3361 					return (mpte);
3362 			}
3363 		}
3364 	} else {
3365 		mpte = NULL;
3366 	}
3367 
3368 	/*
3369 	 * This call to vtopte makes the assumption that we are
3370 	 * entering the page into the current pmap.  In order to support
3371 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3372 	 * But that isn't as quick as vtopte.
3373 	 */
3374 	pte = vtopte(va);
3375 	if (*pte) {
3376 		if (mpte != NULL) {
3377 			mpte->wire_count--;
3378 			mpte = NULL;
3379 		}
3380 		return (mpte);
3381 	}
3382 
3383 	/*
3384 	 * Enter on the PV list if part of our managed memory.
3385 	 */
3386 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
3387 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3388 		if (mpte != NULL) {
3389 			free = NULL;
3390 			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
3391 				pmap_invalidate_page(pmap, va);
3392 				pmap_free_zero_pages(free);
3393 			}
3394 
3395 			mpte = NULL;
3396 		}
3397 		return (mpte);
3398 	}
3399 
3400 	/*
3401 	 * Increment counters
3402 	 */
3403 	pmap->pm_stats.resident_count++;
3404 
3405 	pa = VM_PAGE_TO_PHYS(m);
3406 #ifdef PAE
3407 	if ((prot & VM_PROT_EXECUTE) == 0)
3408 		pa |= pg_nx;
3409 #endif
3410 
3411 	/*
3412 	 * Now validate mapping with RO protection
3413 	 */
3414 	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
3415 		pte_store(pte, pa | PG_V | PG_U);
3416 	else
3417 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
3418 	return mpte;
3419 }
3420 
3421 /*
3422  * Make a temporary mapping for a physical address.  This is only intended
3423  * to be used for panic dumps.
3424  */
3425 void *
3426 pmap_kenter_temporary(vm_paddr_t pa, int i)
3427 {
3428 	vm_offset_t va;
3429 
3430 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3431 	pmap_kenter(va, pa);
3432 	invlpg(va);
3433 	return ((void *)crashdumpmap);
3434 }
3435 
3436 /*
3437  * This code maps large physical mmap regions into the
3438  * processor address space.  Note that some shortcuts
3439  * are taken, but the code works.
3440  */
3441 void
3442 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3443     vm_pindex_t pindex, vm_size_t size)
3444 {
3445 	vm_page_t p;
3446 
3447 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
3448 	KASSERT(object->type == OBJT_DEVICE,
3449 	    ("pmap_object_init_pt: non-device object"));
3450 	if (pseflag &&
3451 	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
3452 		int i;
3453 		vm_page_t m[1];
3454 		unsigned int ptepindex;
3455 		int npdes;
3456 		pd_entry_t ptepa;
3457 
3458 		PMAP_LOCK(pmap);
3459 		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
3460 			goto out;
3461 		PMAP_UNLOCK(pmap);
3462 retry:
3463 		p = vm_page_lookup(object, pindex);
3464 		if (p != NULL) {
3465 			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
3466 				goto retry;
3467 		} else {
3468 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
3469 			if (p == NULL)
3470 				return;
3471 			m[0] = p;
3472 
3473 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
3474 				vm_page_lock_queues();
3475 				vm_page_free(p);
3476 				vm_page_unlock_queues();
3477 				return;
3478 			}
3479 
3480 			p = vm_page_lookup(object, pindex);
3481 			vm_page_wakeup(p);
3482 		}
3483 
3484 		ptepa = VM_PAGE_TO_PHYS(p);
3485 		if (ptepa & (NBPDR - 1))
3486 			return;
3487 
3488 		p->valid = VM_PAGE_BITS_ALL;
3489 
3490 		PMAP_LOCK(pmap);
3491 		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
3492 		npdes = size >> PDRSHIFT;
3493 		for(i = 0; i < npdes; i++) {
3494 			pde_store(&pmap->pm_pdir[ptepindex],
3495 			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
3496 			ptepa += NBPDR;
3497 			ptepindex += 1;
3498 		}
3499 		pmap_invalidate_all(pmap);
3500 out:
3501 		PMAP_UNLOCK(pmap);
3502 	}
3503 }
3504 
3505 /*
3506  *	Routine:	pmap_change_wiring
3507  *	Function:	Change the wiring attribute for a map/virtual-address
3508  *			pair.
3509  *	In/out conditions:
3510  *			The mapping must already exist in the pmap.
3511  */
3512 void
3513 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
3514 {
3515 	pd_entry_t *pde;
3516 	pt_entry_t *pte;
3517 	boolean_t are_queues_locked;
3518 
3519 	are_queues_locked = FALSE;
3520 retry:
3521 	PMAP_LOCK(pmap);
3522 	pde = pmap_pde(pmap, va);
3523 	if ((*pde & PG_PS) != 0) {
3524 		if (!wired != ((*pde & PG_W) == 0)) {
3525 			if (!are_queues_locked) {
3526 				are_queues_locked = TRUE;
3527 				if (!mtx_trylock(&vm_page_queue_mtx)) {
3528 					PMAP_UNLOCK(pmap);
3529 					vm_page_lock_queues();
3530 					goto retry;
3531 				}
3532 			}
3533 			if (!pmap_demote_pde(pmap, pde, va))
3534 				panic("pmap_change_wiring: demotion failed");
3535 		} else
3536 			goto out;
3537 	}
3538 	pte = pmap_pte(pmap, va);
3539 
3540 	if (wired && !pmap_pte_w(pte))
3541 		pmap->pm_stats.wired_count++;
3542 	else if (!wired && pmap_pte_w(pte))
3543 		pmap->pm_stats.wired_count--;
3544 
3545 	/*
3546 	 * Wiring is not a hardware characteristic so there is no need to
3547 	 * invalidate TLB.
3548 	 */
3549 	pmap_pte_set_w(pte, wired);
3550 	pmap_pte_release(pte);
3551 out:
3552 	if (are_queues_locked)
3553 		vm_page_unlock_queues();
3554 	PMAP_UNLOCK(pmap);
3555 }
3556 
3557 
3558 
3559 /*
3560  *	Copy the range specified by src_addr/len
3561  *	from the source map to the range dst_addr/len
3562  *	in the destination map.
3563  *
3564  *	This routine is only advisory and need not do anything.
3565  */
3566 
3567 void
3568 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3569     vm_offset_t src_addr)
3570 {
3571 	vm_page_t   free;
3572 	vm_offset_t addr;
3573 	vm_offset_t end_addr = src_addr + len;
3574 	vm_offset_t pdnxt;
3575 
3576 	if (dst_addr != src_addr)
3577 		return;
3578 
3579 	if (!pmap_is_current(src_pmap))
3580 		return;
3581 
3582 	vm_page_lock_queues();
3583 	if (dst_pmap < src_pmap) {
3584 		PMAP_LOCK(dst_pmap);
3585 		PMAP_LOCK(src_pmap);
3586 	} else {
3587 		PMAP_LOCK(src_pmap);
3588 		PMAP_LOCK(dst_pmap);
3589 	}
3590 	sched_pin();
3591 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
3592 		pt_entry_t *src_pte, *dst_pte;
3593 		vm_page_t dstmpte, srcmpte;
3594 		pd_entry_t srcptepaddr;
3595 		unsigned ptepindex;
3596 
3597 		KASSERT(addr < UPT_MIN_ADDRESS,
3598 		    ("pmap_copy: invalid to pmap_copy page tables"));
3599 
3600 		pdnxt = (addr + NBPDR) & ~PDRMASK;
3601 		if (pdnxt < addr)
3602 			pdnxt = end_addr;
3603 		ptepindex = addr >> PDRSHIFT;
3604 
3605 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
3606 		if (srcptepaddr == 0)
3607 			continue;
3608 
3609 		if (srcptepaddr & PG_PS) {
3610 			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
3611 			    ((srcptepaddr & PG_MANAGED) == 0 ||
3612 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
3613 			    PG_PS_FRAME))) {
3614 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
3615 				    ~PG_W;
3616 				dst_pmap->pm_stats.resident_count +=
3617 				    NBPDR / PAGE_SIZE;
3618 			}
3619 			continue;
3620 		}
3621 
3622 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
3623 		KASSERT(srcmpte->wire_count > 0,
3624 		    ("pmap_copy: source page table page is unused"));
3625 
3626 		if (pdnxt > end_addr)
3627 			pdnxt = end_addr;
3628 
3629 		src_pte = vtopte(addr);
3630 		while (addr < pdnxt) {
3631 			pt_entry_t ptetemp;
3632 			ptetemp = *src_pte;
3633 			/*
3634 			 * we only virtual copy managed pages
3635 			 */
3636 			if ((ptetemp & PG_MANAGED) != 0) {
3637 				dstmpte = pmap_allocpte(dst_pmap, addr,
3638 				    M_NOWAIT);
3639 				if (dstmpte == NULL)
3640 					break;
3641 				dst_pte = pmap_pte_quick(dst_pmap, addr);
3642 				if (*dst_pte == 0 &&
3643 				    pmap_try_insert_pv_entry(dst_pmap, addr,
3644 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
3645 					/*
3646 					 * Clear the wired, modified, and
3647 					 * accessed (referenced) bits
3648 					 * during the copy.
3649 					 */
3650 					*dst_pte = ptetemp & ~(PG_W | PG_M |
3651 					    PG_A);
3652 					dst_pmap->pm_stats.resident_count++;
3653 	 			} else {
3654 					free = NULL;
3655 					if (pmap_unwire_pte_hold( dst_pmap,
3656 					    dstmpte, &free)) {
3657 						pmap_invalidate_page(dst_pmap,
3658 						    addr);
3659 						pmap_free_zero_pages(free);
3660 					}
3661 				}
3662 				if (dstmpte->wire_count >= srcmpte->wire_count)
3663 					break;
3664 			}
3665 			addr += PAGE_SIZE;
3666 			src_pte++;
3667 		}
3668 	}
3669 	sched_unpin();
3670 	vm_page_unlock_queues();
3671 	PMAP_UNLOCK(src_pmap);
3672 	PMAP_UNLOCK(dst_pmap);
3673 }
3674 
3675 static __inline void
3676 pagezero(void *page)
3677 {
3678 #if defined(I686_CPU)
3679 	if (cpu_class == CPUCLASS_686) {
3680 #if defined(CPU_ENABLE_SSE)
3681 		if (cpu_feature & CPUID_SSE2)
3682 			sse2_pagezero(page);
3683 		else
3684 #endif
3685 			i686_pagezero(page);
3686 	} else
3687 #endif
3688 		bzero(page, PAGE_SIZE);
3689 }
3690 
3691 /*
3692  *	pmap_zero_page zeros the specified hardware page by mapping
3693  *	the page into KVM and using bzero to clear its contents.
3694  */
3695 void
3696 pmap_zero_page(vm_page_t m)
3697 {
3698 	struct sysmaps *sysmaps;
3699 
3700 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3701 	mtx_lock(&sysmaps->lock);
3702 	if (*sysmaps->CMAP2)
3703 		panic("pmap_zero_page: CMAP2 busy");
3704 	sched_pin();
3705 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
3706 	invlcaddr(sysmaps->CADDR2);
3707 	pagezero(sysmaps->CADDR2);
3708 	*sysmaps->CMAP2 = 0;
3709 	sched_unpin();
3710 	mtx_unlock(&sysmaps->lock);
3711 }
3712 
3713 /*
3714  *	pmap_zero_page_area zeros the specified hardware page by mapping
3715  *	the page into KVM and using bzero to clear its contents.
3716  *
3717  *	off and size may not cover an area beyond a single hardware page.
3718  */
3719 void
3720 pmap_zero_page_area(vm_page_t m, int off, int size)
3721 {
3722 	struct sysmaps *sysmaps;
3723 
3724 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3725 	mtx_lock(&sysmaps->lock);
3726 	if (*sysmaps->CMAP2)
3727 		panic("pmap_zero_page: CMAP2 busy");
3728 	sched_pin();
3729 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
3730 	invlcaddr(sysmaps->CADDR2);
3731 	if (off == 0 && size == PAGE_SIZE)
3732 		pagezero(sysmaps->CADDR2);
3733 	else
3734 		bzero((char *)sysmaps->CADDR2 + off, size);
3735 	*sysmaps->CMAP2 = 0;
3736 	sched_unpin();
3737 	mtx_unlock(&sysmaps->lock);
3738 }
3739 
3740 /*
3741  *	pmap_zero_page_idle zeros the specified hardware page by mapping
3742  *	the page into KVM and using bzero to clear its contents.  This
3743  *	is intended to be called from the vm_pagezero process only and
3744  *	outside of Giant.
3745  */
3746 void
3747 pmap_zero_page_idle(vm_page_t m)
3748 {
3749 
3750 	if (*CMAP3)
3751 		panic("pmap_zero_page: CMAP3 busy");
3752 	sched_pin();
3753 	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
3754 	invlcaddr(CADDR3);
3755 	pagezero(CADDR3);
3756 	*CMAP3 = 0;
3757 	sched_unpin();
3758 }
3759 
3760 /*
3761  *	pmap_copy_page copies the specified (machine independent)
3762  *	page by mapping the page into virtual memory and using
3763  *	bcopy to copy the page, one machine dependent page at a
3764  *	time.
3765  */
3766 void
3767 pmap_copy_page(vm_page_t src, vm_page_t dst)
3768 {
3769 	struct sysmaps *sysmaps;
3770 
3771 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
3772 	mtx_lock(&sysmaps->lock);
3773 	if (*sysmaps->CMAP1)
3774 		panic("pmap_copy_page: CMAP1 busy");
3775 	if (*sysmaps->CMAP2)
3776 		panic("pmap_copy_page: CMAP2 busy");
3777 	sched_pin();
3778 	invlpg((u_int)sysmaps->CADDR1);
3779 	invlpg((u_int)sysmaps->CADDR2);
3780 	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
3781 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
3782 	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
3783 	*sysmaps->CMAP1 = 0;
3784 	*sysmaps->CMAP2 = 0;
3785 	sched_unpin();
3786 	mtx_unlock(&sysmaps->lock);
3787 }
3788 
3789 /*
3790  * Returns true if the pmap's pv is one of the first
3791  * 16 pvs linked to from this page.  This count may
3792  * be changed upwards or downwards in the future; it
3793  * is only necessary that true be returned for a small
3794  * subset of pmaps for proper page aging.
3795  */
3796 boolean_t
3797 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3798 {
3799 	struct md_page *pvh;
3800 	pv_entry_t pv;
3801 	int loops = 0;
3802 
3803 	if (m->flags & PG_FICTITIOUS)
3804 		return FALSE;
3805 
3806 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3807 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3808 		if (PV_PMAP(pv) == pmap) {
3809 			return TRUE;
3810 		}
3811 		loops++;
3812 		if (loops >= 16)
3813 			break;
3814 	}
3815 	if (loops < 16) {
3816 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3817 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3818 			if (PV_PMAP(pv) == pmap)
3819 				return (TRUE);
3820 			loops++;
3821 			if (loops >= 16)
3822 				break;
3823 		}
3824 	}
3825 	return (FALSE);
3826 }
3827 
3828 /*
3829  *	pmap_page_wired_mappings:
3830  *
3831  *	Return the number of managed mappings to the given physical page
3832  *	that are wired.
3833  */
3834 int
3835 pmap_page_wired_mappings(vm_page_t m)
3836 {
3837 	int count;
3838 
3839 	count = 0;
3840 	if ((m->flags & PG_FICTITIOUS) != 0)
3841 		return (count);
3842 	count = pmap_pvh_wired_mappings(&m->md, count);
3843 	return (pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count));
3844 }
3845 
3846 /*
3847  *	pmap_pvh_wired_mappings:
3848  *
3849  *	Return the updated number "count" of managed mappings that are wired.
3850  */
3851 static int
3852 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
3853 {
3854 	pmap_t pmap;
3855 	pt_entry_t *pte;
3856 	pv_entry_t pv;
3857 
3858 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3859 	sched_pin();
3860 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
3861 		pmap = PV_PMAP(pv);
3862 		PMAP_LOCK(pmap);
3863 		pte = pmap_pte_quick(pmap, pv->pv_va);
3864 		if ((*pte & PG_W) != 0)
3865 			count++;
3866 		PMAP_UNLOCK(pmap);
3867 	}
3868 	sched_unpin();
3869 	return (count);
3870 }
3871 
3872 /*
3873  * Returns TRUE if the given page is mapped individually or as part of
3874  * a 4mpage.  Otherwise, returns FALSE.
3875  */
3876 boolean_t
3877 pmap_page_is_mapped(vm_page_t m)
3878 {
3879 	struct md_page *pvh;
3880 
3881 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
3882 		return (FALSE);
3883 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3884 	if (TAILQ_EMPTY(&m->md.pv_list)) {
3885 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3886 		return (!TAILQ_EMPTY(&pvh->pv_list));
3887 	} else
3888 		return (TRUE);
3889 }
3890 
3891 /*
3892  * Remove all pages from specified address space
3893  * this aids process exit speeds.  Also, this code
3894  * is special cased for current process only, but
3895  * can have the more generic (and slightly slower)
3896  * mode enabled.  This is much faster than pmap_remove
3897  * in the case of running down an entire address space.
3898  */
3899 void
3900 pmap_remove_pages(pmap_t pmap)
3901 {
3902 	pt_entry_t *pte, tpte;
3903 	vm_page_t free = NULL;
3904 	vm_page_t m, mpte, mt;
3905 	pv_entry_t pv;
3906 	struct md_page *pvh;
3907 	struct pv_chunk *pc, *npc;
3908 	int field, idx;
3909 	int32_t bit;
3910 	uint32_t inuse, bitmask;
3911 	int allfree;
3912 
3913 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
3914 		printf("warning: pmap_remove_pages called with non-current pmap\n");
3915 		return;
3916 	}
3917 	vm_page_lock_queues();
3918 	PMAP_LOCK(pmap);
3919 	sched_pin();
3920 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3921 		allfree = 1;
3922 		for (field = 0; field < _NPCM; field++) {
3923 			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
3924 			while (inuse != 0) {
3925 				bit = bsfl(inuse);
3926 				bitmask = 1UL << bit;
3927 				idx = field * 32 + bit;
3928 				pv = &pc->pc_pventry[idx];
3929 				inuse &= ~bitmask;
3930 
3931 				pte = pmap_pde(pmap, pv->pv_va);
3932 				tpte = *pte;
3933 				if ((tpte & PG_PS) == 0) {
3934 					pte = vtopte(pv->pv_va);
3935 					tpte = *pte & ~PG_PTE_PAT;
3936 				}
3937 
3938 				if (tpte == 0) {
3939 					printf(
3940 					    "TPTE at %p  IS ZERO @ VA %08x\n",
3941 					    pte, pv->pv_va);
3942 					panic("bad pte");
3943 				}
3944 
3945 /*
3946  * We cannot remove wired pages from a process' mapping at this time
3947  */
3948 				if (tpte & PG_W) {
3949 					allfree = 0;
3950 					continue;
3951 				}
3952 
3953 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3954 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
3955 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3956 				    m, (uintmax_t)m->phys_addr,
3957 				    (uintmax_t)tpte));
3958 
3959 				KASSERT(m < &vm_page_array[vm_page_array_size],
3960 					("pmap_remove_pages: bad tpte %#jx",
3961 					(uintmax_t)tpte));
3962 
3963 				pte_clear(pte);
3964 
3965 				/*
3966 				 * Update the vm_page_t clean/reference bits.
3967 				 */
3968 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3969 					if ((tpte & PG_PS) != 0) {
3970 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
3971 							vm_page_dirty(mt);
3972 					} else
3973 						vm_page_dirty(m);
3974 				}
3975 
3976 				/* Mark free */
3977 				PV_STAT(pv_entry_frees++);
3978 				PV_STAT(pv_entry_spare++);
3979 				pv_entry_count--;
3980 				pc->pc_map[field] |= bitmask;
3981 				if ((tpte & PG_PS) != 0) {
3982 					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
3983 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
3984 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
3985 					if (TAILQ_EMPTY(&pvh->pv_list)) {
3986 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
3987 							if (TAILQ_EMPTY(&mt->md.pv_list))
3988 								vm_page_flag_clear(mt, PG_WRITEABLE);
3989 					}
3990 					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
3991 					if (mpte != NULL) {
3992 						pmap_remove_pt_page(pmap, mpte);
3993 						KASSERT(mpte->wire_count == NPTEPG,
3994 						    ("pmap_remove_pages: pte page wire count error"));
3995 						mpte->wire_count = 0;
3996 						pmap_add_delayed_free_list(mpte, &free, FALSE);
3997 						atomic_subtract_int(&cnt.v_wire_count, 1);
3998 					}
3999 				} else {
4000 					pmap->pm_stats.resident_count--;
4001 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4002 					if (TAILQ_EMPTY(&m->md.pv_list)) {
4003 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4004 						if (TAILQ_EMPTY(&pvh->pv_list))
4005 							vm_page_flag_clear(m, PG_WRITEABLE);
4006 					}
4007 					pmap_unuse_pt(pmap, pv->pv_va, &free);
4008 				}
4009 			}
4010 		}
4011 		if (allfree) {
4012 			PV_STAT(pv_entry_spare -= _NPCPV);
4013 			PV_STAT(pc_chunk_count--);
4014 			PV_STAT(pc_chunk_frees++);
4015 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4016 			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
4017 			pmap_qremove((vm_offset_t)pc, 1);
4018 			vm_page_unwire(m, 0);
4019 			vm_page_free(m);
4020 			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
4021 		}
4022 	}
4023 	sched_unpin();
4024 	pmap_invalidate_all(pmap);
4025 	vm_page_unlock_queues();
4026 	PMAP_UNLOCK(pmap);
4027 	pmap_free_zero_pages(free);
4028 }
4029 
4030 /*
4031  *	pmap_is_modified:
4032  *
4033  *	Return whether or not the specified physical page was modified
4034  *	in any physical maps.
4035  */
4036 boolean_t
4037 pmap_is_modified(vm_page_t m)
4038 {
4039 
4040 	if (m->flags & PG_FICTITIOUS)
4041 		return (FALSE);
4042 	if (pmap_is_modified_pvh(&m->md))
4043 		return (TRUE);
4044 	return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4045 }
4046 
4047 /*
4048  * Returns TRUE if any of the given mappings were used to modify
4049  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4050  * mappings are supported.
4051  */
4052 static boolean_t
4053 pmap_is_modified_pvh(struct md_page *pvh)
4054 {
4055 	pv_entry_t pv;
4056 	pt_entry_t *pte;
4057 	pmap_t pmap;
4058 	boolean_t rv;
4059 
4060 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4061 	rv = FALSE;
4062 	sched_pin();
4063 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
4064 		pmap = PV_PMAP(pv);
4065 		PMAP_LOCK(pmap);
4066 		pte = pmap_pte_quick(pmap, pv->pv_va);
4067 		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4068 		PMAP_UNLOCK(pmap);
4069 		if (rv)
4070 			break;
4071 	}
4072 	sched_unpin();
4073 	return (rv);
4074 }
4075 
4076 /*
4077  *	pmap_is_prefaultable:
4078  *
4079  *	Return whether or not the specified virtual address is elgible
4080  *	for prefault.
4081  */
4082 boolean_t
4083 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4084 {
4085 	pd_entry_t *pde;
4086 	pt_entry_t *pte;
4087 	boolean_t rv;
4088 
4089 	rv = FALSE;
4090 	PMAP_LOCK(pmap);
4091 	pde = pmap_pde(pmap, addr);
4092 	if (*pde != 0 && (*pde & PG_PS) == 0) {
4093 		pte = vtopte(addr);
4094 		rv = *pte == 0;
4095 	}
4096 	PMAP_UNLOCK(pmap);
4097 	return (rv);
4098 }
4099 
4100 /*
4101  * Clear the write and modified bits in each of the given page's mappings.
4102  */
4103 void
4104 pmap_remove_write(vm_page_t m)
4105 {
4106 	struct md_page *pvh;
4107 	pv_entry_t next_pv, pv;
4108 	pmap_t pmap;
4109 	pd_entry_t *pde;
4110 	pt_entry_t oldpte, *pte;
4111 	vm_offset_t va;
4112 
4113 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4114 	if ((m->flags & PG_FICTITIOUS) != 0 ||
4115 	    (m->flags & PG_WRITEABLE) == 0)
4116 		return;
4117 	sched_pin();
4118 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4119 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4120 		va = pv->pv_va;
4121 		pmap = PV_PMAP(pv);
4122 		PMAP_LOCK(pmap);
4123 		pde = pmap_pde(pmap, va);
4124 		if ((*pde & PG_RW) != 0)
4125 			(void)pmap_demote_pde(pmap, pde, va);
4126 		PMAP_UNLOCK(pmap);
4127 	}
4128 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4129 		pmap = PV_PMAP(pv);
4130 		PMAP_LOCK(pmap);
4131 		pde = pmap_pde(pmap, pv->pv_va);
4132 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4133 		    " a 4mpage in page %p's pv list", m));
4134 		pte = pmap_pte_quick(pmap, pv->pv_va);
4135 retry:
4136 		oldpte = *pte;
4137 		if ((oldpte & PG_RW) != 0) {
4138 			/*
4139 			 * Regardless of whether a pte is 32 or 64 bits
4140 			 * in size, PG_RW and PG_M are among the least
4141 			 * significant 32 bits.
4142 			 */
4143 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4144 			    oldpte & ~(PG_RW | PG_M)))
4145 				goto retry;
4146 			if ((oldpte & PG_M) != 0)
4147 				vm_page_dirty(m);
4148 			pmap_invalidate_page(pmap, pv->pv_va);
4149 		}
4150 		PMAP_UNLOCK(pmap);
4151 	}
4152 	vm_page_flag_clear(m, PG_WRITEABLE);
4153 	sched_unpin();
4154 }
4155 
4156 /*
4157  *	pmap_ts_referenced:
4158  *
4159  *	Return a count of reference bits for a page, clearing those bits.
4160  *	It is not necessary for every reference bit to be cleared, but it
4161  *	is necessary that 0 only be returned when there are truly no
4162  *	reference bits set.
4163  *
4164  *	XXX: The exact number of bits to check and clear is a matter that
4165  *	should be tested and standardized at some point in the future for
4166  *	optimal aging of shared pages.
4167  */
4168 int
4169 pmap_ts_referenced(vm_page_t m)
4170 {
4171 	struct md_page *pvh;
4172 	pv_entry_t pv, pvf, pvn;
4173 	pmap_t pmap;
4174 	pd_entry_t oldpde, *pde;
4175 	pt_entry_t *pte;
4176 	vm_offset_t va;
4177 	int rtval = 0;
4178 
4179 	if (m->flags & PG_FICTITIOUS)
4180 		return (rtval);
4181 	sched_pin();
4182 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4183 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4184 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
4185 		va = pv->pv_va;
4186 		pmap = PV_PMAP(pv);
4187 		PMAP_LOCK(pmap);
4188 		pde = pmap_pde(pmap, va);
4189 		oldpde = *pde;
4190 		if ((oldpde & PG_A) != 0) {
4191 			if (pmap_demote_pde(pmap, pde, va)) {
4192 				if ((oldpde & PG_W) == 0) {
4193 					/*
4194 					 * Remove the mapping to a single page
4195 					 * so that a subsequent access may
4196 					 * repromote.  Since the underlying
4197 					 * page table page is fully populated,
4198 					 * this removal never frees a page
4199 					 * table page.
4200 					 */
4201 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4202 					    PG_PS_FRAME);
4203 					pmap_remove_page(pmap, va, NULL);
4204 					rtval++;
4205 					if (rtval > 4) {
4206 						PMAP_UNLOCK(pmap);
4207 						return (rtval);
4208 					}
4209 				}
4210 			}
4211 		}
4212 		PMAP_UNLOCK(pmap);
4213 	}
4214 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4215 		pvf = pv;
4216 		do {
4217 			pvn = TAILQ_NEXT(pv, pv_list);
4218 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
4219 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
4220 			pmap = PV_PMAP(pv);
4221 			PMAP_LOCK(pmap);
4222 			pde = pmap_pde(pmap, pv->pv_va);
4223 			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
4224 			    " found a 4mpage in page %p's pv list", m));
4225 			pte = pmap_pte_quick(pmap, pv->pv_va);
4226 			if ((*pte & PG_A) != 0) {
4227 				atomic_clear_int((u_int *)pte, PG_A);
4228 				pmap_invalidate_page(pmap, pv->pv_va);
4229 				rtval++;
4230 				if (rtval > 4)
4231 					pvn = NULL;
4232 			}
4233 			PMAP_UNLOCK(pmap);
4234 		} while ((pv = pvn) != NULL && pv != pvf);
4235 	}
4236 	sched_unpin();
4237 	return (rtval);
4238 }
4239 
4240 /*
4241  *	Clear the modify bits on the specified physical page.
4242  */
4243 void
4244 pmap_clear_modify(vm_page_t m)
4245 {
4246 	struct md_page *pvh;
4247 	pv_entry_t next_pv, pv;
4248 	pmap_t pmap;
4249 	pd_entry_t oldpde, *pde;
4250 	pt_entry_t oldpte, *pte;
4251 	vm_offset_t va;
4252 
4253 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4254 	if ((m->flags & PG_FICTITIOUS) != 0)
4255 		return;
4256 	sched_pin();
4257 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4258 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4259 		va = pv->pv_va;
4260 		pmap = PV_PMAP(pv);
4261 		PMAP_LOCK(pmap);
4262 		pde = pmap_pde(pmap, va);
4263 		oldpde = *pde;
4264 		if ((oldpde & PG_RW) != 0) {
4265 			if (pmap_demote_pde(pmap, pde, va)) {
4266 				if ((oldpde & PG_W) == 0) {
4267 					/*
4268 					 * Write protect the mapping to a
4269 					 * single page so that a subsequent
4270 					 * write access may repromote.
4271 					 */
4272 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
4273 					    PG_PS_FRAME);
4274 					pte = pmap_pte_quick(pmap, va);
4275 					oldpte = *pte;
4276 					if ((oldpte & PG_V) != 0) {
4277 						/*
4278 						 * Regardless of whether a pte is 32 or 64 bits
4279 						 * in size, PG_RW and PG_M are among the least
4280 						 * significant 32 bits.
4281 						 */
4282 						while (!atomic_cmpset_int((u_int *)pte,
4283 						    oldpte,
4284 						    oldpte & ~(PG_M | PG_RW)))
4285 							oldpte = *pte;
4286 						vm_page_dirty(m);
4287 						pmap_invalidate_page(pmap, va);
4288 					}
4289 				}
4290 			}
4291 		}
4292 		PMAP_UNLOCK(pmap);
4293 	}
4294 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4295 		pmap = PV_PMAP(pv);
4296 		PMAP_LOCK(pmap);
4297 		pde = pmap_pde(pmap, pv->pv_va);
4298 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
4299 		    " a 4mpage in page %p's pv list", m));
4300 		pte = pmap_pte_quick(pmap, pv->pv_va);
4301 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4302 			/*
4303 			 * Regardless of whether a pte is 32 or 64 bits
4304 			 * in size, PG_M is among the least significant
4305 			 * 32 bits.
4306 			 */
4307 			atomic_clear_int((u_int *)pte, PG_M);
4308 			pmap_invalidate_page(pmap, pv->pv_va);
4309 		}
4310 		PMAP_UNLOCK(pmap);
4311 	}
4312 	sched_unpin();
4313 }
4314 
4315 /*
4316  *	pmap_clear_reference:
4317  *
4318  *	Clear the reference bit on the specified physical page.
4319  */
4320 void
4321 pmap_clear_reference(vm_page_t m)
4322 {
4323 	struct md_page *pvh;
4324 	pv_entry_t next_pv, pv;
4325 	pmap_t pmap;
4326 	pd_entry_t oldpde, *pde;
4327 	pt_entry_t *pte;
4328 	vm_offset_t va;
4329 
4330 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
4331 	if ((m->flags & PG_FICTITIOUS) != 0)
4332 		return;
4333 	sched_pin();
4334 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4335 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
4336 		va = pv->pv_va;
4337 		pmap = PV_PMAP(pv);
4338 		PMAP_LOCK(pmap);
4339 		pde = pmap_pde(pmap, va);
4340 		oldpde = *pde;
4341 		if ((oldpde & PG_A) != 0) {
4342 			if (pmap_demote_pde(pmap, pde, va)) {
4343 				/*
4344 				 * Remove the mapping to a single page so
4345 				 * that a subsequent access may repromote.
4346 				 * Since the underlying page table page is
4347 				 * fully populated, this removal never frees
4348 				 * a page table page.
4349 				 */
4350 				va += VM_PAGE_TO_PHYS(m) - (oldpde &
4351 				    PG_PS_FRAME);
4352 				pmap_remove_page(pmap, va, NULL);
4353 			}
4354 		}
4355 		PMAP_UNLOCK(pmap);
4356 	}
4357 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4358 		pmap = PV_PMAP(pv);
4359 		PMAP_LOCK(pmap);
4360 		pde = pmap_pde(pmap, pv->pv_va);
4361 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
4362 		    " a 4mpage in page %p's pv list", m));
4363 		pte = pmap_pte_quick(pmap, pv->pv_va);
4364 		if ((*pte & PG_A) != 0) {
4365 			/*
4366 			 * Regardless of whether a pte is 32 or 64 bits
4367 			 * in size, PG_A is among the least significant
4368 			 * 32 bits.
4369 			 */
4370 			atomic_clear_int((u_int *)pte, PG_A);
4371 			pmap_invalidate_page(pmap, pv->pv_va);
4372 		}
4373 		PMAP_UNLOCK(pmap);
4374 	}
4375 	sched_unpin();
4376 }
4377 
4378 /*
4379  * Miscellaneous support routines follow
4380  */
4381 
4382 /*
4383  * Map a set of physical memory pages into the kernel virtual
4384  * address space. Return a pointer to where it is mapped. This
4385  * routine is intended to be used for mapping device memory,
4386  * NOT real memory.
4387  */
4388 void *
4389 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4390 {
4391 	vm_offset_t va, tmpva, offset;
4392 
4393 	offset = pa & PAGE_MASK;
4394 	size = roundup(offset + size, PAGE_SIZE);
4395 	pa = pa & PG_FRAME;
4396 
4397 	if (pa < KERNLOAD && pa + size <= KERNLOAD)
4398 		va = KERNBASE + pa;
4399 	else
4400 		va = kmem_alloc_nofault(kernel_map, size);
4401 	if (!va)
4402 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4403 
4404 	for (tmpva = va; size > 0; ) {
4405 		pmap_kenter_attr(tmpva, pa, mode);
4406 		size -= PAGE_SIZE;
4407 		tmpva += PAGE_SIZE;
4408 		pa += PAGE_SIZE;
4409 	}
4410 	pmap_invalidate_range(kernel_pmap, va, tmpva);
4411 	pmap_invalidate_cache();
4412 	return ((void *)(va + offset));
4413 }
4414 
4415 void *
4416 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4417 {
4418 
4419 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4420 }
4421 
4422 void *
4423 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4424 {
4425 
4426 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4427 }
4428 
4429 void
4430 pmap_unmapdev(vm_offset_t va, vm_size_t size)
4431 {
4432 	vm_offset_t base, offset, tmpva;
4433 
4434 	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
4435 		return;
4436 	base = trunc_page(va);
4437 	offset = va & PAGE_MASK;
4438 	size = roundup(offset + size, PAGE_SIZE);
4439 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
4440 		pmap_kremove(tmpva);
4441 	pmap_invalidate_range(kernel_pmap, va, tmpva);
4442 	kmem_free(kernel_map, base, size);
4443 }
4444 
4445 int
4446 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4447 {
4448 	vm_offset_t base, offset, tmpva;
4449 	pt_entry_t *pte;
4450 	u_int opte, npte;
4451 	pd_entry_t *pde;
4452 
4453 	base = trunc_page(va);
4454 	offset = va & PAGE_MASK;
4455 	size = roundup(offset + size, PAGE_SIZE);
4456 
4457 	/*
4458 	 * Only supported on kernel virtual addresses above the recursive map.
4459 	 */
4460 	if (base < VM_MIN_KERNEL_ADDRESS)
4461 		return (EINVAL);
4462 
4463 	/* 4MB pages and pages that aren't mapped aren't supported. */
4464 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
4465 		pde = pmap_pde(kernel_pmap, tmpva);
4466 		if (*pde & PG_PS)
4467 			return (EINVAL);
4468 		if (*pde == 0)
4469 			return (EINVAL);
4470 		pte = vtopte(tmpva);
4471 		if (*pte == 0)
4472 			return (EINVAL);
4473 	}
4474 
4475 	/*
4476 	 * Ok, all the pages exist and are 4k, so run through them updating
4477 	 * their cache mode.
4478 	 */
4479 	for (tmpva = base; size > 0; ) {
4480 		pte = vtopte(tmpva);
4481 
4482 		/*
4483 		 * The cache mode bits are all in the low 32-bits of the
4484 		 * PTE, so we can just spin on updating the low 32-bits.
4485 		 */
4486 		do {
4487 			opte = *(u_int *)pte;
4488 			npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
4489 			npte |= pmap_cache_bits(mode, 0);
4490 		} while (npte != opte &&
4491 		    !atomic_cmpset_int((u_int *)pte, opte, npte));
4492 		tmpva += PAGE_SIZE;
4493 		size -= PAGE_SIZE;
4494 	}
4495 
4496 	/*
4497 	 * Flush CPU caches to make sure any data isn't cached that shouldn't
4498 	 * be, etc.
4499 	 */
4500 	pmap_invalidate_range(kernel_pmap, base, tmpva);
4501 	pmap_invalidate_cache();
4502 	return (0);
4503 }
4504 
4505 /*
4506  * perform the pmap work for mincore
4507  */
4508 int
4509 pmap_mincore(pmap_t pmap, vm_offset_t addr)
4510 {
4511 	pd_entry_t *pdep;
4512 	pt_entry_t *ptep, pte;
4513 	vm_paddr_t pa;
4514 	vm_page_t m;
4515 	int val = 0;
4516 
4517 	PMAP_LOCK(pmap);
4518 	pdep = pmap_pde(pmap, addr);
4519 	if (*pdep != 0) {
4520 		if (*pdep & PG_PS) {
4521 			pte = *pdep;
4522 			val = MINCORE_SUPER;
4523 			/* Compute the physical address of the 4KB page. */
4524 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
4525 			    PG_FRAME;
4526 		} else {
4527 			ptep = pmap_pte(pmap, addr);
4528 			pte = *ptep;
4529 			pmap_pte_release(ptep);
4530 			pa = pte & PG_FRAME;
4531 		}
4532 	} else {
4533 		pte = 0;
4534 		pa = 0;
4535 	}
4536 	PMAP_UNLOCK(pmap);
4537 
4538 	if (pte != 0) {
4539 		val |= MINCORE_INCORE;
4540 		if ((pte & PG_MANAGED) == 0)
4541 			return val;
4542 
4543 		m = PHYS_TO_VM_PAGE(pa);
4544 
4545 		/*
4546 		 * Modified by us
4547 		 */
4548 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4549 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
4550 		else {
4551 			/*
4552 			 * Modified by someone else
4553 			 */
4554 			vm_page_lock_queues();
4555 			if (m->dirty || pmap_is_modified(m))
4556 				val |= MINCORE_MODIFIED_OTHER;
4557 			vm_page_unlock_queues();
4558 		}
4559 		/*
4560 		 * Referenced by us
4561 		 */
4562 		if (pte & PG_A)
4563 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
4564 		else {
4565 			/*
4566 			 * Referenced by someone else
4567 			 */
4568 			vm_page_lock_queues();
4569 			if ((m->flags & PG_REFERENCED) ||
4570 			    pmap_ts_referenced(m)) {
4571 				val |= MINCORE_REFERENCED_OTHER;
4572 				vm_page_flag_set(m, PG_REFERENCED);
4573 			}
4574 			vm_page_unlock_queues();
4575 		}
4576 	}
4577 	return val;
4578 }
4579 
4580 void
4581 pmap_activate(struct thread *td)
4582 {
4583 	pmap_t	pmap, oldpmap;
4584 	u_int32_t  cr3;
4585 
4586 	critical_enter();
4587 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4588 	oldpmap = PCPU_GET(curpmap);
4589 #if defined(SMP)
4590 	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
4591 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
4592 #else
4593 	oldpmap->pm_active &= ~1;
4594 	pmap->pm_active |= 1;
4595 #endif
4596 #ifdef PAE
4597 	cr3 = vtophys(pmap->pm_pdpt);
4598 #else
4599 	cr3 = vtophys(pmap->pm_pdir);
4600 #endif
4601 	/*
4602 	 * pmap_activate is for the current thread on the current cpu
4603 	 */
4604 	td->td_pcb->pcb_cr3 = cr3;
4605 	load_cr3(cr3);
4606 	PCPU_SET(curpmap, pmap);
4607 	critical_exit();
4608 }
4609 
4610 /*
4611  *	Increase the starting virtual address of the given mapping if a
4612  *	different alignment might result in more superpage mappings.
4613  */
4614 void
4615 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
4616     vm_offset_t *addr, vm_size_t size)
4617 {
4618 	vm_offset_t superpage_offset;
4619 
4620 	if (size < NBPDR)
4621 		return;
4622 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
4623 		offset += ptoa(object->pg_color);
4624 	superpage_offset = offset & PDRMASK;
4625 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
4626 	    (*addr & PDRMASK) == superpage_offset)
4627 		return;
4628 	if ((*addr & PDRMASK) < superpage_offset)
4629 		*addr = (*addr & ~PDRMASK) + superpage_offset;
4630 	else
4631 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
4632 }
4633 
4634 
4635 #if defined(PMAP_DEBUG)
4636 pmap_pid_dump(int pid)
4637 {
4638 	pmap_t pmap;
4639 	struct proc *p;
4640 	int npte = 0;
4641 	int index;
4642 
4643 	sx_slock(&allproc_lock);
4644 	FOREACH_PROC_IN_SYSTEM(p) {
4645 		if (p->p_pid != pid)
4646 			continue;
4647 
4648 		if (p->p_vmspace) {
4649 			int i,j;
4650 			index = 0;
4651 			pmap = vmspace_pmap(p->p_vmspace);
4652 			for (i = 0; i < NPDEPTD; i++) {
4653 				pd_entry_t *pde;
4654 				pt_entry_t *pte;
4655 				vm_offset_t base = i << PDRSHIFT;
4656 
4657 				pde = &pmap->pm_pdir[i];
4658 				if (pde && pmap_pde_v(pde)) {
4659 					for (j = 0; j < NPTEPG; j++) {
4660 						vm_offset_t va = base + (j << PAGE_SHIFT);
4661 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
4662 							if (index) {
4663 								index = 0;
4664 								printf("\n");
4665 							}
4666 							sx_sunlock(&allproc_lock);
4667 							return npte;
4668 						}
4669 						pte = pmap_pte(pmap, va);
4670 						if (pte && pmap_pte_v(pte)) {
4671 							pt_entry_t pa;
4672 							vm_page_t m;
4673 							pa = *pte;
4674 							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
4675 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
4676 								va, pa, m->hold_count, m->wire_count, m->flags);
4677 							npte++;
4678 							index++;
4679 							if (index >= 2) {
4680 								index = 0;
4681 								printf("\n");
4682 							} else {
4683 								printf(" ");
4684 							}
4685 						}
4686 					}
4687 				}
4688 			}
4689 		}
4690 	}
4691 	sx_sunlock(&allproc_lock);
4692 	return npte;
4693 }
4694 #endif
4695 
4696 #if defined(DEBUG)
4697 
4698 static void	pads(pmap_t pm);
4699 void		pmap_pvdump(vm_offset_t pa);
4700 
4701 /* print address space of pmap*/
4702 static void
4703 pads(pmap_t pm)
4704 {
4705 	int i, j;
4706 	vm_paddr_t va;
4707 	pt_entry_t *ptep;
4708 
4709 	if (pm == kernel_pmap)
4710 		return;
4711 	for (i = 0; i < NPDEPTD; i++)
4712 		if (pm->pm_pdir[i])
4713 			for (j = 0; j < NPTEPG; j++) {
4714 				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
4715 				if (pm == kernel_pmap && va < KERNBASE)
4716 					continue;
4717 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
4718 					continue;
4719 				ptep = pmap_pte(pm, va);
4720 				if (pmap_pte_v(ptep))
4721 					printf("%x:%x ", va, *ptep);
4722 			};
4723 
4724 }
4725 
4726 void
4727 pmap_pvdump(vm_paddr_t pa)
4728 {
4729 	pv_entry_t pv;
4730 	pmap_t pmap;
4731 	vm_page_t m;
4732 
4733 	printf("pa %x", pa);
4734 	m = PHYS_TO_VM_PAGE(pa);
4735 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4736 		pmap = PV_PMAP(pv);
4737 		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
4738 		pads(pmap);
4739 	}
4740 	printf(" ");
4741 }
4742 #endif
4743