xref: /dragonfly/sys/platform/pc64/x86_64/pmap.c (revision 938e74dc)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * Copyright (c) 1994 John S. Dyson
4  * Copyright (c) 1994 David Greenman
5  * Copyright (c) 2003 Peter Wemm
6  * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
7  * Copyright (c) 2008, 2009 The DragonFly Project.
8  * Copyright (c) 2008, 2009 Jordan Gordeev.
9  * Copyright (c) 2011-2012 Matthew Dillon
10  * All rights reserved.
11  *
12  * This code is derived from software contributed to Berkeley by
13  * the Systems Programming Group of the University of Utah Computer
14  * Science Department and William Jolitz of UUNET Technologies Inc.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  * 3. All advertising materials mentioning features or use of this software
25  *    must display the following acknowledgement:
26  *	This product includes software developed by the University of
27  *	California, Berkeley and its contributors.
28  * 4. Neither the name of the University nor the names of its contributors
29  *    may be used to endorse or promote products derived from this software
30  *    without specific prior written permission.
31  *
32  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
33  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
36  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42  * SUCH DAMAGE.
43  */
44 /*
45  * Manage physical address maps for x86-64 systems.
46  */
47 
48 #if JG
49 #include "opt_disable_pse.h"
50 #include "opt_pmap.h"
51 #endif
52 #include "opt_msgbuf.h"
53 
54 #include <sys/param.h>
55 #include <sys/kernel.h>
56 #include <sys/proc.h>
57 #include <sys/msgbuf.h>
58 #include <sys/vmmeter.h>
59 #include <sys/mman.h>
60 #include <sys/systm.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_param.h>
64 #include <sys/sysctl.h>
65 #include <sys/lock.h>
66 #include <vm/vm_kern.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_map.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_extern.h>
71 #include <vm/vm_pageout.h>
72 #include <vm/vm_pager.h>
73 #include <vm/vm_zone.h>
74 
75 #include <sys/user.h>
76 #include <sys/thread2.h>
77 #include <sys/sysref2.h>
78 #include <sys/spinlock2.h>
79 #include <vm/vm_page2.h>
80 
81 #include <machine/cputypes.h>
82 #include <machine/md_var.h>
83 #include <machine/specialreg.h>
84 #include <machine/smp.h>
85 #include <machine_base/apic/apicreg.h>
86 #include <machine/globaldata.h>
87 #include <machine/pmap.h>
88 #include <machine/pmap_inval.h>
89 #include <machine/inttypes.h>
90 
91 #include <ddb/ddb.h>
92 
93 #define PMAP_KEEP_PDIRS
94 #ifndef PMAP_SHPGPERPROC
95 #define PMAP_SHPGPERPROC 2000
96 #endif
97 
98 #if defined(DIAGNOSTIC)
99 #define PMAP_DIAGNOSTIC
100 #endif
101 
102 #define MINPV 2048
103 
104 /*
105  * pmap debugging will report who owns a pv lock when blocking.
106  */
107 #ifdef PMAP_DEBUG
108 
109 #define PMAP_DEBUG_DECL		,const char *func, int lineno
110 #define PMAP_DEBUG_ARGS		, __func__, __LINE__
111 #define PMAP_DEBUG_COPY		, func, lineno
112 
113 #define pv_get(pmap, pindex)		_pv_get(pmap, pindex		\
114 							PMAP_DEBUG_ARGS)
115 #define pv_lock(pv)			_pv_lock(pv			\
116 							PMAP_DEBUG_ARGS)
117 #define pv_hold_try(pv)			_pv_hold_try(pv			\
118 							PMAP_DEBUG_ARGS)
119 #define pv_alloc(pmap, pindex, isnewp)	_pv_alloc(pmap, pindex, isnewp	\
120 							PMAP_DEBUG_ARGS)
121 
122 #else
123 
124 #define PMAP_DEBUG_DECL
125 #define PMAP_DEBUG_ARGS
126 #define PMAP_DEBUG_COPY
127 
128 #define pv_get(pmap, pindex)		_pv_get(pmap, pindex)
129 #define pv_lock(pv)			_pv_lock(pv)
130 #define pv_hold_try(pv)			_pv_hold_try(pv)
131 #define pv_alloc(pmap, pindex, isnewp)	_pv_alloc(pmap, pindex, isnewp)
132 
133 #endif
134 
135 /*
136  * Get PDEs and PTEs for user/kernel address space
137  */
138 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
139 
140 #define pmap_pde_v(pmap, pte)		((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0)
141 #define pmap_pte_w(pmap, pte)		((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0)
142 #define pmap_pte_m(pmap, pte)		((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0)
143 #define pmap_pte_u(pmap, pte)		((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0)
144 #define pmap_pte_v(pmap, pte)		((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0)
145 
146 /*
147  * Given a map and a machine independent protection code,
148  * convert to a vax protection code.
149  */
150 #define pte_prot(m, p)		\
151 	(m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
152 static int protection_codes[PROTECTION_CODES_SIZE];
153 
154 struct pmap kernel_pmap;
155 static TAILQ_HEAD(,pmap)	pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
156 
157 MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects");
158 
159 vm_paddr_t avail_start;		/* PA of first available physical page */
160 vm_paddr_t avail_end;		/* PA of last available physical page */
161 vm_offset_t virtual2_start;	/* cutout free area prior to kernel start */
162 vm_offset_t virtual2_end;
163 vm_offset_t virtual_start;	/* VA of first avail page (after kernel bss) */
164 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
165 vm_offset_t KvaStart;		/* VA start of KVA space */
166 vm_offset_t KvaEnd;		/* VA end of KVA space (non-inclusive) */
167 vm_offset_t KvaSize;		/* max size of kernel virtual address space */
168 static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
169 //static int pgeflag;		/* PG_G or-in */
170 //static int pseflag;		/* PG_PS or-in */
171 uint64_t PatMsr;
172 
173 static int ndmpdp;
174 static vm_paddr_t dmaplimit;
175 static int nkpt;
176 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
177 
178 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE];	/* PAT -> PG_ bits */
179 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/	/* PAT -> PG_ bits */
180 
181 static uint64_t KPTbase;
182 static uint64_t KPTphys;
183 static uint64_t	KPDphys;	/* phys addr of kernel level 2 */
184 static uint64_t	KPDbase;	/* phys addr of kernel level 2 @ KERNBASE */
185 uint64_t KPDPphys;	/* phys addr of kernel level 3 */
186 uint64_t KPML4phys;	/* phys addr of kernel level 4 */
187 
188 static uint64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
189 static uint64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
190 
191 /*
192  * Data for the pv entry allocation mechanism
193  */
194 static vm_zone_t pvzone;
195 static struct vm_zone pvzone_store;
196 static struct vm_object pvzone_obj;
197 static int pv_entry_max=0, pv_entry_high_water=0;
198 static int pmap_pagedaemon_waken = 0;
199 static struct pv_entry *pvinit;
200 
201 /*
202  * All those kernel PT submaps that BSD is so fond of
203  */
204 pt_entry_t *CMAP1 = NULL, *ptmmap;
205 caddr_t CADDR1 = NULL, ptvmmap = NULL;
206 static pt_entry_t *msgbufmap;
207 struct msgbuf *msgbufp=NULL;
208 
209 /*
210  * PMAP default PG_* bits. Needed to be able to add
211  * EPT/NPT pagetable pmap_bits for the VMM module
212  */
213 uint64_t pmap_bits_default[] = {
214 		REGULAR_PMAP,					/* TYPE_IDX		0 */
215 		X86_PG_V,					/* PG_V_IDX		1 */
216 		X86_PG_RW,					/* PG_RW_IDX		2 */
217 		X86_PG_U,					/* PG_U_IDX		3 */
218 		X86_PG_A,					/* PG_A_IDX		4 */
219 		X86_PG_M,					/* PG_M_IDX		5 */
220 		X86_PG_PS,					/* PG_PS_IDX3		6 */
221 		X86_PG_G,					/* PG_G_IDX		7 */
222 		X86_PG_AVAIL1,					/* PG_AVAIL1_IDX	8 */
223 		X86_PG_AVAIL2,					/* PG_AVAIL2_IDX	9 */
224 		X86_PG_AVAIL3,					/* PG_AVAIL3_IDX	10 */
225 		X86_PG_NC_PWT | X86_PG_NC_PCD,			/* PG_N_IDX	11 */
226 };
227 /*
228  * Crashdump maps.
229  */
230 static pt_entry_t *pt_crashdumpmap;
231 static caddr_t crashdumpmap;
232 
233 #ifdef PMAP_DEBUG2
234 static int pmap_enter_debug = 0;
235 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW,
236     &pmap_enter_debug, 0, "Debug pmap_enter's");
237 #endif
238 static int pmap_yield_count = 64;
239 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW,
240     &pmap_yield_count, 0, "Yield during init_pt/release");
241 static int pmap_mmu_optimize = 0;
242 SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW,
243     &pmap_mmu_optimize, 0, "Share page table pages when possible");
244 
245 #define DISABLE_PSE
246 
247 /* Standard user access funtions */
248 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len,
249     size_t *lencopied);
250 extern int std_copyin (const void *udaddr, void *kaddr, size_t len);
251 extern int std_copyout (const void *kaddr, void *udaddr, size_t len);
252 extern int std_fubyte (const void *base);
253 extern int std_subyte (void *base, int byte);
254 extern long std_fuword (const void *base);
255 extern int std_suword (void *base, long word);
256 extern int std_suword32 (void *base, int word);
257 
258 static void pv_hold(pv_entry_t pv);
259 static int _pv_hold_try(pv_entry_t pv
260 				PMAP_DEBUG_DECL);
261 static void pv_drop(pv_entry_t pv);
262 static void _pv_lock(pv_entry_t pv
263 				PMAP_DEBUG_DECL);
264 static void pv_unlock(pv_entry_t pv);
265 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew
266 				PMAP_DEBUG_DECL);
267 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex
268 				PMAP_DEBUG_DECL);
269 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp);
270 static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex);
271 static void pv_put(pv_entry_t pv);
272 static void pv_free(pv_entry_t pv);
273 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex);
274 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
275 		      pv_entry_t *pvpp);
276 static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex,
277 		      pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va);
278 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp,
279 		      struct pmap_inval_info *info);
280 static vm_page_t pmap_remove_pv_page(pv_entry_t pv);
281 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp);
282 
283 struct pmap_scan_info;
284 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
285 		      pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
286 		      vm_offset_t va, pt_entry_t *ptep, void *arg __unused);
287 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info,
288 		      pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
289 		      vm_offset_t va, pt_entry_t *ptep, void *arg __unused);
290 
291 static void i386_protection_init (void);
292 static void create_pagetables(vm_paddr_t *firstaddr);
293 static void pmap_remove_all (vm_page_t m);
294 static boolean_t pmap_testbit (vm_page_t m, int bit);
295 
296 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
297 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
298 
299 static void pmap_pinit_defaults(struct pmap *pmap);
300 
301 static unsigned pdir4mb;
302 
303 static int
304 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2)
305 {
306 	if (pv1->pv_pindex < pv2->pv_pindex)
307 		return(-1);
308 	if (pv1->pv_pindex > pv2->pv_pindex)
309 		return(1);
310 	return(0);
311 }
312 
313 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry,
314              pv_entry_compare, vm_pindex_t, pv_pindex);
315 
316 static __inline
317 void
318 pmap_page_stats_adding(vm_page_t m)
319 {
320 	globaldata_t gd = mycpu;
321 
322 	if (TAILQ_EMPTY(&m->md.pv_list)) {
323 		++gd->gd_vmtotal.t_arm;
324 	} else if (TAILQ_FIRST(&m->md.pv_list) ==
325 		   TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) {
326 		++gd->gd_vmtotal.t_armshr;
327 		++gd->gd_vmtotal.t_avmshr;
328 	} else {
329 		++gd->gd_vmtotal.t_avmshr;
330 	}
331 }
332 
333 static __inline
334 void
335 pmap_page_stats_deleting(vm_page_t m)
336 {
337 	globaldata_t gd = mycpu;
338 
339 	if (TAILQ_EMPTY(&m->md.pv_list)) {
340 		--gd->gd_vmtotal.t_arm;
341 	} else if (TAILQ_FIRST(&m->md.pv_list) ==
342 		   TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) {
343 		--gd->gd_vmtotal.t_armshr;
344 		--gd->gd_vmtotal.t_avmshr;
345 	} else {
346 		--gd->gd_vmtotal.t_avmshr;
347 	}
348 }
349 
350 /*
351  * Move the kernel virtual free pointer to the next
352  * 2MB.  This is used to help improve performance
353  * by using a large (2MB) page for much of the kernel
354  * (.text, .data, .bss)
355  */
356 static
357 vm_offset_t
358 pmap_kmem_choose(vm_offset_t addr)
359 {
360 	vm_offset_t newaddr = addr;
361 
362 	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
363 	return newaddr;
364 }
365 
366 /*
367  * pmap_pte_quick:
368  *
369  *	Super fast pmap_pte routine best used when scanning the pv lists.
370  *	This eliminates many course-grained invltlb calls.  Note that many of
371  *	the pv list scans are across different pmaps and it is very wasteful
372  *	to do an entire invltlb when checking a single mapping.
373  */
374 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va);
375 
376 static
377 pt_entry_t *
378 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
379 {
380 	return pmap_pte(pmap, va);
381 }
382 
383 /*
384  * Returns the pindex of a page table entry (representing a terminal page).
385  * There are NUPTE_TOTAL page table entries possible (a huge number)
386  *
387  * x86-64 has a 48-bit address space, where bit 47 is sign-extended out.
388  * We want to properly translate negative KVAs.
389  */
390 static __inline
391 vm_pindex_t
392 pmap_pte_pindex(vm_offset_t va)
393 {
394 	return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1));
395 }
396 
397 /*
398  * Returns the pindex of a page table.
399  */
400 static __inline
401 vm_pindex_t
402 pmap_pt_pindex(vm_offset_t va)
403 {
404 	return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1)));
405 }
406 
407 /*
408  * Returns the pindex of a page directory.
409  */
410 static __inline
411 vm_pindex_t
412 pmap_pd_pindex(vm_offset_t va)
413 {
414 	return (NUPTE_TOTAL + NUPT_TOTAL +
415 		((va >> PDPSHIFT) & (NUPD_TOTAL - 1)));
416 }
417 
418 static __inline
419 vm_pindex_t
420 pmap_pdp_pindex(vm_offset_t va)
421 {
422 	return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
423 		((va >> PML4SHIFT) & (NUPDP_TOTAL - 1)));
424 }
425 
426 static __inline
427 vm_pindex_t
428 pmap_pml4_pindex(void)
429 {
430 	return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL);
431 }
432 
433 /*
434  * Return various clipped indexes for a given VA
435  *
436  * Returns the index of a pte in a page table, representing a terminal
437  * page.
438  */
439 static __inline
440 vm_pindex_t
441 pmap_pte_index(vm_offset_t va)
442 {
443 	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
444 }
445 
446 /*
447  * Returns the index of a pt in a page directory, representing a page
448  * table.
449  */
450 static __inline
451 vm_pindex_t
452 pmap_pt_index(vm_offset_t va)
453 {
454 	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
455 }
456 
457 /*
458  * Returns the index of a pd in a page directory page, representing a page
459  * directory.
460  */
461 static __inline
462 vm_pindex_t
463 pmap_pd_index(vm_offset_t va)
464 {
465 	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
466 }
467 
468 /*
469  * Returns the index of a pdp in the pml4 table, representing a page
470  * directory page.
471  */
472 static __inline
473 vm_pindex_t
474 pmap_pdp_index(vm_offset_t va)
475 {
476 	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
477 }
478 
479 /*
480  * Generic procedure to index a pte from a pt, pd, or pdp.
481  *
482  * NOTE: Normally passed pindex as pmap_xx_index().  pmap_xx_pindex() is NOT
483  *	 a page table page index but is instead of PV lookup index.
484  */
485 static
486 void *
487 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex)
488 {
489 	pt_entry_t *pte;
490 
491 	pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m));
492 	return(&pte[pindex]);
493 }
494 
495 /*
496  * Return pointer to PDP slot in the PML4
497  */
498 static __inline
499 pml4_entry_t *
500 pmap_pdp(pmap_t pmap, vm_offset_t va)
501 {
502 	return (&pmap->pm_pml4[pmap_pdp_index(va)]);
503 }
504 
505 /*
506  * Return pointer to PD slot in the PDP given a pointer to the PDP
507  */
508 static __inline
509 pdp_entry_t *
510 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va)
511 {
512 	pdp_entry_t *pd;
513 
514 	pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME);
515 	return (&pd[pmap_pd_index(va)]);
516 }
517 
518 /*
519  * Return pointer to PD slot in the PDP.
520  */
521 static __inline
522 pdp_entry_t *
523 pmap_pd(pmap_t pmap, vm_offset_t va)
524 {
525 	pml4_entry_t *pdp;
526 
527 	pdp = pmap_pdp(pmap, va);
528 	if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0)
529 		return NULL;
530 	return (pmap_pdp_to_pd(*pdp, va));
531 }
532 
533 /*
534  * Return pointer to PT slot in the PD given a pointer to the PD
535  */
536 static __inline
537 pd_entry_t *
538 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va)
539 {
540 	pd_entry_t *pt;
541 
542 	pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME);
543 	return (&pt[pmap_pt_index(va)]);
544 }
545 
546 /*
547  * Return pointer to PT slot in the PD
548  *
549  * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs,
550  *		     so we cannot lookup the PD via the PDP.  Instead we
551  *		     must look it up via the pmap.
552  */
553 static __inline
554 pd_entry_t *
555 pmap_pt(pmap_t pmap, vm_offset_t va)
556 {
557 	pdp_entry_t *pd;
558 	pv_entry_t pv;
559 	vm_pindex_t pd_pindex;
560 
561 	if (pmap->pm_flags & PMAP_FLAG_SIMPLE) {
562 		pd_pindex = pmap_pd_pindex(va);
563 		spin_lock(&pmap->pm_spin);
564 		pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex);
565 		spin_unlock(&pmap->pm_spin);
566 		if (pv == NULL || pv->pv_m == NULL)
567 			return NULL;
568 		return (pmap_pd_to_pt(VM_PAGE_TO_PHYS(pv->pv_m), va));
569 	} else {
570 		pd = pmap_pd(pmap, va);
571 		if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0)
572 			 return NULL;
573 		return (pmap_pd_to_pt(*pd, va));
574 	}
575 }
576 
577 /*
578  * Return pointer to PTE slot in the PT given a pointer to the PT
579  */
580 static __inline
581 pt_entry_t *
582 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va)
583 {
584 	pt_entry_t *pte;
585 
586 	pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME);
587 	return (&pte[pmap_pte_index(va)]);
588 }
589 
590 /*
591  * Return pointer to PTE slot in the PT
592  */
593 static __inline
594 pt_entry_t *
595 pmap_pte(pmap_t pmap, vm_offset_t va)
596 {
597 	pd_entry_t *pt;
598 
599 	pt = pmap_pt(pmap, va);
600 	if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0)
601 		 return NULL;
602 	if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0)
603 		return ((pt_entry_t *)pt);
604 	return (pmap_pt_to_pte(*pt, va));
605 }
606 
607 /*
608  * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is
609  * the PT layer.  This will speed up core pmap operations considerably.
610  *
611  * NOTE: The pmap spinlock does not need to be held but the passed-in pv
612  *	 must be in a known associated state (typically by being locked when
613  *	 the pmap spinlock isn't held).  We allow the race for that case.
614  */
615 static __inline
616 void
617 pv_cache(pv_entry_t pv, vm_pindex_t pindex)
618 {
619 	if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0))
620 		pv->pv_pmap->pm_pvhint = pv;
621 }
622 
623 
624 /*
625  * Return address of PT slot in PD (KVM only)
626  *
627  * Cannot be used for user page tables because it might interfere with
628  * the shared page-table-page optimization (pmap_mmu_optimize).
629  */
630 static __inline
631 pd_entry_t *
632 vtopt(vm_offset_t va)
633 {
634 	uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
635 				  NPML4EPGSHIFT)) - 1);
636 
637 	return (PDmap + ((va >> PDRSHIFT) & mask));
638 }
639 
640 /*
641  * KVM - return address of PTE slot in PT
642  */
643 static __inline
644 pt_entry_t *
645 vtopte(vm_offset_t va)
646 {
647 	uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
648 				  NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
649 
650 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
651 }
652 
653 static uint64_t
654 allocpages(vm_paddr_t *firstaddr, long n)
655 {
656 	uint64_t ret;
657 
658 	ret = *firstaddr;
659 	bzero((void *)ret, n * PAGE_SIZE);
660 	*firstaddr += n * PAGE_SIZE;
661 	return (ret);
662 }
663 
664 static
665 void
666 create_pagetables(vm_paddr_t *firstaddr)
667 {
668 	long i;		/* must be 64 bits */
669 	long nkpt_base;
670 	long nkpt_phys;
671 	int j;
672 
673 	/*
674 	 * We are running (mostly) V=P at this point
675 	 *
676 	 * Calculate NKPT - number of kernel page tables.  We have to
677 	 * accomodoate prealloction of the vm_page_array, dump bitmap,
678 	 * MSGBUF_SIZE, and other stuff.  Be generous.
679 	 *
680 	 * Maxmem is in pages.
681 	 *
682 	 * ndmpdp is the number of 1GB pages we wish to map.
683 	 */
684 	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
685 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
686 		ndmpdp = 4;
687 	KKASSERT(ndmpdp <= NKPDPE * NPDEPG);
688 
689 	/*
690 	 * Starting at the beginning of kvm (not KERNBASE).
691 	 */
692 	nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR;
693 	nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR;
694 	nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E +
695 		       ndmpdp) + 511) / 512;
696 	nkpt_phys += 128;
697 
698 	/*
699 	 * Starting at KERNBASE - map 2G worth of page table pages.
700 	 * KERNBASE is offset -2G from the end of kvm.
701 	 */
702 	nkpt_base = (NPDPEPG - KPDPI) * NPTEPG;	/* typically 2 x 512 */
703 
704 	/*
705 	 * Allocate pages
706 	 */
707 	KPTbase = allocpages(firstaddr, nkpt_base);
708 	KPTphys = allocpages(firstaddr, nkpt_phys);
709 	KPML4phys = allocpages(firstaddr, 1);
710 	KPDPphys = allocpages(firstaddr, NKPML4E);
711 	KPDphys = allocpages(firstaddr, NKPDPE);
712 
713 	/*
714 	 * Calculate the page directory base for KERNBASE,
715 	 * that is where we start populating the page table pages.
716 	 * Basically this is the end - 2.
717 	 */
718 	KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT);
719 
720 	DMPDPphys = allocpages(firstaddr, NDMPML4E);
721 	if ((amd_feature & AMDID_PAGE1GB) == 0)
722 		DMPDphys = allocpages(firstaddr, ndmpdp);
723 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
724 
725 	/*
726 	 * Fill in the underlying page table pages for the area around
727 	 * KERNBASE.  This remaps low physical memory to KERNBASE.
728 	 *
729 	 * Read-only from zero to physfree
730 	 * XXX not fully used, underneath 2M pages
731 	 */
732 	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
733 		((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT;
734 		((pt_entry_t *)KPTbase)[i] |=
735 		    pmap_bits_default[PG_RW_IDX] |
736 		    pmap_bits_default[PG_V_IDX] |
737 		    pmap_bits_default[PG_G_IDX];
738 	}
739 
740 	/*
741 	 * Now map the initial kernel page tables.  One block of page
742 	 * tables is placed at the beginning of kernel virtual memory,
743 	 * and another block is placed at KERNBASE to map the kernel binary,
744 	 * data, bss, and initial pre-allocations.
745 	 */
746 	for (i = 0; i < nkpt_base; i++) {
747 		((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT);
748 		((pd_entry_t *)KPDbase)[i] |=
749 		    pmap_bits_default[PG_RW_IDX] |
750 		    pmap_bits_default[PG_V_IDX];
751 	}
752 	for (i = 0; i < nkpt_phys; i++) {
753 		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
754 		((pd_entry_t *)KPDphys)[i] |=
755 		    pmap_bits_default[PG_RW_IDX] |
756 		    pmap_bits_default[PG_V_IDX];
757 	}
758 
759 	/*
760 	 * Map from zero to end of allocations using 2M pages as an
761 	 * optimization.  This will bypass some of the KPTBase pages
762 	 * above in the KERNBASE area.
763 	 */
764 	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
765 		((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT;
766 		((pd_entry_t *)KPDbase)[i] |=
767 		    pmap_bits_default[PG_RW_IDX] |
768 		    pmap_bits_default[PG_V_IDX] |
769 		    pmap_bits_default[PG_PS_IDX] |
770 		    pmap_bits_default[PG_G_IDX];
771 	}
772 
773 	/*
774 	 * And connect up the PD to the PDP.  The kernel pmap is expected
775 	 * to pre-populate all of its PDs.  See NKPDPE in vmparam.h.
776 	 */
777 	for (i = 0; i < NKPDPE; i++) {
778 		((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] =
779 				KPDphys + (i << PAGE_SHIFT);
780 		((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |=
781 		    pmap_bits_default[PG_RW_IDX] |
782 		    pmap_bits_default[PG_V_IDX] |
783 		    pmap_bits_default[PG_U_IDX];
784 	}
785 
786 	/*
787 	 * Now set up the direct map space using either 2MB or 1GB pages
788 	 * Preset PG_M and PG_A because demotion expects it.
789 	 *
790 	 * When filling in entries in the PD pages make sure any excess
791 	 * entries are set to zero as we allocated enough PD pages
792 	 */
793 	if ((amd_feature & AMDID_PAGE1GB) == 0) {
794 		for (i = 0; i < NPDEPG * ndmpdp; i++) {
795 			((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT;
796 			((pd_entry_t *)DMPDphys)[i] |=
797 			    pmap_bits_default[PG_RW_IDX] |
798 			    pmap_bits_default[PG_V_IDX] |
799 			    pmap_bits_default[PG_PS_IDX] |
800 			    pmap_bits_default[PG_G_IDX] |
801 			    pmap_bits_default[PG_M_IDX] |
802 			    pmap_bits_default[PG_A_IDX];
803 		}
804 
805 		/*
806 		 * And the direct map space's PDP
807 		 */
808 		for (i = 0; i < ndmpdp; i++) {
809 			((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
810 							(i << PAGE_SHIFT);
811 			((pdp_entry_t *)DMPDPphys)[i] |=
812 			    pmap_bits_default[PG_RW_IDX] |
813 			    pmap_bits_default[PG_V_IDX] |
814 			    pmap_bits_default[PG_U_IDX];
815 		}
816 	} else {
817 		for (i = 0; i < ndmpdp; i++) {
818 			((pdp_entry_t *)DMPDPphys)[i] =
819 						(vm_paddr_t)i << PDPSHIFT;
820 			((pdp_entry_t *)DMPDPphys)[i] |=
821 			    pmap_bits_default[PG_RW_IDX] |
822 			    pmap_bits_default[PG_V_IDX] |
823 			    pmap_bits_default[PG_PS_IDX] |
824 			    pmap_bits_default[PG_G_IDX] |
825 			    pmap_bits_default[PG_M_IDX] |
826 			    pmap_bits_default[PG_A_IDX];
827 		}
828 	}
829 
830 	/* And recursively map PML4 to itself in order to get PTmap */
831 	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
832 	((pdp_entry_t *)KPML4phys)[PML4PML4I] |=
833 	    pmap_bits_default[PG_RW_IDX] |
834 	    pmap_bits_default[PG_V_IDX] |
835 	    pmap_bits_default[PG_U_IDX];
836 
837 	/*
838 	 * Connect the Direct Map slots up to the PML4
839 	 */
840 	for (j = 0; j < NDMPML4E; ++j) {
841 		((pdp_entry_t *)KPML4phys)[DMPML4I + j] =
842 		    (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) |
843 		    pmap_bits_default[PG_RW_IDX] |
844 		    pmap_bits_default[PG_V_IDX] |
845 		    pmap_bits_default[PG_U_IDX];
846 	}
847 
848 	/*
849 	 * Connect the KVA slot up to the PML4
850 	 */
851 	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
852 	((pdp_entry_t *)KPML4phys)[KPML4I] |=
853 	    pmap_bits_default[PG_RW_IDX] |
854 	    pmap_bits_default[PG_V_IDX] |
855 	    pmap_bits_default[PG_U_IDX];
856 }
857 
858 /*
859  *	Bootstrap the system enough to run with virtual memory.
860  *
861  *	On the i386 this is called after mapping has already been enabled
862  *	and just syncs the pmap module with what has already been done.
863  *	[We can't call it easily with mapping off since the kernel is not
864  *	mapped with PA == VA, hence we would have to relocate every address
865  *	from the linked base (virtual) address "KERNBASE" to the actual
866  *	(physical) address starting relative to 0]
867  */
868 void
869 pmap_bootstrap(vm_paddr_t *firstaddr)
870 {
871 	vm_offset_t va;
872 	pt_entry_t *pte;
873 
874 	KvaStart = VM_MIN_KERNEL_ADDRESS;
875 	KvaEnd = VM_MAX_KERNEL_ADDRESS;
876 	KvaSize = KvaEnd - KvaStart;
877 
878 	avail_start = *firstaddr;
879 
880 	/*
881 	 * Create an initial set of page tables to run the kernel in.
882 	 */
883 	create_pagetables(firstaddr);
884 
885 	virtual2_start = KvaStart;
886 	virtual2_end = PTOV_OFFSET;
887 
888 	virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr;
889 	virtual_start = pmap_kmem_choose(virtual_start);
890 
891 	virtual_end = VM_MAX_KERNEL_ADDRESS;
892 
893 	/* XXX do %cr0 as well */
894 	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
895 	load_cr3(KPML4phys);
896 
897 	/*
898 	 * Initialize protection array.
899 	 */
900 	i386_protection_init();
901 
902 	/*
903 	 * The kernel's pmap is statically allocated so we don't have to use
904 	 * pmap_create, which is unlikely to work correctly at this part of
905 	 * the boot sequence (XXX and which no longer exists).
906 	 */
907 	kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys);
908 	kernel_pmap.pm_count = 1;
909 	kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
910 	RB_INIT(&kernel_pmap.pm_pvroot);
911 	spin_init(&kernel_pmap.pm_spin);
912 	lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok");
913 
914 	/*
915 	 * Reserve some special page table entries/VA space for temporary
916 	 * mapping of pages.
917 	 */
918 #define	SYSMAP(c, p, v, n)	\
919 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
920 
921 	va = virtual_start;
922 	pte = vtopte(va);
923 
924 	/*
925 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
926 	 */
927 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
928 
929 	/*
930 	 * Crashdump maps.
931 	 */
932 	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
933 
934 	/*
935 	 * ptvmmap is used for reading arbitrary physical pages via
936 	 * /dev/mem.
937 	 */
938 	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
939 
940 	/*
941 	 * msgbufp is used to map the system message buffer.
942 	 * XXX msgbufmap is not used.
943 	 */
944 	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
945 	       atop(round_page(MSGBUF_SIZE)))
946 
947 	virtual_start = va;
948 
949 	*CMAP1 = 0;
950 
951 	/*
952 	 * PG_G is terribly broken on SMP because we IPI invltlb's in some
953 	 * cases rather then invl1pg.  Actually, I don't even know why it
954 	 * works under UP because self-referential page table mappings
955 	 */
956 //	pgeflag = 0;
957 
958 /*
959  * Initialize the 4MB page size flag
960  */
961 //	pseflag = 0;
962 /*
963  * The 4MB page version of the initial
964  * kernel page mapping.
965  */
966 	pdir4mb = 0;
967 
968 #if !defined(DISABLE_PSE)
969 	if (cpu_feature & CPUID_PSE) {
970 		pt_entry_t ptditmp;
971 		/*
972 		 * Note that we have enabled PSE mode
973 		 */
974 //		pseflag = kernel_pmap.pmap_bits[PG_PS_IDX];
975 		ptditmp = *(PTmap + x86_64_btop(KERNBASE));
976 		ptditmp &= ~(NBPDR - 1);
977 		ptditmp |= pmap_bits_default[PG_V_IDX] |
978 		    pmap_bits_default[PG_RW_IDX] |
979 		    pmap_bits_default[PG_PS_IDX] |
980 		    pmap_bits_default[PG_U_IDX];
981 //		    pgeflag;
982 		pdir4mb = ptditmp;
983 	}
984 #endif
985 	cpu_invltlb();
986 
987 	/* Initialize the PAT MSR */
988 	pmap_init_pat();
989 
990 	pmap_pinit_defaults(&kernel_pmap);
991 }
992 
993 /*
994  * Setup the PAT MSR.
995  */
996 void
997 pmap_init_pat(void)
998 {
999 	uint64_t pat_msr;
1000 	u_long cr0, cr4;
1001 
1002 	/*
1003 	 * Default values mapping PATi,PCD,PWT bits at system reset.
1004 	 * The default values effectively ignore the PATi bit by
1005 	 * repeating the encodings for 0-3 in 4-7, and map the PCD
1006 	 * and PWT bit combinations to the expected PAT types.
1007 	 */
1008 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |	/* 000 */
1009 		  PAT_VALUE(1, PAT_WRITE_THROUGH) |	/* 001 */
1010 		  PAT_VALUE(2, PAT_UNCACHED) |		/* 010 */
1011 		  PAT_VALUE(3, PAT_UNCACHEABLE) |	/* 011 */
1012 		  PAT_VALUE(4, PAT_WRITE_BACK) |	/* 100 */
1013 		  PAT_VALUE(5, PAT_WRITE_THROUGH) |	/* 101 */
1014 		  PAT_VALUE(6, PAT_UNCACHED) |		/* 110 */
1015 		  PAT_VALUE(7, PAT_UNCACHEABLE);	/* 111 */
1016 	pat_pte_index[PAT_WRITE_BACK]	= 0;
1017 	pat_pte_index[PAT_WRITE_THROUGH]= 0         | X86_PG_NC_PWT;
1018 	pat_pte_index[PAT_UNCACHED]	= X86_PG_NC_PCD;
1019 	pat_pte_index[PAT_UNCACHEABLE]	= X86_PG_NC_PCD | X86_PG_NC_PWT;
1020 	pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE];
1021 	pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE];
1022 
1023 	if (cpu_feature & CPUID_PAT) {
1024 		/*
1025 		 * If we support the PAT then set-up entries for
1026 		 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns
1027 		 * 4 and 5.
1028 		 */
1029 		pat_msr = (pat_msr & ~PAT_MASK(4)) |
1030 			  PAT_VALUE(4, PAT_WRITE_PROTECTED);
1031 		pat_msr = (pat_msr & ~PAT_MASK(5)) |
1032 			  PAT_VALUE(5, PAT_WRITE_COMBINING);
1033 		pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | 0;
1034 		pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PWT;
1035 
1036 		/*
1037 		 * Then enable the PAT
1038 		 */
1039 
1040 		/* Disable PGE. */
1041 		cr4 = rcr4();
1042 		load_cr4(cr4 & ~CR4_PGE);
1043 
1044 		/* Disable caches (CD = 1, NW = 0). */
1045 		cr0 = rcr0();
1046 		load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1047 
1048 		/* Flushes caches and TLBs. */
1049 		wbinvd();
1050 		cpu_invltlb();
1051 
1052 		/* Update PAT and index table. */
1053 		wrmsr(MSR_PAT, pat_msr);
1054 
1055 		/* Flush caches and TLBs again. */
1056 		wbinvd();
1057 		cpu_invltlb();
1058 
1059 		/* Restore caches and PGE. */
1060 		load_cr0(cr0);
1061 		load_cr4(cr4);
1062 		PatMsr = pat_msr;
1063 	}
1064 }
1065 
1066 /*
1067  * Set 4mb pdir for mp startup
1068  */
1069 void
1070 pmap_set_opt(void)
1071 {
1072 	if (cpu_feature & CPUID_PSE) {
1073 		load_cr4(rcr4() | CR4_PSE);
1074 		if (pdir4mb && mycpu->gd_cpuid == 0) {	/* only on BSP */
1075 			cpu_invltlb();
1076 		}
1077 	}
1078 }
1079 
1080 /*
1081  *	Initialize the pmap module.
1082  *	Called by vm_init, to initialize any structures that the pmap
1083  *	system needs to map virtual memory.
1084  *	pmap_init has been enhanced to support in a fairly consistant
1085  *	way, discontiguous physical memory.
1086  */
1087 void
1088 pmap_init(void)
1089 {
1090 	int i;
1091 	int initial_pvs;
1092 
1093 	/*
1094 	 * Allocate memory for random pmap data structures.  Includes the
1095 	 * pv_head_table.
1096 	 */
1097 
1098 	for (i = 0; i < vm_page_array_size; i++) {
1099 		vm_page_t m;
1100 
1101 		m = &vm_page_array[i];
1102 		TAILQ_INIT(&m->md.pv_list);
1103 	}
1104 
1105 	/*
1106 	 * init the pv free list
1107 	 */
1108 	initial_pvs = vm_page_array_size;
1109 	if (initial_pvs < MINPV)
1110 		initial_pvs = MINPV;
1111 	pvzone = &pvzone_store;
1112 	pvinit = (void *)kmem_alloc(&kernel_map,
1113 				    initial_pvs * sizeof (struct pv_entry));
1114 	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry),
1115 		  pvinit, initial_pvs);
1116 
1117 	/*
1118 	 * Now it is safe to enable pv_table recording.
1119 	 */
1120 	pmap_initialized = TRUE;
1121 }
1122 
1123 /*
1124  * Initialize the address space (zone) for the pv_entries.  Set a
1125  * high water mark so that the system can recover from excessive
1126  * numbers of pv entries.
1127  */
1128 void
1129 pmap_init2(void)
1130 {
1131 	int shpgperproc = PMAP_SHPGPERPROC;
1132 	int entry_max;
1133 
1134 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
1135 	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
1136 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
1137 	pv_entry_high_water = 9 * (pv_entry_max / 10);
1138 
1139 	/*
1140 	 * Subtract out pages already installed in the zone (hack)
1141 	 */
1142 	entry_max = pv_entry_max - vm_page_array_size;
1143 	if (entry_max <= 0)
1144 		entry_max = 1;
1145 
1146 	zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT, 1);
1147 }
1148 
1149 /*
1150  * Typically used to initialize a fictitious page by vm/device_pager.c
1151  */
1152 void
1153 pmap_page_init(struct vm_page *m)
1154 {
1155 	vm_page_init(m);
1156 	TAILQ_INIT(&m->md.pv_list);
1157 }
1158 
1159 /***************************************************
1160  * Low level helper routines.....
1161  ***************************************************/
1162 
1163 /*
1164  * this routine defines the region(s) of memory that should
1165  * not be tested for the modified bit.
1166  */
1167 static __inline
1168 int
1169 pmap_track_modified(vm_pindex_t pindex)
1170 {
1171 	vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT;
1172 	if ((va < clean_sva) || (va >= clean_eva))
1173 		return 1;
1174 	else
1175 		return 0;
1176 }
1177 
1178 /*
1179  * Extract the physical page address associated with the map/VA pair.
1180  * The page must be wired for this to work reliably.
1181  *
1182  * XXX for the moment we're using pv_find() instead of pv_get(), as
1183  *     callers might be expecting non-blocking operation.
1184  */
1185 vm_paddr_t
1186 pmap_extract(pmap_t pmap, vm_offset_t va)
1187 {
1188 	vm_paddr_t rtval;
1189 	pv_entry_t pt_pv;
1190 	pt_entry_t *ptep;
1191 
1192 	rtval = 0;
1193 	if (va >= VM_MAX_USER_ADDRESS) {
1194 		/*
1195 		 * Kernel page directories might be direct-mapped and
1196 		 * there is typically no PV tracking of pte's
1197 		 */
1198 		pd_entry_t *pt;
1199 
1200 		pt = pmap_pt(pmap, va);
1201 		if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) {
1202 			if (*pt & pmap->pmap_bits[PG_PS_IDX]) {
1203 				rtval = *pt & PG_PS_FRAME;
1204 				rtval |= va & PDRMASK;
1205 			} else {
1206 				ptep = pmap_pt_to_pte(*pt, va);
1207 				if (*pt & pmap->pmap_bits[PG_V_IDX]) {
1208 					rtval = *ptep & PG_FRAME;
1209 					rtval |= va & PAGE_MASK;
1210 				}
1211 			}
1212 		}
1213 	} else {
1214 		/*
1215 		 * User pages currently do not direct-map the page directory
1216 		 * and some pages might not used managed PVs.  But all PT's
1217 		 * will have a PV.
1218 		 */
1219 		pt_pv = pv_find(pmap, pmap_pt_pindex(va));
1220 		if (pt_pv) {
1221 			ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
1222 			if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
1223 				rtval = *ptep & PG_FRAME;
1224 				rtval |= va & PAGE_MASK;
1225 			}
1226 			pv_drop(pt_pv);
1227 		}
1228 	}
1229 	return rtval;
1230 }
1231 
1232 /*
1233  * Similar to extract but checks protections, SMP-friendly short-cut for
1234  * vm_fault_page[_quick]().  Can return NULL to cause the caller to
1235  * fall-through to the real fault code.
1236  *
1237  * The returned page, if not NULL, is held (and not busied).
1238  */
1239 vm_page_t
1240 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1241 {
1242 	if (pmap && va < VM_MAX_USER_ADDRESS) {
1243 		pv_entry_t pt_pv;
1244 		pv_entry_t pte_pv;
1245 		pt_entry_t *ptep;
1246 		pt_entry_t req;
1247 		vm_page_t m;
1248 		int error;
1249 
1250 		req = pmap->pmap_bits[PG_V_IDX] |
1251 		      pmap->pmap_bits[PG_U_IDX];
1252 		if (prot & VM_PROT_WRITE)
1253 			req |= pmap->pmap_bits[PG_RW_IDX];
1254 
1255 		pt_pv = pv_find(pmap, pmap_pt_pindex(va));
1256 		if (pt_pv == NULL)
1257 			return (NULL);
1258 		ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
1259 		if ((*ptep & req) != req) {
1260 			pv_drop(pt_pv);
1261 			return (NULL);
1262 		}
1263 		pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), &error);
1264 		if (pte_pv && error == 0) {
1265 			m = pte_pv->pv_m;
1266 			vm_page_hold(m);
1267 			if (prot & VM_PROT_WRITE)
1268 				vm_page_dirty(m);
1269 			pv_put(pte_pv);
1270 		} else if (pte_pv) {
1271 			pv_drop(pte_pv);
1272 			m = NULL;
1273 		} else {
1274 			m = NULL;
1275 		}
1276 		pv_drop(pt_pv);
1277 		return(m);
1278 	} else {
1279 		return(NULL);
1280 	}
1281 }
1282 
1283 /*
1284  * Extract the physical page address associated kernel virtual address.
1285  */
1286 vm_paddr_t
1287 pmap_kextract(vm_offset_t va)
1288 {
1289 	pd_entry_t pt;		/* pt entry in pd */
1290 	vm_paddr_t pa;
1291 
1292 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1293 		pa = DMAP_TO_PHYS(va);
1294 	} else {
1295 		pt = *vtopt(va);
1296 		if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) {
1297 			pa = (pt & PG_PS_FRAME) | (va & PDRMASK);
1298 		} else {
1299 			/*
1300 			 * Beware of a concurrent promotion that changes the
1301 			 * PDE at this point!  For example, vtopte() must not
1302 			 * be used to access the PTE because it would use the
1303 			 * new PDE.  It is, however, safe to use the old PDE
1304 			 * because the page table page is preserved by the
1305 			 * promotion.
1306 			 */
1307 			pa = *pmap_pt_to_pte(pt, va);
1308 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1309 		}
1310 	}
1311 	return pa;
1312 }
1313 
1314 /***************************************************
1315  * Low level mapping routines.....
1316  ***************************************************/
1317 
1318 /*
1319  * Routine: pmap_kenter
1320  * Function:
1321  *  	Add a wired page to the KVA
1322  *  	NOTE! note that in order for the mapping to take effect -- you
1323  *  	should do an invltlb after doing the pmap_kenter().
1324  */
1325 void
1326 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1327 {
1328 	pt_entry_t *pte;
1329 	pt_entry_t npte;
1330 	pmap_inval_info info;
1331 
1332 	pmap_inval_init(&info);				/* XXX remove */
1333 	npte = pa |
1334 	    kernel_pmap.pmap_bits[PG_RW_IDX] |
1335 	    kernel_pmap.pmap_bits[PG_V_IDX];
1336 //	    pgeflag;
1337 	pte = vtopte(va);
1338 	pmap_inval_interlock(&info, &kernel_pmap, va);	/* XXX remove */
1339 	*pte = npte;
1340 	pmap_inval_deinterlock(&info, &kernel_pmap);	/* XXX remove */
1341 	pmap_inval_done(&info);				/* XXX remove */
1342 }
1343 
1344 /*
1345  * Routine: pmap_kenter_quick
1346  * Function:
1347  *  	Similar to pmap_kenter(), except we only invalidate the
1348  *  	mapping on the current CPU.
1349  */
1350 void
1351 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
1352 {
1353 	pt_entry_t *pte;
1354 	pt_entry_t npte;
1355 
1356 	npte = pa |
1357 	    kernel_pmap.pmap_bits[PG_RW_IDX] |
1358 	    kernel_pmap.pmap_bits[PG_V_IDX];
1359 //	    pgeflag;
1360 	pte = vtopte(va);
1361 	*pte = npte;
1362 	cpu_invlpg((void *)va);
1363 }
1364 
1365 void
1366 pmap_kenter_sync(vm_offset_t va)
1367 {
1368 	pmap_inval_info info;
1369 
1370 	pmap_inval_init(&info);
1371 	pmap_inval_interlock(&info, &kernel_pmap, va);
1372 	pmap_inval_deinterlock(&info, &kernel_pmap);
1373 	pmap_inval_done(&info);
1374 }
1375 
1376 void
1377 pmap_kenter_sync_quick(vm_offset_t va)
1378 {
1379 	cpu_invlpg((void *)va);
1380 }
1381 
1382 /*
1383  * remove a page from the kernel pagetables
1384  */
1385 void
1386 pmap_kremove(vm_offset_t va)
1387 {
1388 	pt_entry_t *pte;
1389 	pmap_inval_info info;
1390 
1391 	pmap_inval_init(&info);
1392 	pte = vtopte(va);
1393 	pmap_inval_interlock(&info, &kernel_pmap, va);
1394 	(void)pte_load_clear(pte);
1395 	pmap_inval_deinterlock(&info, &kernel_pmap);
1396 	pmap_inval_done(&info);
1397 }
1398 
1399 void
1400 pmap_kremove_quick(vm_offset_t va)
1401 {
1402 	pt_entry_t *pte;
1403 	pte = vtopte(va);
1404 	(void)pte_load_clear(pte);
1405 	cpu_invlpg((void *)va);
1406 }
1407 
1408 /*
1409  * XXX these need to be recoded.  They are not used in any critical path.
1410  */
1411 void
1412 pmap_kmodify_rw(vm_offset_t va)
1413 {
1414 	atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]);
1415 	cpu_invlpg((void *)va);
1416 }
1417 
1418 /* NOT USED
1419 void
1420 pmap_kmodify_nc(vm_offset_t va)
1421 {
1422 	atomic_set_long(vtopte(va), PG_N);
1423 	cpu_invlpg((void *)va);
1424 }
1425 */
1426 
1427 /*
1428  * Used to map a range of physical addresses into kernel virtual
1429  * address space during the low level boot, typically to map the
1430  * dump bitmap, message buffer, and vm_page_array.
1431  *
1432  * These mappings are typically made at some pointer after the end of the
1433  * kernel text+data.
1434  *
1435  * We could return PHYS_TO_DMAP(start) here and not allocate any
1436  * via (*virtp), but then kmem from userland and kernel dumps won't
1437  * have access to the related pointers.
1438  */
1439 vm_offset_t
1440 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot)
1441 {
1442 	vm_offset_t va;
1443 	vm_offset_t va_start;
1444 
1445 	/*return PHYS_TO_DMAP(start);*/
1446 
1447 	va_start = *virtp;
1448 	va = va_start;
1449 
1450 	while (start < end) {
1451 		pmap_kenter_quick(va, start);
1452 		va += PAGE_SIZE;
1453 		start += PAGE_SIZE;
1454 	}
1455 	*virtp = va;
1456 	return va_start;
1457 }
1458 
1459 #define PMAP_CLFLUSH_THRESHOLD  (2 * 1024 * 1024)
1460 
1461 /*
1462  * Remove the specified set of pages from the data and instruction caches.
1463  *
1464  * In contrast to pmap_invalidate_cache_range(), this function does not
1465  * rely on the CPU's self-snoop feature, because it is intended for use
1466  * when moving pages into a different cache domain.
1467  */
1468 void
1469 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1470 {
1471 	vm_offset_t daddr, eva;
1472 	int i;
1473 
1474 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1475 	    (cpu_feature & CPUID_CLFSH) == 0)
1476 		wbinvd();
1477 	else {
1478 		cpu_mfence();
1479 		for (i = 0; i < count; i++) {
1480 			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1481 			eva = daddr + PAGE_SIZE;
1482 			for (; daddr < eva; daddr += cpu_clflush_line_size)
1483 				clflush(daddr);
1484 		}
1485 		cpu_mfence();
1486 	}
1487 }
1488 
1489 void
1490 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1491 {
1492 	KASSERT((sva & PAGE_MASK) == 0,
1493 	    ("pmap_invalidate_cache_range: sva not page-aligned"));
1494 	KASSERT((eva & PAGE_MASK) == 0,
1495 	    ("pmap_invalidate_cache_range: eva not page-aligned"));
1496 
1497 	if (cpu_feature & CPUID_SS) {
1498 		; /* If "Self Snoop" is supported, do nothing. */
1499 	} else {
1500 		/* Globally invalidate caches */
1501 		cpu_wbinvd_on_all_cpus();
1502 	}
1503 }
1504 void
1505 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1506 {
1507 	smp_invlpg_range(pmap->pm_active, sva, eva);
1508 }
1509 
1510 /*
1511  * Add a list of wired pages to the kva
1512  * this routine is only used for temporary
1513  * kernel mappings that do not need to have
1514  * page modification or references recorded.
1515  * Note that old mappings are simply written
1516  * over.  The page *must* be wired.
1517  */
1518 void
1519 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
1520 {
1521 	vm_offset_t end_va;
1522 
1523 	end_va = va + count * PAGE_SIZE;
1524 
1525 	while (va < end_va) {
1526 		pt_entry_t *pte;
1527 
1528 		pte = vtopte(va);
1529 		*pte = VM_PAGE_TO_PHYS(*m) |
1530 		    kernel_pmap.pmap_bits[PG_RW_IDX] |
1531 		    kernel_pmap.pmap_bits[PG_V_IDX] |
1532 		    kernel_pmap.pmap_cache_bits[(*m)->pat_mode];
1533 //		pgeflag;
1534 		cpu_invlpg((void *)va);
1535 		va += PAGE_SIZE;
1536 		m++;
1537 	}
1538 	smp_invltlb();
1539 }
1540 
1541 /*
1542  * This routine jerks page mappings from the
1543  * kernel -- it is meant only for temporary mappings.
1544  *
1545  * MPSAFE, INTERRUPT SAFE (cluster callback)
1546  */
1547 void
1548 pmap_qremove(vm_offset_t va, int count)
1549 {
1550 	vm_offset_t end_va;
1551 
1552 	end_va = va + count * PAGE_SIZE;
1553 
1554 	while (va < end_va) {
1555 		pt_entry_t *pte;
1556 
1557 		pte = vtopte(va);
1558 		(void)pte_load_clear(pte);
1559 		cpu_invlpg((void *)va);
1560 		va += PAGE_SIZE;
1561 	}
1562 	smp_invltlb();
1563 }
1564 
1565 /*
1566  * Create a new thread and optionally associate it with a (new) process.
1567  * NOTE! the new thread's cpu may not equal the current cpu.
1568  */
1569 void
1570 pmap_init_thread(thread_t td)
1571 {
1572 	/* enforce pcb placement & alignment */
1573 	td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
1574 	td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF);
1575 	td->td_savefpu = &td->td_pcb->pcb_save;
1576 	td->td_sp = (char *)td->td_pcb;	/* no -16 */
1577 }
1578 
1579 /*
1580  * This routine directly affects the fork perf for a process.
1581  */
1582 void
1583 pmap_init_proc(struct proc *p)
1584 {
1585 }
1586 
1587 static void
1588 pmap_pinit_defaults(struct pmap *pmap)
1589 {
1590 	bcopy(pmap_bits_default, pmap->pmap_bits,
1591 	      sizeof(pmap_bits_default));
1592 	bcopy(protection_codes, pmap->protection_codes,
1593 	      sizeof(protection_codes));
1594 	bcopy(pat_pte_index, pmap->pmap_cache_bits,
1595 	      sizeof(pat_pte_index));
1596 	pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT;
1597 	pmap->copyinstr = std_copyinstr;
1598 	pmap->copyin = std_copyin;
1599 	pmap->copyout = std_copyout;
1600 	pmap->fubyte = std_fubyte;
1601 	pmap->subyte = std_subyte;
1602 	pmap->fuword = std_fuword;
1603 	pmap->suword = std_suword;
1604 	pmap->suword32 = std_suword32;
1605 }
1606 /*
1607  * Initialize pmap0/vmspace0.  This pmap is not added to pmap_list because
1608  * it, and IdlePTD, represents the template used to update all other pmaps.
1609  *
1610  * On architectures where the kernel pmap is not integrated into the user
1611  * process pmap, this pmap represents the process pmap, not the kernel pmap.
1612  * kernel_pmap should be used to directly access the kernel_pmap.
1613  */
1614 void
1615 pmap_pinit0(struct pmap *pmap)
1616 {
1617 	pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys);
1618 	pmap->pm_count = 1;
1619 	pmap->pm_active = 0;
1620 	pmap->pm_pvhint = NULL;
1621 	RB_INIT(&pmap->pm_pvroot);
1622 	spin_init(&pmap->pm_spin);
1623 	lwkt_token_init(&pmap->pm_token, "pmap_tok");
1624 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1625 	pmap_pinit_defaults(pmap);
1626 }
1627 
1628 /*
1629  * Initialize a preallocated and zeroed pmap structure,
1630  * such as one in a vmspace structure.
1631  */
1632 static void
1633 pmap_pinit_simple(struct pmap *pmap)
1634 {
1635 	/*
1636 	 * Misc initialization
1637 	 */
1638 	pmap->pm_count = 1;
1639 	pmap->pm_active = 0;
1640 	pmap->pm_pvhint = NULL;
1641 	pmap->pm_flags = PMAP_FLAG_SIMPLE;
1642 
1643 	pmap_pinit_defaults(pmap);
1644 
1645 	/*
1646 	 * Don't blow up locks/tokens on re-use (XXX fix/use drop code
1647 	 * for this).
1648 	 */
1649 	if (pmap->pm_pmlpv == NULL) {
1650 		RB_INIT(&pmap->pm_pvroot);
1651 		bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1652 		spin_init(&pmap->pm_spin);
1653 		lwkt_token_init(&pmap->pm_token, "pmap_tok");
1654 	}
1655 }
1656 
1657 void
1658 pmap_pinit(struct pmap *pmap)
1659 {
1660 	pv_entry_t pv;
1661 	int j;
1662 
1663 	if (pmap->pm_pmlpv) {
1664 		if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) {
1665 			pmap_puninit(pmap);
1666 		}
1667 	}
1668 
1669 	pmap_pinit_simple(pmap);
1670 	pmap->pm_flags &= ~PMAP_FLAG_SIMPLE;
1671 
1672 	/*
1673 	 * No need to allocate page table space yet but we do need a valid
1674 	 * page directory table.
1675 	 */
1676 	if (pmap->pm_pml4 == NULL) {
1677 		pmap->pm_pml4 =
1678 		    (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
1679 	}
1680 
1681 	/*
1682 	 * Allocate the page directory page, which wires it even though
1683 	 * it isn't being entered into some higher level page table (it
1684 	 * being the highest level).  If one is already cached we don't
1685 	 * have to do anything.
1686 	 */
1687 	if ((pv = pmap->pm_pmlpv) == NULL) {
1688 		pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
1689 		pmap->pm_pmlpv = pv;
1690 		pmap_kenter((vm_offset_t)pmap->pm_pml4,
1691 			    VM_PAGE_TO_PHYS(pv->pv_m));
1692 		pv_put(pv);
1693 
1694 		/*
1695 		 * Install DMAP and KMAP.
1696 		 */
1697 		for (j = 0; j < NDMPML4E; ++j) {
1698 			pmap->pm_pml4[DMPML4I + j] =
1699 			    (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) |
1700 			    pmap->pmap_bits[PG_RW_IDX] |
1701 			    pmap->pmap_bits[PG_V_IDX] |
1702 			    pmap->pmap_bits[PG_U_IDX];
1703 		}
1704 		pmap->pm_pml4[KPML4I] = KPDPphys |
1705 		    pmap->pmap_bits[PG_RW_IDX] |
1706 		    pmap->pmap_bits[PG_V_IDX] |
1707 		    pmap->pmap_bits[PG_U_IDX];
1708 
1709 		/*
1710 		 * install self-referential address mapping entry
1711 		 */
1712 		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) |
1713 		    pmap->pmap_bits[PG_V_IDX] |
1714 		    pmap->pmap_bits[PG_RW_IDX] |
1715 		    pmap->pmap_bits[PG_A_IDX] |
1716 		    pmap->pmap_bits[PG_M_IDX];
1717 	} else {
1718 		KKASSERT(pv->pv_m->flags & PG_MAPPED);
1719 		KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
1720 	}
1721 	KKASSERT(pmap->pm_pml4[255] == 0);
1722 	KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv);
1723 	KKASSERT(pv->pv_entry.rbe_left == NULL);
1724 	KKASSERT(pv->pv_entry.rbe_right == NULL);
1725 }
1726 
1727 /*
1728  * Clean up a pmap structure so it can be physically freed.  This routine
1729  * is called by the vmspace dtor function.  A great deal of pmap data is
1730  * left passively mapped to improve vmspace management so we have a bit
1731  * of cleanup work to do here.
1732  */
1733 void
1734 pmap_puninit(pmap_t pmap)
1735 {
1736 	pv_entry_t pv;
1737 	vm_page_t p;
1738 
1739 	KKASSERT(pmap->pm_active == 0);
1740 	if ((pv = pmap->pm_pmlpv) != NULL) {
1741 		if (pv_hold_try(pv) == 0)
1742 			pv_lock(pv);
1743 		KKASSERT(pv == pmap->pm_pmlpv);
1744 		p = pmap_remove_pv_page(pv);
1745 		pv_free(pv);
1746 		pmap_kremove((vm_offset_t)pmap->pm_pml4);
1747 		vm_page_busy_wait(p, FALSE, "pgpun");
1748 		KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED));
1749 		vm_page_unwire(p, 0);
1750 		vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
1751 
1752 		/*
1753 		 * XXX eventually clean out PML4 static entries and
1754 		 * use vm_page_free_zero()
1755 		 */
1756 		vm_page_free(p);
1757 		pmap->pm_pmlpv = NULL;
1758 	}
1759 	if (pmap->pm_pml4) {
1760 		KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
1761 		kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE);
1762 		pmap->pm_pml4 = NULL;
1763 	}
1764 	KKASSERT(pmap->pm_stats.resident_count == 0);
1765 	KKASSERT(pmap->pm_stats.wired_count == 0);
1766 }
1767 
1768 /*
1769  * Wire in kernel global address entries.  To avoid a race condition
1770  * between pmap initialization and pmap_growkernel, this procedure
1771  * adds the pmap to the master list (which growkernel scans to update),
1772  * then copies the template.
1773  */
1774 void
1775 pmap_pinit2(struct pmap *pmap)
1776 {
1777 	spin_lock(&pmap_spin);
1778 	TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
1779 	spin_unlock(&pmap_spin);
1780 }
1781 
1782 /*
1783  * This routine is called when various levels in the page table need to
1784  * be populated.  This routine cannot fail.
1785  *
1786  * This function returns two locked pv_entry's, one representing the
1787  * requested pv and one representing the requested pv's parent pv.  If
1788  * the pv did not previously exist it will be mapped into its parent
1789  * and wired, otherwise no additional wire count will be added.
1790  */
1791 static
1792 pv_entry_t
1793 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
1794 {
1795 	pt_entry_t *ptep;
1796 	pv_entry_t pv;
1797 	pv_entry_t pvp;
1798 	vm_pindex_t pt_pindex;
1799 	vm_page_t m;
1800 	int isnew;
1801 	int ispt;
1802 
1803 	/*
1804 	 * If the pv already exists and we aren't being asked for the
1805 	 * parent page table page we can just return it.  A locked+held pv
1806 	 * is returned.  The pv will also have a second hold related to the
1807 	 * pmap association that we don't have to worry about.
1808 	 */
1809 	ispt = 0;
1810 	pv = pv_alloc(pmap, ptepindex, &isnew);
1811 	if (isnew == 0 && pvpp == NULL)
1812 		return(pv);
1813 
1814 	/*
1815 	 * Special case terminal PVs.  These are not page table pages so
1816 	 * no vm_page is allocated (the caller supplied the vm_page).  If
1817 	 * pvpp is non-NULL we are being asked to also removed the pt_pv
1818 	 * for this pv.
1819 	 *
1820 	 * Note that pt_pv's are only returned for user VAs. We assert that
1821 	 * a pt_pv is not being requested for kernel VAs.
1822 	 */
1823 	if (ptepindex < pmap_pt_pindex(0)) {
1824 		if (ptepindex >= NUPTE_USER)
1825 			KKASSERT(pvpp == NULL);
1826 		else
1827 			KKASSERT(pvpp != NULL);
1828 		if (pvpp) {
1829 			pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT);
1830 			pvp = pmap_allocpte(pmap, pt_pindex, NULL);
1831 			if (isnew)
1832 				vm_page_wire_quick(pvp->pv_m);
1833 			*pvpp = pvp;
1834 		} else {
1835 			pvp = NULL;
1836 		}
1837 		return(pv);
1838 	}
1839 
1840 	/*
1841 	 * Non-terminal PVs allocate a VM page to represent the page table,
1842 	 * so we have to resolve pvp and calculate ptepindex for the pvp
1843 	 * and then for the page table entry index in the pvp for
1844 	 * fall-through.
1845 	 */
1846 	if (ptepindex < pmap_pd_pindex(0)) {
1847 		/*
1848 		 * pv is PT, pvp is PD
1849 		 */
1850 		ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT;
1851 		ptepindex += NUPTE_TOTAL + NUPT_TOTAL;
1852 		pvp = pmap_allocpte(pmap, ptepindex, NULL);
1853 		if (!isnew)
1854 			goto notnew;
1855 
1856 		/*
1857 		 * PT index in PD
1858 		 */
1859 		ptepindex = pv->pv_pindex - pmap_pt_pindex(0);
1860 		ptepindex &= ((1ul << NPDEPGSHIFT) - 1);
1861 		ispt = 1;
1862 	} else if (ptepindex < pmap_pdp_pindex(0)) {
1863 		/*
1864 		 * pv is PD, pvp is PDP
1865 		 *
1866 		 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above
1867 		 *		     the PD.
1868 		 */
1869 		ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT;
1870 		ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
1871 
1872 		if (pmap->pm_flags & PMAP_FLAG_SIMPLE) {
1873 			KKASSERT(pvpp == NULL);
1874 			pvp = NULL;
1875 		} else {
1876 			pvp = pmap_allocpte(pmap, ptepindex, NULL);
1877 		}
1878 		if (!isnew)
1879 			goto notnew;
1880 
1881 		/*
1882 		 * PD index in PDP
1883 		 */
1884 		ptepindex = pv->pv_pindex - pmap_pd_pindex(0);
1885 		ptepindex &= ((1ul << NPDPEPGSHIFT) - 1);
1886 	} else if (ptepindex < pmap_pml4_pindex()) {
1887 		/*
1888 		 * pv is PDP, pvp is the root pml4 table
1889 		 */
1890 		pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
1891 		if (!isnew)
1892 			goto notnew;
1893 
1894 		/*
1895 		 * PDP index in PML4
1896 		 */
1897 		ptepindex = pv->pv_pindex - pmap_pdp_pindex(0);
1898 		ptepindex &= ((1ul << NPML4EPGSHIFT) - 1);
1899 	} else {
1900 		/*
1901 		 * pv represents the top-level PML4, there is no parent.
1902 		 */
1903 		pvp = NULL;
1904 		if (!isnew)
1905 			goto notnew;
1906 	}
1907 
1908 	/*
1909 	 * This code is only reached if isnew is TRUE and this is not a
1910 	 * terminal PV.  We need to allocate a vm_page for the page table
1911 	 * at this level and enter it into the parent page table.
1912 	 *
1913 	 * page table pages are marked PG_WRITEABLE and PG_MAPPED.
1914 	 */
1915 	for (;;) {
1916 		m = vm_page_alloc(NULL, pv->pv_pindex,
1917 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
1918 				  VM_ALLOC_INTERRUPT);
1919 		if (m)
1920 			break;
1921 		vm_wait(0);
1922 	}
1923 	vm_page_spin_lock(m);
1924 	pmap_page_stats_adding(m);
1925 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1926 	pv->pv_m = m;
1927 	vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1928 	vm_page_spin_unlock(m);
1929 	vm_page_unmanage(m);	/* m must be spinunlocked */
1930 
1931 	if ((m->flags & PG_ZERO) == 0) {
1932 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
1933 	}
1934 #ifdef PMAP_DEBUG
1935 	else {
1936 		pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
1937 	}
1938 #endif
1939 	m->valid = VM_PAGE_BITS_ALL;
1940 	vm_page_flag_clear(m, PG_ZERO);
1941 	vm_page_wire(m);	/* wire for mapping in parent */
1942 
1943 	/*
1944 	 * Wire the page into pvp, bump the wire-count for pvp's page table
1945 	 * page.  Bump the resident_count for the pmap.  There is no pvp
1946 	 * for the top level, address the pm_pml4[] array directly.
1947 	 *
1948 	 * If the caller wants the parent we return it, otherwise
1949 	 * we just put it away.
1950 	 *
1951 	 * No interlock is needed for pte 0 -> non-zero.
1952 	 *
1953 	 * In the situation where *ptep is valid we might have an unmanaged
1954 	 * page table page shared from another page table which we need to
1955 	 * unshare before installing our private page table page.
1956 	 */
1957 	if (pvp) {
1958 		ptep = pv_pte_lookup(pvp, ptepindex);
1959 		if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
1960 			pt_entry_t pte;
1961 			pmap_inval_info info;
1962 
1963 			if (ispt == 0) {
1964 				panic("pmap_allocpte: unexpected pte %p/%d",
1965 				      pvp, (int)ptepindex);
1966 			}
1967 			pmap_inval_init(&info);
1968 			pmap_inval_interlock(&info, pmap, (vm_offset_t)-1);
1969 			pte = pte_load_clear(ptep);
1970 			pmap_inval_deinterlock(&info, pmap);
1971 			pmap_inval_done(&info);
1972 			if (vm_page_unwire_quick(
1973 					PHYS_TO_VM_PAGE(pte & PG_FRAME))) {
1974 				panic("pmap_allocpte: shared pgtable "
1975 				      "pg bad wirecount");
1976 			}
1977 			atomic_add_long(&pmap->pm_stats.resident_count, -1);
1978 		} else {
1979 			vm_page_wire_quick(pvp->pv_m);
1980 		}
1981 		*ptep = VM_PAGE_TO_PHYS(m) |
1982 		    (pmap->pmap_bits[PG_U_IDX] |
1983 		    pmap->pmap_bits[PG_RW_IDX] |
1984 		    pmap->pmap_bits[PG_V_IDX] |
1985 		    pmap->pmap_bits[PG_A_IDX] |
1986 		    pmap->pmap_bits[PG_M_IDX]);
1987 	}
1988 	vm_page_wakeup(m);
1989 notnew:
1990 	if (pvpp)
1991 		*pvpp = pvp;
1992 	else if (pvp)
1993 		pv_put(pvp);
1994 	return (pv);
1995 }
1996 
1997 /*
1998  * This version of pmap_allocpte() checks for possible segment optimizations
1999  * that would allow page-table sharing.  It can be called for terminal
2000  * page or page table page ptepindex's.
2001  *
2002  * The function is called with page table page ptepindex's for fictitious
2003  * and unmanaged terminal pages.  That is, we don't want to allocate a
2004  * terminal pv, we just want the pt_pv.  pvpp is usually passed as NULL
2005  * for this case.
2006  *
2007  * This function can return a pv and *pvpp associated with the passed in pmap
2008  * OR a pv and *pvpp associated with the shared pmap.  In the latter case
2009  * an unmanaged page table page will be entered into the pass in pmap.
2010  */
2011 static
2012 pv_entry_t
2013 pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp,
2014 		  vm_map_entry_t entry, vm_offset_t va)
2015 {
2016 	struct pmap_inval_info info;
2017 	vm_object_t object;
2018 	pmap_t obpmap;
2019 	pmap_t *obpmapp;
2020 	vm_offset_t b;
2021 	pv_entry_t pte_pv;	/* in original or shared pmap */
2022 	pv_entry_t pt_pv;	/* in original or shared pmap */
2023 	pv_entry_t proc_pd_pv;	/* in original pmap */
2024 	pv_entry_t proc_pt_pv;	/* in original pmap */
2025 	pv_entry_t xpv;		/* PT in shared pmap */
2026 	pd_entry_t *pt;		/* PT entry in PD of original pmap */
2027 	pd_entry_t opte;	/* contents of *pt */
2028 	pd_entry_t npte;	/* contents of *pt */
2029 	vm_page_t m;
2030 
2031 retry:
2032 	/*
2033 	 * Basic tests, require a non-NULL vm_map_entry, require proper
2034 	 * alignment and type for the vm_map_entry, require that the
2035 	 * underlying object already be allocated.
2036 	 *
2037 	 * We allow almost any type of object to use this optimization.
2038 	 * The object itself does NOT have to be sized to a multiple of the
2039 	 * segment size, but the memory mapping does.
2040 	 *
2041 	 * XXX don't handle devices currently, because VM_PAGE_TO_PHYS()
2042 	 *     won't work as expected.
2043 	 */
2044 	if (entry == NULL ||
2045 	    pmap_mmu_optimize == 0 ||			/* not enabled */
2046 	    ptepindex >= pmap_pd_pindex(0) ||		/* not terminal or pt */
2047 	    entry->inheritance != VM_INHERIT_SHARE ||	/* not shared */
2048 	    entry->maptype != VM_MAPTYPE_NORMAL ||	/* weird map type */
2049 	    entry->object.vm_object == NULL ||		/* needs VM object */
2050 	    entry->object.vm_object->type == OBJT_DEVICE ||	/* ick */
2051 	    entry->object.vm_object->type == OBJT_MGTDEVICE ||	/* ick */
2052 	    (entry->offset & SEG_MASK) ||		/* must be aligned */
2053 	    (entry->start & SEG_MASK)) {
2054 		return(pmap_allocpte(pmap, ptepindex, pvpp));
2055 	}
2056 
2057 	/*
2058 	 * Make sure the full segment can be represented.
2059 	 */
2060 	b = va & ~(vm_offset_t)SEG_MASK;
2061 	if (b < entry->start || b + SEG_SIZE > entry->end)
2062 		return(pmap_allocpte(pmap, ptepindex, pvpp));
2063 
2064 	/*
2065 	 * If the full segment can be represented dive the VM object's
2066 	 * shared pmap, allocating as required.
2067 	 */
2068 	object = entry->object.vm_object;
2069 
2070 	if (entry->protection & VM_PROT_WRITE)
2071 		obpmapp = &object->md.pmap_rw;
2072 	else
2073 		obpmapp = &object->md.pmap_ro;
2074 
2075 #ifdef PMAP_DEBUG2
2076 	if (pmap_enter_debug > 0) {
2077 		--pmap_enter_debug;
2078 		kprintf("pmap_allocpte_seg: va=%jx prot %08x o=%p "
2079 			"obpmapp %p %p\n",
2080 			va, entry->protection, object,
2081 			obpmapp, *obpmapp);
2082 		kprintf("pmap_allocpte_seg: entry %p %jx-%jx\n",
2083 			entry, entry->start, entry->end);
2084 	}
2085 #endif
2086 
2087 	/*
2088 	 * We allocate what appears to be a normal pmap but because portions
2089 	 * of this pmap are shared with other unrelated pmaps we have to
2090 	 * set pm_active to point to all cpus.
2091 	 *
2092 	 * XXX Currently using pmap_spin to interlock the update, can't use
2093 	 *     vm_object_hold/drop because the token might already be held
2094 	 *     shared OR exclusive and we don't know.
2095 	 */
2096 	while ((obpmap = *obpmapp) == NULL) {
2097 		obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO);
2098 		pmap_pinit_simple(obpmap);
2099 		pmap_pinit2(obpmap);
2100 		spin_lock(&pmap_spin);
2101 		if (*obpmapp != NULL) {
2102 			/*
2103 			 * Handle race
2104 			 */
2105 			spin_unlock(&pmap_spin);
2106 			pmap_release(obpmap);
2107 			pmap_puninit(obpmap);
2108 			kfree(obpmap, M_OBJPMAP);
2109 			obpmap = *obpmapp; /* safety */
2110 		} else {
2111 			obpmap->pm_active = smp_active_mask;
2112 			*obpmapp = obpmap;
2113 			spin_unlock(&pmap_spin);
2114 		}
2115 	}
2116 
2117 	/*
2118 	 * Layering is: PTE, PT, PD, PDP, PML4.  We have to return the
2119 	 * pte/pt using the shared pmap from the object but also adjust
2120 	 * the process pmap's page table page as a side effect.
2121 	 */
2122 
2123 	/*
2124 	 * Resolve the terminal PTE and PT in the shared pmap.  This is what
2125 	 * we will return.  This is true if ptepindex represents a terminal
2126 	 * page, otherwise pte_pv is actually the PT and pt_pv is actually
2127 	 * the PD.
2128 	 */
2129 	pt_pv = NULL;
2130 	pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv);
2131 	if (ptepindex >= pmap_pt_pindex(0))
2132 		xpv = pte_pv;
2133 	else
2134 		xpv = pt_pv;
2135 
2136 	/*
2137 	 * Resolve the PD in the process pmap so we can properly share the
2138 	 * page table page.  Lock order is bottom-up (leaf first)!
2139 	 *
2140 	 * NOTE: proc_pt_pv can be NULL.
2141 	 */
2142 	proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b));
2143 	proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL);
2144 #ifdef PMAP_DEBUG2
2145 	if (pmap_enter_debug > 0) {
2146 		--pmap_enter_debug;
2147 		kprintf("proc_pt_pv %p (wc %d) pd_pv %p va=%jx\n",
2148 			proc_pt_pv,
2149 			(proc_pt_pv ? proc_pt_pv->pv_m->wire_count : -1),
2150 			proc_pd_pv,
2151 			va);
2152 	}
2153 #endif
2154 
2155 	/*
2156 	 * xpv is the page table page pv from the shared object
2157 	 * (for convenience), from above.
2158 	 *
2159 	 * Calculate the pte value for the PT to load into the process PD.
2160 	 * If we have to change it we must properly dispose of the previous
2161 	 * entry.
2162 	 */
2163 	pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b));
2164 	npte = VM_PAGE_TO_PHYS(xpv->pv_m) |
2165 	    (pmap->pmap_bits[PG_U_IDX] |
2166 	    pmap->pmap_bits[PG_RW_IDX] |
2167 	    pmap->pmap_bits[PG_V_IDX] |
2168 	    pmap->pmap_bits[PG_A_IDX] |
2169 	    pmap->pmap_bits[PG_M_IDX]);
2170 
2171 	/*
2172 	 * Dispose of previous page table page if it was local to the
2173 	 * process pmap.  If the old pt is not empty we cannot dispose of it
2174 	 * until we clean it out.  This case should not arise very often so
2175 	 * it is not optimized.
2176 	 */
2177 	if (proc_pt_pv) {
2178 		if (proc_pt_pv->pv_m->wire_count != 1) {
2179 			pv_put(proc_pd_pv);
2180 			pv_put(proc_pt_pv);
2181 			pv_put(pt_pv);
2182 			pv_put(pte_pv);
2183 			pmap_remove(pmap,
2184 				    va & ~(vm_offset_t)SEG_MASK,
2185 				    (va + SEG_SIZE) & ~(vm_offset_t)SEG_MASK);
2186 			goto retry;
2187 		}
2188 		pmap_release_pv(proc_pt_pv, proc_pd_pv);
2189 		proc_pt_pv = NULL;
2190 		/* relookup */
2191 		pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b));
2192 	}
2193 
2194 	/*
2195 	 * Handle remaining cases.
2196 	 */
2197 	if (*pt == 0) {
2198 		*pt = npte;
2199 		vm_page_wire_quick(xpv->pv_m);
2200 		vm_page_wire_quick(proc_pd_pv->pv_m);
2201 		atomic_add_long(&pmap->pm_stats.resident_count, 1);
2202 	} else if (*pt != npte) {
2203 		pmap_inval_init(&info);
2204 		pmap_inval_interlock(&info, pmap, (vm_offset_t)-1);
2205 
2206 		opte = pte_load_clear(pt);
2207 		KKASSERT(opte && opte != npte);
2208 
2209 		*pt = npte;
2210 		vm_page_wire_quick(xpv->pv_m);	/* pgtable pg that is npte */
2211 
2212 		/*
2213 		 * Clean up opte, bump the wire_count for the process
2214 		 * PD page representing the new entry if it was
2215 		 * previously empty.
2216 		 *
2217 		 * If the entry was not previously empty and we have
2218 		 * a PT in the proc pmap then opte must match that
2219 		 * pt.  The proc pt must be retired (this is done
2220 		 * later on in this procedure).
2221 		 *
2222 		 * NOTE: replacing valid pte, wire_count on proc_pd_pv
2223 		 * stays the same.
2224 		 */
2225 		KKASSERT(opte & pmap->pmap_bits[PG_V_IDX]);
2226 		m = PHYS_TO_VM_PAGE(opte & PG_FRAME);
2227 		if (vm_page_unwire_quick(m)) {
2228 			panic("pmap_allocpte_seg: "
2229 			      "bad wire count %p",
2230 			      m);
2231 		}
2232 
2233 		pmap_inval_deinterlock(&info, pmap);
2234 		pmap_inval_done(&info);
2235 	}
2236 
2237 	/*
2238 	 * The existing process page table was replaced and must be destroyed
2239 	 * here.
2240 	 */
2241 	if (proc_pd_pv)
2242 		pv_put(proc_pd_pv);
2243 	if (pvpp)
2244 		*pvpp = pt_pv;
2245 	else
2246 		pv_put(pt_pv);
2247 
2248 	return (pte_pv);
2249 }
2250 
2251 /*
2252  * Release any resources held by the given physical map.
2253  *
2254  * Called when a pmap initialized by pmap_pinit is being released.  Should
2255  * only be called if the map contains no valid mappings.
2256  *
2257  * Caller must hold pmap->pm_token
2258  */
2259 struct pmap_release_info {
2260 	pmap_t	pmap;
2261 	int	retry;
2262 };
2263 
2264 static int pmap_release_callback(pv_entry_t pv, void *data);
2265 
2266 void
2267 pmap_release(struct pmap *pmap)
2268 {
2269 	struct pmap_release_info info;
2270 
2271 	KASSERT(pmap->pm_active == 0,
2272 		("pmap still active! %016jx", (uintmax_t)pmap->pm_active));
2273 
2274 	spin_lock(&pmap_spin);
2275 	TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
2276 	spin_unlock(&pmap_spin);
2277 
2278 	/*
2279 	 * Pull pv's off the RB tree in order from low to high and release
2280 	 * each page.
2281 	 */
2282 	info.pmap = pmap;
2283 	do {
2284 		info.retry = 0;
2285 		spin_lock(&pmap->pm_spin);
2286 		RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL,
2287 			pmap_release_callback, &info);
2288 		spin_unlock(&pmap->pm_spin);
2289 	} while (info.retry);
2290 
2291 
2292 	/*
2293 	 * One resident page (the pml4 page) should remain.
2294 	 * No wired pages should remain.
2295 	 */
2296 	KKASSERT(pmap->pm_stats.resident_count ==
2297 		 ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1));
2298 
2299 	KKASSERT(pmap->pm_stats.wired_count == 0);
2300 }
2301 
2302 static int
2303 pmap_release_callback(pv_entry_t pv, void *data)
2304 {
2305 	struct pmap_release_info *info = data;
2306 	pmap_t pmap = info->pmap;
2307 	int r;
2308 
2309 	if (pv_hold_try(pv)) {
2310 		spin_unlock(&pmap->pm_spin);
2311 	} else {
2312 		spin_unlock(&pmap->pm_spin);
2313 		pv_lock(pv);
2314 	}
2315 	if (pv->pv_pmap != pmap) {
2316 		pv_put(pv);
2317 		spin_lock(&pmap->pm_spin);
2318 		info->retry = 1;
2319 		return(-1);
2320 	}
2321 	r = pmap_release_pv(pv, NULL);
2322 	spin_lock(&pmap->pm_spin);
2323 	return(r);
2324 }
2325 
2326 /*
2327  * Called with held (i.e. also locked) pv.  This function will dispose of
2328  * the lock along with the pv.
2329  *
2330  * If the caller already holds the locked parent page table for pv it
2331  * must pass it as pvp, allowing us to avoid a deadlock, else it can
2332  * pass NULL for pvp.
2333  */
2334 static int
2335 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp)
2336 {
2337 	vm_page_t p;
2338 
2339 	/*
2340 	 * The pmap is currently not spinlocked, pv is held+locked.
2341 	 * Remove the pv's page from its parent's page table.  The
2342 	 * parent's page table page's wire_count will be decremented.
2343 	 */
2344 	pmap_remove_pv_pte(pv, pvp, NULL);
2345 
2346 	/*
2347 	 * Terminal pvs are unhooked from their vm_pages.  Because
2348 	 * terminal pages aren't page table pages they aren't wired
2349 	 * by us, so we have to be sure not to unwire them either.
2350 	 */
2351 	if (pv->pv_pindex < pmap_pt_pindex(0)) {
2352 		pmap_remove_pv_page(pv);
2353 		goto skip;
2354 	}
2355 
2356 	/*
2357 	 * We leave the top-level page table page cached, wired, and
2358 	 * mapped in the pmap until the dtor function (pmap_puninit())
2359 	 * gets called.
2360 	 *
2361 	 * Since we are leaving the top-level pv intact we need
2362 	 * to break out of what would otherwise be an infinite loop.
2363 	 */
2364 	if (pv->pv_pindex == pmap_pml4_pindex()) {
2365 		pv_put(pv);
2366 		return(-1);
2367 	}
2368 
2369 	/*
2370 	 * For page table pages (other than the top-level page),
2371 	 * remove and free the vm_page.  The representitive mapping
2372 	 * removed above by pmap_remove_pv_pte() did not undo the
2373 	 * last wire_count so we have to do that as well.
2374 	 */
2375 	p = pmap_remove_pv_page(pv);
2376 	vm_page_busy_wait(p, FALSE, "pmaprl");
2377 	if (p->wire_count != 1) {
2378 		kprintf("p->wire_count was %016lx %d\n",
2379 			pv->pv_pindex, p->wire_count);
2380 	}
2381 	KKASSERT(p->wire_count == 1);
2382 	KKASSERT(p->flags & PG_UNMANAGED);
2383 
2384 	vm_page_unwire(p, 0);
2385 	KKASSERT(p->wire_count == 0);
2386 
2387 	/*
2388 	 * Theoretically this page, if not the pml4 page, should contain
2389 	 * all-zeros.  But its just too dangerous to mark it PG_ZERO.  Free
2390 	 * normally.
2391 	 */
2392 	vm_page_free(p);
2393 skip:
2394 	pv_free(pv);
2395 	return 0;
2396 }
2397 
2398 /*
2399  * This function will remove the pte associated with a pv from its parent.
2400  * Terminal pv's are supported.  The removal will be interlocked if info
2401  * is non-NULL.  The caller must dispose of pv instead of just unlocking
2402  * it.
2403  *
2404  * The wire count will be dropped on the parent page table.  The wire
2405  * count on the page being removed (pv->pv_m) from the parent page table
2406  * is NOT touched.  Note that terminal pages will not have any additional
2407  * wire counts while page table pages will have at least one representing
2408  * the mapping, plus others representing sub-mappings.
2409  *
2410  * NOTE: Cannot be called on kernel page table pages, only KVM terminal
2411  *	 pages and user page table and terminal pages.
2412  *
2413  * The pv must be locked.
2414  *
2415  * XXX must lock parent pv's if they exist to remove pte XXX
2416  */
2417 static
2418 void
2419 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
2420 {
2421 	vm_pindex_t ptepindex = pv->pv_pindex;
2422 	pmap_t pmap = pv->pv_pmap;
2423 	vm_page_t p;
2424 	int gotpvp = 0;
2425 
2426 	KKASSERT(pmap);
2427 
2428 	if (ptepindex == pmap_pml4_pindex()) {
2429 		/*
2430 		 * We are the top level pml4 table, there is no parent.
2431 		 */
2432 		p = pmap->pm_pmlpv->pv_m;
2433 	} else if (ptepindex >= pmap_pdp_pindex(0)) {
2434 		/*
2435 		 * Remove a PDP page from the pml4e.  This can only occur
2436 		 * with user page tables.  We do not have to lock the
2437 		 * pml4 PV so just ignore pvp.
2438 		 */
2439 		vm_pindex_t pml4_pindex;
2440 		vm_pindex_t pdp_index;
2441 		pml4_entry_t *pdp;
2442 
2443 		pdp_index = ptepindex - pmap_pdp_pindex(0);
2444 		if (pvp == NULL) {
2445 			pml4_pindex = pmap_pml4_pindex();
2446 			pvp = pv_get(pv->pv_pmap, pml4_pindex);
2447 			KKASSERT(pvp);
2448 			gotpvp = 1;
2449 		}
2450 		pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)];
2451 		KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0);
2452 		p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2453 		*pdp = 0;
2454 		KKASSERT(info == NULL);
2455 	} else if (ptepindex >= pmap_pd_pindex(0)) {
2456 		/*
2457 		 * Remove a PD page from the pdp
2458 		 *
2459 		 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case
2460 		 *		     of a simple pmap because it stops at
2461 		 *		     the PD page.
2462 		 */
2463 		vm_pindex_t pdp_pindex;
2464 		vm_pindex_t pd_index;
2465 		pdp_entry_t *pd;
2466 
2467 		pd_index = ptepindex - pmap_pd_pindex(0);
2468 
2469 		if (pvp == NULL) {
2470 			pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
2471 				     (pd_index >> NPML4EPGSHIFT);
2472 			pvp = pv_get(pv->pv_pmap, pdp_pindex);
2473 			if (pvp)
2474 				gotpvp = 1;
2475 		}
2476 		if (pvp) {
2477 			pd = pv_pte_lookup(pvp, pd_index &
2478 						((1ul << NPDPEPGSHIFT) - 1));
2479 			KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0);
2480 			p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2481 			*pd = 0;
2482 		} else {
2483 			KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE);
2484 			p = pv->pv_m;		/* degenerate test later */
2485 		}
2486 		KKASSERT(info == NULL);
2487 	} else if (ptepindex >= pmap_pt_pindex(0)) {
2488 		/*
2489 		 *  Remove a PT page from the pd
2490 		 */
2491 		vm_pindex_t pd_pindex;
2492 		vm_pindex_t pt_index;
2493 		pd_entry_t *pt;
2494 
2495 		pt_index = ptepindex - pmap_pt_pindex(0);
2496 
2497 		if (pvp == NULL) {
2498 			pd_pindex = NUPTE_TOTAL + NUPT_TOTAL +
2499 				    (pt_index >> NPDPEPGSHIFT);
2500 			pvp = pv_get(pv->pv_pmap, pd_pindex);
2501 			KKASSERT(pvp);
2502 			gotpvp = 1;
2503 		}
2504 		pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1));
2505 		KKASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0);
2506 		p = PHYS_TO_VM_PAGE(*pt & PG_FRAME);
2507 		*pt = 0;
2508 		KKASSERT(info == NULL);
2509 	} else {
2510 		/*
2511 		 * Remove a PTE from the PT page
2512 		 *
2513 		 * NOTE: pv's must be locked bottom-up to avoid deadlocking.
2514 		 *	 pv is a pte_pv so we can safely lock pt_pv.
2515 		 *
2516 		 * NOTE: FICTITIOUS pages may have multiple physical mappings
2517 		 *	 so PHYS_TO_VM_PAGE() will not necessarily work for
2518 		 *	 terminal ptes.
2519 		 */
2520 		vm_pindex_t pt_pindex;
2521 		pt_entry_t *ptep;
2522 		pt_entry_t pte;
2523 		vm_offset_t va;
2524 
2525 		pt_pindex = ptepindex >> NPTEPGSHIFT;
2526 		va = (vm_offset_t)ptepindex << PAGE_SHIFT;
2527 
2528 		if (ptepindex >= NUPTE_USER) {
2529 			ptep = vtopte(ptepindex << PAGE_SHIFT);
2530 			KKASSERT(pvp == NULL);
2531 		} else {
2532 			if (pvp == NULL) {
2533 				pt_pindex = NUPTE_TOTAL +
2534 					    (ptepindex >> NPDPEPGSHIFT);
2535 				pvp = pv_get(pv->pv_pmap, pt_pindex);
2536 				KKASSERT(pvp);
2537 				gotpvp = 1;
2538 			}
2539 			ptep = pv_pte_lookup(pvp, ptepindex &
2540 						  ((1ul << NPDPEPGSHIFT) - 1));
2541 		}
2542 
2543 		if (info)
2544 			pmap_inval_interlock(info, pmap, va);
2545 		pte = pte_load_clear(ptep);
2546 		if (info)
2547 			pmap_inval_deinterlock(info, pmap);
2548 		else
2549 			cpu_invlpg((void *)va);
2550 
2551 		/*
2552 		 * Now update the vm_page_t
2553 		 */
2554 		if ((pte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) !=
2555 		    (pmap->pmap_bits[PG_MANAGED_IDX]|pmap->pmap_bits[PG_V_IDX])) {
2556 			kprintf("remove_pte badpte %016lx %016lx %d\n",
2557 				pte, pv->pv_pindex,
2558 				pv->pv_pindex < pmap_pt_pindex(0));
2559 		}
2560 		/* PHYS_TO_VM_PAGE() will not work for FICTITIOUS pages */
2561 		/*KKASSERT((pte & (PG_MANAGED|PG_V)) == (PG_MANAGED|PG_V));*/
2562 		if (pte & pmap->pmap_bits[PG_DEVICE_IDX])
2563 			p = pv->pv_m;
2564 		else
2565 			p = PHYS_TO_VM_PAGE(pte & PG_FRAME);
2566 		/* p = pv->pv_m; */
2567 
2568 		if (pte & pmap->pmap_bits[PG_M_IDX]) {
2569 			if (pmap_track_modified(ptepindex))
2570 				vm_page_dirty(p);
2571 		}
2572 		if (pte & pmap->pmap_bits[PG_A_IDX]) {
2573 			vm_page_flag_set(p, PG_REFERENCED);
2574 		}
2575 		if (pte & pmap->pmap_bits[PG_W_IDX])
2576 			atomic_add_long(&pmap->pm_stats.wired_count, -1);
2577 		if (pte & pmap->pmap_bits[PG_G_IDX])
2578 			cpu_invlpg((void *)va);
2579 	}
2580 
2581 	/*
2582 	 * Unwire the parent page table page.  The wire_count cannot go below
2583 	 * 1 here because the parent page table page is itself still mapped.
2584 	 *
2585 	 * XXX remove the assertions later.
2586 	 */
2587 	KKASSERT(pv->pv_m == p);
2588 	if (pvp && vm_page_unwire_quick(pvp->pv_m))
2589 		panic("pmap_remove_pv_pte: Insufficient wire_count");
2590 
2591 	if (gotpvp)
2592 		pv_put(pvp);
2593 }
2594 
2595 /*
2596  * Remove the vm_page association to a pv.  The pv must be locked.
2597  */
2598 static
2599 vm_page_t
2600 pmap_remove_pv_page(pv_entry_t pv)
2601 {
2602 	vm_page_t m;
2603 
2604 	m = pv->pv_m;
2605 	KKASSERT(m);
2606 	vm_page_spin_lock(m);
2607 	pv->pv_m = NULL;
2608 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2609 	pmap_page_stats_deleting(m);
2610 	/*
2611 	if (m->object)
2612 		atomic_add_int(&m->object->agg_pv_list_count, -1);
2613 	*/
2614 	if (TAILQ_EMPTY(&m->md.pv_list))
2615 		vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2616 	vm_page_spin_unlock(m);
2617 	return(m);
2618 }
2619 
2620 /*
2621  * Grow the number of kernel page table entries, if needed.
2622  *
2623  * This routine is always called to validate any address space
2624  * beyond KERNBASE (for kldloads).  kernel_vm_end only governs the address
2625  * space below KERNBASE.
2626  */
2627 void
2628 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
2629 {
2630 	vm_paddr_t paddr;
2631 	vm_offset_t ptppaddr;
2632 	vm_page_t nkpg;
2633 	pd_entry_t *pt, newpt;
2634 	pdp_entry_t newpd;
2635 	int update_kernel_vm_end;
2636 
2637 	/*
2638 	 * bootstrap kernel_vm_end on first real VM use
2639 	 */
2640 	if (kernel_vm_end == 0) {
2641 		kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
2642 		nkpt = 0;
2643 		while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) {
2644 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
2645 					~(PAGE_SIZE * NPTEPG - 1);
2646 			nkpt++;
2647 			if (kernel_vm_end - 1 >= kernel_map.max_offset) {
2648 				kernel_vm_end = kernel_map.max_offset;
2649 				break;
2650 			}
2651 		}
2652 	}
2653 
2654 	/*
2655 	 * Fill in the gaps.  kernel_vm_end is only adjusted for ranges
2656 	 * below KERNBASE.  Ranges above KERNBASE are kldloaded and we
2657 	 * do not want to force-fill 128G worth of page tables.
2658 	 */
2659 	if (kstart < KERNBASE) {
2660 		if (kstart > kernel_vm_end)
2661 			kstart = kernel_vm_end;
2662 		KKASSERT(kend <= KERNBASE);
2663 		update_kernel_vm_end = 1;
2664 	} else {
2665 		update_kernel_vm_end = 0;
2666 	}
2667 
2668 	kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG);
2669 	kend = roundup2(kend, PAGE_SIZE * NPTEPG);
2670 
2671 	if (kend - 1 >= kernel_map.max_offset)
2672 		kend = kernel_map.max_offset;
2673 
2674 	while (kstart < kend) {
2675 		pt = pmap_pt(&kernel_pmap, kstart);
2676 		if (pt == NULL) {
2677 			/* We need a new PDP entry */
2678 			nkpg = vm_page_alloc(NULL, nkpt,
2679 			                     VM_ALLOC_NORMAL |
2680 					     VM_ALLOC_SYSTEM |
2681 					     VM_ALLOC_INTERRUPT);
2682 			if (nkpg == NULL) {
2683 				panic("pmap_growkernel: no memory to grow "
2684 				      "kernel");
2685 			}
2686 			paddr = VM_PAGE_TO_PHYS(nkpg);
2687 			if ((nkpg->flags & PG_ZERO) == 0)
2688 				pmap_zero_page(paddr);
2689 			vm_page_flag_clear(nkpg, PG_ZERO);
2690 			newpd = (pdp_entry_t)
2691 			    (paddr |
2692 			    kernel_pmap.pmap_bits[PG_V_IDX] |
2693 			    kernel_pmap.pmap_bits[PG_RW_IDX] |
2694 			    kernel_pmap.pmap_bits[PG_A_IDX] |
2695 			    kernel_pmap.pmap_bits[PG_M_IDX]);
2696 			*pmap_pd(&kernel_pmap, kstart) = newpd;
2697 			nkpt++;
2698 			continue; /* try again */
2699 		}
2700 		if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) {
2701 			kstart = (kstart + PAGE_SIZE * NPTEPG) &
2702 				 ~(PAGE_SIZE * NPTEPG - 1);
2703 			if (kstart - 1 >= kernel_map.max_offset) {
2704 				kstart = kernel_map.max_offset;
2705 				break;
2706 			}
2707 			continue;
2708 		}
2709 
2710 		/*
2711 		 * This index is bogus, but out of the way
2712 		 */
2713 		nkpg = vm_page_alloc(NULL, nkpt,
2714 				     VM_ALLOC_NORMAL |
2715 				     VM_ALLOC_SYSTEM |
2716 				     VM_ALLOC_INTERRUPT);
2717 		if (nkpg == NULL)
2718 			panic("pmap_growkernel: no memory to grow kernel");
2719 
2720 		vm_page_wire(nkpg);
2721 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2722 		pmap_zero_page(ptppaddr);
2723 		vm_page_flag_clear(nkpg, PG_ZERO);
2724 		newpt = (pd_entry_t) (ptppaddr |
2725 		    kernel_pmap.pmap_bits[PG_V_IDX] |
2726 		    kernel_pmap.pmap_bits[PG_RW_IDX] |
2727 		    kernel_pmap.pmap_bits[PG_A_IDX] |
2728 		    kernel_pmap.pmap_bits[PG_M_IDX]);
2729 		*pmap_pt(&kernel_pmap, kstart) = newpt;
2730 		nkpt++;
2731 
2732 		kstart = (kstart + PAGE_SIZE * NPTEPG) &
2733 			  ~(PAGE_SIZE * NPTEPG - 1);
2734 
2735 		if (kstart - 1 >= kernel_map.max_offset) {
2736 			kstart = kernel_map.max_offset;
2737 			break;
2738 		}
2739 	}
2740 
2741 	/*
2742 	 * Only update kernel_vm_end for areas below KERNBASE.
2743 	 */
2744 	if (update_kernel_vm_end && kernel_vm_end < kstart)
2745 		kernel_vm_end = kstart;
2746 }
2747 
2748 /*
2749  *	Add a reference to the specified pmap.
2750  */
2751 void
2752 pmap_reference(pmap_t pmap)
2753 {
2754 	if (pmap != NULL) {
2755 		lwkt_gettoken(&pmap->pm_token);
2756 		++pmap->pm_count;
2757 		lwkt_reltoken(&pmap->pm_token);
2758 	}
2759 }
2760 
2761 /***************************************************
2762  * page management routines.
2763  ***************************************************/
2764 
2765 /*
2766  * Hold a pv without locking it
2767  */
2768 static void
2769 pv_hold(pv_entry_t pv)
2770 {
2771 	atomic_add_int(&pv->pv_hold, 1);
2772 }
2773 
2774 /*
2775  * Hold a pv_entry, preventing its destruction.  TRUE is returned if the pv
2776  * was successfully locked, FALSE if it wasn't.  The caller must dispose of
2777  * the pv properly.
2778  *
2779  * Either the pmap->pm_spin or the related vm_page_spin (if traversing a
2780  * pv list via its page) must be held by the caller.
2781  */
2782 static int
2783 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL)
2784 {
2785 	u_int count;
2786 
2787 	/*
2788 	 * Critical path shortcut expects pv to already have one ref
2789 	 * (for the pv->pv_pmap).
2790 	 */
2791 	if (atomic_cmpset_int(&pv->pv_hold, 1, PV_HOLD_LOCKED | 2)) {
2792 #ifdef PMAP_DEBUG
2793 		pv->pv_func = func;
2794 		pv->pv_line = lineno;
2795 #endif
2796 		return TRUE;
2797 	}
2798 
2799 	for (;;) {
2800 		count = pv->pv_hold;
2801 		cpu_ccfence();
2802 		if ((count & PV_HOLD_LOCKED) == 0) {
2803 			if (atomic_cmpset_int(&pv->pv_hold, count,
2804 					      (count + 1) | PV_HOLD_LOCKED)) {
2805 #ifdef PMAP_DEBUG
2806 				pv->pv_func = func;
2807 				pv->pv_line = lineno;
2808 #endif
2809 				return TRUE;
2810 			}
2811 		} else {
2812 			if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
2813 				return FALSE;
2814 		}
2815 		/* retry */
2816 	}
2817 }
2818 
2819 /*
2820  * Drop a previously held pv_entry which could not be locked, allowing its
2821  * destruction.
2822  *
2823  * Must not be called with a spinlock held as we might zfree() the pv if it
2824  * is no longer associated with a pmap and this was the last hold count.
2825  */
2826 static void
2827 pv_drop(pv_entry_t pv)
2828 {
2829 	u_int count;
2830 
2831 	for (;;) {
2832 		count = pv->pv_hold;
2833 		cpu_ccfence();
2834 		KKASSERT((count & PV_HOLD_MASK) > 0);
2835 		KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) !=
2836 			 (PV_HOLD_LOCKED | 1));
2837 		if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) {
2838 			if ((count & PV_HOLD_MASK) == 1) {
2839 #ifdef PMAP_DEBUG2
2840 				if (pmap_enter_debug > 0) {
2841 					--pmap_enter_debug;
2842 					kprintf("pv_drop: free pv %p\n", pv);
2843 				}
2844 #endif
2845 				KKASSERT(count == 1);
2846 				KKASSERT(pv->pv_pmap == NULL);
2847 				zfree(pvzone, pv);
2848 			}
2849 			return;
2850 		}
2851 		/* retry */
2852 	}
2853 }
2854 
2855 /*
2856  * Find or allocate the requested PV entry, returning a locked, held pv.
2857  *
2858  * If (*isnew) is non-zero, the returned pv will have two hold counts, one
2859  * for the caller and one representing the pmap and vm_page association.
2860  *
2861  * If (*isnew) is zero, the returned pv will have only one hold count.
2862  *
2863  * Since both associations can only be adjusted while the pv is locked,
2864  * together they represent just one additional hold.
2865  */
2866 static
2867 pv_entry_t
2868 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL)
2869 {
2870 	pv_entry_t pv;
2871 	pv_entry_t pnew = NULL;
2872 
2873 	spin_lock(&pmap->pm_spin);
2874 	for (;;) {
2875 		if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) {
2876 			pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
2877 							pindex);
2878 		}
2879 		if (pv == NULL) {
2880 			if (pnew == NULL) {
2881 				spin_unlock(&pmap->pm_spin);
2882 				pnew = zalloc(pvzone);
2883 				spin_lock(&pmap->pm_spin);
2884 				continue;
2885 			}
2886 			pnew->pv_pmap = pmap;
2887 			pnew->pv_pindex = pindex;
2888 			pnew->pv_hold = PV_HOLD_LOCKED | 2;
2889 #ifdef PMAP_DEBUG
2890 			pnew->pv_func = func;
2891 			pnew->pv_line = lineno;
2892 #endif
2893 			pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew);
2894 			++pmap->pm_generation;
2895 			atomic_add_long(&pmap->pm_stats.resident_count, 1);
2896 			spin_unlock(&pmap->pm_spin);
2897 			*isnew = 1;
2898 			return(pnew);
2899 		}
2900 		if (pnew) {
2901 			spin_unlock(&pmap->pm_spin);
2902 			zfree(pvzone, pnew);
2903 			pnew = NULL;
2904 			spin_lock(&pmap->pm_spin);
2905 			continue;
2906 		}
2907 		if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
2908 			spin_unlock(&pmap->pm_spin);
2909 		} else {
2910 			spin_unlock(&pmap->pm_spin);
2911 			_pv_lock(pv PMAP_DEBUG_COPY);
2912 		}
2913 		if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) {
2914 			*isnew = 0;
2915 			return(pv);
2916 		}
2917 		pv_put(pv);
2918 		spin_lock(&pmap->pm_spin);
2919 	}
2920 }
2921 
2922 /*
2923  * Find the requested PV entry, returning a locked+held pv or NULL
2924  */
2925 static
2926 pv_entry_t
2927 _pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL)
2928 {
2929 	pv_entry_t pv;
2930 
2931 	spin_lock(&pmap->pm_spin);
2932 	for (;;) {
2933 		/*
2934 		 * Shortcut cache
2935 		 */
2936 		if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) {
2937 			pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
2938 							pindex);
2939 		}
2940 		if (pv == NULL) {
2941 			spin_unlock(&pmap->pm_spin);
2942 			return NULL;
2943 		}
2944 		if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
2945 			spin_unlock(&pmap->pm_spin);
2946 		} else {
2947 			spin_unlock(&pmap->pm_spin);
2948 			_pv_lock(pv PMAP_DEBUG_COPY);
2949 		}
2950 		if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) {
2951 			pv_cache(pv, pindex);
2952 			return(pv);
2953 		}
2954 		pv_put(pv);
2955 		spin_lock(&pmap->pm_spin);
2956 	}
2957 }
2958 
2959 /*
2960  * Lookup, hold, and attempt to lock (pmap,pindex).
2961  *
2962  * If the entry does not exist NULL is returned and *errorp is set to 0
2963  *
2964  * If the entry exists and could be successfully locked it is returned and
2965  * errorp is set to 0.
2966  *
2967  * If the entry exists but could NOT be successfully locked it is returned
2968  * held and *errorp is set to 1.
2969  */
2970 static
2971 pv_entry_t
2972 pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp)
2973 {
2974 	pv_entry_t pv;
2975 
2976 	spin_lock_shared(&pmap->pm_spin);
2977 	if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
2978 		pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
2979 	if (pv == NULL) {
2980 		spin_unlock_shared(&pmap->pm_spin);
2981 		*errorp = 0;
2982 		return NULL;
2983 	}
2984 	if (pv_hold_try(pv)) {
2985 		pv_cache(pv, pindex);
2986 		spin_unlock_shared(&pmap->pm_spin);
2987 		*errorp = 0;
2988 		KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex);
2989 		return(pv);	/* lock succeeded */
2990 	}
2991 	spin_unlock_shared(&pmap->pm_spin);
2992 	*errorp = 1;
2993 	return (pv);		/* lock failed */
2994 }
2995 
2996 /*
2997  * Find the requested PV entry, returning a held pv or NULL
2998  */
2999 static
3000 pv_entry_t
3001 pv_find(pmap_t pmap, vm_pindex_t pindex)
3002 {
3003 	pv_entry_t pv;
3004 
3005 	spin_lock_shared(&pmap->pm_spin);
3006 
3007 	if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
3008 		pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
3009 	if (pv == NULL) {
3010 		spin_unlock_shared(&pmap->pm_spin);
3011 		return NULL;
3012 	}
3013 	pv_hold(pv);
3014 	pv_cache(pv, pindex);
3015 	spin_unlock_shared(&pmap->pm_spin);
3016 	return(pv);
3017 }
3018 
3019 /*
3020  * Lock a held pv, keeping the hold count
3021  */
3022 static
3023 void
3024 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL)
3025 {
3026 	u_int count;
3027 
3028 	for (;;) {
3029 		count = pv->pv_hold;
3030 		cpu_ccfence();
3031 		if ((count & PV_HOLD_LOCKED) == 0) {
3032 			if (atomic_cmpset_int(&pv->pv_hold, count,
3033 					      count | PV_HOLD_LOCKED)) {
3034 #ifdef PMAP_DEBUG
3035 				pv->pv_func = func;
3036 				pv->pv_line = lineno;
3037 #endif
3038 				return;
3039 			}
3040 			continue;
3041 		}
3042 		tsleep_interlock(pv, 0);
3043 		if (atomic_cmpset_int(&pv->pv_hold, count,
3044 				      count | PV_HOLD_WAITING)) {
3045 #ifdef PMAP_DEBUG
3046 			kprintf("pv waiting on %s:%d\n",
3047 					pv->pv_func, pv->pv_line);
3048 #endif
3049 			tsleep(pv, PINTERLOCKED, "pvwait", hz);
3050 		}
3051 		/* retry */
3052 	}
3053 }
3054 
3055 /*
3056  * Unlock a held and locked pv, keeping the hold count.
3057  */
3058 static
3059 void
3060 pv_unlock(pv_entry_t pv)
3061 {
3062 	u_int count;
3063 
3064 	for (;;) {
3065 		count = pv->pv_hold;
3066 		cpu_ccfence();
3067 		KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >=
3068 			 (PV_HOLD_LOCKED | 1));
3069 		if (atomic_cmpset_int(&pv->pv_hold, count,
3070 				      count &
3071 				      ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) {
3072 			if (count & PV_HOLD_WAITING)
3073 				wakeup(pv);
3074 			break;
3075 		}
3076 	}
3077 }
3078 
3079 /*
3080  * Unlock and drop a pv.  If the pv is no longer associated with a pmap
3081  * and the hold count drops to zero we will free it.
3082  *
3083  * Caller should not hold any spin locks.  We are protected from hold races
3084  * by virtue of holds only occuring only with a pmap_spin or vm_page_spin
3085  * lock held.  A pv cannot be located otherwise.
3086  */
3087 static
3088 void
3089 pv_put(pv_entry_t pv)
3090 {
3091 #ifdef PMAP_DEBUG2
3092 	if (pmap_enter_debug > 0) {
3093 		--pmap_enter_debug;
3094 		kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold);
3095 	}
3096 #endif
3097 
3098 	/*
3099 	 * Fast - shortcut most common condition
3100 	 */
3101 	if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1))
3102 		return;
3103 
3104 	/*
3105 	 * Slow
3106 	 */
3107 	pv_unlock(pv);
3108 	pv_drop(pv);
3109 }
3110 
3111 /*
3112  * Remove the pmap association from a pv, require that pv_m already be removed,
3113  * then unlock and drop the pv.  Any pte operations must have already been
3114  * completed.  This call may result in a last-drop which will physically free
3115  * the pv.
3116  *
3117  * Removing the pmap association entails an additional drop.
3118  *
3119  * pv must be exclusively locked on call and will be disposed of on return.
3120  */
3121 static
3122 void
3123 pv_free(pv_entry_t pv)
3124 {
3125 	pmap_t pmap;
3126 
3127 	KKASSERT(pv->pv_m == NULL);
3128 	KKASSERT((pv->pv_hold & PV_HOLD_MASK) >= 2);
3129 	if ((pmap = pv->pv_pmap) != NULL) {
3130 		spin_lock(&pmap->pm_spin);
3131 		pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
3132 		++pmap->pm_generation;
3133 		if (pmap->pm_pvhint == pv)
3134 			pmap->pm_pvhint = NULL;
3135 		atomic_add_long(&pmap->pm_stats.resident_count, -1);
3136 		pv->pv_pmap = NULL;
3137 		pv->pv_pindex = 0;
3138 		spin_unlock(&pmap->pm_spin);
3139 
3140 		/*
3141 		 * Try to shortcut three atomic ops, otherwise fall through
3142 		 * and do it normally.  Drop two refs and the lock all in
3143 		 * one go.
3144 		 */
3145 		if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) {
3146 #ifdef PMAP_DEBUG2
3147 			if (pmap_enter_debug > 0) {
3148 				--pmap_enter_debug;
3149 				kprintf("pv_free: free pv %p\n", pv);
3150 			}
3151 #endif
3152 			zfree(pvzone, pv);
3153 			return;
3154 		}
3155 		pv_drop(pv);	/* ref for pv_pmap */
3156 	}
3157 	pv_put(pv);
3158 }
3159 
3160 /*
3161  * This routine is very drastic, but can save the system
3162  * in a pinch.
3163  */
3164 void
3165 pmap_collect(void)
3166 {
3167 	int i;
3168 	vm_page_t m;
3169 	static int warningdone=0;
3170 
3171 	if (pmap_pagedaemon_waken == 0)
3172 		return;
3173 	pmap_pagedaemon_waken = 0;
3174 	if (warningdone < 5) {
3175 		kprintf("pmap_collect: collecting pv entries -- "
3176 			"suggest increasing PMAP_SHPGPERPROC\n");
3177 		warningdone++;
3178 	}
3179 
3180 	for (i = 0; i < vm_page_array_size; i++) {
3181 		m = &vm_page_array[i];
3182 		if (m->wire_count || m->hold_count)
3183 			continue;
3184 		if (vm_page_busy_try(m, TRUE) == 0) {
3185 			if (m->wire_count == 0 && m->hold_count == 0) {
3186 				pmap_remove_all(m);
3187 			}
3188 			vm_page_wakeup(m);
3189 		}
3190 	}
3191 }
3192 
3193 /*
3194  * Scan the pmap for active page table entries and issue a callback.
3195  * The callback must dispose of pte_pv, whos PTE entry is at *ptep in
3196  * its parent page table.
3197  *
3198  * pte_pv will be NULL if the page or page table is unmanaged.
3199  * pt_pv will point to the page table page containing the pte for the page.
3200  *
3201  * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page),
3202  *	 we pass a NULL pte_pv and we pass a pt_pv pointing to the passed
3203  *	 process pmap's PD and page to the callback function.  This can be
3204  *	 confusing because the pt_pv is really a pd_pv, and the target page
3205  *	 table page is simply aliased by the pmap and not owned by it.
3206  *
3207  * It is assumed that the start and end are properly rounded to the page size.
3208  *
3209  * It is assumed that PD pages and above are managed and thus in the RB tree,
3210  * allowing us to use RB_SCAN from the PD pages down for ranged scans.
3211  */
3212 struct pmap_scan_info {
3213 	struct pmap *pmap;
3214 	vm_offset_t sva;
3215 	vm_offset_t eva;
3216 	vm_pindex_t sva_pd_pindex;
3217 	vm_pindex_t eva_pd_pindex;
3218 	void (*func)(pmap_t, struct pmap_scan_info *,
3219 		     pv_entry_t, pv_entry_t, int, vm_offset_t,
3220 		     pt_entry_t *, void *);
3221 	void *arg;
3222 	int doinval;
3223 	struct pmap_inval_info inval;
3224 };
3225 
3226 static int pmap_scan_cmp(pv_entry_t pv, void *data);
3227 static int pmap_scan_callback(pv_entry_t pv, void *data);
3228 
3229 static void
3230 pmap_scan(struct pmap_scan_info *info)
3231 {
3232 	struct pmap *pmap = info->pmap;
3233 	pv_entry_t pd_pv;	/* A page directory PV */
3234 	pv_entry_t pt_pv;	/* A page table PV */
3235 	pv_entry_t pte_pv;	/* A page table entry PV */
3236 	pt_entry_t *ptep;
3237 	pt_entry_t oldpte;
3238 	struct pv_entry dummy_pv;
3239 	int generation;
3240 
3241 	if (pmap == NULL)
3242 		return;
3243 
3244 	/*
3245 	 * Hold the token for stability; if the pmap is empty we have nothing
3246 	 * to do.
3247 	 */
3248 	lwkt_gettoken(&pmap->pm_token);
3249 #if 0
3250 	if (pmap->pm_stats.resident_count == 0) {
3251 		lwkt_reltoken(&pmap->pm_token);
3252 		return;
3253 	}
3254 #endif
3255 
3256 	pmap_inval_init(&info->inval);
3257 
3258 again:
3259 	/*
3260 	 * Special handling for scanning one page, which is a very common
3261 	 * operation (it is?).
3262 	 *
3263 	 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4
3264 	 */
3265 	if (info->sva + PAGE_SIZE == info->eva) {
3266 		generation = pmap->pm_generation;
3267 		if (info->sva >= VM_MAX_USER_ADDRESS) {
3268 			/*
3269 			 * Kernel mappings do not track wire counts on
3270 			 * page table pages and only maintain pd_pv and
3271 			 * pte_pv levels so pmap_scan() works.
3272 			 */
3273 			pt_pv = NULL;
3274 			pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva));
3275 			ptep = vtopte(info->sva);
3276 		} else {
3277 			/*
3278 			 * User pages which are unmanaged will not have a
3279 			 * pte_pv.  User page table pages which are unmanaged
3280 			 * (shared from elsewhere) will also not have a pt_pv.
3281 			 * The func() callback will pass both pte_pv and pt_pv
3282 			 * as NULL in that case.
3283 			 */
3284 			pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva));
3285 			pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva));
3286 			if (pt_pv == NULL) {
3287 				KKASSERT(pte_pv == NULL);
3288 				pd_pv = pv_get(pmap, pmap_pd_pindex(info->sva));
3289 				if (pd_pv) {
3290 					ptep = pv_pte_lookup(pd_pv,
3291 						    pmap_pt_index(info->sva));
3292 					if (*ptep) {
3293 						info->func(pmap, info,
3294 						     NULL, pd_pv, 1,
3295 						     info->sva, ptep,
3296 						     info->arg);
3297 					}
3298 					pv_put(pd_pv);
3299 				}
3300 				goto fast_skip;
3301 			}
3302 			ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva));
3303 		}
3304 
3305 		/*
3306 		 * NOTE: *ptep can't be ripped out from under us if we hold
3307 		 *	 pte_pv locked, but bits can change.  However, there is
3308 		 *	 a race where another thread may be inserting pte_pv
3309 		 *	 and setting *ptep just after our pte_pv lookup fails.
3310 		 *
3311 		 *	 In this situation we can end up with a NULL pte_pv
3312 		 *	 but find that we have a managed *ptep.  We explicitly
3313 		 *	 check for this race.
3314 		 */
3315 		oldpte = *ptep;
3316 		cpu_ccfence();
3317 		if (oldpte == 0) {
3318 			/*
3319 			 * Unlike the pv_find() case below we actually
3320 			 * acquired a locked pv in this case so any
3321 			 * race should have been resolved.  It is expected
3322 			 * to not exist.
3323 			 */
3324 			KKASSERT(pte_pv == NULL);
3325 		} else if (pte_pv) {
3326 			KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] |
3327 					   pmap->pmap_bits[PG_V_IDX])) ==
3328 				(pmap->pmap_bits[PG_MANAGED_IDX] |
3329 				 pmap->pmap_bits[PG_V_IDX]),
3330 			    ("badA *ptep %016lx/%016lx sva %016lx pte_pv %p"
3331 			     "generation %d/%d",
3332 			    *ptep, oldpte, info->sva, pte_pv,
3333 			    generation, pmap->pm_generation));
3334 			info->func(pmap, info, pte_pv, pt_pv, 0,
3335 				   info->sva, ptep, info->arg);
3336 		} else {
3337 			/*
3338 			 * Check for insertion race
3339 			 */
3340 			if ((oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) &&
3341 			    pt_pv) {
3342 				pte_pv = pv_find(pmap,
3343 						 pmap_pte_pindex(info->sva));
3344 				if (pte_pv) {
3345 					pv_drop(pte_pv);
3346 					pv_put(pt_pv);
3347 					kprintf("pmap_scan: RACE1 "
3348 						"%016jx, %016lx\n",
3349 						info->sva, oldpte);
3350 					goto again;
3351 				}
3352 			}
3353 
3354 			/*
3355 			 * Didn't race
3356 			 */
3357 			KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] |
3358 					   pmap->pmap_bits[PG_V_IDX])) ==
3359 			    pmap->pmap_bits[PG_V_IDX],
3360 			    ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL"
3361 			     "generation %d/%d",
3362 			    *ptep, oldpte, info->sva,
3363 			    generation, pmap->pm_generation));
3364 			info->func(pmap, info, NULL, pt_pv, 0,
3365 			    info->sva, ptep, info->arg);
3366 		}
3367 		if (pt_pv)
3368 			pv_put(pt_pv);
3369 fast_skip:
3370 		pmap_inval_done(&info->inval);
3371 		lwkt_reltoken(&pmap->pm_token);
3372 		return;
3373 	}
3374 
3375 	/*
3376 	 * Nominal scan case, RB_SCAN() for PD pages and iterate from
3377 	 * there.
3378 	 */
3379 	info->sva_pd_pindex = pmap_pd_pindex(info->sva);
3380 	info->eva_pd_pindex = pmap_pd_pindex(info->eva + NBPDP - 1);
3381 
3382 	if (info->sva >= VM_MAX_USER_ADDRESS) {
3383 		/*
3384 		 * The kernel does not currently maintain any pv_entry's for
3385 		 * higher-level page tables.
3386 		 */
3387 		bzero(&dummy_pv, sizeof(dummy_pv));
3388 		dummy_pv.pv_pindex = info->sva_pd_pindex;
3389 		spin_lock(&pmap->pm_spin);
3390 		while (dummy_pv.pv_pindex < info->eva_pd_pindex) {
3391 			pmap_scan_callback(&dummy_pv, info);
3392 			++dummy_pv.pv_pindex;
3393 		}
3394 		spin_unlock(&pmap->pm_spin);
3395 	} else {
3396 		/*
3397 		 * User page tables maintain local PML4, PDP, and PD
3398 		 * pv_entry's at the very least.  PT pv's might be
3399 		 * unmanaged and thus not exist.  PTE pv's might be
3400 		 * unmanaged and thus not exist.
3401 		 */
3402 		spin_lock(&pmap->pm_spin);
3403 		pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot,
3404 			pmap_scan_cmp, pmap_scan_callback, info);
3405 		spin_unlock(&pmap->pm_spin);
3406 	}
3407 	pmap_inval_done(&info->inval);
3408 	lwkt_reltoken(&pmap->pm_token);
3409 }
3410 
3411 /*
3412  * WARNING! pmap->pm_spin held
3413  */
3414 static int
3415 pmap_scan_cmp(pv_entry_t pv, void *data)
3416 {
3417 	struct pmap_scan_info *info = data;
3418 	if (pv->pv_pindex < info->sva_pd_pindex)
3419 		return(-1);
3420 	if (pv->pv_pindex >= info->eva_pd_pindex)
3421 		return(1);
3422 	return(0);
3423 }
3424 
3425 /*
3426  * WARNING! pmap->pm_spin held
3427  */
3428 static int
3429 pmap_scan_callback(pv_entry_t pv, void *data)
3430 {
3431 	struct pmap_scan_info *info = data;
3432 	struct pmap *pmap = info->pmap;
3433 	pv_entry_t pd_pv;	/* A page directory PV */
3434 	pv_entry_t pt_pv;	/* A page table PV */
3435 	pv_entry_t pte_pv;	/* A page table entry PV */
3436 	pt_entry_t *ptep;
3437 	pt_entry_t oldpte;
3438 	vm_offset_t sva;
3439 	vm_offset_t eva;
3440 	vm_offset_t va_next;
3441 	vm_pindex_t pd_pindex;
3442 	int error;
3443 	int generation;
3444 
3445 	/*
3446 	 * Pull the PD pindex from the pv before releasing the spinlock.
3447 	 *
3448 	 * WARNING: pv is faked for kernel pmap scans.
3449 	 */
3450 	pd_pindex = pv->pv_pindex;
3451 	spin_unlock(&pmap->pm_spin);
3452 	pv = NULL;	/* invalid after spinlock unlocked */
3453 
3454 	/*
3455 	 * Calculate the page range within the PD.  SIMPLE pmaps are
3456 	 * direct-mapped for the entire 2^64 address space.  Normal pmaps
3457 	 * reflect the user and kernel address space which requires
3458 	 * cannonicalization w/regards to converting pd_pindex's back
3459 	 * into addresses.
3460 	 */
3461 	sva = (pd_pindex - NUPTE_TOTAL - NUPT_TOTAL) << PDPSHIFT;
3462 	if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 &&
3463 	    (sva & PML4_SIGNMASK)) {
3464 		sva |= PML4_SIGNMASK;
3465 	}
3466 	eva = sva + NBPDP;	/* can overflow */
3467 	if (sva < info->sva)
3468 		sva = info->sva;
3469 	if (eva < info->sva || eva > info->eva)
3470 		eva = info->eva;
3471 
3472 	/*
3473 	 * NOTE: kernel mappings do not track page table pages, only
3474 	 * 	 terminal pages.
3475 	 *
3476 	 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4.
3477 	 *	 However, for the scan to be efficient we try to
3478 	 *	 cache items top-down.
3479 	 */
3480 	pd_pv = NULL;
3481 	pt_pv = NULL;
3482 
3483 	for (; sva < eva; sva = va_next) {
3484 		if (sva >= VM_MAX_USER_ADDRESS) {
3485 			if (pt_pv) {
3486 				pv_put(pt_pv);
3487 				pt_pv = NULL;
3488 			}
3489 			goto kernel_skip;
3490 		}
3491 
3492 		/*
3493 		 * PD cache (degenerate case if we skip).  It is possible
3494 		 * for the PD to not exist due to races.  This is ok.
3495 		 */
3496 		if (pd_pv == NULL) {
3497 			pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
3498 		} else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) {
3499 			pv_put(pd_pv);
3500 			pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
3501 		}
3502 		if (pd_pv == NULL) {
3503 			va_next = (sva + NBPDP) & ~PDPMASK;
3504 			if (va_next < sva)
3505 				va_next = eva;
3506 			continue;
3507 		}
3508 
3509 		/*
3510 		 * PT cache
3511 		 */
3512 		if (pt_pv == NULL) {
3513 			if (pd_pv) {
3514 				pv_put(pd_pv);
3515 				pd_pv = NULL;
3516 			}
3517 			pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
3518 		} else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) {
3519 			if (pd_pv) {
3520 				pv_put(pd_pv);
3521 				pd_pv = NULL;
3522 			}
3523 			pv_put(pt_pv);
3524 			pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
3525 		}
3526 
3527 		/*
3528 		 * If pt_pv is NULL we either have an shared page table
3529 		 * page and must issue a callback specific to that case,
3530 		 * or there is no page table page.
3531 		 *
3532 		 * Either way we can skip the page table page.
3533 		 */
3534 		if (pt_pv == NULL) {
3535 			/*
3536 			 * Possible unmanaged (shared from another pmap)
3537 			 * page table page.
3538 			 */
3539 			if (pd_pv == NULL)
3540 				pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
3541 			KKASSERT(pd_pv != NULL);
3542 			ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva));
3543 			if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
3544 				info->func(pmap, info, NULL, pd_pv, 1,
3545 					   sva, ptep, info->arg);
3546 			}
3547 
3548 			/*
3549 			 * Done, move to next page table page.
3550 			 */
3551 			va_next = (sva + NBPDR) & ~PDRMASK;
3552 			if (va_next < sva)
3553 				va_next = eva;
3554 			continue;
3555 		}
3556 
3557 		/*
3558 		 * From this point in the loop testing pt_pv for non-NULL
3559 		 * means we are in UVM, else if it is NULL we are in KVM.
3560 		 *
3561 		 * Limit our scan to either the end of the va represented
3562 		 * by the current page table page, or to the end of the
3563 		 * range being removed.
3564 		 */
3565 kernel_skip:
3566 		va_next = (sva + NBPDR) & ~PDRMASK;
3567 		if (va_next < sva)
3568 			va_next = eva;
3569 		if (va_next > eva)
3570 			va_next = eva;
3571 
3572 		/*
3573 		 * Scan the page table for pages.  Some pages may not be
3574 		 * managed (might not have a pv_entry).
3575 		 *
3576 		 * There is no page table management for kernel pages so
3577 		 * pt_pv will be NULL in that case, but otherwise pt_pv
3578 		 * is non-NULL, locked, and referenced.
3579 		 */
3580 
3581 		/*
3582 		 * At this point a non-NULL pt_pv means a UVA, and a NULL
3583 		 * pt_pv means a KVA.
3584 		 */
3585 		if (pt_pv)
3586 			ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
3587 		else
3588 			ptep = vtopte(sva);
3589 
3590 		while (sva < va_next) {
3591 			/*
3592 			 * Acquire the related pte_pv, if any.  If *ptep == 0
3593 			 * the related pte_pv should not exist, but if *ptep
3594 			 * is not zero the pte_pv may or may not exist (e.g.
3595 			 * will not exist for an unmanaged page).
3596 			 *
3597 			 * However a multitude of races are possible here.
3598 			 *
3599 			 * In addition, the (pt_pv, pte_pv) lock order is
3600 			 * backwards, so we have to be careful in aquiring
3601 			 * a properly locked pte_pv.
3602 			 */
3603 			generation = pmap->pm_generation;
3604 			if (pt_pv) {
3605 				pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva),
3606 						    &error);
3607 				if (error) {
3608 					if (pd_pv) {
3609 						pv_put(pd_pv);
3610 						pd_pv = NULL;
3611 					}
3612 					pv_put(pt_pv);	 /* must be non-NULL */
3613 					pt_pv = NULL;
3614 					pv_lock(pte_pv); /* safe to block now */
3615 					pv_put(pte_pv);
3616 					pte_pv = NULL;
3617 					pt_pv = pv_get(pmap,
3618 						       pmap_pt_pindex(sva));
3619 					/*
3620 					 * pt_pv reloaded, need new ptep
3621 					 */
3622 					KKASSERT(pt_pv != NULL);
3623 					ptep = pv_pte_lookup(pt_pv,
3624 							pmap_pte_index(sva));
3625 					continue;
3626 				}
3627 			} else {
3628 				pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
3629 			}
3630 
3631 			/*
3632 			 * Ok, if *ptep == 0 we had better NOT have a pte_pv.
3633 			 */
3634 			oldpte = *ptep;
3635 			if (oldpte == 0) {
3636 				if (pte_pv) {
3637 					kprintf("Unexpected non-NULL pte_pv "
3638 						"%p pt_pv %p "
3639 						"*ptep = %016lx/%016lx\n",
3640 						pte_pv, pt_pv, *ptep, oldpte);
3641 					panic("Unexpected non-NULL pte_pv");
3642 				}
3643 				sva += PAGE_SIZE;
3644 				++ptep;
3645 				continue;
3646 			}
3647 
3648 			/*
3649 			 * Ready for the callback.  The locked pte_pv (if any)
3650 			 * is consumed by the callback.  pte_pv will exist if
3651 			 *  the page is managed, and will not exist if it
3652 			 * isn't.
3653 			 */
3654 			if (pte_pv) {
3655 				KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) ==
3656 				    (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX]),
3657 				    ("badC *ptep %016lx/%016lx sva %016lx "
3658 				    "pte_pv %p pm_generation %d/%d",
3659 				    *ptep, oldpte, sva, pte_pv,
3660 				    generation, pmap->pm_generation));
3661 				info->func(pmap, info, pte_pv, pt_pv, 0,
3662 				    sva, ptep, info->arg);
3663 			} else {
3664 				/*
3665 				 * Check for insertion race.  Since there is no
3666 				 * pte_pv to guard us it is possible for us
3667 				 * to race another thread doing an insertion.
3668 				 * Our lookup misses the pte_pv but our *ptep
3669 				 * check sees the inserted pte.
3670 				 *
3671 				 * XXX panic case seems to occur within a
3672 				 * vm_fork() of /bin/sh, which frankly
3673 				 * shouldn't happen since no other threads
3674 				 * should be inserting to our pmap in that
3675 				 * situation.  Removing, possibly.  Inserting,
3676 				 * shouldn't happen.
3677 				 */
3678 				if ((oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) &&
3679 				    pt_pv) {
3680 					pte_pv = pv_find(pmap,
3681 							 pmap_pte_pindex(sva));
3682 					if (pte_pv) {
3683 						pv_drop(pte_pv);
3684 						kprintf("pmap_scan: RACE2 "
3685 							"%016jx, %016lx\n",
3686 							sva, oldpte);
3687 						continue;
3688 					}
3689 				}
3690 
3691 				/*
3692 				 * Didn't race
3693 				 */
3694 				KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) ==
3695 				    pmap->pmap_bits[PG_V_IDX],
3696 				    ("badD *ptep %016lx/%016lx sva %016lx "
3697 				    "pte_pv NULL pm_generation %d/%d",
3698 				     *ptep, oldpte, sva,
3699 				     generation, pmap->pm_generation));
3700 				info->func(pmap, info, NULL, pt_pv, 0,
3701 				    sva, ptep, info->arg);
3702 			}
3703 			pte_pv = NULL;
3704 			sva += PAGE_SIZE;
3705 			++ptep;
3706 		}
3707 		lwkt_yield();
3708 	}
3709 	if (pd_pv) {
3710 		pv_put(pd_pv);
3711 		pd_pv = NULL;
3712 	}
3713 	if (pt_pv) {
3714 		pv_put(pt_pv);
3715 		pt_pv = NULL;
3716 	}
3717 	lwkt_yield();
3718 
3719 	/*
3720 	 * Relock before returning.
3721 	 */
3722 	spin_lock(&pmap->pm_spin);
3723 	return (0);
3724 }
3725 
3726 void
3727 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
3728 {
3729 	struct pmap_scan_info info;
3730 
3731 	info.pmap = pmap;
3732 	info.sva = sva;
3733 	info.eva = eva;
3734 	info.func = pmap_remove_callback;
3735 	info.arg = NULL;
3736 	info.doinval = 1;	/* normal remove requires pmap inval */
3737 	pmap_scan(&info);
3738 }
3739 
3740 static void
3741 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
3742 {
3743 	struct pmap_scan_info info;
3744 
3745 	info.pmap = pmap;
3746 	info.sva = sva;
3747 	info.eva = eva;
3748 	info.func = pmap_remove_callback;
3749 	info.arg = NULL;
3750 	info.doinval = 0;	/* normal remove requires pmap inval */
3751 	pmap_scan(&info);
3752 }
3753 
3754 static void
3755 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
3756 		     pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
3757 		     vm_offset_t va, pt_entry_t *ptep, void *arg __unused)
3758 {
3759 	pt_entry_t pte;
3760 
3761 	if (pte_pv) {
3762 		/*
3763 		 * This will also drop pt_pv's wire_count. Note that
3764 		 * terminal pages are not wired based on mmu presence.
3765 		 */
3766 		if (info->doinval)
3767 			pmap_remove_pv_pte(pte_pv, pt_pv, &info->inval);
3768 		else
3769 			pmap_remove_pv_pte(pte_pv, pt_pv, NULL);
3770 		pmap_remove_pv_page(pte_pv);
3771 		pv_free(pte_pv);
3772 	} else if (sharept == 0) {
3773 		/*
3774 		 * Unmanaged page
3775 		 *
3776 		 * pt_pv's wire_count is still bumped by unmanaged pages
3777 		 * so we must decrement it manually.
3778 		 */
3779 		if (info->doinval)
3780 			pmap_inval_interlock(&info->inval, pmap, va);
3781 		pte = pte_load_clear(ptep);
3782 		if (info->doinval)
3783 			pmap_inval_deinterlock(&info->inval, pmap);
3784 		if (pte & pmap->pmap_bits[PG_W_IDX])
3785 			atomic_add_long(&pmap->pm_stats.wired_count, -1);
3786 		atomic_add_long(&pmap->pm_stats.resident_count, -1);
3787 		if (vm_page_unwire_quick(pt_pv->pv_m))
3788 			panic("pmap_remove: insufficient wirecount");
3789 	} else {
3790 		/*
3791 		 * Unmanaged page table, pt_pv is actually the pd_pv
3792 		 * for our pmap (not the share object pmap).
3793 		 *
3794 		 * We have to unwire the target page table page and we
3795 		 * have to unwire our page directory page.
3796 		 */
3797 		if (info->doinval)
3798 			pmap_inval_interlock(&info->inval, pmap, va);
3799 		pte = pte_load_clear(ptep);
3800 		if (info->doinval)
3801 			pmap_inval_deinterlock(&info->inval, pmap);
3802 		atomic_add_long(&pmap->pm_stats.resident_count, -1);
3803 		KKASSERT((pte & pmap->pmap_bits[PG_DEVICE_IDX]) == 0);
3804 		if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME)))
3805 			panic("pmap_remove: shared pgtable1 bad wirecount");
3806 		if (vm_page_unwire_quick(pt_pv->pv_m))
3807 			panic("pmap_remove: shared pgtable2 bad wirecount");
3808 	}
3809 }
3810 
3811 /*
3812  * Removes this physical page from all physical maps in which it resides.
3813  * Reflects back modify bits to the pager.
3814  *
3815  * This routine may not be called from an interrupt.
3816  */
3817 static
3818 void
3819 pmap_remove_all(vm_page_t m)
3820 {
3821 	struct pmap_inval_info info;
3822 	pv_entry_t pv;
3823 
3824 	if (!pmap_initialized /* || (m->flags & PG_FICTITIOUS)*/)
3825 		return;
3826 
3827 	pmap_inval_init(&info);
3828 	vm_page_spin_lock(m);
3829 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3830 		KKASSERT(pv->pv_m == m);
3831 		if (pv_hold_try(pv)) {
3832 			vm_page_spin_unlock(m);
3833 		} else {
3834 			vm_page_spin_unlock(m);
3835 			pv_lock(pv);
3836 		}
3837 		if (pv->pv_m != m) {
3838 			pv_put(pv);
3839 			vm_page_spin_lock(m);
3840 			continue;
3841 		}
3842 		/*
3843 		 * Holding no spinlocks, pv is locked.
3844 		 */
3845 		pmap_remove_pv_pte(pv, NULL, &info);
3846 		pmap_remove_pv_page(pv);
3847 		pv_free(pv);
3848 		vm_page_spin_lock(m);
3849 	}
3850 	KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
3851 	vm_page_spin_unlock(m);
3852 	pmap_inval_done(&info);
3853 }
3854 
3855 /*
3856  * Set the physical protection on the specified range of this map
3857  * as requested.  This function is typically only used for debug watchpoints
3858  * and COW pages.
3859  *
3860  * This function may not be called from an interrupt if the map is
3861  * not the kernel_pmap.
3862  *
3863  * NOTE!  For shared page table pages we just unmap the page.
3864  */
3865 void
3866 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3867 {
3868 	struct pmap_scan_info info;
3869 	/* JG review for NX */
3870 
3871 	if (pmap == NULL)
3872 		return;
3873 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3874 		pmap_remove(pmap, sva, eva);
3875 		return;
3876 	}
3877 	if (prot & VM_PROT_WRITE)
3878 		return;
3879 	info.pmap = pmap;
3880 	info.sva = sva;
3881 	info.eva = eva;
3882 	info.func = pmap_protect_callback;
3883 	info.arg = &prot;
3884 	info.doinval = 1;
3885 	pmap_scan(&info);
3886 }
3887 
3888 static
3889 void
3890 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info,
3891 		      pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
3892 		      vm_offset_t va, pt_entry_t *ptep, void *arg __unused)
3893 {
3894 	pt_entry_t pbits;
3895 	pt_entry_t cbits;
3896 	pt_entry_t pte;
3897 	vm_page_t m;
3898 
3899 	/*
3900 	 * XXX non-optimal.
3901 	 */
3902 	pmap_inval_interlock(&info->inval, pmap, va);
3903 again:
3904 	pbits = *ptep;
3905 	cbits = pbits;
3906 	if (pte_pv) {
3907 		m = NULL;
3908 		if (pbits & pmap->pmap_bits[PG_A_IDX]) {
3909 			if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) {
3910 				m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3911 				KKASSERT(m == pte_pv->pv_m);
3912 				vm_page_flag_set(m, PG_REFERENCED);
3913 			}
3914 			cbits &= ~pmap->pmap_bits[PG_A_IDX];
3915 		}
3916 		if (pbits & pmap->pmap_bits[PG_M_IDX]) {
3917 			if (pmap_track_modified(pte_pv->pv_pindex)) {
3918 				if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) {
3919 					if (m == NULL) {
3920 						m = PHYS_TO_VM_PAGE(pbits &
3921 								    PG_FRAME);
3922 					}
3923 					vm_page_dirty(m);
3924 				}
3925 				cbits &= ~pmap->pmap_bits[PG_M_IDX];
3926 			}
3927 		}
3928 	} else if (sharept) {
3929 		/*
3930 		 * Unmanaged page table, pt_pv is actually the pd_pv
3931 		 * for our pmap (not the object's shared pmap).
3932 		 *
3933 		 * When asked to protect something in a shared page table
3934 		 * page we just unmap the page table page.  We have to
3935 		 * invalidate the tlb in this situation.
3936 		 *
3937 		 * XXX Warning, shared page tables will not be used for
3938 		 * OBJT_DEVICE or OBJT_MGTDEVICE (PG_FICTITIOUS) mappings
3939 		 * so PHYS_TO_VM_PAGE() should be safe here.
3940 		 */
3941 		pte = pte_load_clear(ptep);
3942 		pmap_inval_invltlb(&info->inval);
3943 		if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME)))
3944 			panic("pmap_protect: pgtable1 pg bad wirecount");
3945 		if (vm_page_unwire_quick(pt_pv->pv_m))
3946 			panic("pmap_protect: pgtable2 pg bad wirecount");
3947 		ptep = NULL;
3948 	}
3949 	/* else unmanaged page, adjust bits, no wire changes */
3950 
3951 	if (ptep) {
3952 		cbits &= ~pmap->pmap_bits[PG_RW_IDX];
3953 #ifdef PMAP_DEBUG2
3954 		if (pmap_enter_debug > 0) {
3955 			--pmap_enter_debug;
3956 			kprintf("pmap_protect va=%lx ptep=%p pte_pv=%p "
3957 				"pt_pv=%p cbits=%08lx\n",
3958 				va, ptep, pte_pv,
3959 				pt_pv, cbits
3960 			);
3961 		}
3962 #endif
3963 		if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) {
3964 			goto again;
3965 		}
3966 	}
3967 	pmap_inval_deinterlock(&info->inval, pmap);
3968 	if (pte_pv)
3969 		pv_put(pte_pv);
3970 }
3971 
3972 /*
3973  * Insert the vm_page (m) at the virtual address (va), replacing any prior
3974  * mapping at that address.  Set protection and wiring as requested.
3975  *
3976  * If entry is non-NULL we check to see if the SEG_SIZE optimization is
3977  * possible.  If it is we enter the page into the appropriate shared pmap
3978  * hanging off the related VM object instead of the passed pmap, then we
3979  * share the page table page from the VM object's pmap into the current pmap.
3980  *
3981  * NOTE: This routine MUST insert the page into the pmap now, it cannot
3982  *	 lazy-evaluate.
3983  */
3984 void
3985 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3986 	   boolean_t wired, vm_map_entry_t entry)
3987 {
3988 	pmap_inval_info info;
3989 	pv_entry_t pt_pv;	/* page table */
3990 	pv_entry_t pte_pv;	/* page table entry */
3991 	pt_entry_t *ptep;
3992 	vm_paddr_t opa;
3993 	pt_entry_t origpte, newpte;
3994 	vm_paddr_t pa;
3995 
3996 	if (pmap == NULL)
3997 		return;
3998 	va = trunc_page(va);
3999 #ifdef PMAP_DIAGNOSTIC
4000 	if (va >= KvaEnd)
4001 		panic("pmap_enter: toobig");
4002 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
4003 		panic("pmap_enter: invalid to pmap_enter page table "
4004 		      "pages (va: 0x%lx)", va);
4005 #endif
4006 	if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
4007 		kprintf("Warning: pmap_enter called on UVA with "
4008 			"kernel_pmap\n");
4009 #ifdef DDB
4010 		db_print_backtrace();
4011 #endif
4012 	}
4013 	if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
4014 		kprintf("Warning: pmap_enter called on KVA without"
4015 			"kernel_pmap\n");
4016 #ifdef DDB
4017 		db_print_backtrace();
4018 #endif
4019 	}
4020 
4021 	/*
4022 	 * Get locked PV entries for our new page table entry (pte_pv)
4023 	 * and for its parent page table (pt_pv).  We need the parent
4024 	 * so we can resolve the location of the ptep.
4025 	 *
4026 	 * Only hardware MMU actions can modify the ptep out from
4027 	 * under us.
4028 	 *
4029 	 * if (m) is fictitious or unmanaged we do not create a managing
4030 	 * pte_pv for it.  Any pre-existing page's management state must
4031 	 * match (avoiding code complexity).
4032 	 *
4033 	 * If the pmap is still being initialized we assume existing
4034 	 * page tables.
4035 	 *
4036 	 * Kernel mapppings do not track page table pages (i.e. pt_pv).
4037 	 */
4038 	if (pmap_initialized == FALSE) {
4039 		pte_pv = NULL;
4040 		pt_pv = NULL;
4041 		ptep = vtopte(va);
4042 		origpte = *ptep;
4043 	} else if (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) { /* XXX */
4044 		pte_pv = NULL;
4045 		if (va >= VM_MAX_USER_ADDRESS) {
4046 			pt_pv = NULL;
4047 			ptep = vtopte(va);
4048 		} else {
4049 			pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va),
4050 						  NULL, entry, va);
4051 			ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
4052 		}
4053 		origpte = *ptep;
4054 		cpu_ccfence();
4055 		KKASSERT(origpte == 0 ||
4056 			 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0);
4057 	} else {
4058 		if (va >= VM_MAX_USER_ADDRESS) {
4059 			/*
4060 			 * Kernel map, pv_entry-tracked.
4061 			 */
4062 			pt_pv = NULL;
4063 			pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL);
4064 			ptep = vtopte(va);
4065 		} else {
4066 			/*
4067 			 * User map
4068 			 */
4069 			pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va),
4070 						   &pt_pv, entry, va);
4071 			ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
4072 		}
4073 		origpte = *ptep;
4074 		cpu_ccfence();
4075 		KKASSERT(origpte == 0 ||
4076 			 (origpte & pmap->pmap_bits[PG_MANAGED_IDX]));
4077 	}
4078 
4079 	pa = VM_PAGE_TO_PHYS(m);
4080 	opa = origpte & PG_FRAME;
4081 
4082 	newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) |
4083 		 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]);
4084 	if (wired)
4085 		newpte |= pmap->pmap_bits[PG_W_IDX];
4086 	if (va < VM_MAX_USER_ADDRESS)
4087 		newpte |= pmap->pmap_bits[PG_U_IDX];
4088 	if (pte_pv)
4089 		newpte |= pmap->pmap_bits[PG_MANAGED_IDX];
4090 //	if (pmap == &kernel_pmap)
4091 //		newpte |= pgeflag;
4092 	newpte |= pmap->pmap_cache_bits[m->pat_mode];
4093 	if (m->flags & PG_FICTITIOUS)
4094 		newpte |= pmap->pmap_bits[PG_DEVICE_IDX];
4095 
4096 	/*
4097 	 * It is possible for multiple faults to occur in threaded
4098 	 * environments, the existing pte might be correct.
4099 	 */
4100 	if (((origpte ^ newpte) & ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] |
4101 	    pmap->pmap_bits[PG_A_IDX])) == 0)
4102 		goto done;
4103 
4104 	if ((prot & VM_PROT_NOSYNC) == 0)
4105 		pmap_inval_init(&info);
4106 
4107 	/*
4108 	 * Ok, either the address changed or the protection or wiring
4109 	 * changed.
4110 	 *
4111 	 * Clear the current entry, interlocking the removal.  For managed
4112 	 * pte's this will also flush the modified state to the vm_page.
4113 	 * Atomic ops are mandatory in order to ensure that PG_M events are
4114 	 * not lost during any transition.
4115 	 *
4116 	 * WARNING: The caller has busied the new page but not the original
4117 	 *	    vm_page which we are trying to replace.  Because we hold
4118 	 *	    the pte_pv lock, but have not busied the page, PG bits
4119 	 *	    can be cleared out from under us.
4120 	 */
4121 	if (opa) {
4122 		if (pte_pv) {
4123 			/*
4124 			 * pmap_remove_pv_pte() unwires pt_pv and assumes
4125 			 * we will free pte_pv, but since we are reusing
4126 			 * pte_pv we want to retain the wire count.
4127 			 *
4128 			 * pt_pv won't exist for a kernel page (managed or
4129 			 * otherwise).
4130 			 */
4131 			if (pt_pv)
4132 				vm_page_wire_quick(pt_pv->pv_m);
4133 			if (prot & VM_PROT_NOSYNC)
4134 				pmap_remove_pv_pte(pte_pv, pt_pv, NULL);
4135 			else
4136 				pmap_remove_pv_pte(pte_pv, pt_pv, &info);
4137 			if (pte_pv->pv_m)
4138 				pmap_remove_pv_page(pte_pv);
4139 		} else if (prot & VM_PROT_NOSYNC) {
4140 			/*
4141 			 * Unmanaged page, NOSYNC (no mmu sync) requested.
4142 			 *
4143 			 * Leave wire count on PT page intact.
4144 			 */
4145 			(void)pte_load_clear(ptep);
4146 			cpu_invlpg((void *)va);
4147 			atomic_add_long(&pmap->pm_stats.resident_count, -1);
4148 		} else {
4149 			/*
4150 			 * Unmanaged page, normal enter.
4151 			 *
4152 			 * Leave wire count on PT page intact.
4153 			 */
4154 			pmap_inval_interlock(&info, pmap, va);
4155 			(void)pte_load_clear(ptep);
4156 			pmap_inval_deinterlock(&info, pmap);
4157 			atomic_add_long(&pmap->pm_stats.resident_count, -1);
4158 		}
4159 		KKASSERT(*ptep == 0);
4160 	}
4161 
4162 #ifdef PMAP_DEBUG2
4163 	if (pmap_enter_debug > 0) {
4164 		--pmap_enter_debug;
4165 		kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p"
4166 			" pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n",
4167 			va, m,
4168 			origpte, newpte, ptep,
4169 			pte_pv, pt_pv, opa, prot);
4170 	}
4171 #endif
4172 
4173 	if (pte_pv) {
4174 		/*
4175 		 * Enter on the PV list if part of our managed memory.
4176 		 * Wiring of the PT page is already handled.
4177 		 */
4178 		KKASSERT(pte_pv->pv_m == NULL);
4179 		vm_page_spin_lock(m);
4180 		pte_pv->pv_m = m;
4181 		pmap_page_stats_adding(m);
4182 		TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list);
4183 		vm_page_flag_set(m, PG_MAPPED);
4184 		vm_page_spin_unlock(m);
4185 	} else if (pt_pv && opa == 0) {
4186 		/*
4187 		 * We have to adjust the wire count on the PT page ourselves
4188 		 * for unmanaged entries.  If opa was non-zero we retained
4189 		 * the existing wire count from the removal.
4190 		 */
4191 		vm_page_wire_quick(pt_pv->pv_m);
4192 	}
4193 
4194 	/*
4195 	 * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks.
4196 	 *
4197 	 * User VMAs do not because those will be zero->non-zero, so no
4198 	 * stale entries to worry about at this point.
4199 	 *
4200 	 * For KVM there appear to still be issues.  Theoretically we
4201 	 * should be able to scrap the interlocks entirely but we
4202 	 * get crashes.
4203 	 */
4204 	if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
4205 		pmap_inval_interlock(&info, pmap, va);
4206 
4207 	/*
4208 	 * Set the pte
4209 	 */
4210 	*(volatile pt_entry_t *)ptep = newpte;
4211 
4212 	if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
4213 		pmap_inval_deinterlock(&info, pmap);
4214 	else if (pt_pv == NULL)
4215 		cpu_invlpg((void *)va);
4216 
4217 	if (wired) {
4218 		if (pte_pv) {
4219 			atomic_add_long(&pte_pv->pv_pmap->pm_stats.wired_count,
4220 					1);
4221 		} else {
4222 			atomic_add_long(&pmap->pm_stats.wired_count, 1);
4223 		}
4224 	}
4225 	if (newpte & pmap->pmap_bits[PG_RW_IDX])
4226 		vm_page_flag_set(m, PG_WRITEABLE);
4227 
4228 	/*
4229 	 * Unmanaged pages need manual resident_count tracking.
4230 	 */
4231 	if (pte_pv == NULL && pt_pv)
4232 		atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1);
4233 
4234 	/*
4235 	 * Cleanup
4236 	 */
4237 	if ((prot & VM_PROT_NOSYNC) == 0 || pte_pv == NULL)
4238 		pmap_inval_done(&info);
4239 done:
4240 	KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 ||
4241 		 (m->flags & PG_MAPPED));
4242 
4243 	/*
4244 	 * Cleanup the pv entry, allowing other accessors.
4245 	 */
4246 	if (pte_pv)
4247 		pv_put(pte_pv);
4248 	if (pt_pv)
4249 		pv_put(pt_pv);
4250 }
4251 
4252 /*
4253  * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
4254  * This code also assumes that the pmap has no pre-existing entry for this
4255  * VA.
4256  *
4257  * This code currently may only be used on user pmaps, not kernel_pmap.
4258  */
4259 void
4260 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
4261 {
4262 	pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL);
4263 }
4264 
4265 /*
4266  * Make a temporary mapping for a physical address.  This is only intended
4267  * to be used for panic dumps.
4268  *
4269  * The caller is responsible for calling smp_invltlb().
4270  */
4271 void *
4272 pmap_kenter_temporary(vm_paddr_t pa, long i)
4273 {
4274 	pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
4275 	return ((void *)crashdumpmap);
4276 }
4277 
4278 #define MAX_INIT_PT (96)
4279 
4280 /*
4281  * This routine preloads the ptes for a given object into the specified pmap.
4282  * This eliminates the blast of soft faults on process startup and
4283  * immediately after an mmap.
4284  */
4285 static int pmap_object_init_pt_callback(vm_page_t p, void *data);
4286 
4287 void
4288 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
4289 		    vm_object_t object, vm_pindex_t pindex,
4290 		    vm_size_t size, int limit)
4291 {
4292 	struct rb_vm_page_scan_info info;
4293 	struct lwp *lp;
4294 	vm_size_t psize;
4295 
4296 	/*
4297 	 * We can't preinit if read access isn't set or there is no pmap
4298 	 * or object.
4299 	 */
4300 	if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL)
4301 		return;
4302 
4303 	/*
4304 	 * We can't preinit if the pmap is not the current pmap
4305 	 */
4306 	lp = curthread->td_lwp;
4307 	if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace))
4308 		return;
4309 
4310 	/*
4311 	 * Misc additional checks
4312 	 */
4313 	psize = x86_64_btop(size);
4314 
4315 	if ((object->type != OBJT_VNODE) ||
4316 		((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
4317 			(object->resident_page_count > MAX_INIT_PT))) {
4318 		return;
4319 	}
4320 
4321 	if (pindex + psize > object->size) {
4322 		if (object->size < pindex)
4323 			return;
4324 		psize = object->size - pindex;
4325 	}
4326 
4327 	if (psize == 0)
4328 		return;
4329 
4330 	/*
4331 	 * If everything is segment-aligned do not pre-init here.  Instead
4332 	 * allow the normal vm_fault path to pass a segment hint to
4333 	 * pmap_enter() which will then use an object-referenced shared
4334 	 * page table page.
4335 	 */
4336 	if ((addr & SEG_MASK) == 0 &&
4337 	    (ctob(psize) & SEG_MASK) == 0 &&
4338 	    (ctob(pindex) & SEG_MASK) == 0) {
4339 		return;
4340 	}
4341 
4342 	/*
4343 	 * Use a red-black scan to traverse the requested range and load
4344 	 * any valid pages found into the pmap.
4345 	 *
4346 	 * We cannot safely scan the object's memq without holding the
4347 	 * object token.
4348 	 */
4349 	info.start_pindex = pindex;
4350 	info.end_pindex = pindex + psize - 1;
4351 	info.limit = limit;
4352 	info.mpte = NULL;
4353 	info.addr = addr;
4354 	info.pmap = pmap;
4355 
4356 	vm_object_hold_shared(object);
4357 	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
4358 				pmap_object_init_pt_callback, &info);
4359 	vm_object_drop(object);
4360 }
4361 
4362 static
4363 int
4364 pmap_object_init_pt_callback(vm_page_t p, void *data)
4365 {
4366 	struct rb_vm_page_scan_info *info = data;
4367 	vm_pindex_t rel_index;
4368 
4369 	/*
4370 	 * don't allow an madvise to blow away our really
4371 	 * free pages allocating pv entries.
4372 	 */
4373 	if ((info->limit & MAP_PREFAULT_MADVISE) &&
4374 		vmstats.v_free_count < vmstats.v_free_reserved) {
4375 		    return(-1);
4376 	}
4377 
4378 	/*
4379 	 * Ignore list markers and ignore pages we cannot instantly
4380 	 * busy (while holding the object token).
4381 	 */
4382 	if (p->flags & PG_MARKER)
4383 		return 0;
4384 	if (vm_page_busy_try(p, TRUE))
4385 		return 0;
4386 	if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
4387 	    (p->flags & PG_FICTITIOUS) == 0) {
4388 		if ((p->queue - p->pc) == PQ_CACHE)
4389 			vm_page_deactivate(p);
4390 		rel_index = p->pindex - info->start_pindex;
4391 		pmap_enter_quick(info->pmap,
4392 				 info->addr + x86_64_ptob(rel_index), p);
4393 	}
4394 	vm_page_wakeup(p);
4395 	lwkt_yield();
4396 	return(0);
4397 }
4398 
4399 /*
4400  * Return TRUE if the pmap is in shape to trivially pre-fault the specified
4401  * address.
4402  *
4403  * Returns FALSE if it would be non-trivial or if a pte is already loaded
4404  * into the slot.
4405  *
4406  * XXX This is safe only because page table pages are not freed.
4407  */
4408 int
4409 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
4410 {
4411 	pt_entry_t *pte;
4412 
4413 	/*spin_lock(&pmap->pm_spin);*/
4414 	if ((pte = pmap_pte(pmap, addr)) != NULL) {
4415 		if (*pte & pmap->pmap_bits[PG_V_IDX]) {
4416 			/*spin_unlock(&pmap->pm_spin);*/
4417 			return FALSE;
4418 		}
4419 	}
4420 	/*spin_unlock(&pmap->pm_spin);*/
4421 	return TRUE;
4422 }
4423 
4424 /*
4425  * Change the wiring attribute for a pmap/va pair.  The mapping must already
4426  * exist in the pmap.  The mapping may or may not be managed.
4427  */
4428 void
4429 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired,
4430 		   vm_map_entry_t entry)
4431 {
4432 	pt_entry_t *ptep;
4433 	pv_entry_t pv;
4434 
4435 	if (pmap == NULL)
4436 		return;
4437 	lwkt_gettoken(&pmap->pm_token);
4438 	pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), NULL, entry, va);
4439 	ptep = pv_pte_lookup(pv, pmap_pte_index(va));
4440 
4441 	if (wired && !pmap_pte_w(pmap, ptep))
4442 		atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, 1);
4443 	else if (!wired && pmap_pte_w(pmap, ptep))
4444 		atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, -1);
4445 
4446 	/*
4447 	 * Wiring is not a hardware characteristic so there is no need to
4448 	 * invalidate TLB.  However, in an SMP environment we must use
4449 	 * a locked bus cycle to update the pte (if we are not using
4450 	 * the pmap_inval_*() API that is)... it's ok to do this for simple
4451 	 * wiring changes.
4452 	 */
4453 	if (wired)
4454 		atomic_set_long(ptep, pmap->pmap_bits[PG_W_IDX]);
4455 	else
4456 		atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]);
4457 	pv_put(pv);
4458 	lwkt_reltoken(&pmap->pm_token);
4459 }
4460 
4461 
4462 
4463 /*
4464  * Copy the range specified by src_addr/len from the source map to
4465  * the range dst_addr/len in the destination map.
4466  *
4467  * This routine is only advisory and need not do anything.
4468  */
4469 void
4470 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
4471 	  vm_size_t len, vm_offset_t src_addr)
4472 {
4473 }
4474 
4475 /*
4476  * pmap_zero_page:
4477  *
4478  *	Zero the specified physical page.
4479  *
4480  *	This function may be called from an interrupt and no locking is
4481  *	required.
4482  */
4483 void
4484 pmap_zero_page(vm_paddr_t phys)
4485 {
4486 	vm_offset_t va = PHYS_TO_DMAP(phys);
4487 
4488 	pagezero((void *)va);
4489 }
4490 
4491 /*
4492  * pmap_page_assertzero:
4493  *
4494  *	Assert that a page is empty, panic if it isn't.
4495  */
4496 void
4497 pmap_page_assertzero(vm_paddr_t phys)
4498 {
4499 	vm_offset_t va = PHYS_TO_DMAP(phys);
4500 	size_t i;
4501 
4502 	for (i = 0; i < PAGE_SIZE; i += sizeof(long)) {
4503 		if (*(long *)((char *)va + i) != 0) {
4504 			panic("pmap_page_assertzero() @ %p not zero!",
4505 			      (void *)(intptr_t)va);
4506 		}
4507 	}
4508 }
4509 
4510 /*
4511  * pmap_zero_page:
4512  *
4513  *	Zero part of a physical page by mapping it into memory and clearing
4514  *	its contents with bzero.
4515  *
4516  *	off and size may not cover an area beyond a single hardware page.
4517  */
4518 void
4519 pmap_zero_page_area(vm_paddr_t phys, int off, int size)
4520 {
4521 	vm_offset_t virt = PHYS_TO_DMAP(phys);
4522 
4523 	bzero((char *)virt + off, size);
4524 }
4525 
4526 /*
4527  * pmap_copy_page:
4528  *
4529  *	Copy the physical page from the source PA to the target PA.
4530  *	This function may be called from an interrupt.  No locking
4531  *	is required.
4532  */
4533 void
4534 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
4535 {
4536 	vm_offset_t src_virt, dst_virt;
4537 
4538 	src_virt = PHYS_TO_DMAP(src);
4539 	dst_virt = PHYS_TO_DMAP(dst);
4540 	bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE);
4541 }
4542 
4543 /*
4544  * pmap_copy_page_frag:
4545  *
4546  *	Copy the physical page from the source PA to the target PA.
4547  *	This function may be called from an interrupt.  No locking
4548  *	is required.
4549  */
4550 void
4551 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
4552 {
4553 	vm_offset_t src_virt, dst_virt;
4554 
4555 	src_virt = PHYS_TO_DMAP(src);
4556 	dst_virt = PHYS_TO_DMAP(dst);
4557 
4558 	bcopy((char *)src_virt + (src & PAGE_MASK),
4559 	      (char *)dst_virt + (dst & PAGE_MASK),
4560 	      bytes);
4561 }
4562 
4563 /*
4564  * Returns true if the pmap's pv is one of the first 16 pvs linked to from
4565  * this page.  This count may be changed upwards or downwards in the future;
4566  * it is only necessary that true be returned for a small subset of pmaps
4567  * for proper page aging.
4568  */
4569 boolean_t
4570 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4571 {
4572 	pv_entry_t pv;
4573 	int loops = 0;
4574 
4575 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
4576 		return FALSE;
4577 
4578 	vm_page_spin_lock(m);
4579 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4580 		if (pv->pv_pmap == pmap) {
4581 			vm_page_spin_unlock(m);
4582 			return TRUE;
4583 		}
4584 		loops++;
4585 		if (loops >= 16)
4586 			break;
4587 	}
4588 	vm_page_spin_unlock(m);
4589 	return (FALSE);
4590 }
4591 
4592 /*
4593  * Remove all pages from specified address space this aids process exit
4594  * speeds.  Also, this code may be special cased for the current process
4595  * only.
4596  */
4597 void
4598 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4599 {
4600 	pmap_remove_noinval(pmap, sva, eva);
4601 	cpu_invltlb();
4602 }
4603 
4604 /*
4605  * pmap_testbit tests bits in pte's note that the testbit/clearbit
4606  * routines are inline, and a lot of things compile-time evaluate.
4607  */
4608 static
4609 boolean_t
4610 pmap_testbit(vm_page_t m, int bit)
4611 {
4612 	pv_entry_t pv;
4613 	pt_entry_t *pte;
4614 	pmap_t pmap;
4615 
4616 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
4617 		return FALSE;
4618 
4619 	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
4620 		return FALSE;
4621 	vm_page_spin_lock(m);
4622 	if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
4623 		vm_page_spin_unlock(m);
4624 		return FALSE;
4625 	}
4626 
4627 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4628 
4629 #if defined(PMAP_DIAGNOSTIC)
4630 		if (pv->pv_pmap == NULL) {
4631 			kprintf("Null pmap (tb) at pindex: %"PRIu64"\n",
4632 			    pv->pv_pindex);
4633 			continue;
4634 		}
4635 #endif
4636 		pmap = pv->pv_pmap;
4637 
4638 		/*
4639 		 * If the bit being tested is the modified bit, then
4640 		 * mark clean_map and ptes as never
4641 		 * modified.
4642 		 *
4643 		 * WARNING!  Because we do not lock the pv, *pte can be in a
4644 		 *	     state of flux.  Despite this the value of *pte
4645 		 *	     will still be related to the vm_page in some way
4646 		 *	     because the pv cannot be destroyed as long as we
4647 		 *	     hold the vm_page spin lock.
4648 		 */
4649 		if (bit == PG_A_IDX || bit == PG_M_IDX) {
4650 				//& (pmap->pmap_bits[PG_A_IDX] | pmap->pmap_bits[PG_M_IDX])) {
4651 			if (!pmap_track_modified(pv->pv_pindex))
4652 				continue;
4653 		}
4654 
4655 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
4656 		if (*pte & pmap->pmap_bits[bit]) {
4657 			vm_page_spin_unlock(m);
4658 			return TRUE;
4659 		}
4660 	}
4661 	vm_page_spin_unlock(m);
4662 	return (FALSE);
4663 }
4664 
4665 /*
4666  * This routine is used to modify bits in ptes.  Only one bit should be
4667  * specified.  PG_RW requires special handling.
4668  *
4669  * Caller must NOT hold any spin locks
4670  */
4671 static __inline
4672 void
4673 pmap_clearbit(vm_page_t m, int bit_index)
4674 {
4675 	struct pmap_inval_info info;
4676 	pv_entry_t pv;
4677 	pt_entry_t *pte;
4678 	pt_entry_t pbits;
4679 	pmap_t pmap;
4680 
4681 	if (bit_index == PG_RW_IDX)
4682 		vm_page_flag_clear(m, PG_WRITEABLE);
4683 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
4684 		return;
4685 	}
4686 
4687 	/*
4688 	 * PG_M or PG_A case
4689 	 *
4690 	 * Loop over all current mappings setting/clearing as appropos If
4691 	 * setting RO do we need to clear the VAC?
4692 	 *
4693 	 * NOTE: When clearing PG_M we could also (not implemented) drop
4694 	 *	 through to the PG_RW code and clear PG_RW too, forcing
4695 	 *	 a fault on write to redetect PG_M for virtual kernels, but
4696 	 *	 it isn't necessary since virtual kernels invalidate the
4697 	 *	 pte when they clear the VPTE_M bit in their virtual page
4698 	 *	 tables.
4699 	 *
4700 	 * NOTE: Does not re-dirty the page when clearing only PG_M.
4701 	 *
4702 	 * NOTE: Because we do not lock the pv, *pte can be in a state of
4703 	 *	 flux.  Despite this the value of *pte is still somewhat
4704 	 *	 related while we hold the vm_page spin lock.
4705 	 *
4706 	 *	 *pte can be zero due to this race.  Since we are clearing
4707 	 *	 bits we basically do no harm when this race  ccurs.
4708 	 */
4709 	if (bit_index != PG_RW_IDX) {
4710 		vm_page_spin_lock(m);
4711 		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4712 #if defined(PMAP_DIAGNOSTIC)
4713 			if (pv->pv_pmap == NULL) {
4714 				kprintf("Null pmap (cb) at pindex: %"PRIu64"\n",
4715 				    pv->pv_pindex);
4716 				continue;
4717 			}
4718 #endif
4719 			pmap = pv->pv_pmap;
4720 			pte = pmap_pte_quick(pv->pv_pmap,
4721 					     pv->pv_pindex << PAGE_SHIFT);
4722 			pbits = *pte;
4723 			if (pbits & pmap->pmap_bits[bit_index])
4724 				atomic_clear_long(pte, pmap->pmap_bits[bit_index]);
4725 		}
4726 		vm_page_spin_unlock(m);
4727 		return;
4728 	}
4729 
4730 	/*
4731 	 * Clear PG_RW.  Also clears PG_M and marks the page dirty if PG_M
4732 	 * was set.
4733 	 */
4734 	pmap_inval_init(&info);
4735 
4736 restart:
4737 	vm_page_spin_lock(m);
4738 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4739 		/*
4740 		 * don't write protect pager mappings
4741 		 */
4742 		if (!pmap_track_modified(pv->pv_pindex))
4743 			continue;
4744 
4745 #if defined(PMAP_DIAGNOSTIC)
4746 		if (pv->pv_pmap == NULL) {
4747 			kprintf("Null pmap (cb) at pindex: %"PRIu64"\n",
4748 			    pv->pv_pindex);
4749 			continue;
4750 		}
4751 #endif
4752 		pmap = pv->pv_pmap;
4753 		/*
4754 		 * Skip pages which do not have PG_RW set.
4755 		 */
4756 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
4757 		if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0)
4758 			continue;
4759 
4760 		/*
4761 		 * Lock the PV
4762 		 */
4763 		if (pv_hold_try(pv)) {
4764 			vm_page_spin_unlock(m);
4765 		} else {
4766 			vm_page_spin_unlock(m);
4767 			pv_lock(pv);	/* held, now do a blocking lock */
4768 		}
4769 		if (pv->pv_pmap != pmap || pv->pv_m != m) {
4770 			pv_put(pv);	/* and release */
4771 			goto restart;	/* anything could have happened */
4772 		}
4773 		pmap_inval_interlock(&info, pmap,
4774 				     (vm_offset_t)pv->pv_pindex << PAGE_SHIFT);
4775 		KKASSERT(pv->pv_pmap == pmap);
4776 		for (;;) {
4777 			pbits = *pte;
4778 			cpu_ccfence();
4779 			if (atomic_cmpset_long(pte, pbits, pbits &
4780 			    ~(pmap->pmap_bits[PG_RW_IDX] |
4781 			    pmap->pmap_bits[PG_M_IDX]))) {
4782 				break;
4783 			}
4784 		}
4785 		pmap_inval_deinterlock(&info, pmap);
4786 		vm_page_spin_lock(m);
4787 
4788 		/*
4789 		 * If PG_M was found to be set while we were clearing PG_RW
4790 		 * we also clear PG_M (done above) and mark the page dirty.
4791 		 * Callers expect this behavior.
4792 		 */
4793 		if (pbits & pmap->pmap_bits[PG_M_IDX])
4794 			vm_page_dirty(m);
4795 		pv_put(pv);
4796 	}
4797 	vm_page_spin_unlock(m);
4798 	pmap_inval_done(&info);
4799 }
4800 
4801 /*
4802  * Lower the permission for all mappings to a given page.
4803  *
4804  * Page must be busied by caller.  Because page is busied by caller this
4805  * should not be able to race a pmap_enter().
4806  */
4807 void
4808 pmap_page_protect(vm_page_t m, vm_prot_t prot)
4809 {
4810 	/* JG NX support? */
4811 	if ((prot & VM_PROT_WRITE) == 0) {
4812 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
4813 			/*
4814 			 * NOTE: pmap_clearbit(.. PG_RW) also clears
4815 			 *	 the PG_WRITEABLE flag in (m).
4816 			 */
4817 			pmap_clearbit(m, PG_RW_IDX);
4818 		} else {
4819 			pmap_remove_all(m);
4820 		}
4821 	}
4822 }
4823 
4824 vm_paddr_t
4825 pmap_phys_address(vm_pindex_t ppn)
4826 {
4827 	return (x86_64_ptob(ppn));
4828 }
4829 
4830 /*
4831  * Return a count of reference bits for a page, clearing those bits.
4832  * It is not necessary for every reference bit to be cleared, but it
4833  * is necessary that 0 only be returned when there are truly no
4834  * reference bits set.
4835  *
4836  * XXX: The exact number of bits to check and clear is a matter that
4837  * should be tested and standardized at some point in the future for
4838  * optimal aging of shared pages.
4839  *
4840  * This routine may not block.
4841  */
4842 int
4843 pmap_ts_referenced(vm_page_t m)
4844 {
4845 	pv_entry_t pv;
4846 	pt_entry_t *pte;
4847 	pmap_t pmap;
4848 	int rtval = 0;
4849 
4850 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
4851 		return (rtval);
4852 
4853 	vm_page_spin_lock(m);
4854 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4855 		if (!pmap_track_modified(pv->pv_pindex))
4856 			continue;
4857 		pmap = pv->pv_pmap;
4858 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
4859 		if (pte && (*pte & pmap->pmap_bits[PG_A_IDX])) {
4860 			atomic_clear_long(pte, pmap->pmap_bits[PG_A_IDX]);
4861 			rtval++;
4862 			if (rtval > 4)
4863 				break;
4864 		}
4865 	}
4866 	vm_page_spin_unlock(m);
4867 	return (rtval);
4868 }
4869 
4870 /*
4871  *	pmap_is_modified:
4872  *
4873  *	Return whether or not the specified physical page was modified
4874  *	in any physical maps.
4875  */
4876 boolean_t
4877 pmap_is_modified(vm_page_t m)
4878 {
4879 	boolean_t res;
4880 
4881 	res = pmap_testbit(m, PG_M_IDX);
4882 	return (res);
4883 }
4884 
4885 /*
4886  *	Clear the modify bits on the specified physical page.
4887  */
4888 void
4889 pmap_clear_modify(vm_page_t m)
4890 {
4891 	pmap_clearbit(m, PG_M_IDX);
4892 }
4893 
4894 /*
4895  *	pmap_clear_reference:
4896  *
4897  *	Clear the reference bit on the specified physical page.
4898  */
4899 void
4900 pmap_clear_reference(vm_page_t m)
4901 {
4902 	pmap_clearbit(m, PG_A_IDX);
4903 }
4904 
4905 /*
4906  * Miscellaneous support routines follow
4907  */
4908 
4909 static
4910 void
4911 i386_protection_init(void)
4912 {
4913 	int *kp, prot;
4914 
4915 	/* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit  */
4916 	kp = protection_codes;
4917 	for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) {
4918 		switch (prot) {
4919 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
4920 			/*
4921 			 * Read access is also 0. There isn't any execute bit,
4922 			 * so just make it readable.
4923 			 */
4924 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
4925 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
4926 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
4927 			*kp++ = 0;
4928 			break;
4929 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
4930 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
4931 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
4932 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
4933 			*kp++ = pmap_bits_default[PG_RW_IDX];
4934 			break;
4935 		}
4936 	}
4937 }
4938 
4939 /*
4940  * Map a set of physical memory pages into the kernel virtual
4941  * address space. Return a pointer to where it is mapped. This
4942  * routine is intended to be used for mapping device memory,
4943  * NOT real memory.
4944  *
4945  * NOTE: We can't use pgeflag unless we invalidate the pages one at
4946  *	 a time.
4947  *
4948  * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE}
4949  *	 work whether the cpu supports PAT or not.  The remaining PAT
4950  *	 attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu
4951  *	 supports PAT.
4952  */
4953 void *
4954 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4955 {
4956 	return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4957 }
4958 
4959 void *
4960 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size)
4961 {
4962 	return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
4963 }
4964 
4965 void *
4966 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4967 {
4968 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4969 }
4970 
4971 /*
4972  * Map a set of physical memory pages into the kernel virtual
4973  * address space. Return a pointer to where it is mapped. This
4974  * routine is intended to be used for mapping device memory,
4975  * NOT real memory.
4976  */
4977 void *
4978 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
4979 {
4980 	vm_offset_t va, tmpva, offset;
4981 	pt_entry_t *pte;
4982 	vm_size_t tmpsize;
4983 
4984 	offset = pa & PAGE_MASK;
4985 	size = roundup(offset + size, PAGE_SIZE);
4986 
4987 	va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE);
4988 	if (va == 0)
4989 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
4990 
4991 	pa = pa & ~PAGE_MASK;
4992 	for (tmpva = va, tmpsize = size; tmpsize > 0;) {
4993 		pte = vtopte(tmpva);
4994 		*pte = pa |
4995 		    kernel_pmap.pmap_bits[PG_RW_IDX] |
4996 		    kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */
4997 		    kernel_pmap.pmap_cache_bits[mode];
4998 		tmpsize -= PAGE_SIZE;
4999 		tmpva += PAGE_SIZE;
5000 		pa += PAGE_SIZE;
5001 	}
5002 	pmap_invalidate_range(&kernel_pmap, va, va + size);
5003 	pmap_invalidate_cache_range(va, va + size);
5004 
5005 	return ((void *)(va + offset));
5006 }
5007 
5008 void
5009 pmap_unmapdev(vm_offset_t va, vm_size_t size)
5010 {
5011 	vm_offset_t base, offset;
5012 
5013 	base = va & ~PAGE_MASK;
5014 	offset = va & PAGE_MASK;
5015 	size = roundup(offset + size, PAGE_SIZE);
5016 	pmap_qremove(va, size >> PAGE_SHIFT);
5017 	kmem_free(&kernel_map, base, size);
5018 }
5019 
5020 /*
5021  * Sets the memory attribute for the specified page.
5022  */
5023 void
5024 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5025 {
5026 
5027     m->pat_mode = ma;
5028 
5029     /*
5030      * If "m" is a normal page, update its direct mapping.  This update
5031      * can be relied upon to perform any cache operations that are
5032      * required for data coherence.
5033      */
5034     if ((m->flags & PG_FICTITIOUS) == 0)
5035         pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
5036         m->pat_mode);
5037 }
5038 
5039 /*
5040  * Change the PAT attribute on an existing kernel memory map.  Caller
5041  * must ensure that the virtual memory in question is not accessed
5042  * during the adjustment.
5043  */
5044 void
5045 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode)
5046 {
5047 	pt_entry_t *pte;
5048 	vm_offset_t base;
5049 	int changed = 0;
5050 
5051 	if (va == 0)
5052 		panic("pmap_change_attr: va is NULL");
5053 	base = trunc_page(va);
5054 
5055 	while (count) {
5056 		pte = vtopte(va);
5057 		*pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) |
5058 		       kernel_pmap.pmap_cache_bits[mode];
5059 		--count;
5060 		va += PAGE_SIZE;
5061 	}
5062 
5063 	changed = 1;	/* XXX: not optimal */
5064 
5065 	/*
5066 	 * Flush CPU caches if required to make sure any data isn't cached that
5067 	 * shouldn't be, etc.
5068 	 */
5069 	if (changed) {
5070 		pmap_invalidate_range(&kernel_pmap, base, va);
5071 		pmap_invalidate_cache_range(base, va);
5072 	}
5073 }
5074 
5075 /*
5076  * perform the pmap work for mincore
5077  */
5078 int
5079 pmap_mincore(pmap_t pmap, vm_offset_t addr)
5080 {
5081 	pt_entry_t *ptep, pte;
5082 	vm_page_t m;
5083 	int val = 0;
5084 
5085 	lwkt_gettoken(&pmap->pm_token);
5086 	ptep = pmap_pte(pmap, addr);
5087 
5088 	if (ptep && (pte = *ptep) != 0) {
5089 		vm_offset_t pa;
5090 
5091 		val = MINCORE_INCORE;
5092 		if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0)
5093 			goto done;
5094 
5095 		pa = pte & PG_FRAME;
5096 
5097 		if (pte & pmap->pmap_bits[PG_DEVICE_IDX])
5098 			m = NULL;
5099 		else
5100 			m = PHYS_TO_VM_PAGE(pa);
5101 
5102 		/*
5103 		 * Modified by us
5104 		 */
5105 		if (pte & pmap->pmap_bits[PG_M_IDX])
5106 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
5107 		/*
5108 		 * Modified by someone
5109 		 */
5110 		else if (m && (m->dirty || pmap_is_modified(m)))
5111 			val |= MINCORE_MODIFIED_OTHER;
5112 		/*
5113 		 * Referenced by us
5114 		 */
5115 		if (pte & pmap->pmap_bits[PG_A_IDX])
5116 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
5117 
5118 		/*
5119 		 * Referenced by someone
5120 		 */
5121 		else if (m && ((m->flags & PG_REFERENCED) ||
5122 				pmap_ts_referenced(m))) {
5123 			val |= MINCORE_REFERENCED_OTHER;
5124 			vm_page_flag_set(m, PG_REFERENCED);
5125 		}
5126 	}
5127 done:
5128 	lwkt_reltoken(&pmap->pm_token);
5129 
5130 	return val;
5131 }
5132 
5133 /*
5134  * Replace p->p_vmspace with a new one.  If adjrefs is non-zero the new
5135  * vmspace will be ref'd and the old one will be deref'd.
5136  *
5137  * The vmspace for all lwps associated with the process will be adjusted
5138  * and cr3 will be reloaded if any lwp is the current lwp.
5139  *
5140  * The process must hold the vmspace->vm_map.token for oldvm and newvm
5141  */
5142 void
5143 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs)
5144 {
5145 	struct vmspace *oldvm;
5146 	struct lwp *lp;
5147 
5148 	oldvm = p->p_vmspace;
5149 	if (oldvm != newvm) {
5150 		if (adjrefs)
5151 			sysref_get(&newvm->vm_sysref);
5152 		p->p_vmspace = newvm;
5153 		KKASSERT(p->p_nthreads == 1);
5154 		lp = RB_ROOT(&p->p_lwp_tree);
5155 		pmap_setlwpvm(lp, newvm);
5156 		if (adjrefs)
5157 			sysref_put(&oldvm->vm_sysref);
5158 	}
5159 }
5160 
5161 /*
5162  * Set the vmspace for a LWP.  The vmspace is almost universally set the
5163  * same as the process vmspace, but virtual kernels need to swap out contexts
5164  * on a per-lwp basis.
5165  *
5166  * Caller does not necessarily hold any vmspace tokens.  Caller must control
5167  * the lwp (typically be in the context of the lwp).  We use a critical
5168  * section to protect against statclock and hardclock (statistics collection).
5169  */
5170 void
5171 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
5172 {
5173 	struct vmspace *oldvm;
5174 	struct pmap *pmap;
5175 
5176 	oldvm = lp->lwp_vmspace;
5177 
5178 	if (oldvm != newvm) {
5179 		crit_enter();
5180 		lp->lwp_vmspace = newvm;
5181 		if (curthread->td_lwp == lp) {
5182 			pmap = vmspace_pmap(newvm);
5183 			atomic_set_cpumask(&pmap->pm_active, mycpu->gd_cpumask);
5184 			if (pmap->pm_active & CPUMASK_LOCK)
5185 				pmap_interlock_wait(newvm);
5186 #if defined(SWTCH_OPTIM_STATS)
5187 			tlb_flush_count++;
5188 #endif
5189 			if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) {
5190 				curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4);
5191 			} else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) {
5192 				curthread->td_pcb->pcb_cr3 = KPML4phys;
5193 			} else {
5194 				panic("pmap_setlwpvm: unknown pmap type\n");
5195 			}
5196 			load_cr3(curthread->td_pcb->pcb_cr3);
5197 			pmap = vmspace_pmap(oldvm);
5198 			atomic_clear_cpumask(&pmap->pm_active, mycpu->gd_cpumask);
5199 		}
5200 		crit_exit();
5201 	}
5202 }
5203 
5204 /*
5205  * Called when switching to a locked pmap, used to interlock against pmaps
5206  * undergoing modifications to prevent us from activating the MMU for the
5207  * target pmap until all such modifications have completed.  We have to do
5208  * this because the thread making the modifications has already set up its
5209  * SMP synchronization mask.
5210  *
5211  * This function cannot sleep!
5212  *
5213  * No requirements.
5214  */
5215 void
5216 pmap_interlock_wait(struct vmspace *vm)
5217 {
5218 	struct pmap *pmap = &vm->vm_pmap;
5219 
5220 	if (pmap->pm_active & CPUMASK_LOCK) {
5221 		crit_enter();
5222 		KKASSERT(curthread->td_critcount >= 2);
5223 		DEBUG_PUSH_INFO("pmap_interlock_wait");
5224 		while (pmap->pm_active & CPUMASK_LOCK) {
5225 			cpu_ccfence();
5226 			lwkt_process_ipiq();
5227 		}
5228 		DEBUG_POP_INFO();
5229 		crit_exit();
5230 	}
5231 }
5232 
5233 vm_offset_t
5234 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
5235 {
5236 
5237 	if ((obj == NULL) || (size < NBPDR) ||
5238 	    ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) {
5239 		return addr;
5240 	}
5241 
5242 	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
5243 	return addr;
5244 }
5245 
5246 /*
5247  * Used by kmalloc/kfree, page already exists at va
5248  */
5249 vm_page_t
5250 pmap_kvtom(vm_offset_t va)
5251 {
5252 	pt_entry_t *ptep = vtopte(va);
5253 
5254 	KKASSERT((*ptep & kernel_pmap.pmap_bits[PG_DEVICE_IDX]) == 0);
5255 	return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME));
5256 }
5257 
5258 /*
5259  * Initialize machine-specific shared page directory support.  This
5260  * is executed when a VM object is created.
5261  */
5262 void
5263 pmap_object_init(vm_object_t object)
5264 {
5265 	object->md.pmap_rw = NULL;
5266 	object->md.pmap_ro = NULL;
5267 }
5268 
5269 /*
5270  * Clean up machine-specific shared page directory support.  This
5271  * is executed when a VM object is destroyed.
5272  */
5273 void
5274 pmap_object_free(vm_object_t object)
5275 {
5276 	pmap_t pmap;
5277 
5278 	if ((pmap = object->md.pmap_rw) != NULL) {
5279 		object->md.pmap_rw = NULL;
5280 		pmap_remove_noinval(pmap,
5281 				  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
5282 		pmap->pm_active = 0;
5283 		pmap_release(pmap);
5284 		pmap_puninit(pmap);
5285 		kfree(pmap, M_OBJPMAP);
5286 	}
5287 	if ((pmap = object->md.pmap_ro) != NULL) {
5288 		object->md.pmap_ro = NULL;
5289 		pmap_remove_noinval(pmap,
5290 				  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
5291 		pmap->pm_active = 0;
5292 		pmap_release(pmap);
5293 		pmap_puninit(pmap);
5294 		kfree(pmap, M_OBJPMAP);
5295 	}
5296 }
5297