1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * Copyright (c) 1994 John S. Dyson
4  * Copyright (c) 1994 David Greenman
5  * Copyright (c) 2003 Peter Wemm
6  * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
7  * Copyright (c) 2008, 2009 The DragonFly Project.
8  * Copyright (c) 2008, 2009 Jordan Gordeev.
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * the Systems Programming Group of the University of Utah Computer
13  * Science Department and William Jolitz of UUNET Technologies Inc.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. All advertising materials mentioning features or use of this software
24  *    must display the following acknowledgement:
25  *	This product includes software developed by the University of
26  *	California, Berkeley and its contributors.
27  * 4. Neither the name of the University nor the names of its contributors
28  *    may be used to endorse or promote products derived from this software
29  *    without specific prior written permission.
30  *
31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41  * SUCH DAMAGE.
42  *
43  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44  * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $
45  */
46 
47 /*
48  * Manages physical address maps.
49  */
50 
51 #include "opt_msgbuf.h"
52 
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
56 #include <sys/proc.h>
57 #include <sys/msgbuf.h>
58 #include <sys/vmmeter.h>
59 #include <sys/mman.h>
60 #include <sys/vmspace.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_param.h>
64 #include <sys/sysctl.h>
65 #include <sys/lock.h>
66 #include <vm/vm_kern.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_map.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_extern.h>
71 #include <vm/vm_pageout.h>
72 #include <vm/vm_pager.h>
73 #include <vm/vm_zone.h>
74 
75 #include <sys/user.h>
76 #include <sys/thread2.h>
77 #include <sys/sysref2.h>
78 #include <sys/spinlock2.h>
79 #include <vm/vm_page2.h>
80 
81 #include <machine/cputypes.h>
82 #include <machine/md_var.h>
83 #include <machine/specialreg.h>
84 #include <machine/smp.h>
85 #include <machine/globaldata.h>
86 #include <machine/pmap.h>
87 #include <machine/pmap_inval.h>
88 
89 #include <ddb/ddb.h>
90 
91 #include <stdio.h>
92 #include <assert.h>
93 #include <stdlib.h>
94 #include <pthread.h>
95 
96 #define PMAP_KEEP_PDIRS
97 #ifndef PMAP_SHPGPERPROC
98 #define PMAP_SHPGPERPROC 1000
99 #endif
100 
101 #if defined(DIAGNOSTIC)
102 #define PMAP_DIAGNOSTIC
103 #endif
104 
105 #define MINPV 2048
106 
107 #if !defined(PMAP_DIAGNOSTIC)
108 #define PMAP_INLINE __inline
109 #else
110 #define PMAP_INLINE
111 #endif
112 
113 /*
114  * Get PDEs and PTEs for user/kernel address space
115  */
116 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va);
117 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
118 
119 #define pmap_pde_v(pte)		((*(pd_entry_t *)pte & VPTE_V) != 0)
120 #define pmap_pte_w(pte)		((*(pt_entry_t *)pte & VPTE_WIRED) != 0)
121 #define pmap_pte_m(pte)		((*(pt_entry_t *)pte & VPTE_M) != 0)
122 #define pmap_pte_u(pte)		((*(pt_entry_t *)pte & VPTE_A) != 0)
123 #define pmap_pte_v(pte)		((*(pt_entry_t *)pte & VPTE_V) != 0)
124 
125 /*
126  * Given a map and a machine independent protection code,
127  * convert to a vax protection code.
128  */
129 #define pte_prot(m, p)		\
130 	(protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
131 static int protection_codes[8];
132 
133 struct pmap kernel_pmap;
134 
135 static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
136 
137 static struct vm_object kptobj;
138 static int nkpt;
139 
140 static uint64_t	KPDphys;	/* phys addr of kernel level 2 */
141 uint64_t		KPDPphys;	/* phys addr of kernel level 3 */
142 uint64_t		KPML4phys;	/* phys addr of kernel level 4 */
143 
144 extern int vmm_enabled;
145 extern void *vkernel_stack;
146 
147 /*
148  * Data for the pv entry allocation mechanism
149  */
150 static vm_zone_t pvzone;
151 static struct vm_zone pvzone_store;
152 static struct vm_object pvzone_obj;
153 static int pv_entry_count = 0;
154 static int pv_entry_max = 0;
155 static int pv_entry_high_water = 0;
156 static int pmap_pagedaemon_waken = 0;
157 static struct pv_entry *pvinit;
158 
159 /*
160  * All those kernel PT submaps that BSD is so fond of
161  */
162 pt_entry_t *CMAP1 = NULL, *ptmmap;
163 caddr_t CADDR1 = NULL;
164 static pt_entry_t *msgbufmap;
165 
166 uint64_t KPTphys;
167 
168 static PMAP_INLINE void	free_pv_entry (pv_entry_t pv);
169 static pv_entry_t get_pv_entry (void);
170 static void	i386_protection_init (void);
171 static __inline void	pmap_clearbit (vm_page_t m, int bit);
172 
173 static void	pmap_remove_all (vm_page_t m);
174 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq,
175 				pt_entry_t oldpte, vm_offset_t sva);
176 static void pmap_remove_page (struct pmap *pmap, vm_offset_t va);
177 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m,
178 				vm_offset_t va);
179 static boolean_t pmap_testbit (vm_page_t m, int bit);
180 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va,
181 				vm_page_t mpte, vm_page_t m, pv_entry_t);
182 
183 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va);
184 
185 static int pmap_release_free_page (pmap_t pmap, vm_page_t p);
186 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex);
187 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex);
188 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t);
189 
190 static int
191 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2)
192 {
193 	if (pv1->pv_va < pv2->pv_va)
194 		return(-1);
195 	if (pv1->pv_va > pv2->pv_va)
196 		return(1);
197 	return(0);
198 }
199 
200 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry,
201 	    pv_entry_compare, vm_offset_t, pv_va);
202 
203 static __inline vm_pindex_t
204 pmap_pt_pindex(vm_offset_t va)
205 {
206 	return va >> PDRSHIFT;
207 }
208 
209 static __inline vm_pindex_t
210 pmap_pte_index(vm_offset_t va)
211 {
212 	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
213 }
214 
215 static __inline vm_pindex_t
216 pmap_pde_index(vm_offset_t va)
217 {
218 	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
219 }
220 
221 static __inline vm_pindex_t
222 pmap_pdpe_index(vm_offset_t va)
223 {
224 	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
225 }
226 
227 static __inline vm_pindex_t
228 pmap_pml4e_index(vm_offset_t va)
229 {
230 	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
231 }
232 
233 /* Return a pointer to the PML4 slot that corresponds to a VA */
234 static __inline pml4_entry_t *
235 pmap_pml4e(pmap_t pmap, vm_offset_t va)
236 {
237 	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
238 }
239 
240 /* Return a pointer to the PDP slot that corresponds to a VA */
241 static __inline pdp_entry_t *
242 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
243 {
244 	pdp_entry_t *pdpe;
245 
246 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & VPTE_FRAME);
247 	return (&pdpe[pmap_pdpe_index(va)]);
248 }
249 
250 /* Return a pointer to the PDP slot that corresponds to a VA */
251 static __inline pdp_entry_t *
252 pmap_pdpe(pmap_t pmap, vm_offset_t va)
253 {
254 	pml4_entry_t *pml4e;
255 
256 	pml4e = pmap_pml4e(pmap, va);
257 	if ((*pml4e & VPTE_V) == 0)
258 		return NULL;
259 	return (pmap_pml4e_to_pdpe(pml4e, va));
260 }
261 
262 /* Return a pointer to the PD slot that corresponds to a VA */
263 static __inline pd_entry_t *
264 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
265 {
266 	pd_entry_t *pde;
267 
268 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & VPTE_FRAME);
269 	return (&pde[pmap_pde_index(va)]);
270 }
271 
272 /* Return a pointer to the PD slot that corresponds to a VA */
273 static __inline pd_entry_t *
274 pmap_pde(pmap_t pmap, vm_offset_t va)
275 {
276 	pdp_entry_t *pdpe;
277 
278 	pdpe = pmap_pdpe(pmap, va);
279 	if (pdpe == NULL || (*pdpe & VPTE_V) == 0)
280 		 return NULL;
281 	return (pmap_pdpe_to_pde(pdpe, va));
282 }
283 
284 /* Return a pointer to the PT slot that corresponds to a VA */
285 static __inline pt_entry_t *
286 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
287 {
288 	pt_entry_t *pte;
289 
290 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & VPTE_FRAME);
291 	return (&pte[pmap_pte_index(va)]);
292 }
293 
294 /*
295  * Hold pt_m for page table scans to prevent it from getting reused out
296  * from under us across blocking conditions in the body of the loop.
297  */
298 static __inline
299 vm_page_t
300 pmap_hold_pt_page(pd_entry_t *pde, vm_offset_t va)
301 {
302 	pt_entry_t pte;
303 	vm_page_t pt_m;
304 
305 	pte = (pt_entry_t)*pde;
306 	KKASSERT(pte != 0);
307 	pt_m = PHYS_TO_VM_PAGE(pte & VPTE_FRAME);
308 	vm_page_hold(pt_m);
309 
310 	return pt_m;
311 }
312 
313 /* Return a pointer to the PT slot that corresponds to a VA */
314 static __inline pt_entry_t *
315 pmap_pte(pmap_t pmap, vm_offset_t va)
316 {
317 	pd_entry_t *pde;
318 
319 	pde = pmap_pde(pmap, va);
320 	if (pde == NULL || (*pde & VPTE_V) == 0)
321 		return NULL;
322 	if ((*pde & VPTE_PS) != 0)	/* compat with i386 pmap_pte() */
323 		return ((pt_entry_t *)pde);
324 	return (pmap_pde_to_pte(pde, va));
325 }
326 
327 static PMAP_INLINE pt_entry_t *
328 vtopte(vm_offset_t va)
329 {
330 	pt_entry_t *x;
331 	x = pmap_pte(&kernel_pmap, va);
332 	assert(x != NULL);
333 	return x;
334 }
335 
336 static __inline pd_entry_t *
337 vtopde(vm_offset_t va)
338 {
339 	pd_entry_t *x;
340 	x = pmap_pde(&kernel_pmap, va);
341 	assert(x != NULL);
342 	return x;
343 }
344 
345 static uint64_t
346 allocpages(vm_paddr_t *firstaddr, int n)
347 {
348 	uint64_t ret;
349 
350 	ret = *firstaddr;
351 	/*bzero((void *)ret, n * PAGE_SIZE); not mapped yet */
352 	*firstaddr += n * PAGE_SIZE;
353 	return (ret);
354 }
355 
356 static void
357 create_dmap_vmm(vm_paddr_t *firstaddr)
358 {
359 	void *stack_addr;
360 	int pml4_stack_index;
361 	int pdp_stack_index;
362 	int pd_stack_index;
363 	long i,j;
364 	int regs[4];
365 	int amd_feature;
366 
367 	uint64_t KPDP_DMAP_phys = allocpages(firstaddr, NDMPML4E);
368 	uint64_t KPDP_VSTACK_phys = allocpages(firstaddr, 1);
369 	uint64_t KPD_VSTACK_phys = allocpages(firstaddr, 1);
370 
371 	pml4_entry_t *KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
372 	pdp_entry_t *KPDP_DMAP_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_DMAP_phys);
373 	pdp_entry_t *KPDP_VSTACK_virt = (pdp_entry_t *)PHYS_TO_DMAP(KPDP_VSTACK_phys);
374 	pd_entry_t *KPD_VSTACK_virt = (pd_entry_t *)PHYS_TO_DMAP(KPD_VSTACK_phys);
375 
376 	bzero(KPDP_DMAP_virt, NDMPML4E * PAGE_SIZE);
377 	bzero(KPDP_VSTACK_virt, 1 * PAGE_SIZE);
378 	bzero(KPD_VSTACK_virt, 1 * PAGE_SIZE);
379 
380 	do_cpuid(0x80000001, regs);
381 	amd_feature = regs[3];
382 
383 	/* Build the mappings for the first 512GB */
384 	if (amd_feature & AMDID_PAGE1GB) {
385 		/* In pages of 1 GB, if supported */
386 		for (i = 0; i < NPDPEPG; i++) {
387 			KPDP_DMAP_virt[i] = ((uint64_t)i << PDPSHIFT);
388 			KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_PS | VPTE_U;
389 		}
390 	} else {
391 		/* In page of 2MB, otherwise */
392 		for (i = 0; i < NPDPEPG; i++) {
393 			uint64_t KPD_DMAP_phys;
394 			pd_entry_t *KPD_DMAP_virt;
395 
396 			KPD_DMAP_phys = allocpages(firstaddr, 1);
397 			KPD_DMAP_virt =
398 				(pd_entry_t *)PHYS_TO_DMAP(KPD_DMAP_phys);
399 
400 			bzero(KPD_DMAP_virt, PAGE_SIZE);
401 
402 			KPDP_DMAP_virt[i] = KPD_DMAP_phys;
403 			KPDP_DMAP_virt[i] |= VPTE_RW | VPTE_V | VPTE_U;
404 
405 			/* For each PD, we have to allocate NPTEPG PT */
406 			for (j = 0; j < NPTEPG; j++) {
407 				KPD_DMAP_virt[j] = (i << PDPSHIFT) |
408 						   (j << PDRSHIFT);
409 				KPD_DMAP_virt[j] |= VPTE_RW | VPTE_V |
410 						    VPTE_PS | VPTE_U;
411 			}
412 		}
413 	}
414 
415 	/* DMAP for the first 512G */
416 	KPML4virt[0] = KPDP_DMAP_phys;
417 	KPML4virt[0] |= VPTE_RW | VPTE_V | VPTE_U;
418 
419 	/* create a 2 MB map of the new stack */
420 	pml4_stack_index = (uint64_t)&stack_addr >> PML4SHIFT;
421 	KPML4virt[pml4_stack_index] = KPDP_VSTACK_phys;
422 	KPML4virt[pml4_stack_index] |= VPTE_RW | VPTE_V | VPTE_U;
423 
424 	pdp_stack_index = ((uint64_t)&stack_addr & PML4MASK) >> PDPSHIFT;
425 	KPDP_VSTACK_virt[pdp_stack_index] = KPD_VSTACK_phys;
426 	KPDP_VSTACK_virt[pdp_stack_index] |= VPTE_RW | VPTE_V | VPTE_U;
427 
428 	pd_stack_index = ((uint64_t)&stack_addr & PDPMASK) >> PDRSHIFT;
429 	KPD_VSTACK_virt[pd_stack_index] = (uint64_t) vkernel_stack;
430 	KPD_VSTACK_virt[pd_stack_index] |= VPTE_RW | VPTE_V | VPTE_U | VPTE_PS;
431 }
432 
433 static void
434 create_pagetables(vm_paddr_t *firstaddr, int64_t ptov_offset)
435 {
436 	int i;
437 	pml4_entry_t *KPML4virt;
438 	pdp_entry_t *KPDPvirt;
439 	pd_entry_t *KPDvirt;
440 	pt_entry_t *KPTvirt;
441 	int kpml4i = pmap_pml4e_index(ptov_offset);
442 	int kpdpi = pmap_pdpe_index(ptov_offset);
443 	int kpdi = pmap_pde_index(ptov_offset);
444 
445 	/*
446          * Calculate NKPT - number of kernel page tables.  We have to
447          * accomodoate prealloction of the vm_page_array, dump bitmap,
448          * MSGBUF_SIZE, and other stuff.  Be generous.
449          *
450          * Maxmem is in pages.
451          */
452         nkpt = (Maxmem * (sizeof(struct vm_page) * 2) + MSGBUF_SIZE) / NBPDR;
453 	/*
454 	 * Allocate pages
455 	 */
456 	KPML4phys = allocpages(firstaddr, 1);
457 	KPDPphys = allocpages(firstaddr, NKPML4E);
458 	KPDphys = allocpages(firstaddr, NKPDPE);
459 	KPTphys = allocpages(firstaddr, nkpt);
460 
461 	KPML4virt = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
462 	KPDPvirt = (pdp_entry_t *)PHYS_TO_DMAP(KPDPphys);
463 	KPDvirt = (pd_entry_t *)PHYS_TO_DMAP(KPDphys);
464 	KPTvirt = (pt_entry_t *)PHYS_TO_DMAP(KPTphys);
465 
466 	bzero(KPML4virt, 1 * PAGE_SIZE);
467 	bzero(KPDPvirt, NKPML4E * PAGE_SIZE);
468 	bzero(KPDvirt, NKPDPE * PAGE_SIZE);
469 	bzero(KPTvirt, nkpt * PAGE_SIZE);
470 
471 	/* Now map the page tables at their location within PTmap */
472 	for (i = 0; i < nkpt; i++) {
473 		KPDvirt[i + kpdi] = KPTphys + (i << PAGE_SHIFT);
474 		KPDvirt[i + kpdi] |= VPTE_RW | VPTE_V | VPTE_U;
475 	}
476 
477 	/* And connect up the PD to the PDP */
478 	for (i = 0; i < NKPDPE; i++) {
479 		KPDPvirt[i + kpdpi] = KPDphys + (i << PAGE_SHIFT);
480 		KPDPvirt[i + kpdpi] |= VPTE_RW | VPTE_V | VPTE_U;
481 	}
482 
483 	/* And recursively map PML4 to itself in order to get PTmap */
484 	KPML4virt[PML4PML4I] = KPML4phys;
485 	KPML4virt[PML4PML4I] |= VPTE_RW | VPTE_V | VPTE_U;
486 
487 	/* Connect the KVA slot up to the PML4 */
488 	KPML4virt[kpml4i] = KPDPphys;
489 	KPML4virt[kpml4i] |= VPTE_RW | VPTE_V | VPTE_U;
490 }
491 
492 /*
493  * Typically used to initialize a fictitious page by vm/device_pager.c
494  */
495 void
496 pmap_page_init(struct vm_page *m)
497 {
498 	vm_page_init(m);
499 	TAILQ_INIT(&m->md.pv_list);
500 }
501 
502 /*
503  *	Bootstrap the system enough to run with virtual memory.
504  *
505  *	On the i386 this is called after mapping has already been enabled
506  *	and just syncs the pmap module with what has already been done.
507  *	[We can't call it easily with mapping off since the kernel is not
508  *	mapped with PA == VA, hence we would have to relocate every address
509  *	from the linked base (virtual) address "KERNBASE" to the actual
510  *	(physical) address starting relative to 0]
511  */
512 void
513 pmap_bootstrap(vm_paddr_t *firstaddr, int64_t ptov_offset)
514 {
515 	vm_offset_t va;
516 	pt_entry_t *pte;
517 
518 	/*
519 	 * Create an initial set of page tables to run the kernel in.
520 	 */
521 	create_pagetables(firstaddr, ptov_offset);
522 
523 	/* Create the DMAP for the VMM */
524 	if (vmm_enabled) {
525 		create_dmap_vmm(firstaddr);
526 	}
527 
528 	virtual_start = KvaStart;
529 	virtual_end = KvaEnd;
530 
531 	/*
532 	 * Initialize protection array.
533 	 */
534 	i386_protection_init();
535 
536 	/*
537 	 * The kernel's pmap is statically allocated so we don't have to use
538 	 * pmap_create, which is unlikely to work correctly at this part of
539 	 * the boot sequence (XXX and which no longer exists).
540 	 *
541 	 * The kernel_pmap's pm_pteobj is used only for locking and not
542 	 * for mmu pages.
543 	 */
544 	kernel_pmap.pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
545 	kernel_pmap.pm_count = 1;
546 	/* don't allow deactivation */
547 	CPUMASK_ASSALLONES(kernel_pmap.pm_active);
548 	kernel_pmap.pm_pteobj = NULL;	/* see pmap_init */
549 	RB_INIT(&kernel_pmap.pm_pvroot);
550 	spin_init(&kernel_pmap.pm_spin, "pmapbootstrap");
551 
552 	/*
553 	 * Reserve some special page table entries/VA space for temporary
554 	 * mapping of pages.
555 	 */
556 #define	SYSMAP(c, p, v, n)	\
557 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
558 
559 	va = virtual_start;
560 	pte = pmap_pte(&kernel_pmap, va);
561 	/*
562 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
563 	 */
564 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
565 
566 #if JGV
567 	/*
568 	 * Crashdump maps.
569 	 */
570 	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
571 #endif
572 
573 	/*
574 	 * ptvmmap is used for reading arbitrary physical pages via
575 	 * /dev/mem.
576 	 */
577 	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
578 
579 	/*
580 	 * msgbufp is used to map the system message buffer.
581 	 * XXX msgbufmap is not used.
582 	 */
583 	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
584 	       atop(round_page(MSGBUF_SIZE)))
585 
586 	virtual_start = va;
587 
588 	*CMAP1 = 0;
589 	/* Not ready to do an invltlb yet for VMM*/
590 	if (!vmm_enabled)
591 		cpu_invltlb();
592 
593 }
594 
595 /*
596  *	Initialize the pmap module.
597  *	Called by vm_init, to initialize any structures that the pmap
598  *	system needs to map virtual memory.
599  *	pmap_init has been enhanced to support in a fairly consistant
600  *	way, discontiguous physical memory.
601  */
602 void
603 pmap_init(void)
604 {
605 	int i;
606 	int initial_pvs;
607 
608 	/*
609 	 * object for kernel page table pages
610 	 */
611 	/* JG I think the number can be arbitrary */
612 	vm_object_init(&kptobj, 5);
613 	kernel_pmap.pm_pteobj = &kptobj;
614 
615 	/*
616 	 * Allocate memory for random pmap data structures.  Includes the
617 	 * pv_head_table.
618 	 */
619 	for(i = 0; i < vm_page_array_size; i++) {
620 		vm_page_t m;
621 
622 		m = &vm_page_array[i];
623 		TAILQ_INIT(&m->md.pv_list);
624 		m->md.pv_list_count = 0;
625 	}
626 
627 	/*
628 	 * init the pv free list
629 	 */
630 	initial_pvs = vm_page_array_size;
631 	if (initial_pvs < MINPV)
632 		initial_pvs = MINPV;
633 	pvzone = &pvzone_store;
634 	pvinit = (struct pv_entry *)
635 		kmem_alloc(&kernel_map,
636 			   initial_pvs * sizeof (struct pv_entry),
637 			   VM_SUBSYS_PVENTRY);
638 	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit,
639 		initial_pvs);
640 
641 	/*
642 	 * Now it is safe to enable pv_table recording.
643 	 */
644 	pmap_initialized = TRUE;
645 }
646 
647 /*
648  * Initialize the address space (zone) for the pv_entries.  Set a
649  * high water mark so that the system can recover from excessive
650  * numbers of pv entries.
651  */
652 void
653 pmap_init2(void)
654 {
655 	int shpgperproc = PMAP_SHPGPERPROC;
656 
657 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
658 	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
659 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
660 	pv_entry_high_water = 9 * (pv_entry_max / 10);
661 	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT);
662 }
663 
664 
665 /***************************************************
666  * Low level helper routines.....
667  ***************************************************/
668 
669 /*
670  * The modification bit is not tracked for any pages in this range. XXX
671  * such pages in this maps should always use pmap_k*() functions and not
672  * be managed anyhow.
673  *
674  * XXX User and kernel address spaces are independant for virtual kernels,
675  * this function only applies to the kernel pmap.
676  */
677 int
678 pmap_track_modified(pmap_t pmap, vm_offset_t va)
679 {
680 	if (pmap != &kernel_pmap)
681 		return 1;
682 	if ((va < clean_sva) || (va >= clean_eva))
683 		return 1;
684 	else
685 		return 0;
686 }
687 
688 /*
689  * Extract the physical page address associated with the map/VA pair.
690  *
691  * No requirements.
692  */
693 vm_paddr_t
694 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep)
695 {
696 	vm_paddr_t rtval;
697 	pt_entry_t *pte;
698 	pd_entry_t pde, *pdep;
699 
700 	vm_object_hold(pmap->pm_pteobj);
701 	rtval = 0;
702 	pdep = pmap_pde(pmap, va);
703 	if (pdep != NULL) {
704 		pde = *pdep;
705 		if (pde) {
706 			if ((pde & VPTE_PS) != 0) {
707 				/* JGV */
708 				rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
709 			} else {
710 				pte = pmap_pde_to_pte(pdep, va);
711 				rtval = (*pte & VPTE_FRAME) | (va & PAGE_MASK);
712 			}
713 		}
714 	}
715 	if (handlep)
716 		*handlep = NULL;	/* XXX */
717 	vm_object_drop(pmap->pm_pteobj);
718 
719 	return rtval;
720 }
721 
722 void
723 pmap_extract_done(void *handle)
724 {
725 	pmap_t pmap;
726 
727 	if (handle) {
728 		pmap = handle;
729 		vm_object_drop(pmap->pm_pteobj);
730 	}
731 }
732 
733 /*
734  * Similar to extract but checks protections, SMP-friendly short-cut for
735  * vm_fault_page[_quick]().
736  *
737  * WARNING! THE RETURNED PAGE IS ONLY HELD AND NEITHER IT NOR ITS TARGET
738  *	    DATA IS SUITABLE FOR WRITING.  Writing can interfere with
739  *	    pageouts flushes, msync, etc.  The hold_count is not enough
740  *	    to avoid races against pageouts and other flush code doesn't
741  *	    care about hold_count.
742  */
743 vm_page_t
744 pmap_fault_page_quick(pmap_t pmap __unused, vm_offset_t vaddr __unused,
745 		      vm_prot_t prot __unused, int *busyp __unused)
746 {
747 	return(NULL);
748 }
749 
750 /*
751  *	Routine:	pmap_kextract
752  *	Function:
753  *		Extract the physical page address associated
754  *		kernel virtual address.
755  */
756 vm_paddr_t
757 pmap_kextract(vm_offset_t va)
758 {
759 	pd_entry_t pde;
760 	vm_paddr_t pa;
761 
762 	KKASSERT(va >= KvaStart && va < KvaEnd);
763 
764 	/*
765 	 * The DMAP region is not included in [KvaStart, KvaEnd)
766 	 */
767 #if 0
768 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
769 		pa = DMAP_TO_PHYS(va);
770 	} else {
771 #endif
772 		pde = *vtopde(va);
773 		if (pde & VPTE_PS) {
774 			/* JGV */
775 			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
776 		} else {
777 			/*
778 			 * Beware of a concurrent promotion that changes the
779 			 * PDE at this point!  For example, vtopte() must not
780 			 * be used to access the PTE because it would use the
781 			 * new PDE.  It is, however, safe to use the old PDE
782 			 * because the page table page is preserved by the
783 			 * promotion.
784 			 */
785 			pa = *pmap_pde_to_pte(&pde, va);
786 			pa = (pa & VPTE_FRAME) | (va & PAGE_MASK);
787 		}
788 #if 0
789 	}
790 #endif
791 	return pa;
792 }
793 
794 /***************************************************
795  * Low level mapping routines.....
796  ***************************************************/
797 
798 /*
799  * Enter a mapping into kernel_pmap.  Mappings created in this fashion
800  * are not managed.  Mappings must be immediately accessible on all cpus.
801  *
802  * Call pmap_inval_pte() to invalidate the virtual pte and clean out the
803  * real pmap and handle related races before storing the new vpte.  The
804  * new semantics for kenter require use to do an UNCONDITIONAL invalidation,
805  * because the entry may have previously been cleared without an invalidation.
806  */
807 void
808 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
809 {
810 	pt_entry_t *ptep;
811 	pt_entry_t npte;
812 
813 	KKASSERT(va >= KvaStart && va < KvaEnd);
814 	npte = pa | VPTE_RW | VPTE_V | VPTE_U;
815 	ptep = vtopte(va);
816 
817 #if 1
818 	pmap_inval_pte(ptep, &kernel_pmap, va);
819 #else
820 	if (*pte & VPTE_V)
821 		pmap_inval_pte(ptep, &kernel_pmap, va);
822 #endif
823 	atomic_swap_long(ptep, npte);
824 }
825 
826 /*
827  * Enter an unmanaged KVA mapping for the private use of the current
828  * cpu only.
829  *
830  * It is illegal for the mapping to be accessed by other cpus without
831  * proper invalidation.
832  */
833 int
834 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
835 {
836 	pt_entry_t *ptep;
837 	pt_entry_t npte;
838 	int res;
839 
840 	KKASSERT(va >= KvaStart && va < KvaEnd);
841 
842 	npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U;
843 	ptep = vtopte(va);
844 
845 #if 1
846 	pmap_inval_pte_quick(ptep, &kernel_pmap, va);
847 	res = 1;
848 #else
849 	/* FUTURE */
850 	res = (*ptep != 0);
851 	if (*pte & VPTE_V)
852 		pmap_inval_pte(pte, &kernel_pmap, va);
853 #endif
854 	atomic_swap_long(ptep, npte);
855 
856 	return res;
857 }
858 
859 /*
860  * Invalidation will occur later, ok to be lazy here.
861  */
862 int
863 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa)
864 {
865 	pt_entry_t *ptep;
866 	pt_entry_t npte;
867 	int res;
868 
869 	KKASSERT(va >= KvaStart && va < KvaEnd);
870 
871 	npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U;
872 	ptep = vtopte(va);
873 #if 1
874 	res = 1;
875 #else
876 	/* FUTURE */
877 	res = (*ptep != 0);
878 #endif
879 	atomic_swap_long(ptep, npte);
880 
881 	return res;
882 }
883 
884 /*
885  * Remove an unmanaged mapping created with pmap_kenter*().
886  */
887 void
888 pmap_kremove(vm_offset_t va)
889 {
890 	pt_entry_t *ptep;
891 
892 	KKASSERT(va >= KvaStart && va < KvaEnd);
893 
894 	ptep = vtopte(va);
895 	atomic_swap_long(ptep, 0);
896 	pmap_inval_pte(ptep, &kernel_pmap, va);
897 }
898 
899 /*
900  * Remove an unmanaged mapping created with pmap_kenter*() but synchronize
901  * only with this cpu.
902  *
903  * Unfortunately because we optimize new entries by testing VPTE_V later
904  * on, we actually still have to synchronize with all the cpus.  XXX maybe
905  * store a junk value and test against 0 in the other places instead?
906  */
907 void
908 pmap_kremove_quick(vm_offset_t va)
909 {
910 	pt_entry_t *ptep;
911 
912 	KKASSERT(va >= KvaStart && va < KvaEnd);
913 
914 	ptep = vtopte(va);
915 	atomic_swap_long(ptep, 0);
916 	pmap_inval_pte(ptep, &kernel_pmap, va); /* NOT _quick */
917 }
918 
919 /*
920  * Invalidation will occur later, ok to be lazy here.
921  */
922 void
923 pmap_kremove_noinval(vm_offset_t va)
924 {
925 	pt_entry_t *ptep;
926 
927 	KKASSERT(va >= KvaStart && va < KvaEnd);
928 
929 	ptep = vtopte(va);
930 	atomic_swap_long(ptep, 0);
931 }
932 
933 /*
934  *	Used to map a range of physical addresses into kernel
935  *	virtual address space.
936  *
937  *	For now, VM is already on, we only need to map the
938  *	specified memory.
939  */
940 vm_offset_t
941 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot)
942 {
943 	return PHYS_TO_DMAP(start);
944 }
945 
946 /*
947  * Map a set of unmanaged VM pages into KVM.
948  */
949 void
950 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count)
951 {
952 	vm_offset_t end_va;
953 	vm_offset_t va;
954 
955 	end_va = beg_va + count * PAGE_SIZE;
956 	KKASSERT(beg_va >= KvaStart && end_va <= KvaEnd);
957 
958 	for (va = beg_va; va < end_va; va += PAGE_SIZE) {
959 		pt_entry_t *ptep;
960 
961 		ptep = vtopte(va);
962 		atomic_swap_long(ptep, VM_PAGE_TO_PHYS(*m) |
963 				       VPTE_RW | VPTE_V | VPTE_U);
964 		++m;
965 	}
966 	pmap_invalidate_range(&kernel_pmap, beg_va, end_va);
967 	/* pmap_inval_pte(pte, &kernel_pmap, va); */
968 }
969 
970 /*
971  * Undo the effects of pmap_qenter*().
972  */
973 void
974 pmap_qremove(vm_offset_t beg_va, int count)
975 {
976 	vm_offset_t end_va;
977 	vm_offset_t va;
978 
979 	end_va = beg_va + count * PAGE_SIZE;
980 	KKASSERT(beg_va >= KvaStart && end_va < KvaEnd);
981 
982 	for (va = beg_va; va < end_va; va += PAGE_SIZE) {
983 		pt_entry_t *ptep;
984 
985 		ptep = vtopte(va);
986 		atomic_swap_long(ptep, 0);
987 	}
988 	pmap_invalidate_range(&kernel_pmap, beg_va, end_va);
989 }
990 
991 /*
992  * Unlike the real pmap code, we can't avoid calling the real-kernel.
993  */
994 void
995 pmap_qremove_quick(vm_offset_t va, int count)
996 {
997 	pmap_qremove(va, count);
998 }
999 
1000 void
1001 pmap_qremove_noinval(vm_offset_t va, int count)
1002 {
1003 	pmap_qremove(va, count);
1004 }
1005 
1006 /*
1007  * This routine works like vm_page_lookup() but also blocks as long as the
1008  * page is busy.  This routine does not busy the page it returns.
1009  *
1010  * Unless the caller is managing objects whos pages are in a known state,
1011  * the call should be made with a critical section held so the page's object
1012  * association remains valid on return.
1013  */
1014 static vm_page_t
1015 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
1016 {
1017 	vm_page_t m;
1018 
1019 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1020 	m = vm_page_lookup_busy_wait(object, pindex, TRUE, "pplookp");
1021 
1022 	return(m);
1023 }
1024 
1025 /*
1026  * Create a new thread and optionally associate it with a (new) process.
1027  * NOTE! the new thread's cpu may not equal the current cpu.
1028  */
1029 void
1030 pmap_init_thread(thread_t td)
1031 {
1032 	/* enforce pcb placement */
1033 	td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
1034 	td->td_savefpu = &td->td_pcb->pcb_save;
1035 	td->td_sp = (char *)td->td_pcb - 16; /* JG is -16 needed on x86_64? */
1036 }
1037 
1038 /*
1039  * This routine directly affects the fork perf for a process.
1040  */
1041 void
1042 pmap_init_proc(struct proc *p)
1043 {
1044 }
1045 
1046 /*
1047  * Unwire a page table which has been removed from the pmap.  We own the
1048  * wire_count, so the page cannot go away.  The page representing the page
1049  * table is passed in unbusied and must be busied if we cannot trivially
1050  * unwire it.
1051  *
1052  * XXX NOTE!  This code is not usually run because we do not currently
1053  *	      implement dynamic page table page removal.  The page in
1054  *	      its parent assumes at least 1 wire count, so no call to this
1055  *	      function ever sees a wire count less than 2.
1056  */
1057 static int
1058 pmap_unwire_pgtable(pmap_t pmap, vm_offset_t va, vm_page_t m)
1059 {
1060 	/*
1061 	 * Try to unwire optimally.  If non-zero is returned the wire_count
1062 	 * is 1 and we must busy the page to unwire it.
1063 	 */
1064 	if (vm_page_unwire_quick(m) == 0)
1065 		return 0;
1066 
1067 	vm_page_busy_wait(m, TRUE, "pmuwpt");
1068 	KASSERT(m->queue == PQ_NONE,
1069 		("_pmap_unwire_pgtable: %p->queue != PQ_NONE", m));
1070 
1071 	if (m->wire_count == 1) {
1072 		/*
1073 		 * Unmap the page table page.
1074 		 */
1075 		/* pmap_inval_add(info, pmap, -1); */
1076 
1077 		if (m->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) {
1078 			/* PDP page */
1079 			pml4_entry_t *pml4;
1080 			pml4 = pmap_pml4e(pmap, va);
1081 			*pml4 = 0;
1082 		} else if (m->pindex >= NUPT_TOTAL) {
1083 			/* PD page */
1084 			pdp_entry_t *pdp;
1085 			pdp = pmap_pdpe(pmap, va);
1086 			*pdp = 0;
1087 		} else {
1088 			/* PT page */
1089 			pd_entry_t *pd;
1090 			pd = pmap_pde(pmap, va);
1091 			*pd = 0;
1092 		}
1093 
1094 		KKASSERT(pmap->pm_stats.resident_count > 0);
1095 		atomic_add_long(&pmap->pm_stats.resident_count, -1);
1096 
1097 		if (pmap->pm_ptphint == m)
1098 			pmap->pm_ptphint = NULL;
1099 
1100 		if (m->pindex < NUPT_TOTAL) {
1101 			/* We just released a PT, unhold the matching PD */
1102 			vm_page_t pdpg;
1103 
1104 			pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) &
1105 					       VPTE_FRAME);
1106 			pmap_unwire_pgtable(pmap, va, pdpg);
1107 		}
1108 		if (m->pindex >= NUPT_TOTAL &&
1109 		    m->pindex < (NUPT_TOTAL + NUPD_TOTAL)) {
1110 			/* We just released a PD, unhold the matching PDP */
1111 			vm_page_t pdppg;
1112 
1113 			pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) &
1114 						VPTE_FRAME);
1115 			pmap_unwire_pgtable(pmap, va, pdppg);
1116 		}
1117 
1118 		/*
1119 		 * This was our last wire, the page had better be unwired
1120 		 * after we decrement wire_count.
1121 		 *
1122 		 * FUTURE NOTE: shared page directory page could result in
1123 		 * multiple wire counts.
1124 		 */
1125 		vm_page_unwire(m, 0);
1126 		KKASSERT(m->wire_count == 0);
1127 		vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1128 		vm_page_flash(m);
1129 		vm_page_free(m);
1130 		return 1;
1131 	} else {
1132 		/* XXX SMP race to 1 if not holding vmobj */
1133 		vm_page_unwire(m, 0);
1134 		vm_page_wakeup(m);
1135 		return 0;
1136 	}
1137 }
1138 
1139 /*
1140  * After removing a page table entry, this routine is used to
1141  * conditionally free the page, and manage the hold/wire counts.
1142  *
1143  * If not NULL the caller owns a wire_count on mpte, so it can't disappear.
1144  * If NULL the caller owns a wire_count on what would be the mpte, we must
1145  * look it up.
1146  */
1147 static int
1148 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1149 {
1150 	vm_pindex_t ptepindex;
1151 
1152 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj));
1153 
1154 	if (mpte == NULL) {
1155 		/*
1156 		 * page table pages in the kernel_pmap are not managed.
1157 		 */
1158 		if (pmap == &kernel_pmap)
1159 			return(0);
1160 		ptepindex = pmap_pt_pindex(va);
1161 		if (pmap->pm_ptphint &&
1162 		    (pmap->pm_ptphint->pindex == ptepindex)) {
1163 			mpte = pmap->pm_ptphint;
1164 		} else {
1165 			mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1166 			pmap->pm_ptphint = mpte;
1167 			vm_page_wakeup(mpte);
1168 		}
1169 	}
1170 	return pmap_unwire_pgtable(pmap, va, mpte);
1171 }
1172 
1173 /*
1174  * Initialize pmap0/vmspace0 .  Since process 0 never enters user mode we
1175  * just dummy it up so it works well enough for fork().
1176  *
1177  * In DragonFly, process pmaps may only be used to manipulate user address
1178  * space, never kernel address space.
1179  */
1180 void
1181 pmap_pinit0(struct pmap *pmap)
1182 {
1183 	pmap_pinit(pmap);
1184 }
1185 
1186 /*
1187  * Initialize a preallocated and zeroed pmap structure,
1188  * such as one in a vmspace structure.
1189  */
1190 void
1191 pmap_pinit(struct pmap *pmap)
1192 {
1193 	vm_page_t ptdpg;
1194 
1195 	/*
1196 	 * No need to allocate page table space yet but we do need a valid
1197 	 * page directory table.
1198 	 */
1199 	if (pmap->pm_pml4 == NULL) {
1200 		pmap->pm_pml4 = (pml4_entry_t *)
1201 			kmem_alloc_pageable(&kernel_map, PAGE_SIZE,
1202 					    VM_SUBSYS_PML4);
1203 	}
1204 
1205 	/*
1206 	 * Allocate an object for the ptes
1207 	 */
1208 	if (pmap->pm_pteobj == NULL)
1209 		pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL + 1);
1210 
1211 	/*
1212 	 * Allocate the page directory page, unless we already have
1213 	 * one cached.  If we used the cached page the wire_count will
1214 	 * already be set appropriately.
1215 	 */
1216 	if ((ptdpg = pmap->pm_pdirm) == NULL) {
1217 		ptdpg = vm_page_grab(pmap->pm_pteobj,
1218 				     NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL,
1219 				     VM_ALLOC_NORMAL | VM_ALLOC_RETRY |
1220 				     VM_ALLOC_ZERO);
1221 		pmap->pm_pdirm = ptdpg;
1222 		vm_page_flag_clear(ptdpg, PG_MAPPED | PG_WRITEABLE);
1223 		vm_page_wire(ptdpg);
1224 		vm_page_wakeup(ptdpg);
1225 		pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg));
1226 	}
1227 	pmap->pm_count = 1;
1228 	CPUMASK_ASSZERO(pmap->pm_active);
1229 	pmap->pm_ptphint = NULL;
1230 	RB_INIT(&pmap->pm_pvroot);
1231 	spin_init(&pmap->pm_spin, "pmapinit");
1232 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1233 	pmap->pm_stats.resident_count = 1;
1234 	pmap->pm_stats.wired_count = 1;
1235 }
1236 
1237 /*
1238  * Clean up a pmap structure so it can be physically freed.  This routine
1239  * is called by the vmspace dtor function.  A great deal of pmap data is
1240  * left passively mapped to improve vmspace management so we have a bit
1241  * of cleanup work to do here.
1242  *
1243  * No requirements.
1244  */
1245 void
1246 pmap_puninit(pmap_t pmap)
1247 {
1248 	vm_page_t p;
1249 
1250 	KKASSERT(CPUMASK_TESTZERO(pmap->pm_active));
1251 	if ((p = pmap->pm_pdirm) != NULL) {
1252 		KKASSERT(pmap->pm_pml4 != NULL);
1253 		pmap_kremove((vm_offset_t)pmap->pm_pml4);
1254 		vm_page_busy_wait(p, TRUE, "pgpun");
1255 		vm_page_unwire(p, 0);
1256 		vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
1257 		vm_page_free(p);
1258 		pmap->pm_pdirm = NULL;
1259 		atomic_add_long(&pmap->pm_stats.wired_count, -1);
1260 		KKASSERT(pmap->pm_stats.wired_count == 0);
1261 	}
1262 	if (pmap->pm_pml4) {
1263 		kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE);
1264 		pmap->pm_pml4 = NULL;
1265 	}
1266 	if (pmap->pm_pteobj) {
1267 		vm_object_deallocate(pmap->pm_pteobj);
1268 		pmap->pm_pteobj = NULL;
1269 	}
1270 }
1271 
1272 /*
1273  * This function is now unused (used to add the pmap to the pmap_list)
1274  */
1275 void
1276 pmap_pinit2(struct pmap *pmap)
1277 {
1278 }
1279 
1280 /*
1281  * Attempt to release and free a vm_page in a pmap.  Returns 1 on success,
1282  * 0 on failure (if the procedure had to sleep).
1283  *
1284  * When asked to remove the page directory page itself, we actually just
1285  * leave it cached so we do not have to incur the SMP inval overhead of
1286  * removing the kernel mapping.  pmap_puninit() will take care of it.
1287  */
1288 static int
1289 pmap_release_free_page(struct pmap *pmap, vm_page_t p)
1290 {
1291 	/*
1292 	 * This code optimizes the case of freeing non-busy
1293 	 * page-table pages.  Those pages are zero now, and
1294 	 * might as well be placed directly into the zero queue.
1295 	 */
1296 	if (vm_page_busy_try(p, TRUE)) {
1297 		vm_page_sleep_busy(p, TRUE, "pmaprl");
1298 		return 1;
1299 	}
1300 
1301 	/*
1302 	 * Remove the page table page from the processes address space.
1303 	 */
1304 	if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) {
1305 		/*
1306 		 * We are the pml4 table itself.
1307 		 */
1308 		/* XXX anything to do here? */
1309 	} else if (p->pindex >= (NUPT_TOTAL + NUPD_TOTAL)) {
1310 		/*
1311 		 * We are a PDP page.
1312 		 * We look for the PML4 entry that points to us.
1313 		 */
1314 		vm_page_t m4;
1315 		pml4_entry_t *pml4;
1316 		int idx;
1317 
1318 		m4 = vm_page_lookup(pmap->pm_pteobj,
1319 				    NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL);
1320 		KKASSERT(m4 != NULL);
1321 		pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4));
1322 		idx = (p->pindex - (NUPT_TOTAL + NUPD_TOTAL)) % NPML4EPG;
1323 		KKASSERT(pml4[idx] != 0);
1324 		if (pml4[idx] == 0)
1325 			kprintf("pmap_release: Unmapped PML4\n");
1326 		pml4[idx] = 0;
1327 		vm_page_unwire_quick(m4);
1328 	} else if (p->pindex >= NUPT_TOTAL) {
1329 		/*
1330 		 * We are a PD page.
1331 		 * We look for the PDP entry that points to us.
1332 		 */
1333 		vm_page_t m3;
1334 		pdp_entry_t *pdp;
1335 		int idx;
1336 
1337 		m3 = vm_page_lookup(pmap->pm_pteobj,
1338 				    NUPT_TOTAL + NUPD_TOTAL +
1339 				     (p->pindex - NUPT_TOTAL) / NPDPEPG);
1340 		KKASSERT(m3 != NULL);
1341 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3));
1342 		idx = (p->pindex - NUPT_TOTAL) % NPDPEPG;
1343 		KKASSERT(pdp[idx] != 0);
1344 		if (pdp[idx] == 0)
1345 			kprintf("pmap_release: Unmapped PDP %d\n", idx);
1346 		pdp[idx] = 0;
1347 		vm_page_unwire_quick(m3);
1348 	} else {
1349 		/* We are a PT page.
1350 		 * We look for the PD entry that points to us.
1351 		 */
1352 		vm_page_t m2;
1353 		pd_entry_t *pd;
1354 		int idx;
1355 
1356 		m2 = vm_page_lookup(pmap->pm_pteobj,
1357 				    NUPT_TOTAL + p->pindex / NPDEPG);
1358 		KKASSERT(m2 != NULL);
1359 		pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2));
1360 		idx = p->pindex % NPDEPG;
1361 		if (pd[idx] == 0)
1362 			kprintf("pmap_release: Unmapped PD %d\n", idx);
1363 		pd[idx] = 0;
1364 		vm_page_unwire_quick(m2);
1365 	}
1366 	KKASSERT(pmap->pm_stats.resident_count > 0);
1367 	atomic_add_long(&pmap->pm_stats.resident_count, -1);
1368 
1369 	if (p->wire_count > 1)  {
1370 		panic("pmap_release: freeing held pt page "
1371 		      "pmap=%p pg=%p dmap=%p pi=%ld {%ld,%ld,%ld}",
1372 		      pmap, p, (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(p)),
1373 		      p->pindex, NUPT_TOTAL, NUPD_TOTAL, NUPDP_TOTAL);
1374 	}
1375 
1376 	if (pmap->pm_ptphint == p)
1377 		pmap->pm_ptphint = NULL;
1378 
1379 	/*
1380 	 * We leave the top-level page table page cached, wired, and mapped in
1381 	 * the pmap until the dtor function (pmap_puninit()) gets called.
1382 	 * However, still clean it up.
1383 	 */
1384 	if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) {
1385 		bzero(pmap->pm_pml4, PAGE_SIZE);
1386 		vm_page_wakeup(p);
1387 	} else {
1388 		vm_page_unwire(p, 0);
1389 		vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
1390 		vm_page_free(p);
1391 		atomic_add_long(&pmap->pm_stats.wired_count, -1);
1392 	}
1393 	return 0;
1394 }
1395 
1396 /*
1397  * Locate the requested PT, PD, or PDP page table page.
1398  *
1399  * Returns a busied page, caller must vm_page_wakeup() when done.
1400  */
1401 static vm_page_t
1402 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
1403 {
1404 	vm_page_t m;
1405 	vm_page_t pm;
1406 	vm_pindex_t pindex;
1407 	pt_entry_t *ptep;
1408 	pt_entry_t data;
1409 
1410 	/*
1411 	 * Find or fabricate a new pagetable page.  A non-zero wire_count
1412 	 * indicates that the page has already been mapped into its parent.
1413 	 */
1414 	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1415 			 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1416 	if (m->wire_count != 0)
1417 		return m;
1418 
1419 	/*
1420 	 * Map the page table page into its parent, giving it 1 wire count.
1421 	 */
1422 	vm_page_wire(m);
1423 	vm_page_unmanage(m);
1424 	atomic_add_long(&pmap->pm_stats.resident_count, 1);
1425 	vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1426 
1427 	data = VM_PAGE_TO_PHYS(m) |
1428 	       VPTE_RW | VPTE_V | VPTE_U | VPTE_A | VPTE_M | VPTE_WIRED;
1429 	atomic_add_long(&pmap->pm_stats.wired_count, 1);
1430 
1431 	if (ptepindex >= (NUPT_TOTAL + NUPD_TOTAL)) {
1432 		/*
1433 		 * Map PDP into the PML4
1434 		 */
1435 		pindex = ptepindex - (NUPT_TOTAL + NUPD_TOTAL);
1436 		pindex &= (NUPDP_TOTAL - 1);
1437 		ptep = (pt_entry_t *)pmap->pm_pml4;
1438 		pm = NULL;
1439 	} else if (ptepindex >= NUPT_TOTAL) {
1440 		/*
1441 		 * Map PD into its PDP
1442 		 */
1443 		pindex = (ptepindex - NUPT_TOTAL) >> NPDPEPGSHIFT;
1444 		pindex += NUPT_TOTAL + NUPD_TOTAL;
1445 		pm = _pmap_allocpte(pmap, pindex);
1446 		pindex = (ptepindex - NUPT_TOTAL) & (NPDPEPG - 1);
1447 		ptep = (void *)PHYS_TO_DMAP(pm->phys_addr);
1448 	} else {
1449 		/*
1450 		 * Map PT into its PD
1451 		 */
1452 		pindex = ptepindex >> NPDPEPGSHIFT;
1453 		pindex += NUPT_TOTAL;
1454 		pm = _pmap_allocpte(pmap, pindex);
1455 		pindex = ptepindex & (NPTEPG - 1);
1456 		ptep = (void *)PHYS_TO_DMAP(pm->phys_addr);
1457 	}
1458 
1459 	/*
1460 	 * Install the pte in (pm).  (m) prevents races.
1461 	 */
1462 	ptep += pindex;
1463 	data = atomic_swap_long(ptep, data);
1464 	if (pm) {
1465 		vm_page_wire_quick(pm);
1466 		vm_page_wakeup(pm);
1467 	}
1468 	pmap->pm_ptphint = pm;
1469 
1470 	return m;
1471 }
1472 
1473 /*
1474  * Determine the page table page required to access the VA in the pmap
1475  * and allocate it if necessary.  Return a held vm_page_t for the page.
1476  *
1477  * Only used with user pmaps.
1478  */
1479 static vm_page_t
1480 pmap_allocpte(pmap_t pmap, vm_offset_t va)
1481 {
1482 	vm_pindex_t ptepindex;
1483 	vm_page_t m;
1484 
1485 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj));
1486 
1487 	/*
1488 	 * Calculate pagetable page index, and return the PT page to
1489 	 * the caller.
1490 	 */
1491 	ptepindex = pmap_pt_pindex(va);
1492 	m = _pmap_allocpte(pmap, ptepindex);
1493 
1494 	return m;
1495 }
1496 
1497 /***************************************************
1498  * Pmap allocation/deallocation routines.
1499  ***************************************************/
1500 
1501 /*
1502  * Release any resources held by the given physical map.
1503  * Called when a pmap initialized by pmap_pinit is being released.
1504  * Should only be called if the map contains no valid mappings.
1505  */
1506 static int pmap_release_callback(struct vm_page *p, void *data);
1507 
1508 void
1509 pmap_release(struct pmap *pmap)
1510 {
1511 	vm_object_t object = pmap->pm_pteobj;
1512 	struct rb_vm_page_scan_info info;
1513 
1514 	KKASSERT(pmap != &kernel_pmap);
1515 
1516 #if defined(DIAGNOSTIC)
1517 	if (object->ref_count != 1)
1518 		panic("pmap_release: pteobj reference count != 1");
1519 #endif
1520 
1521 	info.pmap = pmap;
1522 	info.object = object;
1523 
1524 	KASSERT(CPUMASK_TESTZERO(pmap->pm_active),
1525 		("pmap %p still active! %016jx",
1526 		pmap,
1527 		(uintmax_t)CPUMASK_LOWMASK(pmap->pm_active)));
1528 
1529 	vm_object_hold(object);
1530 	do {
1531 		info.error = 0;
1532 		info.mpte = NULL;
1533 		info.limit = object->generation;
1534 
1535 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1536 				        pmap_release_callback, &info);
1537 		if (info.error == 0 && info.mpte) {
1538 			if (pmap_release_free_page(pmap, info.mpte))
1539 				info.error = 1;
1540 		}
1541 	} while (info.error);
1542 
1543 	pmap->pm_ptphint = NULL;
1544 
1545 	KASSERT((pmap->pm_stats.wired_count == (pmap->pm_pdirm != NULL)),
1546 		("pmap_release: dangling count %p %ld",
1547 		pmap, pmap->pm_stats.wired_count));
1548 
1549 	vm_object_drop(object);
1550 }
1551 
1552 static int
1553 pmap_release_callback(struct vm_page *p, void *data)
1554 {
1555 	struct rb_vm_page_scan_info *info = data;
1556 
1557 	if (p->pindex == NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL) {
1558 		info->mpte = p;
1559 		return(0);
1560 	}
1561 	if (pmap_release_free_page(info->pmap, p)) {
1562 		info->error = 1;
1563 		return(-1);
1564 	}
1565 	if (info->object->generation != info->limit) {
1566 		info->error = 1;
1567 		return(-1);
1568 	}
1569 	return(0);
1570 }
1571 
1572 /*
1573  * Grow the number of kernel page table entries, if needed.
1574  *
1575  * kernel_map must be locked exclusively by the caller.
1576  */
1577 void
1578 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
1579 {
1580 	vm_offset_t addr;
1581 	vm_paddr_t paddr;
1582 	vm_offset_t ptppaddr;
1583 	vm_page_t nkpg;
1584 	pd_entry_t *pde, newpdir;
1585 	pdp_entry_t newpdp;
1586 
1587 	addr = kend;
1588 
1589 	vm_object_hold(&kptobj);
1590 	if (kernel_vm_end == 0) {
1591 		kernel_vm_end = KvaStart;
1592 		nkpt = 0;
1593 		while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & VPTE_V) != 0) {
1594 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1595 			nkpt++;
1596 			if (kernel_vm_end - 1 >= kernel_map.max_offset) {
1597 				kernel_vm_end = kernel_map.max_offset;
1598 				break;
1599 			}
1600 		}
1601 	}
1602 	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1603 	if (addr - 1 >= kernel_map.max_offset)
1604 		addr = kernel_map.max_offset;
1605 	while (kernel_vm_end < addr) {
1606 		pde = pmap_pde(&kernel_pmap, kernel_vm_end);
1607 		if (pde == NULL) {
1608 			/* We need a new PDP entry */
1609 			nkpg = vm_page_alloc(&kptobj, nkpt,
1610 			                     VM_ALLOC_NORMAL |
1611 					     VM_ALLOC_SYSTEM |
1612 					     VM_ALLOC_INTERRUPT);
1613 			if (nkpg == NULL) {
1614 				panic("pmap_growkernel: no memory to "
1615 				      "grow kernel");
1616 			}
1617 			paddr = VM_PAGE_TO_PHYS(nkpg);
1618 			pmap_zero_page(paddr);
1619 			newpdp = (pdp_entry_t)(paddr |
1620 					       VPTE_V | VPTE_RW | VPTE_U |
1621 					       VPTE_A | VPTE_M | VPTE_WIRED);
1622 			*pmap_pdpe(&kernel_pmap, kernel_vm_end) = newpdp;
1623 			atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1);
1624 			nkpt++;
1625 			continue; /* try again */
1626 		}
1627 		if ((*pde & VPTE_V) != 0) {
1628 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
1629 					~(PAGE_SIZE * NPTEPG - 1);
1630 			if (kernel_vm_end - 1 >= kernel_map.max_offset) {
1631 				kernel_vm_end = kernel_map.max_offset;
1632 				break;
1633 			}
1634 			continue;
1635 		}
1636 
1637 		/*
1638 		 * This index is bogus, but out of the way
1639 		 */
1640 		nkpg = vm_page_alloc(&kptobj, nkpt,
1641 				     VM_ALLOC_NORMAL |
1642 				     VM_ALLOC_SYSTEM |
1643 				     VM_ALLOC_INTERRUPT);
1644 		if (nkpg == NULL)
1645 			panic("pmap_growkernel: no memory to grow kernel");
1646 
1647 		vm_page_wire(nkpg);
1648 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1649 		pmap_zero_page(ptppaddr);
1650 		newpdir = (pd_entry_t)(ptppaddr |
1651 				       VPTE_V | VPTE_RW | VPTE_U |
1652 				       VPTE_A | VPTE_M | VPTE_WIRED);
1653 		*pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir;
1654 		atomic_add_long(&kernel_pmap.pm_stats.wired_count, 1);
1655 		nkpt++;
1656 
1657 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
1658 				~(PAGE_SIZE * NPTEPG - 1);
1659 		if (kernel_vm_end - 1 >= kernel_map.max_offset) {
1660 			kernel_vm_end = kernel_map.max_offset;
1661 			break;
1662 		}
1663 	}
1664 	vm_object_drop(&kptobj);
1665 }
1666 
1667 /*
1668  * Add a reference to the specified pmap.
1669  *
1670  * No requirements.
1671  */
1672 void
1673 pmap_reference(pmap_t pmap)
1674 {
1675 	if (pmap)
1676 		atomic_add_int(&pmap->pm_count, 1);
1677 }
1678 
1679 /************************************************************************
1680  *	   		VMSPACE MANAGEMENT				*
1681  ************************************************************************
1682  *
1683  * The VMSPACE management we do in our virtual kernel must be reflected
1684  * in the real kernel.  This is accomplished by making vmspace system
1685  * calls to the real kernel.
1686  */
1687 void
1688 cpu_vmspace_alloc(struct vmspace *vm)
1689 {
1690 	int r;
1691 	void *rp;
1692 	vpte_t vpte;
1693 
1694 	/*
1695 	 * If VMM enable, don't do nothing, we
1696 	 * are able to use real page tables
1697 	 */
1698 	if (vmm_enabled)
1699 		return;
1700 
1701 #define USER_SIZE	(VM_MAX_USER_ADDRESS - VM_MIN_USER_ADDRESS)
1702 
1703 	if (vmspace_create(&vm->vm_pmap, 0, NULL) < 0)
1704 		panic("vmspace_create() failed");
1705 
1706 	rp = vmspace_mmap(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE,
1707 			  PROT_READ|PROT_WRITE,
1708 			  MAP_FILE|MAP_SHARED|MAP_VPAGETABLE|MAP_FIXED,
1709 			  MemImageFd, 0);
1710 	if (rp == MAP_FAILED)
1711 		panic("vmspace_mmap: failed");
1712 	vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE,
1713 			 MADV_NOSYNC, 0);
1714 	vpte = VM_PAGE_TO_PHYS(vmspace_pmap(vm)->pm_pdirm) |
1715 			       VPTE_RW | VPTE_V | VPTE_U;
1716 	r = vmspace_mcontrol(&vm->vm_pmap, VM_MIN_USER_ADDRESS, USER_SIZE,
1717 			     MADV_SETMAP, vpte);
1718 	if (r < 0)
1719 		panic("vmspace_mcontrol: failed");
1720 }
1721 
1722 void
1723 cpu_vmspace_free(struct vmspace *vm)
1724 {
1725 	/*
1726 	 * If VMM enable, don't do nothing, we
1727 	 * are able to use real page tables
1728 	 */
1729 	if (vmm_enabled)
1730 		return;
1731 
1732 	if (vmspace_destroy(&vm->vm_pmap) < 0)
1733 		panic("vmspace_destroy() failed");
1734 }
1735 
1736 /***************************************************
1737 * page management routines.
1738  ***************************************************/
1739 
1740 /*
1741  * free the pv_entry back to the free list.  This function may be
1742  * called from an interrupt.
1743  */
1744 static __inline void
1745 free_pv_entry(pv_entry_t pv)
1746 {
1747 	atomic_add_int(&pv_entry_count, -1);
1748 	KKASSERT(pv_entry_count >= 0);
1749 	zfree(pvzone, pv);
1750 }
1751 
1752 /*
1753  * get a new pv_entry, allocating a block from the system
1754  * when needed.  This function may be called from an interrupt.
1755  */
1756 static pv_entry_t
1757 get_pv_entry(void)
1758 {
1759 	atomic_add_int(&pv_entry_count, 1);
1760 	if (pv_entry_high_water &&
1761 	    (pv_entry_count > pv_entry_high_water) &&
1762 	    atomic_swap_int(&pmap_pagedaemon_waken, 1) == 0) {
1763 		wakeup(&vm_pages_needed);
1764 	}
1765 	return zalloc(pvzone);
1766 }
1767 
1768 /*
1769  * This routine is very drastic, but can save the system
1770  * in a pinch.
1771  *
1772  * No requirements.
1773  */
1774 void
1775 pmap_collect(void)
1776 {
1777 	int i;
1778 	vm_page_t m;
1779 	static int warningdone=0;
1780 
1781 	if (pmap_pagedaemon_waken == 0)
1782 		return;
1783 	pmap_pagedaemon_waken = 0;
1784 
1785 	if (warningdone < 5) {
1786 		kprintf("pmap_collect: collecting pv entries -- "
1787 			"suggest increasing PMAP_SHPGPERPROC\n");
1788 		warningdone++;
1789 	}
1790 
1791 	for (i = 0; i < vm_page_array_size; i++) {
1792 		m = &vm_page_array[i];
1793 		if (m->wire_count || m->hold_count)
1794 			continue;
1795 		if (vm_page_busy_try(m, TRUE) == 0) {
1796 			if (m->wire_count == 0 && m->hold_count == 0) {
1797 				pmap_remove_all(m);
1798 			}
1799 			vm_page_wakeup(m);
1800 		}
1801 	}
1802 }
1803 
1804 
1805 /*
1806  * If it is the first entry on the list, it is actually
1807  * in the header and we must copy the following entry up
1808  * to the header.  Otherwise we must search the list for
1809  * the entry.  In either case we free the now unused entry.
1810  *
1811  * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller.
1812  */
1813 static int
1814 pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va)
1815 {
1816 	pv_entry_t pv;
1817 	int rtval;
1818 
1819 	vm_page_spin_lock(m);
1820 	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, va);
1821 
1822 	/*
1823 	 * Note that pv_ptem is NULL if the page table page itself is not
1824 	 * managed, even if the page being removed IS managed.
1825 	 */
1826 	rtval = 0;
1827 	if (pv) {
1828 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1829 		if (TAILQ_EMPTY(&m->md.pv_list))
1830 			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1831 		m->md.pv_list_count--;
1832 		KKASSERT(m->md.pv_list_count >= 0);
1833 		pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
1834 		atomic_add_int(&pmap->pm_generation, 1);
1835 		vm_page_spin_unlock(m);
1836 		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
1837 		free_pv_entry(pv);
1838 	} else {
1839 		vm_page_spin_unlock(m);
1840 		kprintf("pmap_remove_entry: could not find "
1841 			"pmap=%p m=%p va=%016jx\n",
1842 			pmap, m, va);
1843 	}
1844 	return rtval;
1845 }
1846 
1847 /*
1848  * Create a pv entry for page at pa for (pmap, va).  If the page table page
1849  * holding the VA is managed, mpte will be non-NULL.
1850  *
1851  * pmap->pm_pteobj must be held and (m) must be spin-locked by the caller.
1852  */
1853 static void
1854 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m,
1855 		  pv_entry_t pv)
1856 {
1857 	pv->pv_va = va;
1858 	pv->pv_pmap = pmap;
1859 	pv->pv_ptem = mpte;
1860 
1861 	m->md.pv_list_count++;
1862 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1863 	pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pv);
1864 	vm_page_flag_set(m, PG_MAPPED);
1865 	KKASSERT(pv == NULL);
1866 }
1867 
1868 /*
1869  * pmap_remove_pte: do the things to unmap a page in a process
1870  *
1871  * Caller holds pmap->pm_pteobj and holds the associated page table
1872  * page busy to prevent races.
1873  */
1874 static int
1875 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, pt_entry_t oldpte,
1876 		vm_offset_t va)
1877 {
1878 	vm_page_t m;
1879 	int error;
1880 
1881 	if (ptq)
1882 		oldpte = pmap_inval_loadandclear(ptq, pmap, va);
1883 
1884 	if (oldpte & VPTE_WIRED)
1885 		atomic_add_long(&pmap->pm_stats.wired_count, -1);
1886 	KKASSERT(pmap->pm_stats.wired_count >= 0);
1887 
1888 #if 0
1889 	/*
1890 	 * Machines that don't support invlpg, also don't support
1891 	 * PG_G.  XXX PG_G is disabled for SMP so don't worry about
1892 	 * the SMP case.
1893 	 */
1894 	if (oldpte & PG_G)
1895 		cpu_invlpg((void *)va);
1896 #endif
1897 	KKASSERT(pmap->pm_stats.resident_count > 0);
1898 	atomic_add_long(&pmap->pm_stats.resident_count, -1);
1899 	if (oldpte & VPTE_MANAGED) {
1900 		m = PHYS_TO_VM_PAGE(oldpte);
1901 
1902 		/*
1903 		 * NOTE: pmap_remove_entry() will spin-lock the page
1904 		 */
1905 		if (oldpte & VPTE_M) {
1906 #if defined(PMAP_DIAGNOSTIC)
1907 			if (pmap_nw_modified(oldpte)) {
1908 				kprintf("pmap_remove: modified page not "
1909 					"writable: va: 0x%lx, pte: 0x%lx\n",
1910 					va, oldpte);
1911 			}
1912 #endif
1913 			if (pmap_track_modified(pmap, va))
1914 				vm_page_dirty(m);
1915 		}
1916 		if (oldpte & VPTE_A)
1917 			vm_page_flag_set(m, PG_REFERENCED);
1918 		error = pmap_remove_entry(pmap, m, va);
1919 	} else {
1920 		error = pmap_unuse_pt(pmap, va, NULL);
1921 	}
1922 	return error;
1923 }
1924 
1925 /*
1926  * pmap_remove_page:
1927  *
1928  * Remove a single page from a process address space.
1929  *
1930  * This function may not be called from an interrupt if the pmap is
1931  * not kernel_pmap.
1932  *
1933  * Caller holds pmap->pm_pteobj
1934  */
1935 static void
1936 pmap_remove_page(struct pmap *pmap, vm_offset_t va)
1937 {
1938 	pt_entry_t *pte;
1939 
1940 	pte = pmap_pte(pmap, va);
1941 	if (pte == NULL)
1942 		return;
1943 	if ((*pte & VPTE_V) == 0)
1944 		return;
1945 	pmap_remove_pte(pmap, pte, 0, va);
1946 }
1947 
1948 /*
1949  * Remove the given range of addresses from the specified map.
1950  *
1951  * It is assumed that the start and end are properly rounded to
1952  * the page size.
1953  *
1954  * This function may not be called from an interrupt if the pmap is
1955  * not kernel_pmap.
1956  *
1957  * No requirements.
1958  */
1959 void
1960 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
1961 {
1962 	vm_offset_t va_next;
1963 	pml4_entry_t *pml4e;
1964 	pdp_entry_t *pdpe;
1965 	pd_entry_t ptpaddr, *pde;
1966 	pt_entry_t *pte;
1967 	vm_page_t pt_m;
1968 
1969 	if (pmap == NULL)
1970 		return;
1971 
1972 	vm_object_hold(pmap->pm_pteobj);
1973 	KKASSERT(pmap->pm_stats.resident_count >= 0);
1974 	if (pmap->pm_stats.resident_count == 0) {
1975 		vm_object_drop(pmap->pm_pteobj);
1976 		return;
1977 	}
1978 
1979 	/*
1980 	 * special handling of removing one page.  a very
1981 	 * common operation and easy to short circuit some
1982 	 * code.
1983 	 */
1984 	if (sva + PAGE_SIZE == eva) {
1985 		pde = pmap_pde(pmap, sva);
1986 		if (pde && (*pde & VPTE_PS) == 0) {
1987 			pmap_remove_page(pmap, sva);
1988 			vm_object_drop(pmap->pm_pteobj);
1989 			return;
1990 		}
1991 	}
1992 
1993 	for (; sva < eva; sva = va_next) {
1994 		pml4e = pmap_pml4e(pmap, sva);
1995 		if ((*pml4e & VPTE_V) == 0) {
1996 			va_next = (sva + NBPML4) & ~PML4MASK;
1997 			if (va_next < sva)
1998 				va_next = eva;
1999 			continue;
2000 		}
2001 
2002 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2003 		if ((*pdpe & VPTE_V) == 0) {
2004 			va_next = (sva + NBPDP) & ~PDPMASK;
2005 			if (va_next < sva)
2006 				va_next = eva;
2007 			continue;
2008 		}
2009 
2010 		/*
2011 		 * Calculate index for next page table.
2012 		 */
2013 		va_next = (sva + NBPDR) & ~PDRMASK;
2014 		if (va_next < sva)
2015 			va_next = eva;
2016 
2017 		pde = pmap_pdpe_to_pde(pdpe, sva);
2018 		ptpaddr = *pde;
2019 
2020 		/*
2021 		 * Weed out invalid mappings.
2022 		 */
2023 		if (ptpaddr == 0)
2024 			continue;
2025 
2026 		/*
2027 		 * Check for large page.
2028 		 */
2029 		if ((ptpaddr & VPTE_PS) != 0) {
2030 			/* JG FreeBSD has more complex treatment here */
2031 			KKASSERT(*pde != 0);
2032 			pmap_inval_pde(pde, pmap, sva);
2033 			atomic_add_long(&pmap->pm_stats.resident_count,
2034 				       -NBPDR / PAGE_SIZE);
2035 			continue;
2036 		}
2037 
2038 		/*
2039 		 * Limit our scan to either the end of the va represented
2040 		 * by the current page table page, or to the end of the
2041 		 * range being removed.
2042 		 */
2043 		if (va_next > eva)
2044 			va_next = eva;
2045 
2046 		/*
2047 		 * NOTE: pmap_remove_pte() can block.
2048 		 */
2049 		pt_m = pmap_hold_pt_page(pde, sva);
2050 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2051 		     sva += PAGE_SIZE) {
2052 			if (*pte) {
2053 				if (pmap_remove_pte(pmap, pte, 0, sva))
2054 					break;
2055 			}
2056 		}
2057 		vm_page_unhold(pt_m);
2058 	}
2059 	vm_object_drop(pmap->pm_pteobj);
2060 }
2061 
2062 /*
2063  * Removes this physical page from all physical maps in which it resides.
2064  * Reflects back modify bits to the pager.
2065  *
2066  * This routine may not be called from an interrupt.
2067  *
2068  * No requirements.
2069  */
2070 static void
2071 pmap_remove_all(vm_page_t m)
2072 {
2073 	pt_entry_t *pte, tpte;
2074 	pv_entry_t pv;
2075 	vm_object_t pmobj;
2076 	pmap_t pmap;
2077 
2078 #if defined(PMAP_DIAGNOSTIC)
2079 	/*
2080 	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
2081 	 * pages!
2082 	 */
2083 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
2084 		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%08llx", (long long)VM_PAGE_TO_PHYS(m));
2085 	}
2086 #endif
2087 
2088 restart:
2089 	vm_page_spin_lock(m);
2090 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2091 		pmap = pv->pv_pmap;
2092 		pmobj = pmap->pm_pteobj;
2093 
2094 		/*
2095 		 * Handle reversed lock ordering
2096 		 */
2097 		if (vm_object_hold_try(pmobj) == 0) {
2098 			refcount_acquire(&pmobj->hold_count);
2099 			vm_page_spin_unlock(m);
2100 			vm_object_lock(pmobj);
2101 			vm_page_spin_lock(m);
2102 			if (pv != TAILQ_FIRST(&m->md.pv_list) ||
2103 			    pmap != pv->pv_pmap ||
2104 			    pmobj != pmap->pm_pteobj) {
2105 				vm_page_spin_unlock(m);
2106 				vm_object_drop(pmobj);
2107 				goto restart;
2108 			}
2109 		}
2110 
2111 		KKASSERT(pmap->pm_stats.resident_count > 0);
2112 		atomic_add_long(&pmap->pm_stats.resident_count, -1);
2113 
2114 		pte = pmap_pte(pmap, pv->pv_va);
2115 		KKASSERT(pte != NULL);
2116 
2117 		tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va);
2118 		if (tpte & VPTE_WIRED)
2119 			atomic_add_long(&pmap->pm_stats.wired_count, -1);
2120 		KKASSERT(pmap->pm_stats.wired_count >= 0);
2121 
2122 		if (tpte & VPTE_A)
2123 			vm_page_flag_set(m, PG_REFERENCED);
2124 
2125 		/*
2126 		 * Update the vm_page_t clean and reference bits.
2127 		 */
2128 		if (tpte & VPTE_M) {
2129 #if defined(PMAP_DIAGNOSTIC)
2130 			if (pmap_nw_modified(tpte)) {
2131 				kprintf(
2132 	"pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n",
2133 				    pv->pv_va, tpte);
2134 			}
2135 #endif
2136 			if (pmap_track_modified(pmap, pv->pv_va))
2137 				vm_page_dirty(m);
2138 		}
2139 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2140 		if (TAILQ_EMPTY(&m->md.pv_list))
2141 			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2142 		m->md.pv_list_count--;
2143 		KKASSERT(m->md.pv_list_count >= 0);
2144 		pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
2145 		atomic_add_int(&pmap->pm_generation, 1);
2146 		vm_page_spin_unlock(m);
2147 		pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem);
2148 		free_pv_entry(pv);
2149 
2150 		vm_object_drop(pmobj);
2151 		vm_page_spin_lock(m);
2152 	}
2153 	KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
2154 	vm_page_spin_unlock(m);
2155 }
2156 
2157 /*
2158  * Removes the page from a particular pmap
2159  */
2160 void
2161 pmap_remove_specific(pmap_t pmap, vm_page_t m)
2162 {
2163 	pt_entry_t *pte, tpte;
2164 	pv_entry_t pv;
2165 
2166 	vm_object_hold(pmap->pm_pteobj);
2167 again:
2168 	vm_page_spin_lock(m);
2169 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2170 		if (pv->pv_pmap != pmap)
2171 			continue;
2172 
2173 		KKASSERT(pmap->pm_stats.resident_count > 0);
2174 		atomic_add_long(&pmap->pm_stats.resident_count, -1);
2175 
2176 		pte = pmap_pte(pmap, pv->pv_va);
2177 		KKASSERT(pte != NULL);
2178 
2179 		tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va);
2180 		if (tpte & VPTE_WIRED)
2181 			atomic_add_long(&pmap->pm_stats.wired_count, -1);
2182 		KKASSERT(pmap->pm_stats.wired_count >= 0);
2183 
2184 		if (tpte & VPTE_A)
2185 			vm_page_flag_set(m, PG_REFERENCED);
2186 
2187 		/*
2188 		 * Update the vm_page_t clean and reference bits.
2189 		 */
2190 		if (tpte & VPTE_M) {
2191 			if (pmap_track_modified(pmap, pv->pv_va))
2192 				vm_page_dirty(m);
2193 		}
2194 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2195 		pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
2196 		atomic_add_int(&pmap->pm_generation, 1);
2197 		m->md.pv_list_count--;
2198 		KKASSERT(m->md.pv_list_count >= 0);
2199 		if (TAILQ_EMPTY(&m->md.pv_list))
2200 			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2201 		pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem);
2202 		vm_page_spin_unlock(m);
2203 		free_pv_entry(pv);
2204 		goto again;
2205 	}
2206 	vm_page_spin_unlock(m);
2207 	vm_object_drop(pmap->pm_pteobj);
2208 }
2209 
2210 /*
2211  * Set the physical protection on the specified range of this map
2212  * as requested.
2213  *
2214  * This function may not be called from an interrupt if the map is
2215  * not the kernel_pmap.
2216  *
2217  * No requirements.
2218  */
2219 void
2220 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2221 {
2222 	vm_offset_t va_next;
2223 	pml4_entry_t *pml4e;
2224 	pdp_entry_t *pdpe;
2225 	pd_entry_t ptpaddr, *pde;
2226 	pt_entry_t *pte;
2227 	vm_page_t pt_m;
2228 
2229 	if (pmap == NULL)
2230 		return;
2231 
2232 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2233 		pmap_remove(pmap, sva, eva);
2234 		return;
2235 	}
2236 
2237 	if (prot & VM_PROT_WRITE)
2238 		return;
2239 
2240 	vm_object_hold(pmap->pm_pteobj);
2241 
2242 	for (; sva < eva; sva = va_next) {
2243 		pml4e = pmap_pml4e(pmap, sva);
2244 		if ((*pml4e & VPTE_V) == 0) {
2245 			va_next = (sva + NBPML4) & ~PML4MASK;
2246 			if (va_next < sva)
2247 				va_next = eva;
2248 			continue;
2249 		}
2250 
2251 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
2252 		if ((*pdpe & VPTE_V) == 0) {
2253 			va_next = (sva + NBPDP) & ~PDPMASK;
2254 			if (va_next < sva)
2255 				va_next = eva;
2256 			continue;
2257 		}
2258 
2259 		va_next = (sva + NBPDR) & ~PDRMASK;
2260 		if (va_next < sva)
2261 			va_next = eva;
2262 
2263 		pde = pmap_pdpe_to_pde(pdpe, sva);
2264 		ptpaddr = *pde;
2265 
2266 #if 0
2267 		/*
2268 		 * Check for large page.
2269 		 */
2270 		if ((ptpaddr & VPTE_PS) != 0) {
2271 			/* JG correct? */
2272 			pmap_clean_pde(pde, pmap, sva);
2273 			atomic_add_long(&pmap->pm_stats.resident_count,
2274 					-NBPDR / PAGE_SIZE);
2275 			continue;
2276 		}
2277 #endif
2278 
2279 		/*
2280 		 * Weed out invalid mappings. Note: we assume that the page
2281 		 * directory table is always allocated, and in kernel virtual.
2282 		 */
2283 		if (ptpaddr == 0)
2284 			continue;
2285 
2286 		if (va_next > eva)
2287 			va_next = eva;
2288 
2289 		pt_m = pmap_hold_pt_page(pde, sva);
2290 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
2291 		    sva += PAGE_SIZE) {
2292 			/*
2293 			 * Clean managed pages and also check the accessed
2294 			 * bit.  Just remove write perms for unmanaged
2295 			 * pages.  Be careful of races, turning off write
2296 			 * access will force a fault rather then setting
2297 			 * the modified bit at an unexpected time.
2298 			 */
2299 			pmap_clean_pte(pte, pmap, sva, NULL);
2300 		}
2301 		vm_page_unhold(pt_m);
2302 	}
2303 	vm_object_drop(pmap->pm_pteobj);
2304 }
2305 
2306 /*
2307  * Enter a managed page into a pmap.  If the page is not wired related pmap
2308  * data can be destroyed at any time for later demand-operation.
2309  *
2310  * Insert the vm_page (m) at virtual address (v) in (pmap), with the
2311  * specified protection, and wire the mapping if requested.
2312  *
2313  * NOTE: This routine may not lazy-evaluate or lose information.  The
2314  *	 page must actually be inserted into the given map NOW.
2315  *
2316  * NOTE: When entering a page at a KVA address, the pmap must be the
2317  *	 kernel_pmap.
2318  *
2319  * No requirements.
2320  */
2321 void
2322 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2323 	   boolean_t wired, vm_map_entry_t entry __unused)
2324 {
2325 	vm_paddr_t pa;
2326 	pv_entry_t pv;
2327 	pt_entry_t *pte;
2328 	pt_entry_t origpte, newpte;
2329 	vm_paddr_t opa;
2330 	vm_page_t mpte;
2331 
2332 	if (pmap == NULL)
2333 		return;
2334 
2335 	va = trunc_page(va);
2336 
2337 	vm_object_hold(pmap->pm_pteobj);
2338 
2339 	/*
2340 	 * Get the page table page.   The kernel_pmap's page table pages
2341 	 * are preallocated and have no associated vm_page_t.
2342 	 *
2343 	 * If not NULL, mpte will be busied and we must vm_page_wakeup()
2344 	 * to cleanup.  There will already be at least one wire count from
2345 	 * it being mapped into its parent.
2346 	 */
2347 	if (pmap == &kernel_pmap) {
2348 		mpte = NULL;
2349 		pte = vtopte(va);
2350 	} else {
2351 		mpte = pmap_allocpte(pmap, va);
2352 		pte = (void *)PHYS_TO_DMAP(mpte->phys_addr);
2353 		pte += pmap_pte_index(va);
2354 	}
2355 
2356 	/*
2357 	 * Deal with races against the kernel's real MMU by cleaning the
2358 	 * page, even if we are re-entering the same page.
2359 	 */
2360 	pa = VM_PAGE_TO_PHYS(m);
2361 	origpte = pmap_inval_loadandclear(pte, pmap, va);
2362 	/*origpte = pmap_clean_pte(pte, pmap, va, NULL);*/
2363 	opa = origpte & VPTE_FRAME;
2364 
2365 	if (origpte & VPTE_PS)
2366 		panic("pmap_enter: attempted pmap_enter on 2MB page");
2367 
2368 	if ((origpte & (VPTE_MANAGED|VPTE_M)) == (VPTE_MANAGED|VPTE_M)) {
2369 		if (pmap_track_modified(pmap, va)) {
2370 			vm_page_t om = PHYS_TO_VM_PAGE(opa);
2371 			vm_page_dirty(om);
2372 		}
2373 	}
2374 
2375 	/*
2376 	 * Mapping has not changed, must be protection or wiring change.
2377 	 */
2378 	if (origpte && (opa == pa)) {
2379 		/*
2380 		 * Wiring change, just update stats. We don't worry about
2381 		 * wiring PT pages as they remain resident as long as there
2382 		 * are valid mappings in them. Hence, if a user page is wired,
2383 		 * the PT page will be also.
2384 		 */
2385 		if (wired && ((origpte & VPTE_WIRED) == 0))
2386 			atomic_add_long(&pmap->pm_stats.wired_count, 1);
2387 		else if (!wired && (origpte & VPTE_WIRED))
2388 			atomic_add_long(&pmap->pm_stats.wired_count, -1);
2389 
2390 		if (origpte & VPTE_MANAGED) {
2391 			pa |= VPTE_MANAGED;
2392 			KKASSERT(m->flags & PG_MAPPED);
2393 			KKASSERT(!(m->flags & (PG_FICTITIOUS|PG_UNMANAGED)));
2394 		} else {
2395 			KKASSERT((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)));
2396 		}
2397 		vm_page_spin_lock(m);
2398 		goto validate;
2399 	}
2400 
2401 	/*
2402 	 * Bump the wire_count for the page table page.
2403 	 */
2404 	if (mpte)
2405 		vm_page_wire_quick(mpte);
2406 
2407 	/*
2408 	 * Mapping has changed, invalidate old range and fall through to
2409 	 * handle validating new mapping.  Don't inherit anything from
2410 	 * oldpte.
2411 	 */
2412 	if (opa) {
2413 		int err;
2414 		err = pmap_remove_pte(pmap, NULL, origpte, va);
2415 		origpte = 0;
2416 		if (err)
2417 			panic("pmap_enter: pte vanished, va: 0x%lx", va);
2418 	}
2419 
2420 	/*
2421 	 * Enter on the PV list if part of our managed memory. Note that we
2422 	 * raise IPL while manipulating pv_table since pmap_enter can be
2423 	 * called at interrupt time.
2424 	 */
2425 	if (pmap_initialized) {
2426 		if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2427 			/*
2428 			 * WARNING!  We are using m's spin-lock as a
2429 			 *	     man's pte lock to interlock against
2430 			 *	     pmap_page_protect() operations.
2431 			 *
2432 			 *	     This is a bad hack (obviously).
2433 			 */
2434 			pv = get_pv_entry();
2435 			vm_page_spin_lock(m);
2436 			pmap_insert_entry(pmap, va, mpte, m, pv);
2437 			pa |= VPTE_MANAGED;
2438 			/* vm_page_spin_unlock(m); */
2439 		} else {
2440 			vm_page_spin_lock(m);
2441 		}
2442 	} else {
2443 		vm_page_spin_lock(m);
2444 	}
2445 
2446 	/*
2447 	 * Increment counters
2448 	 */
2449 	atomic_add_long(&pmap->pm_stats.resident_count, 1);
2450 	if (wired)
2451 		atomic_add_long(&pmap->pm_stats.wired_count, 1);
2452 
2453 validate:
2454 	/*
2455 	 * Now validate mapping with desired protection/wiring.
2456 	 */
2457 	newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | VPTE_V | VPTE_U);
2458 	newpte |= VPTE_A;
2459 
2460 	if (wired)
2461 		newpte |= VPTE_WIRED;
2462 //	if (pmap != &kernel_pmap)
2463 		newpte |= VPTE_U;
2464 	if (newpte & VPTE_RW)
2465 		vm_page_flag_set(m, PG_WRITEABLE);
2466 	KKASSERT((newpte & VPTE_MANAGED) == 0 || (m->flags & PG_MAPPED));
2467 
2468 	origpte = atomic_swap_long(pte, newpte);
2469 	if (origpte & VPTE_M) {
2470 		kprintf("pmap [M] race @ %016jx\n", va);
2471 		atomic_set_long(pte, VPTE_M);
2472 	}
2473 	vm_page_spin_unlock(m);
2474 
2475 	if (mpte)
2476 		vm_page_wakeup(mpte);
2477 	vm_object_drop(pmap->pm_pteobj);
2478 }
2479 
2480 /*
2481  * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
2482  *
2483  * Currently this routine may only be used on user pmaps, not kernel_pmap.
2484  *
2485  * No requirements.
2486  */
2487 void
2488 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
2489 {
2490 	pmap_enter(pmap, va, m, VM_PROT_READ, 0, NULL);
2491 }
2492 
2493 /*
2494  * Make a temporary mapping for a physical address.  This is only intended
2495  * to be used for panic dumps.
2496  *
2497  * The caller is responsible for calling smp_invltlb().
2498  */
2499 void *
2500 pmap_kenter_temporary(vm_paddr_t pa, long i)
2501 {
2502 	pmap_kenter_quick(crashdumpmap + (i * PAGE_SIZE), pa);
2503 	return ((void *)crashdumpmap);
2504 }
2505 
2506 #define MAX_INIT_PT (96)
2507 
2508 /*
2509  * This routine preloads the ptes for a given object into the specified pmap.
2510  * This eliminates the blast of soft faults on process startup and
2511  * immediately after an mmap.
2512  *
2513  * No requirements.
2514  */
2515 static int pmap_object_init_pt_callback(vm_page_t p, void *data);
2516 
2517 void
2518 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
2519 		    vm_object_t object, vm_pindex_t pindex,
2520 		    vm_size_t size, int limit)
2521 {
2522 	struct rb_vm_page_scan_info info;
2523 	struct lwp *lp;
2524 	vm_size_t psize;
2525 
2526 	/*
2527 	 * We can't preinit if read access isn't set or there is no pmap
2528 	 * or object.
2529 	 */
2530 	if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL)
2531 		return;
2532 
2533 	/*
2534 	 * We can't preinit if the pmap is not the current pmap
2535 	 */
2536 	lp = curthread->td_lwp;
2537 	if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace))
2538 		return;
2539 
2540 	/*
2541 	 * Misc additional checks
2542 	 */
2543 	psize = x86_64_btop(size);
2544 
2545 	if ((object->type != OBJT_VNODE) ||
2546 		((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
2547 			(object->resident_page_count > MAX_INIT_PT))) {
2548 		return;
2549 	}
2550 
2551 	if (psize + pindex > object->size) {
2552 		if (object->size < pindex)
2553 			return;
2554 		psize = object->size - pindex;
2555 	}
2556 
2557 	if (psize == 0)
2558 		return;
2559 
2560 	/*
2561 	 * Use a red-black scan to traverse the requested range and load
2562 	 * any valid pages found into the pmap.
2563 	 *
2564 	 * We cannot safely scan the object's memq unless we are in a
2565 	 * critical section since interrupts can remove pages from objects.
2566 	 */
2567 	info.start_pindex = pindex;
2568 	info.end_pindex = pindex + psize - 1;
2569 	info.limit = limit;
2570 	info.mpte = NULL;
2571 	info.addr = addr;
2572 	info.pmap = pmap;
2573 
2574 	vm_object_hold_shared(object);
2575 	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
2576 				pmap_object_init_pt_callback, &info);
2577 	vm_object_drop(object);
2578 }
2579 
2580 static
2581 int
2582 pmap_object_init_pt_callback(vm_page_t p, void *data)
2583 {
2584 	struct rb_vm_page_scan_info *info = data;
2585 	vm_pindex_t rel_index;
2586 	/*
2587 	 * don't allow an madvise to blow away our really
2588 	 * free pages allocating pv entries.
2589 	 */
2590 	if ((info->limit & MAP_PREFAULT_MADVISE) &&
2591 		vmstats.v_free_count < vmstats.v_free_reserved) {
2592 		    return(-1);
2593 	}
2594 
2595 	/*
2596 	 * Ignore list markers and ignore pages we cannot instantly
2597 	 * busy (while holding the object token).
2598 	 */
2599 	if (p->flags & PG_MARKER)
2600 		return 0;
2601 	if (vm_page_busy_try(p, TRUE))
2602 		return 0;
2603 	if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2604 	    (p->flags & PG_FICTITIOUS) == 0) {
2605 		if ((p->queue - p->pc) == PQ_CACHE)
2606 			vm_page_deactivate(p);
2607 		rel_index = p->pindex - info->start_pindex;
2608 		pmap_enter_quick(info->pmap,
2609 				 info->addr + x86_64_ptob(rel_index), p);
2610 	}
2611 	vm_page_wakeup(p);
2612 	return(0);
2613 }
2614 
2615 /*
2616  * Return TRUE if the pmap is in shape to trivially
2617  * pre-fault the specified address.
2618  *
2619  * Returns FALSE if it would be non-trivial or if a
2620  * pte is already loaded into the slot.
2621  *
2622  * No requirements.
2623  */
2624 int
2625 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
2626 {
2627 	pt_entry_t *pte;
2628 	pd_entry_t *pde;
2629 	int ret;
2630 
2631 	vm_object_hold(pmap->pm_pteobj);
2632 	pde = pmap_pde(pmap, addr);
2633 	if (pde == NULL || *pde == 0) {
2634 		ret = 0;
2635 	} else {
2636 		pte = pmap_pde_to_pte(pde, addr);
2637 		ret = (*pte) ? 0 : 1;
2638 	}
2639 	vm_object_drop(pmap->pm_pteobj);
2640 
2641 	return (ret);
2642 }
2643 
2644 /*
2645  * Change the wiring attribute for a map/virtual-address pair.
2646  *
2647  * The mapping must already exist in the pmap.
2648  * No other requirements.
2649  */
2650 vm_page_t
2651 pmap_unwire(pmap_t pmap, vm_offset_t va)
2652 {
2653 	pt_entry_t *pte;
2654 	vm_paddr_t pa;
2655 	vm_page_t m;
2656 
2657 	if (pmap == NULL)
2658 		return NULL;
2659 
2660 	vm_object_hold(pmap->pm_pteobj);
2661 	pte = pmap_pte(pmap, va);
2662 
2663 	if (pte == NULL || (*pte & VPTE_V) == 0) {
2664 		vm_object_drop(pmap->pm_pteobj);
2665 		return NULL;
2666 	}
2667 
2668 	/*
2669 	 * Wiring is not a hardware characteristic so there is no need to
2670 	 * invalidate TLB.  However, in an SMP environment we must use
2671 	 * a locked bus cycle to update the pte (if we are not using
2672 	 * the pmap_inval_*() API that is)... it's ok to do this for simple
2673 	 * wiring changes.
2674 	 */
2675 	if (pmap_pte_w(pte))
2676 		atomic_add_long(&pmap->pm_stats.wired_count, -1);
2677 	/* XXX else return NULL so caller doesn't unwire m ? */
2678 	atomic_clear_long(pte, VPTE_WIRED);
2679 
2680 	pa = *pte & VPTE_FRAME;
2681 	m = PHYS_TO_VM_PAGE(pa);	/* held by wired count */
2682 
2683 	vm_object_drop(pmap->pm_pteobj);
2684 
2685 	return m;
2686 }
2687 
2688 /*
2689  *	Copy the range specified by src_addr/len
2690  *	from the source map to the range dst_addr/len
2691  *	in the destination map.
2692  *
2693  *	This routine is only advisory and need not do anything.
2694  */
2695 void
2696 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
2697 	vm_size_t len, vm_offset_t src_addr)
2698 {
2699 	/*
2700 	 * XXX BUGGY.  Amoung other things srcmpte is assumed to remain
2701 	 * valid through blocking calls, and that's just not going to
2702 	 * be the case.
2703 	 *
2704 	 * FIXME!
2705 	 */
2706 	return;
2707 }
2708 
2709 /*
2710  * pmap_zero_page:
2711  *
2712  *	Zero the specified physical page.
2713  *
2714  *	This function may be called from an interrupt and no locking is
2715  *	required.
2716  */
2717 void
2718 pmap_zero_page(vm_paddr_t phys)
2719 {
2720 	vm_offset_t va = PHYS_TO_DMAP(phys);
2721 
2722 	bzero((void *)va, PAGE_SIZE);
2723 }
2724 
2725 /*
2726  * pmap_zero_page:
2727  *
2728  *	Zero part of a physical page by mapping it into memory and clearing
2729  *	its contents with bzero.
2730  *
2731  *	off and size may not cover an area beyond a single hardware page.
2732  */
2733 void
2734 pmap_zero_page_area(vm_paddr_t phys, int off, int size)
2735 {
2736 	vm_offset_t virt = PHYS_TO_DMAP(phys);
2737 
2738 	bzero((char *)virt + off, size);
2739 }
2740 
2741 /*
2742  * pmap_copy_page:
2743  *
2744  *	Copy the physical page from the source PA to the target PA.
2745  *	This function may be called from an interrupt.  No locking
2746  *	is required.
2747  */
2748 void
2749 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
2750 {
2751 	vm_offset_t src_virt, dst_virt;
2752 
2753 	src_virt = PHYS_TO_DMAP(src);
2754 	dst_virt = PHYS_TO_DMAP(dst);
2755 	bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE);
2756 }
2757 
2758 /*
2759  * pmap_copy_page_frag:
2760  *
2761  *	Copy the physical page from the source PA to the target PA.
2762  *	This function may be called from an interrupt.  No locking
2763  *	is required.
2764  */
2765 void
2766 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
2767 {
2768 	vm_offset_t src_virt, dst_virt;
2769 
2770 	src_virt = PHYS_TO_DMAP(src);
2771 	dst_virt = PHYS_TO_DMAP(dst);
2772 	bcopy((char *)src_virt + (src & PAGE_MASK),
2773 	      (char *)dst_virt + (dst & PAGE_MASK),
2774 	      bytes);
2775 }
2776 
2777 /*
2778  * Returns true if the pmap's pv is one of the first 16 pvs linked to
2779  * from this page.  This count may be changed upwards or downwards
2780  * in the future; it is only necessary that true be returned for a small
2781  * subset of pmaps for proper page aging.
2782  *
2783  * No other requirements.
2784  */
2785 boolean_t
2786 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
2787 {
2788 	pv_entry_t pv;
2789 	int loops = 0;
2790 
2791 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2792 		return FALSE;
2793 
2794 	vm_page_spin_lock(m);
2795 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2796 		if (pv->pv_pmap == pmap) {
2797 			vm_page_spin_unlock(m);
2798 			return TRUE;
2799 		}
2800 		loops++;
2801 		if (loops >= 16)
2802 			break;
2803 	}
2804 	vm_page_spin_unlock(m);
2805 
2806 	return (FALSE);
2807 }
2808 
2809 /*
2810  * Remove all pages from specified address space this aids process
2811  * exit speeds.  Also, this code is special cased for current
2812  * process only, but can have the more generic (and slightly slower)
2813  * mode enabled.  This is much faster than pmap_remove in the case
2814  * of running down an entire address space.
2815  *
2816  * No other requirements.
2817  */
2818 void
2819 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2820 {
2821 	pmap_remove(pmap, sva, eva);
2822 #if 0
2823 	pt_entry_t *pte, tpte;
2824 	pv_entry_t pv, npv;
2825 	vm_page_t m;
2826 	int save_generation;
2827 
2828 	if (pmap->pm_pteobj)
2829 		vm_object_hold(pmap->pm_pteobj);
2830 
2831 	pmap_invalidate_range(pmap, sva, eva);
2832 
2833 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2834 		if (pv->pv_va >= eva || pv->pv_va < sva) {
2835 			npv = TAILQ_NEXT(pv, pv_plist);
2836 			continue;
2837 		}
2838 
2839 		KKASSERT(pmap == pv->pv_pmap);
2840 
2841 		pte = pmap_pte(pmap, pv->pv_va);
2842 
2843 		/*
2844 		 * We cannot remove wired pages from a process' mapping
2845 		 * at this time
2846 		 */
2847 		if (*pte & VPTE_WIRED) {
2848 			npv = TAILQ_NEXT(pv, pv_plist);
2849 			continue;
2850 		}
2851 		tpte = pmap_inval_loadandclear(pte, pmap, pv->pv_va);
2852 
2853 		m = PHYS_TO_VM_PAGE(tpte & VPTE_FRAME);
2854 		vm_page_spin_lock(m);
2855 
2856 		KASSERT(m < &vm_page_array[vm_page_array_size],
2857 			("pmap_remove_pages: bad tpte %lx", tpte));
2858 
2859 		KKASSERT(pmap->pm_stats.resident_count > 0);
2860 		atomic_add_long(&pmap->pm_stats.resident_count, -1);
2861 
2862 		/*
2863 		 * Update the vm_page_t clean and reference bits.
2864 		 */
2865 		if (tpte & VPTE_M) {
2866 			vm_page_dirty(m);
2867 		}
2868 
2869 		npv = TAILQ_NEXT(pv, pv_plist);
2870 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2871 		atomic_add_int(&pmap->pm_generation, 1);
2872 		save_generation = pmap->pm_generation;
2873 		m->md.pv_list_count--;
2874 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2875 		if (TAILQ_EMPTY(&m->md.pv_list))
2876 			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2877 		vm_page_spin_unlock(m);
2878 
2879 		pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem);
2880 		free_pv_entry(pv);
2881 
2882 		/*
2883 		 * Restart the scan if we blocked during the unuse or free
2884 		 * calls and other removals were made.
2885 		 */
2886 		if (save_generation != pmap->pm_generation) {
2887 			kprintf("Warning: pmap_remove_pages race-A avoided\n");
2888 			npv = TAILQ_FIRST(&pmap->pm_pvlist);
2889 		}
2890 	}
2891 	if (pmap->pm_pteobj)
2892 		vm_object_drop(pmap->pm_pteobj);
2893 	pmap_remove(pmap, sva, eva);
2894 #endif
2895 }
2896 
2897 /*
2898  * pmap_testbit tests bits in active mappings of a VM page.
2899  */
2900 static boolean_t
2901 pmap_testbit(vm_page_t m, int bit)
2902 {
2903 	pv_entry_t pv;
2904 	pt_entry_t *pte;
2905 
2906 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2907 		return FALSE;
2908 
2909 	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
2910 		return FALSE;
2911 
2912 	vm_page_spin_lock(m);
2913 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2914 		/*
2915 		 * if the bit being tested is the modified bit, then
2916 		 * mark clean_map and ptes as never
2917 		 * modified.
2918 		 */
2919 		if (bit & (VPTE_A|VPTE_M)) {
2920 			if (!pmap_track_modified(pv->pv_pmap, pv->pv_va))
2921 				continue;
2922 		}
2923 
2924 #if defined(PMAP_DIAGNOSTIC)
2925 		if (pv->pv_pmap == NULL) {
2926 			kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va);
2927 			continue;
2928 		}
2929 #endif
2930 		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2931 		if (*pte & bit) {
2932 			vm_page_spin_unlock(m);
2933 			return TRUE;
2934 		}
2935 	}
2936 	vm_page_spin_unlock(m);
2937 	return (FALSE);
2938 }
2939 
2940 /*
2941  * This routine is used to clear bits in ptes.  Certain bits require special
2942  * handling, in particular (on virtual kernels) the VPTE_M (modify) bit.
2943  *
2944  * This routine is only called with certain VPTE_* bit combinations.
2945  */
2946 static __inline void
2947 pmap_clearbit(vm_page_t m, int bit)
2948 {
2949 	pv_entry_t pv;
2950 	pt_entry_t *pte;
2951 	pt_entry_t pbits;
2952 	vm_object_t pmobj;
2953 	pmap_t pmap;
2954 
2955 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
2956 		if (bit == VPTE_RW)
2957 			vm_page_flag_clear(m, PG_WRITEABLE);
2958 		return;
2959 	}
2960 
2961 	/*
2962 	 * Loop over all current mappings setting/clearing as appropos If
2963 	 * setting RO do we need to clear the VAC?
2964 	 */
2965 restart:
2966 	vm_page_spin_lock(m);
2967 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2968 		/*
2969 		 * Need the pmap object lock(?)
2970 		 */
2971 		pmap = pv->pv_pmap;
2972 		pmobj = pmap->pm_pteobj;
2973 
2974 		if (vm_object_hold_try(pmobj) == 0) {
2975 			refcount_acquire(&pmobj->hold_count);
2976 			vm_page_spin_unlock(m);
2977 			vm_object_lock(pmobj);
2978 			vm_object_drop(pmobj);
2979 			goto restart;
2980 		}
2981 
2982 		/*
2983 		 * don't write protect pager mappings
2984 		 */
2985 		if (bit == VPTE_RW) {
2986 			if (!pmap_track_modified(pv->pv_pmap, pv->pv_va)) {
2987 				vm_object_drop(pmobj);
2988 				continue;
2989 			}
2990 		}
2991 
2992 #if defined(PMAP_DIAGNOSTIC)
2993 		if (pv->pv_pmap == NULL) {
2994 			kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va);
2995 			vm_object_drop(pmobj);
2996 			continue;
2997 		}
2998 #endif
2999 
3000 		/*
3001 		 * Careful here.  We can use a locked bus instruction to
3002 		 * clear VPTE_A or VPTE_M safely but we need to synchronize
3003 		 * with the target cpus when we mess with VPTE_RW.
3004 		 *
3005 		 * On virtual kernels we must force a new fault-on-write
3006 		 * in the real kernel if we clear the Modify bit ourselves,
3007 		 * otherwise the real kernel will not get a new fault and
3008 		 * will never set our Modify bit again.
3009 		 */
3010 		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
3011 		if (*pte & bit) {
3012 			if (bit == VPTE_RW) {
3013 				/*
3014 				 * We must also clear VPTE_M when clearing
3015 				 * VPTE_RW and synchronize its state to
3016 				 * the page.
3017 				 */
3018 				pbits = pmap_clean_pte(pte, pv->pv_pmap,
3019 						       pv->pv_va, m);
3020 			} else if (bit == VPTE_M) {
3021 				/*
3022 				 * We must invalidate the real-kernel pte
3023 				 * when clearing VPTE_M bit to force the
3024 				 * real-kernel to take a new fault to re-set
3025 				 * VPTE_M.
3026 				 */
3027 				atomic_clear_long(pte, VPTE_M);
3028 				if (*pte & VPTE_RW) {
3029 					pmap_invalidate_range(pv->pv_pmap,
3030 						      pv->pv_va,
3031 						      pv->pv_va + PAGE_SIZE);
3032 				}
3033 			} else if ((bit & (VPTE_RW|VPTE_M)) ==
3034 				   (VPTE_RW|VPTE_M)) {
3035 				/*
3036 				 * We've been asked to clear W & M, I guess
3037 				 * the caller doesn't want us to update
3038 				 * the dirty status of the VM page.
3039 				 */
3040 				pmap_clean_pte(pte, pv->pv_pmap, pv->pv_va, m);
3041 				panic("shouldn't be called");
3042 			} else {
3043 				/*
3044 				 * We've been asked to clear bits that do
3045 				 * not interact with hardware.
3046 				 */
3047 				atomic_clear_long(pte, bit);
3048 			}
3049 		}
3050 		vm_object_drop(pmobj);
3051 	}
3052 	if (bit == VPTE_RW)
3053 		vm_page_flag_clear(m, PG_WRITEABLE);
3054 	vm_page_spin_unlock(m);
3055 }
3056 
3057 /*
3058  * Lower the permission for all mappings to a given page.
3059  *
3060  * No other requirements.
3061  */
3062 void
3063 pmap_page_protect(vm_page_t m, vm_prot_t prot)
3064 {
3065 	/* JG NX support? */
3066 	if ((prot & VM_PROT_WRITE) == 0) {
3067 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3068 			pmap_clearbit(m, VPTE_RW);
3069 		} else {
3070 			pmap_remove_all(m);
3071 		}
3072 	}
3073 }
3074 
3075 vm_paddr_t
3076 pmap_phys_address(vm_pindex_t ppn)
3077 {
3078 	return (x86_64_ptob(ppn));
3079 }
3080 
3081 /*
3082  * Return a count of reference bits for a page, clearing those bits.
3083  * It is not necessary for every reference bit to be cleared, but it
3084  * is necessary that 0 only be returned when there are truly no
3085  * reference bits set.
3086  *
3087  * XXX: The exact number of bits to check and clear is a matter that
3088  * should be tested and standardized at some point in the future for
3089  * optimal aging of shared pages.
3090  *
3091  * No other requirements.
3092  */
3093 int
3094 pmap_ts_referenced(vm_page_t m)
3095 {
3096 	pv_entry_t pv, pvf, pvn;
3097 	pt_entry_t *pte;
3098 	int rtval = 0;
3099 
3100 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3101 		return (rtval);
3102 
3103 	vm_page_spin_lock(m);
3104 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3105 		pvf = pv;
3106 		do {
3107 			pvn = TAILQ_NEXT(pv, pv_list);
3108 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3109 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3110 
3111 			if (!pmap_track_modified(pv->pv_pmap, pv->pv_va))
3112 				continue;
3113 
3114 			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
3115 
3116 			if (pte && (*pte & VPTE_A)) {
3117 				atomic_clear_long(pte, VPTE_A);
3118 				rtval++;
3119 				if (rtval > 4) {
3120 					break;
3121 				}
3122 			}
3123 		} while ((pv = pvn) != NULL && pv != pvf);
3124 	}
3125 	vm_page_spin_unlock(m);
3126 
3127 	return (rtval);
3128 }
3129 
3130 /*
3131  * Return whether or not the specified physical page was modified
3132  * in any physical maps.
3133  *
3134  * No other requirements.
3135  */
3136 boolean_t
3137 pmap_is_modified(vm_page_t m)
3138 {
3139 	boolean_t res;
3140 
3141 	res = pmap_testbit(m, VPTE_M);
3142 
3143 	return (res);
3144 }
3145 
3146 /*
3147  * Clear the modify bits on the specified physical page.  For the vkernel
3148  * we really need to clean the page, which clears VPTE_RW and VPTE_M, in
3149  * order to ensure that we take a fault on the next write to the page.
3150  * Otherwise the page may become dirty without us knowing it.
3151  *
3152  * No other requirements.
3153  */
3154 void
3155 pmap_clear_modify(vm_page_t m)
3156 {
3157 	pmap_clearbit(m, VPTE_RW);
3158 }
3159 
3160 /*
3161  * Clear the reference bit on the specified physical page.
3162  *
3163  * No other requirements.
3164  */
3165 void
3166 pmap_clear_reference(vm_page_t m)
3167 {
3168 	pmap_clearbit(m, VPTE_A);
3169 }
3170 
3171 /*
3172  * Miscellaneous support routines follow
3173  */
3174 
3175 static void
3176 i386_protection_init(void)
3177 {
3178 	int *kp, prot;
3179 
3180 	kp = protection_codes;
3181 	for (prot = 0; prot < 8; prot++) {
3182 		if (prot & VM_PROT_READ)
3183 			*kp |= 0; /* if it's VALID is readeable */
3184 		if (prot & VM_PROT_WRITE)
3185 			*kp |= VPTE_RW;
3186 		if (prot & VM_PROT_EXECUTE)
3187 			*kp |= 0; /* if it's VALID is executable */
3188 		++kp;
3189 	}
3190 }
3191 
3192 /*
3193  * Sets the memory attribute for the specified page.
3194  */
3195 void
3196 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
3197 {
3198 	/* This is a vkernel, do nothing */
3199 }
3200 
3201 /*
3202  * Change the PAT attribute on an existing kernel memory map.  Caller
3203  * must ensure that the virtual memory in question is not accessed
3204  * during the adjustment.
3205  */
3206 void
3207 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode)
3208 {
3209 	/* This is a vkernel, do nothing */
3210 }
3211 
3212 /*
3213  * Perform the pmap work for mincore
3214  *
3215  * No other requirements.
3216  */
3217 int
3218 pmap_mincore(pmap_t pmap, vm_offset_t addr)
3219 {
3220 	pt_entry_t *ptep, pte;
3221 	vm_page_t m;
3222 	int val = 0;
3223 
3224 	vm_object_hold(pmap->pm_pteobj);
3225 	ptep = pmap_pte(pmap, addr);
3226 
3227 	if (ptep && (pte = *ptep) != 0) {
3228 		vm_paddr_t pa;
3229 
3230 		val = MINCORE_INCORE;
3231 		if ((pte & VPTE_MANAGED) == 0)
3232 			goto done;
3233 
3234 		pa = pte & VPTE_FRAME;
3235 
3236 		m = PHYS_TO_VM_PAGE(pa);
3237 
3238 		/*
3239 		 * Modified by us
3240 		 */
3241 		if (pte & VPTE_M)
3242 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3243 		/*
3244 		 * Modified by someone
3245 		 */
3246 		else if (m->dirty || pmap_is_modified(m))
3247 			val |= MINCORE_MODIFIED_OTHER;
3248 		/*
3249 		 * Referenced by us
3250 		 */
3251 		if (pte & VPTE_A)
3252 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3253 
3254 		/*
3255 		 * Referenced by someone
3256 		 */
3257 		else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) {
3258 			val |= MINCORE_REFERENCED_OTHER;
3259 			vm_page_flag_set(m, PG_REFERENCED);
3260 		}
3261 	}
3262 done:
3263 	vm_object_drop(pmap->pm_pteobj);
3264 
3265 	return val;
3266 }
3267 
3268 /*
3269  * Replace p->p_vmspace with a new one.  If adjrefs is non-zero the new
3270  * vmspace will be ref'd and the old one will be deref'd.
3271  *
3272  * Caller must hold vmspace->vm_map.token for oldvm and newvm
3273  */
3274 void
3275 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs)
3276 {
3277 	struct vmspace *oldvm;
3278 	struct lwp *lp;
3279 
3280 	oldvm = p->p_vmspace;
3281 	if (oldvm != newvm) {
3282 		if (adjrefs)
3283 			vmspace_ref(newvm);
3284 		KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0);
3285 		p->p_vmspace = newvm;
3286 		KKASSERT(p->p_nthreads == 1);
3287 		lp = RB_ROOT(&p->p_lwp_tree);
3288 		pmap_setlwpvm(lp, newvm);
3289 		if (adjrefs)
3290 			vmspace_rel(oldvm);
3291 	}
3292 }
3293 
3294 /*
3295  * Set the vmspace for a LWP.  The vmspace is almost universally set the
3296  * same as the process vmspace, but virtual kernels need to swap out contexts
3297  * on a per-lwp basis.
3298  */
3299 void
3300 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
3301 {
3302 	struct vmspace *oldvm;
3303 	struct pmap *pmap;
3304 
3305 	oldvm = lp->lwp_vmspace;
3306 	if (oldvm != newvm) {
3307 		crit_enter();
3308 		KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0);
3309 		lp->lwp_vmspace = newvm;
3310 		if (curthread->td_lwp == lp) {
3311 			pmap = vmspace_pmap(newvm);
3312 			ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid);
3313 			if (pmap->pm_active_lock & CPULOCK_EXCL)
3314 				pmap_interlock_wait(newvm);
3315 #if defined(SWTCH_OPTIM_STATS)
3316 			tlb_flush_count++;
3317 #endif
3318 			pmap = vmspace_pmap(oldvm);
3319 			ATOMIC_CPUMASK_NANDBIT(pmap->pm_active,
3320 					       mycpu->gd_cpuid);
3321 		}
3322 		crit_exit();
3323 	}
3324 }
3325 
3326 /*
3327  * The swtch code tried to switch in a heavy weight process whos pmap
3328  * is locked by another cpu.  We have to wait for the lock to clear before
3329  * the pmap can be used.
3330  */
3331 void
3332 pmap_interlock_wait (struct vmspace *vm)
3333 {
3334 	pmap_t pmap = vmspace_pmap(vm);
3335 
3336 	if (pmap->pm_active_lock & CPULOCK_EXCL) {
3337 		crit_enter();
3338 		while (pmap->pm_active_lock & CPULOCK_EXCL) {
3339 			cpu_ccfence();
3340 			pthread_yield();
3341 		}
3342 		crit_exit();
3343 	}
3344 }
3345 
3346 vm_offset_t
3347 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3348 {
3349 
3350 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3351 		return addr;
3352 	}
3353 
3354 	addr = roundup2(addr, NBPDR);
3355 	return addr;
3356 }
3357 
3358 /*
3359  * Used by kmalloc/kfree, page already exists at va
3360  */
3361 vm_page_t
3362 pmap_kvtom(vm_offset_t va)
3363 {
3364 	vpte_t *ptep;
3365 
3366 	KKASSERT(va >= KvaStart && va < KvaEnd);
3367 	ptep = vtopte(va);
3368 	return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME));
3369 }
3370 
3371 void
3372 pmap_object_init(vm_object_t object)
3373 {
3374 	/* empty */
3375 }
3376 
3377 void
3378 pmap_object_free(vm_object_t object)
3379 {
3380 	/* empty */
3381 }
3382 
3383 void
3384 pmap_pgscan(struct pmap_pgscan_info *pginfo)
3385 {
3386 	pmap_t pmap = pginfo->pmap;
3387 	vm_offset_t sva = pginfo->beg_addr;
3388 	vm_offset_t eva = pginfo->end_addr;
3389 	vm_offset_t va_next;
3390 	pml4_entry_t *pml4e;
3391 	pdp_entry_t *pdpe;
3392 	pd_entry_t ptpaddr, *pde;
3393 	pt_entry_t *pte;
3394 	vm_page_t pt_m;
3395 	int stop = 0;
3396 
3397 	vm_object_hold(pmap->pm_pteobj);
3398 
3399 	for (; sva < eva; sva = va_next) {
3400 		if (stop)
3401 			break;
3402 
3403 		pml4e = pmap_pml4e(pmap, sva);
3404 		if ((*pml4e & VPTE_V) == 0) {
3405 			va_next = (sva + NBPML4) & ~PML4MASK;
3406 			if (va_next < sva)
3407 				va_next = eva;
3408 			continue;
3409 		}
3410 
3411 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
3412 		if ((*pdpe & VPTE_V) == 0) {
3413 			va_next = (sva + NBPDP) & ~PDPMASK;
3414 			if (va_next < sva)
3415 				va_next = eva;
3416 			continue;
3417 		}
3418 
3419 		va_next = (sva + NBPDR) & ~PDRMASK;
3420 		if (va_next < sva)
3421 			va_next = eva;
3422 
3423 		pde = pmap_pdpe_to_pde(pdpe, sva);
3424 		ptpaddr = *pde;
3425 
3426 #if 0
3427 		/*
3428 		 * Check for large page (ignore).
3429 		 */
3430 		if ((ptpaddr & VPTE_PS) != 0) {
3431 #if 0
3432 			pmap_clean_pde(pde, pmap, sva);
3433 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
3434 #endif
3435 			continue;
3436 		}
3437 #endif
3438 
3439 		/*
3440 		 * Weed out invalid mappings. Note: we assume that the page
3441 		 * directory table is always allocated, and in kernel virtual.
3442 		 */
3443 		if (ptpaddr == 0)
3444 			continue;
3445 
3446 		if (va_next > eva)
3447 			va_next = eva;
3448 
3449 		pt_m = pmap_hold_pt_page(pde, sva);
3450 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
3451 		    sva += PAGE_SIZE) {
3452 			vm_page_t m;
3453 
3454 			if (stop)
3455 				break;
3456 			if ((*pte & VPTE_MANAGED) == 0)
3457 				continue;
3458 
3459 			m = PHYS_TO_VM_PAGE(*pte & VPTE_FRAME);
3460 			if (vm_page_busy_try(m, TRUE) == 0) {
3461 				if (pginfo->callback(pginfo, sva, m) < 0)
3462 					stop = 1;
3463 			}
3464 		}
3465 		vm_page_unhold(pt_m);
3466 	}
3467 	vm_object_drop(pmap->pm_pteobj);
3468 }
3469