xref: /dragonfly/sys/platform/pc64/x86_64/pmap.c (revision 634ba020)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * Copyright (c) 1994 John S. Dyson
4  * Copyright (c) 1994 David Greenman
5  * Copyright (c) 2003 Peter Wemm
6  * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
7  * Copyright (c) 2008, 2009 The DragonFly Project.
8  * Copyright (c) 2008, 2009 Jordan Gordeev.
9  * Copyright (c) 2011-2019 Matthew Dillon
10  * All rights reserved.
11  *
12  * This code is derived from software contributed to Berkeley by
13  * the Systems Programming Group of the University of Utah Computer
14  * Science Department and William Jolitz of UUNET Technologies Inc.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  * 3. All advertising materials mentioning features or use of this software
25  *    must display the following acknowledgement:
26  *	This product includes software developed by the University of
27  *	California, Berkeley and its contributors.
28  * 4. Neither the name of the University nor the names of its contributors
29  *    may be used to endorse or promote products derived from this software
30  *    without specific prior written permission.
31  *
32  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
33  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
36  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42  * SUCH DAMAGE.
43  */
44 /*
45  * Manage physical address maps for x86-64 systems.
46  *
47  * Some notes:
48  *	- The 'M'odified bit is only applicable to terminal PTEs.
49  *
50  *	- The 'U'ser access bit can be set for higher-level PTEs as
51  *	  long as it isn't set for terminal PTEs for pages we don't
52  *	  want user access to.
53  */
54 
55 #if 0 /* JG */
56 #include "opt_pmap.h"
57 #endif
58 #include "opt_msgbuf.h"
59 
60 #include <sys/param.h>
61 #include <sys/kernel.h>
62 #include <sys/proc.h>
63 #include <sys/msgbuf.h>
64 #include <sys/vmmeter.h>
65 #include <sys/mman.h>
66 #include <sys/systm.h>
67 
68 #include <vm/vm.h>
69 #include <vm/vm_param.h>
70 #include <sys/sysctl.h>
71 #include <sys/lock.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_map.h>
75 #include <vm/vm_object.h>
76 #include <vm/vm_extern.h>
77 #include <vm/vm_pageout.h>
78 #include <vm/vm_pager.h>
79 #include <vm/vm_zone.h>
80 
81 #include <sys/thread2.h>
82 #include <sys/spinlock2.h>
83 #include <vm/vm_page2.h>
84 
85 #include <machine/cputypes.h>
86 #include <machine/cpu.h>
87 #include <machine/md_var.h>
88 #include <machine/specialreg.h>
89 #include <machine/smp.h>
90 #include <machine_base/apic/apicreg.h>
91 #include <machine/globaldata.h>
92 #include <machine/pmap.h>
93 #include <machine/pmap_inval.h>
94 
95 #include <ddb/ddb.h>
96 
97 #define PMAP_KEEP_PDIRS
98 
99 #if defined(DIAGNOSTIC)
100 #define PMAP_DIAGNOSTIC
101 #endif
102 
103 #define MINPV 2048
104 
105 /*
106  * pmap debugging will report who owns a pv lock when blocking.
107  */
108 #ifdef PMAP_DEBUG
109 
110 #define PMAP_DEBUG_DECL		, const char *func, int lineno
111 #define PMAP_DEBUG_ARGS		, __func__, __LINE__
112 #define PMAP_DEBUG_COPY		, func, lineno
113 
114 #define pv_get(pmap, pindex, pmarkp)	_pv_get(pmap, pindex, pmarkp	\
115 							PMAP_DEBUG_ARGS)
116 #define pv_lock(pv)			_pv_lock(pv			\
117 							PMAP_DEBUG_ARGS)
118 #define pv_hold_try(pv)			_pv_hold_try(pv			\
119 							PMAP_DEBUG_ARGS)
120 #define pv_alloc(pmap, pindex, isnewp)	_pv_alloc(pmap, pindex, isnewp	\
121 							PMAP_DEBUG_ARGS)
122 
123 #define pv_free(pv, pvp)		_pv_free(pv, pvp PMAP_DEBUG_ARGS)
124 
125 #else
126 
127 #define PMAP_DEBUG_DECL
128 #define PMAP_DEBUG_ARGS
129 #define PMAP_DEBUG_COPY
130 
131 #define pv_get(pmap, pindex, pmarkp)	_pv_get(pmap, pindex, pmarkp)
132 #define pv_lock(pv)			_pv_lock(pv)
133 #define pv_hold_try(pv)			_pv_hold_try(pv)
134 #define pv_alloc(pmap, pindex, isnewp)	_pv_alloc(pmap, pindex, isnewp)
135 #define pv_free(pv, pvp)		_pv_free(pv, pvp)
136 
137 #endif
138 
139 /*
140  * Get PDEs and PTEs for user/kernel address space
141  */
142 #define pdir_pde(m, v)		(m[(vm_offset_t)(v) >> PDRSHIFT])
143 
144 #define pmap_pde_v(pmap, pde)	\
145 		((*(pd_entry_t *)pde & pmap->pmap_bits[PG_V_IDX]) != 0)
146 #define pmap_pte_w(pmap, pte)	\
147 		((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0)
148 #define pmap_pte_m(pmap, pte)	\
149 		((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0)
150 #define pmap_pte_u(pmap, pte)	\
151 		((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0)
152 #define pmap_pte_v(pmap, pte)	\
153 		((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0)
154 
155 /*
156  * Given a map and a machine independent protection code,
157  * convert to a vax protection code.
158  */
159 #define pte_prot(m, p)		\
160 	(m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
161 static uint64_t protection_codes[PROTECTION_CODES_SIZE];
162 
163 /*
164  * Backing scan macros.  Note that in the use case 'ipte' is only a tentitive
165  * value and must be validated by a pmap_inval_smp_cmpset*() or equivalent
166  * function.
167  *
168  * NOTE: cpu_ccfence() is required to prevent excessive optmization of
169  *	 of the (ipte) variable.
170  *
171  * NOTE: We don't bother locking the backing object if it isn't mapped
172  *	 to anything (backing_list is empty).
173  *
174  * NOTE: For now guarantee an interlock via iobj->backing_lk if the
175  *	 object exists and do not shortcut the lock by checking to see
176  *	 if the list is empty first.
177  */
178 #define PMAP_PAGE_BACKING_SCAN(m, match_pmap, ipmap, iptep, ipte, iva)	\
179 	if (m->object) {						\
180 		vm_object_t iobj = m->object;				\
181 		vm_map_backing_t iba, next_ba;				\
182 		struct pmap *ipmap;					\
183 		pt_entry_t ipte;					\
184 		pt_entry_t *iptep;					\
185 		vm_offset_t iva;					\
186 		vm_pindex_t ipindex_start;				\
187 		vm_pindex_t ipindex_end;				\
188 									\
189 		lockmgr(&iobj->backing_lk, LK_SHARED);			\
190 		next_ba = TAILQ_FIRST(&iobj->backing_list);		\
191 		while ((iba = next_ba) != NULL) {			\
192 			next_ba = TAILQ_NEXT(iba, entry);		\
193 			ipmap = iba->pmap;				\
194 			if (match_pmap && ipmap != match_pmap)		\
195 				continue;				\
196 			ipindex_start = iba->offset >> PAGE_SHIFT;	\
197 			ipindex_end = ipindex_start +			\
198 				  ((iba->end - iba->start) >> PAGE_SHIFT); \
199 			if (m->pindex < ipindex_start ||		\
200 			    m->pindex >= ipindex_end) {			\
201 				continue;				\
202 			}						\
203 			iva = iba->start +				\
204 			      ((m->pindex - ipindex_start) << PAGE_SHIFT); \
205 			iptep = pmap_pte(ipmap, iva);			\
206 			if (iptep == NULL)				\
207 				continue;				\
208 			ipte = *iptep;					\
209 			cpu_ccfence();					\
210 			if (m->phys_addr != (ipte & PG_FRAME))		\
211 				continue;				\
212 
213 #define PMAP_PAGE_BACKING_RETRY						\
214 			{						\
215 				next_ba = iba;				\
216 				continue;				\
217 			}						\
218 
219 #define PMAP_PAGE_BACKING_DONE						\
220 		}							\
221 		lockmgr(&iobj->backing_lk, LK_RELEASE);			\
222 	}								\
223 
224 static struct pmap iso_pmap;
225 static struct pmap kernel_pmap_store;
226 struct pmap *kernel_pmap = &kernel_pmap_store;
227 
228 vm_paddr_t avail_start;		/* PA of first available physical page */
229 vm_paddr_t avail_end;		/* PA of last available physical page */
230 vm_offset_t virtual2_start;	/* cutout free area prior to kernel start */
231 vm_offset_t virtual2_end;
232 vm_offset_t virtual_start;	/* VA of first avail page (after kernel BSS) */
233 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
234 vm_offset_t KvaStart;		/* VA start of KVA space */
235 vm_offset_t KvaEnd;		/* VA end of KVA space (non-inclusive) */
236 vm_offset_t KvaSize;		/* max size of KVA space */
237 vm_offset_t DMapMaxAddress;
238 
239 /* Has pmap_init completed? */
240 __read_frequently static boolean_t pmap_initialized = FALSE;
241 //static int pgeflag;		/* PG_G or-in */
242 static uint64_t PatMsr;		/* value of MSR_PAT */
243 
244 static int ndmpdp;
245 static vm_paddr_t dmaplimit;
246 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
247 
248 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE];	/* PAT -> PG_ bits */
249 static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];	/* PAT -> PG_ bits */
250 
251 static uint64_t KPTbase;
252 static uint64_t KPTphys;
253 static uint64_t KPDphys;	/* phys addr of kernel level 2 */
254 static uint64_t KPDbase;	/* phys addr of kernel level 2 @ KERNBASE */
255 uint64_t KPDPphys;		/* phys addr of kernel level 3 */
256 uint64_t KPML4phys;		/* phys addr of kernel level 4 */
257 
258 static uint64_t DMPDphys;	/* phys addr of direct mapped level 2 */
259 static uint64_t DMPDPphys;	/* phys addr of direct mapped level 3 */
260 
261 /*
262  * Data for the pv entry allocation mechanism
263  */
264 __read_mostly static vm_zone_t pvzone;
265 __read_mostly static int pmap_pagedaemon_waken = 0;
266 static struct vm_zone pvzone_store;
267 static struct pv_entry *pvinit;
268 
269 /*
270  * All those kernel PT submaps that BSD is so fond of
271  */
272 pt_entry_t *CMAP1 = NULL;
273 caddr_t CADDR1 = NULL, ptvmmap = NULL;
274 static pt_entry_t *msgbufmap, *ptmmap;
275 struct msgbuf *msgbufp = NULL;
276 
277 /*
278  * PG_* bits for regular (x86) pmap.
279  */
280 __read_frequently static uint64_t pmap_bits_default[PG_BITS_SIZE] = {
281 	[TYPE_IDX]	= REGULAR_PMAP,
282 	[PG_V_IDX]	= X86_PG_V,
283 	[PG_RW_IDX]	= X86_PG_RW,
284 	[PG_U_IDX]	= X86_PG_U,
285 	[PG_A_IDX]	= X86_PG_A,
286 	[PG_M_IDX]	= X86_PG_M,
287 	[PG_PS_IDX]	= X86_PG_PS,
288 	[PG_G_IDX]	= X86_PG_G,
289 	[PG_W_IDX]	= X86_PG_AVAIL1,
290 	[PG_MANAGED_IDX] = X86_PG_AVAIL2,
291 	[PG_N_IDX]	= X86_PG_NC_PWT | X86_PG_NC_PCD,
292 	[PG_NX_IDX]	= X86_PG_NX,
293 };
294 
295 /*
296  * Crashdump maps.
297  */
298 static pt_entry_t *pt_crashdumpmap;
299 static caddr_t crashdumpmap;
300 
301 static int pmap_debug = 0;
302 SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW,
303     &pmap_debug, 0, "Debug pmap's");
304 #ifdef PMAP_DEBUG2
305 static int pmap_enter_debug = 0;
306 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW,
307     &pmap_enter_debug, 0, "Debug pmap_enter's");
308 #endif
309 static int pmap_yield_count = 64;
310 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW,
311     &pmap_yield_count, 0, "Yield during init_pt/release");
312 static int pmap_fast_kernel_cpusync = 0;
313 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW,
314     &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible");
315 static int pmap_dynamic_delete = 0;
316 SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW,
317     &pmap_dynamic_delete, 0, "Dynamically delete PT/PD/PDPs");
318 static int pmap_lock_delay = 100;
319 SYSCTL_INT(_machdep, OID_AUTO, pmap_lock_delay, CTLFLAG_RW,
320     &pmap_lock_delay, 0, "Spin loops");
321 static int meltdown_mitigation = -1;
322 TUNABLE_INT("machdep.meltdown_mitigation", &meltdown_mitigation);
323 SYSCTL_INT(_machdep, OID_AUTO, meltdown_mitigation, CTLFLAG_RW,
324     &meltdown_mitigation, 0, "Userland pmap isolation");
325 
326 static int pmap_nx_enable = -1;		/* -1 = auto */
327 /* needs manual TUNABLE in early probe, see below */
328 SYSCTL_INT(_machdep, OID_AUTO, pmap_nx_enable, CTLFLAG_RD,
329     &pmap_nx_enable, 0,
330     "no-execute support (0=disabled, 1=w/READ, 2=w/READ & WRITE)");
331 
332 static int pmap_pv_debug = 50;
333 SYSCTL_INT(_machdep, OID_AUTO, pmap_pv_debug, CTLFLAG_RW,
334     &pmap_pv_debug, 0, "");
335 
336 static long vm_pmap_pv_entries;
337 SYSCTL_LONG(_vm, OID_AUTO, pmap_pv_entries, CTLFLAG_RD,
338     &vm_pmap_pv_entries, 0, "");
339 
340 /* Standard user access funtions */
341 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len,
342     size_t *lencopied);
343 extern int std_copyin (const void *udaddr, void *kaddr, size_t len);
344 extern int std_copyout (const void *kaddr, void *udaddr, size_t len);
345 extern int std_fubyte (const uint8_t *base);
346 extern int std_subyte (uint8_t *base, uint8_t byte);
347 extern int32_t std_fuword32 (const uint32_t *base);
348 extern int64_t std_fuword64 (const uint64_t *base);
349 extern int std_suword64 (uint64_t *base, uint64_t word);
350 extern int std_suword32 (uint32_t *base, int word);
351 extern uint32_t std_swapu32 (volatile uint32_t *base, uint32_t v);
352 extern uint64_t std_swapu64 (volatile uint64_t *base, uint64_t v);
353 extern uint32_t std_fuwordadd32 (volatile uint32_t *base, uint32_t v);
354 extern uint64_t std_fuwordadd64 (volatile uint64_t *base, uint64_t v);
355 
356 #if 0
357 static void pv_hold(pv_entry_t pv);
358 #endif
359 static int _pv_hold_try(pv_entry_t pv
360 				PMAP_DEBUG_DECL);
361 static void pv_drop(pv_entry_t pv);
362 static void _pv_lock(pv_entry_t pv
363 				PMAP_DEBUG_DECL);
364 static void pv_unlock(pv_entry_t pv);
365 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew
366 				PMAP_DEBUG_DECL);
367 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp
368 				PMAP_DEBUG_DECL);
369 static void _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL);
370 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex,
371 				vm_pindex_t **pmarkp, int *errorp);
372 static void pv_put(pv_entry_t pv);
373 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex);
374 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
375 		      pv_entry_t *pvpp);
376 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp,
377 			pmap_inval_bulk_t *bulk, int destroy);
378 static vm_page_t pmap_remove_pv_page(pv_entry_t pv, int clrpgbits);
379 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp,
380 			pmap_inval_bulk_t *bulk);
381 
382 struct pmap_scan_info;
383 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
384 		      vm_pindex_t *pte_placemark, pv_entry_t pt_pv,
385 		      vm_offset_t va, pt_entry_t *ptep, void *arg __unused);
386 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info,
387 		      vm_pindex_t *pte_placemark, pv_entry_t pt_pv,
388 		      vm_offset_t va, pt_entry_t *ptep, void *arg __unused);
389 
390 static void x86_64_protection_init (void);
391 static void create_pagetables(vm_paddr_t *firstaddr);
392 static void pmap_remove_all (vm_page_t m);
393 static boolean_t pmap_testbit (vm_page_t m, int bit);
394 
395 static pt_entry_t *pmap_pte_quick (pmap_t pmap, vm_offset_t va);
396 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
397 
398 static void pmap_pinit_defaults(struct pmap *pmap);
399 static void pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark);
400 static void pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark);
401 
402 static int
403 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2)
404 {
405 	if (pv1->pv_pindex < pv2->pv_pindex)
406 		return(-1);
407 	if (pv1->pv_pindex > pv2->pv_pindex)
408 		return(1);
409 	return(0);
410 }
411 
412 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry,
413              pv_entry_compare, vm_pindex_t, pv_pindex);
414 
415 /*
416  * We have removed a managed pte.  The page might not be hard or soft-busied
417  * at this point so we have to be careful.
418  *
419  * If advanced mode is enabled we can clear PG_MAPPED/WRITEABLE only if
420  * MAPPEDMULTI is not set.  This must be done atomically against possible
421  * concurrent pmap_enter()s occurring at the same time.  If MULTI is set
422  * then the kernel may have to call vm_page_protect() later on to clean
423  * the bits up.  This is particularly important for kernel_map/kernel_object
424  * mappings due to the expense of scanning the kernel_object's vm_backing's.
425  *
426  * If advanced mode is not enabled we update our tracking counts and
427  * synchronize PG_MAPPED/WRITEABLE later on in pmap_mapped_sync().
428  */
429 static __inline
430 void
431 pmap_removed_pte(vm_page_t m, pt_entry_t pte)
432 {
433 	int flags;
434 	int nflags;
435 
436 	flags = m->flags;
437 	cpu_ccfence();
438 	while ((flags & PG_MAPPEDMULTI) == 0) {
439 		nflags = flags & ~(PG_MAPPED | PG_WRITEABLE);
440 		if (atomic_fcmpset_int(&m->flags, &flags, nflags))
441 			break;
442 	}
443 }
444 
445 /*
446  * Move the kernel virtual free pointer to the next
447  * 2MB.  This is used to help improve performance
448  * by using a large (2MB) page for much of the kernel
449  * (.text, .data, .bss)
450  */
451 static
452 vm_offset_t
453 pmap_kmem_choose(vm_offset_t addr)
454 {
455 	vm_offset_t newaddr = addr;
456 
457 	newaddr = roundup2(addr, NBPDR);
458 	return newaddr;
459 }
460 
461 /*
462  * Returns the pindex of a page table entry (representing a terminal page).
463  * There are NUPTE_TOTAL page table entries possible (a huge number)
464  *
465  * x86-64 has a 48-bit address space, where bit 47 is sign-extended out.
466  * We want to properly translate negative KVAs.
467  */
468 static __inline
469 vm_pindex_t
470 pmap_pte_pindex(vm_offset_t va)
471 {
472 	return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1));
473 }
474 
475 /*
476  * Returns the pindex of a page table.
477  */
478 static __inline
479 vm_pindex_t
480 pmap_pt_pindex(vm_offset_t va)
481 {
482 	return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1)));
483 }
484 
485 /*
486  * Returns the pindex of a page directory.
487  */
488 static __inline
489 vm_pindex_t
490 pmap_pd_pindex(vm_offset_t va)
491 {
492 	return (NUPTE_TOTAL + NUPT_TOTAL +
493 		((va >> PDPSHIFT) & (NUPD_TOTAL - 1)));
494 }
495 
496 static __inline
497 vm_pindex_t
498 pmap_pdp_pindex(vm_offset_t va)
499 {
500 	return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
501 		((va >> PML4SHIFT) & (NUPDP_TOTAL - 1)));
502 }
503 
504 static __inline
505 vm_pindex_t
506 pmap_pml4_pindex(void)
507 {
508 	return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL);
509 }
510 
511 /*
512  * Return various *clipped* indexes for a given VA.
513  *
514  * Returns the index of a PTE in a page table (PT), representing
515  * a terminal page.
516  */
517 static __inline
518 vm_pindex_t
519 pmap_pte_index(vm_offset_t va)
520 {
521 	return ((va >> PAGE_SHIFT) & ((1UL << NPTEPGSHIFT) - 1));
522 }
523 
524 /*
525  * Returns the index of a PDE in a page directory (PD) table, representing
526  * a page table (PT).
527  */
528 static __inline
529 vm_pindex_t
530 pmap_pt_index(vm_offset_t va)
531 {
532 	return ((va >> PDRSHIFT) & ((1UL << NPDEPGSHIFT) - 1));
533 }
534 
535 /*
536  * Returns the index of a PDPE in a page directory pointer (PDP) table,
537  * representing a page directory (PD) table.
538  */
539 static __inline
540 vm_pindex_t
541 pmap_pd_index(vm_offset_t va)
542 {
543 	return ((va >> PDPSHIFT) & ((1UL << NPDPEPGSHIFT) - 1));
544 }
545 
546 /*
547  * Returns the index of a PML4E in the PML4 table, representing a page
548  * directory pointer (PDP) table.
549  */
550 static __inline
551 vm_pindex_t
552 pmap_pdp_index(vm_offset_t va)
553 {
554 	return ((va >> PML4SHIFT) & ((1UL << NPML4EPGSHIFT) - 1));
555 }
556 
557 /*
558  * Of all the layers (PT, PD, PDP, PML4) the best one to cache is
559  * the PT layer.  This will speed up core pmap operations considerably.
560  *
561  * NOTE: The pmap spinlock does not need to be held but the passed-in pv
562  *	 must be in a known associated state (typically by being locked when
563  *	 the pmap spinlock isn't held).  We allow the race for that case.
564  *
565  * NOTE: pm_pvhint* is only accessed (read) with the spin-lock held, using
566  *	 cpu_ccfence() to prevent compiler optimizations from reloading the
567  *	 field.
568  */
569 static __inline
570 void
571 pv_cache(pmap_t pmap, pv_entry_t pv, vm_pindex_t pindex)
572 {
573 	if (pindex < pmap_pt_pindex(0)) {
574 		;
575 	} else if (pindex < pmap_pd_pindex(0)) {
576 		pmap->pm_pvhint_pt = pv;
577 	}
578 }
579 
580 /*
581  * Locate the requested pt_entry
582  */
583 static __inline
584 pv_entry_t
585 pv_entry_lookup(pmap_t pmap, vm_pindex_t pindex)
586 {
587 	pv_entry_t pv;
588 
589 	if (pindex < pmap_pt_pindex(0))
590 		return NULL;
591 #if 1
592 	if (pindex < pmap_pd_pindex(0))
593 		pv = pmap->pm_pvhint_pt;
594 	else
595 		pv = NULL;
596 	cpu_ccfence();
597 	if (pv == NULL || pv->pv_pmap != pmap) {
598 		pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
599 		if (pv)
600 			pv_cache(pmap, pv, pindex);
601 	} else if (pv->pv_pindex != pindex) {
602 		pv = pv_entry_rb_tree_RB_LOOKUP_REL(&pmap->pm_pvroot,
603 						    pindex, pv);
604 		if (pv)
605 			pv_cache(pmap, pv, pindex);
606 	}
607 #else
608 	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
609 #endif
610 	return pv;
611 }
612 
613 /*
614  * pmap_pte_quick:
615  *
616  *	Super fast pmap_pte routine best used when scanning the pv lists.
617  *	This eliminates many course-grained invltlb calls.  Note that many of
618  *	the pv list scans are across different pmaps and it is very wasteful
619  *	to do an entire invltlb when checking a single mapping.
620  */
621 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va);
622 
623 static
624 pt_entry_t *
625 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
626 {
627 	return pmap_pte(pmap, va);
628 }
629 
630 /*
631  * The placemarker hash must be broken up into four zones so lock
632  * ordering semantics continue to work (e.g. pte, pt, pd, then pdp).
633  *
634  * Placemarkers are used to 'lock' page table indices that do not have
635  * a pv_entry.  This allows the pmap to support managed and unmanaged
636  * pages and shared page tables.
637  */
638 #define PM_PLACE_BASE	(PM_PLACEMARKS >> 2)
639 
640 static __inline
641 vm_pindex_t *
642 pmap_placemarker_hash(pmap_t pmap, vm_pindex_t pindex)
643 {
644 	int hi;
645 
646 	if (pindex < pmap_pt_pindex(0))		/* zone 0 - PTE */
647 		hi = 0;
648 	else if (pindex < pmap_pd_pindex(0))	/* zone 1 - PT */
649 		hi = PM_PLACE_BASE;
650 	else if (pindex < pmap_pdp_pindex(0))	/* zone 2 - PD */
651 		hi = PM_PLACE_BASE << 1;
652 	else					/* zone 3 - PDP (and PML4E) */
653 		hi = PM_PLACE_BASE | (PM_PLACE_BASE << 1);
654 	hi += pindex & (PM_PLACE_BASE - 1);
655 
656 	return (&pmap->pm_placemarks[hi]);
657 }
658 
659 
660 /*
661  * Generic procedure to index a pte from a pt, pd, or pdp.
662  *
663  * NOTE: Normally passed pindex as pmap_xx_index().  pmap_xx_pindex() is NOT
664  *	 a page table page index but is instead of PV lookup index.
665  */
666 static
667 void *
668 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex)
669 {
670 	pt_entry_t *pte;
671 
672 	pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m));
673 	return(&pte[pindex]);
674 }
675 
676 /*
677  * Return pointer to PDP slot in the PML4
678  */
679 static __inline
680 pml4_entry_t *
681 pmap_pdp(pmap_t pmap, vm_offset_t va)
682 {
683 	return (&pmap->pm_pml4[pmap_pdp_index(va)]);
684 }
685 
686 /*
687  * Return pointer to PD slot in the PDP given a pointer to the PDP
688  */
689 static __inline
690 pdp_entry_t *
691 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va)
692 {
693 	pdp_entry_t *pd;
694 
695 	pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME);
696 	return (&pd[pmap_pd_index(va)]);
697 }
698 
699 /*
700  * Return pointer to PD slot in the PDP.
701  */
702 static __inline
703 pdp_entry_t *
704 pmap_pd(pmap_t pmap, vm_offset_t va)
705 {
706 	pml4_entry_t *pdp;
707 
708 	pdp = pmap_pdp(pmap, va);
709 	if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0)
710 		return NULL;
711 	return (pmap_pdp_to_pd(*pdp, va));
712 }
713 
714 /*
715  * Return pointer to PT slot in the PD given a pointer to the PD
716  */
717 static __inline
718 pd_entry_t *
719 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va)
720 {
721 	pd_entry_t *pt;
722 
723 	pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME);
724 	return (&pt[pmap_pt_index(va)]);
725 }
726 
727 /*
728  * Return pointer to PT slot in the PD
729  *
730  * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs,
731  *		     so we cannot lookup the PD via the PDP.  Instead we
732  *		     must look it up via the pmap.
733  */
734 static __inline
735 pd_entry_t *
736 pmap_pt(pmap_t pmap, vm_offset_t va)
737 {
738 	pdp_entry_t *pd;
739 	pv_entry_t pv;
740 	vm_pindex_t pd_pindex;
741 	vm_paddr_t phys;
742 
743 	if (pmap->pm_flags & PMAP_FLAG_SIMPLE) {
744 		pd_pindex = pmap_pd_pindex(va);
745 		spin_lock_shared(&pmap->pm_spin);
746 		pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex);
747 		if (pv == NULL || pv->pv_m == NULL) {
748 			spin_unlock_shared(&pmap->pm_spin);
749 			return NULL;
750 		}
751 		phys = VM_PAGE_TO_PHYS(pv->pv_m);
752 		spin_unlock_shared(&pmap->pm_spin);
753 		return (pmap_pd_to_pt(phys, va));
754 	} else {
755 		pd = pmap_pd(pmap, va);
756 		if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0)
757 			 return NULL;
758 		return (pmap_pd_to_pt(*pd, va));
759 	}
760 }
761 
762 /*
763  * Return pointer to PTE slot in the PT given a pointer to the PT
764  */
765 static __inline
766 pt_entry_t *
767 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va)
768 {
769 	pt_entry_t *pte;
770 
771 	pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME);
772 	return (&pte[pmap_pte_index(va)]);
773 }
774 
775 /*
776  * Return pointer to PTE slot in the PT
777  */
778 static __inline
779 pt_entry_t *
780 pmap_pte(pmap_t pmap, vm_offset_t va)
781 {
782 	pd_entry_t *pt;
783 
784 	pt = pmap_pt(pmap, va);
785 	if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0)
786 		 return NULL;
787 	if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0)
788 		return ((pt_entry_t *)pt);
789 	return (pmap_pt_to_pte(*pt, va));
790 }
791 
792 /*
793  * Return address of PT slot in PD (KVM only)
794  *
795  * Cannot be used for user page tables because it might interfere with
796  * the shared page-table-page optimization (pmap_mmu_optimize).
797  */
798 static __inline
799 pd_entry_t *
800 vtopt(vm_offset_t va)
801 {
802 	uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
803 				  NPML4EPGSHIFT)) - 1);
804 
805 	return (PDmap + ((va >> PDRSHIFT) & mask));
806 }
807 
808 /*
809  * KVM - return address of PTE slot in PT
810  */
811 static __inline
812 pt_entry_t *
813 vtopte(vm_offset_t va)
814 {
815 	uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
816 				  NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
817 
818 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
819 }
820 
821 /*
822  * Returns the physical address translation from va for a user address.
823  * (vm_paddr_t)-1 is returned on failure.
824  */
825 vm_paddr_t
826 uservtophys(vm_offset_t va)
827 {
828 	uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
829 				  NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
830 	vm_paddr_t pa;
831 	pt_entry_t pte;
832 	pmap_t pmap;
833 
834 	pmap = vmspace_pmap(mycpu->gd_curthread->td_lwp->lwp_vmspace);
835 	pa = (vm_paddr_t)-1;
836 	if (va < VM_MAX_USER_ADDRESS) {
837 		pte = kreadmem64(PTmap + ((va >> PAGE_SHIFT) & mask));
838 		if (pte & pmap->pmap_bits[PG_V_IDX])
839 			pa = (pte & PG_FRAME) | (va & PAGE_MASK);
840 	}
841 	return pa;
842 }
843 
844 static uint64_t
845 allocpages(vm_paddr_t *firstaddr, long n)
846 {
847 	uint64_t ret;
848 
849 	ret = *firstaddr;
850 	bzero((void *)ret, n * PAGE_SIZE);
851 	*firstaddr += n * PAGE_SIZE;
852 	return (ret);
853 }
854 
855 static
856 void
857 create_pagetables(vm_paddr_t *firstaddr)
858 {
859 	long i;		/* must be 64 bits */
860 	long nkpt_base;
861 	long nkpt_phys;
862 	long nkpd_phys;
863 	int j;
864 
865 	/*
866 	 * We are running (mostly) V=P at this point
867 	 *
868 	 * Calculate how many 1GB PD entries in our PDP pages are needed
869 	 * for the DMAP.  This is only allocated if the system does not
870 	 * support 1GB pages.  Otherwise ndmpdp is simply a count of
871 	 * the number of 1G terminal entries in our PDP pages are needed.
872 	 *
873 	 * NOTE: Maxmem is in pages
874 	 */
875 	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
876 	if (ndmpdp < 4)		/* Minimum 4GB of DMAP */
877 		ndmpdp = 4;
878 
879 #if 0
880 	/*
881 	 * HACK XXX fix me - Some laptops map the EFI framebuffer in
882 	 * very high physical addresses and the DMAP winds up being too
883 	 * small.  The EFI framebuffer has to be mapped for the console
884 	 * very early and the DMAP is how it does it.
885 	 */
886 	if (ndmpdp < 512)	/* Minimum 512GB of DMAP */
887 		ndmpdp = 512;
888 #endif
889 
890 	KKASSERT(ndmpdp <= NDMPML4E * NPML4EPG);
891 	DMapMaxAddress = DMAP_MIN_ADDRESS +
892 			 ((ndmpdp * NPDEPG) << PDRSHIFT);
893 
894 	/*
895 	 * Starting at KERNBASE - map all 2G worth of page table pages.
896 	 * KERNBASE is offset -2G from the end of kvm.  This will accomodate
897 	 * all KVM allocations above KERNBASE, including the SYSMAPs below.
898 	 *
899 	 * We do this by allocating 2*512 PT pages.  Each PT page can map
900 	 * 2MB, for 2GB total.
901 	 */
902 	nkpt_base = (NPDPEPG - KPDPI) * NPTEPG;	/* typically 2 x 512 */
903 
904 	/*
905 	 * Starting at the beginning of kvm (VM_MIN_KERNEL_ADDRESS),
906 	 * Calculate how many page table pages we need to preallocate
907 	 * for early vm_map allocations.
908 	 *
909 	 * A few extra won't hurt, they will get used up in the running
910 	 * system.
911 	 *
912 	 * vm_page array
913 	 * initial pventry's
914 	 */
915 	nkpt_phys = howmany(Maxmem * sizeof(struct vm_page), NBPDR);
916 	nkpt_phys += howmany(Maxmem * sizeof(struct pv_entry), NBPDR);
917 	nkpt_phys += 128;	/* a few extra */
918 
919 	/*
920 	 * The highest value nkpd_phys can be set to is
921 	 * NKPDPE - (NPDPEPG - KPDPI) (i.e. NKPDPE - 2).
922 	 *
923 	 * Doing so would cause all PD pages to be pre-populated for
924 	 * a maximal KVM space (approximately 16*512 pages, or 32MB.
925 	 * We can save memory by not doing this.
926 	 */
927 	nkpd_phys = (nkpt_phys + NPDPEPG - 1) / NPDPEPG;
928 
929 	/*
930 	 * Allocate pages
931 	 *
932 	 * Normally NKPML4E=1-16 (1-16 kernel PDP page)
933 	 * Normally NKPDPE= NKPML4E*512-1 (511 min kernel PD pages)
934 	 *
935 	 * Only allocate enough PD pages
936 	 * NOTE: We allocate all kernel PD pages up-front, typically
937 	 *	 ~511G of KVM, requiring 511 PD pages.
938 	 */
939 	KPTbase = allocpages(firstaddr, nkpt_base);	/* KERNBASE to end */
940 	KPTphys = allocpages(firstaddr, nkpt_phys);	/* KVA start */
941 	KPML4phys = allocpages(firstaddr, 1);		/* recursive PML4 map */
942 	KPDPphys = allocpages(firstaddr, NKPML4E);	/* kernel PDP pages */
943 	KPDphys = allocpages(firstaddr, nkpd_phys);	/* kernel PD pages */
944 
945 	/*
946 	 * Alloc PD pages for the area starting at KERNBASE.
947 	 */
948 	KPDbase = allocpages(firstaddr, NPDPEPG - KPDPI);
949 
950 	/*
951 	 * Stuff for our DMAP.  Use 2MB pages even when 1GB pages
952 	 * are available in order to allow APU code to adjust page
953 	 * attributes on a fixed grain (see pmap_change_attr()).
954 	 */
955 	DMPDPphys = allocpages(firstaddr, NDMPML4E);
956 #if 1
957 	DMPDphys = allocpages(firstaddr, ndmpdp);
958 #else
959 	if ((amd_feature & AMDID_PAGE1GB) == 0)
960 		DMPDphys = allocpages(firstaddr, ndmpdp);
961 #endif
962 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
963 
964 	/*
965 	 * Fill in the underlying page table pages for the area around
966 	 * KERNBASE.  This remaps low physical memory to KERNBASE.
967 	 *
968 	 * Read-only from zero to physfree
969 	 * XXX not fully used, underneath 2M pages
970 	 */
971 	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
972 		((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT;
973 		((pt_entry_t *)KPTbase)[i] |=
974 		    pmap_bits_default[PG_RW_IDX] |
975 		    pmap_bits_default[PG_V_IDX] |
976 		    pmap_bits_default[PG_G_IDX];
977 	}
978 
979 	/*
980 	 * Now map the initial kernel page tables.  One block of page
981 	 * tables is placed at the beginning of kernel virtual memory,
982 	 * and another block is placed at KERNBASE to map the kernel binary,
983 	 * data, bss, and initial pre-allocations.
984 	 */
985 	for (i = 0; i < nkpt_base; i++) {
986 		((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT);
987 		((pd_entry_t *)KPDbase)[i] |=
988 		    pmap_bits_default[PG_RW_IDX] |
989 		    pmap_bits_default[PG_V_IDX];
990 	}
991 	for (i = 0; i < nkpt_phys; i++) {
992 		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
993 		((pd_entry_t *)KPDphys)[i] |=
994 		    pmap_bits_default[PG_RW_IDX] |
995 		    pmap_bits_default[PG_V_IDX];
996 	}
997 
998 	/*
999 	 * Map from zero to end of allocations using 2M pages as an
1000 	 * optimization.  This will bypass some of the KPTBase pages
1001 	 * above in the KERNBASE area.
1002 	 */
1003 	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
1004 		((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT;
1005 		((pd_entry_t *)KPDbase)[i] |=
1006 		    pmap_bits_default[PG_RW_IDX] |
1007 		    pmap_bits_default[PG_V_IDX] |
1008 		    pmap_bits_default[PG_PS_IDX] |
1009 		    pmap_bits_default[PG_G_IDX];
1010 	}
1011 
1012 	/*
1013 	 * Load PD addresses into the PDP pages for primary KVA space to
1014 	 * cover existing page tables.  PD's for KERNBASE are handled in
1015 	 * the next loop.
1016 	 *
1017 	 * expected to pre-populate all of its PDs.  See NKPDPE in vmparam.h.
1018 	 */
1019 	for (i = 0; i < nkpd_phys; i++) {
1020 		((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] =
1021 				KPDphys + (i << PAGE_SHIFT);
1022 		((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] |=
1023 		    pmap_bits_default[PG_RW_IDX] |
1024 		    pmap_bits_default[PG_V_IDX] |
1025 		    pmap_bits_default[PG_A_IDX];
1026 	}
1027 
1028 	/*
1029 	 * Load PDs for KERNBASE to the end
1030 	 */
1031 	i = (NKPML4E - 1) * NPDPEPG + KPDPI;
1032 	for (j = 0; j < NPDPEPG - KPDPI; ++j) {
1033 		((pdp_entry_t *)KPDPphys)[i + j] =
1034 				KPDbase + (j << PAGE_SHIFT);
1035 		((pdp_entry_t *)KPDPphys)[i + j] |=
1036 		    pmap_bits_default[PG_RW_IDX] |
1037 		    pmap_bits_default[PG_V_IDX] |
1038 		    pmap_bits_default[PG_A_IDX];
1039 	}
1040 
1041 	/*
1042 	 * Now set up the direct map space using either 2MB or 1GB pages
1043 	 * Preset PG_M and PG_A because demotion expects it.
1044 	 *
1045 	 * When filling in entries in the PD pages make sure any excess
1046 	 * entries are set to zero as we allocated enough PD pages
1047 	 *
1048 	 * Stuff for our DMAP.  Use 2MB pages even when 1GB pages
1049 	 * are available in order to allow APU code to adjust page
1050 	 * attributes on a fixed grain (see pmap_change_attr()).
1051 	 */
1052 #if 0
1053 	if ((amd_feature & AMDID_PAGE1GB) == 0)
1054 #endif
1055 	{
1056 		/*
1057 		 * Use 2MB pages
1058 		 */
1059 		for (i = 0; i < NPDEPG * ndmpdp; i++) {
1060 			((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT;
1061 			((pd_entry_t *)DMPDphys)[i] |=
1062 			    pmap_bits_default[PG_RW_IDX] |
1063 			    pmap_bits_default[PG_V_IDX] |
1064 			    pmap_bits_default[PG_PS_IDX] |
1065 			    pmap_bits_default[PG_G_IDX] |
1066 			    pmap_bits_default[PG_M_IDX] |
1067 			    pmap_bits_default[PG_A_IDX];
1068 		}
1069 
1070 		/*
1071 		 * And the direct map space's PDP
1072 		 */
1073 		for (i = 0; i < ndmpdp; i++) {
1074 			((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
1075 							(i << PAGE_SHIFT);
1076 			((pdp_entry_t *)DMPDPphys)[i] |=
1077 			    pmap_bits_default[PG_RW_IDX] |
1078 			    pmap_bits_default[PG_V_IDX] |
1079 			    pmap_bits_default[PG_A_IDX];
1080 		}
1081 	}
1082 #if 0
1083 	else {
1084 		/*
1085 		 * 1GB pages
1086 		 */
1087 		for (i = 0; i < ndmpdp; i++) {
1088 			((pdp_entry_t *)DMPDPphys)[i] =
1089 						(vm_paddr_t)i << PDPSHIFT;
1090 			((pdp_entry_t *)DMPDPphys)[i] |=
1091 			    pmap_bits_default[PG_RW_IDX] |
1092 			    pmap_bits_default[PG_V_IDX] |
1093 			    pmap_bits_default[PG_PS_IDX] |
1094 			    pmap_bits_default[PG_G_IDX] |
1095 			    pmap_bits_default[PG_M_IDX] |
1096 			    pmap_bits_default[PG_A_IDX];
1097 		}
1098 	}
1099 #endif
1100 
1101 	/* And recursively map PML4 to itself in order to get PTmap */
1102 	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
1103 	((pdp_entry_t *)KPML4phys)[PML4PML4I] |=
1104 	    pmap_bits_default[PG_RW_IDX] |
1105 	    pmap_bits_default[PG_V_IDX] |
1106 	    pmap_bits_default[PG_A_IDX];
1107 
1108 	/*
1109 	 * Connect the Direct Map slots up to the PML4
1110 	 */
1111 	for (j = 0; j < NDMPML4E; ++j) {
1112 		((pdp_entry_t *)KPML4phys)[DMPML4I + j] =
1113 		    (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) |
1114 		    pmap_bits_default[PG_RW_IDX] |
1115 		    pmap_bits_default[PG_V_IDX] |
1116 		    pmap_bits_default[PG_A_IDX];
1117 	}
1118 
1119 	/*
1120 	 * Connect the KVA slot up to the PML4
1121 	 */
1122 	for (j = 0; j < NKPML4E; ++j) {
1123 		((pdp_entry_t *)KPML4phys)[KPML4I + j] =
1124 		    KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT);
1125 		((pdp_entry_t *)KPML4phys)[KPML4I + j] |=
1126 		    pmap_bits_default[PG_RW_IDX] |
1127 		    pmap_bits_default[PG_V_IDX] |
1128 		    pmap_bits_default[PG_A_IDX];
1129 	}
1130 	cpu_mfence();
1131 	cpu_invltlb();
1132 }
1133 
1134 /*
1135  *	Bootstrap the system enough to run with virtual memory.
1136  *
1137  *	On x86_64 this is called after mapping has already been enabled
1138  *	and just syncs the pmap module with what has already been done.
1139  *	[We can't call it easily with mapping off since the kernel is not
1140  *	mapped with PA == VA, hence we would have to relocate every address
1141  *	from the linked base (virtual) address "KERNBASE" to the actual
1142  *	(physical) address starting relative to 0]
1143  */
1144 void
1145 pmap_bootstrap(vm_paddr_t *firstaddr)
1146 {
1147 	vm_offset_t va;
1148 	pt_entry_t *pte;
1149 	int i;
1150 
1151 	KvaStart = VM_MIN_KERNEL_ADDRESS;
1152 	KvaEnd = VM_MAX_KERNEL_ADDRESS;
1153 	KvaSize = KvaEnd - KvaStart;
1154 
1155 	avail_start = *firstaddr;
1156 
1157 	/*
1158 	 * Create an initial set of page tables to run the kernel in.
1159 	 */
1160 	create_pagetables(firstaddr);
1161 
1162 	virtual2_start = KvaStart;
1163 	virtual2_end = PTOV_OFFSET;
1164 
1165 	virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr;
1166 	virtual_start = pmap_kmem_choose(virtual_start);
1167 
1168 	virtual_end = VM_MAX_KERNEL_ADDRESS;
1169 
1170 	/* XXX do %cr0 as well */
1171 	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
1172 	load_cr3(KPML4phys);
1173 
1174 	/*
1175 	 * Initialize protection array.
1176 	 */
1177 	x86_64_protection_init();
1178 
1179 	/*
1180 	 * The kernel's pmap is statically allocated so we don't have to use
1181 	 * pmap_create, which is unlikely to work correctly at this part of
1182 	 * the boot sequence (XXX and which no longer exists).
1183 	 */
1184 	kernel_pmap->pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys);
1185 	kernel_pmap->pm_count = 1;
1186 	CPUMASK_ASSALLONES(kernel_pmap->pm_active);
1187 	RB_INIT(&kernel_pmap->pm_pvroot);
1188 	spin_init(&kernel_pmap->pm_spin, "pmapbootstrap");
1189 	for (i = 0; i < PM_PLACEMARKS; ++i)
1190 		kernel_pmap->pm_placemarks[i] = PM_NOPLACEMARK;
1191 
1192 	/*
1193 	 * Reserve some special page table entries/VA space for temporary
1194 	 * mapping of pages.
1195 	 */
1196 #define	SYSMAP(c, p, v, n)	\
1197 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
1198 
1199 	va = virtual_start;
1200 	pte = vtopte(va);
1201 
1202 	/*
1203 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
1204 	 */
1205 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
1206 
1207 	/*
1208 	 * Crashdump maps.
1209 	 */
1210 	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
1211 
1212 	/*
1213 	 * ptvmmap is used for reading arbitrary physical pages via
1214 	 * /dev/mem.
1215 	 */
1216 	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
1217 
1218 	/*
1219 	 * msgbufp is used to map the system message buffer.
1220 	 * XXX msgbufmap is not used.
1221 	 */
1222 	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
1223 	       atop(round_page(MSGBUF_SIZE)))
1224 
1225 	virtual_start = va;
1226 	virtual_start = pmap_kmem_choose(virtual_start);
1227 
1228 	*CMAP1 = 0;
1229 
1230 	/*
1231 	 * PG_G is terribly broken on SMP because we IPI invltlb's in some
1232 	 * cases rather then invl1pg.  Actually, I don't even know why it
1233 	 * works under UP because self-referential page table mappings
1234 	 */
1235 //	pgeflag = 0;
1236 
1237 	cpu_invltlb();
1238 
1239 	/* Initialize the PAT MSR */
1240 	pmap_init_pat();
1241 	pmap_pinit_defaults(kernel_pmap);
1242 
1243 	TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync",
1244 			  &pmap_fast_kernel_cpusync);
1245 
1246 }
1247 
1248 /*
1249  * Setup the PAT MSR.
1250  */
1251 void
1252 pmap_init_pat(void)
1253 {
1254 	uint64_t pat_msr;
1255 	u_long cr0, cr4;
1256 	int i;
1257 
1258 	/*
1259 	 * Default values mapping PATi,PCD,PWT bits at system reset.
1260 	 * The default values effectively ignore the PATi bit by
1261 	 * repeating the encodings for 0-3 in 4-7, and map the PCD
1262 	 * and PWT bit combinations to the expected PAT types.
1263 	 */
1264 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |	/* 000 */
1265 		  PAT_VALUE(1, PAT_WRITE_THROUGH) |	/* 001 */
1266 		  PAT_VALUE(2, PAT_UNCACHED) |		/* 010 */
1267 		  PAT_VALUE(3, PAT_UNCACHEABLE) |	/* 011 */
1268 		  PAT_VALUE(4, PAT_WRITE_BACK) |	/* 100 */
1269 		  PAT_VALUE(5, PAT_WRITE_THROUGH) |	/* 101 */
1270 		  PAT_VALUE(6, PAT_UNCACHED) |		/* 110 */
1271 		  PAT_VALUE(7, PAT_UNCACHEABLE);	/* 111 */
1272 	pat_pte_index[PAT_WRITE_BACK]	= 0;
1273 	pat_pte_index[PAT_WRITE_THROUGH]= 0         | X86_PG_NC_PWT;
1274 	pat_pte_index[PAT_UNCACHED]	= X86_PG_NC_PCD;
1275 	pat_pte_index[PAT_UNCACHEABLE]	= X86_PG_NC_PCD | X86_PG_NC_PWT;
1276 	pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE];
1277 	pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE];
1278 
1279 	if (cpu_feature & CPUID_PAT) {
1280 		/*
1281 		 * If we support the PAT then set-up entries for
1282 		 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns
1283 		 * 5 and 6.
1284 		 */
1285 		pat_msr = (pat_msr & ~PAT_MASK(5)) |
1286 			  PAT_VALUE(5, PAT_WRITE_PROTECTED);
1287 		pat_msr = (pat_msr & ~PAT_MASK(6)) |
1288 			  PAT_VALUE(6, PAT_WRITE_COMBINING);
1289 		pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | X86_PG_NC_PWT;
1290 		pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PCD;
1291 
1292 		/*
1293 		 * Then enable the PAT
1294 		 */
1295 
1296 		/* Disable PGE. */
1297 		cr4 = rcr4();
1298 		load_cr4(cr4 & ~CR4_PGE);
1299 
1300 		/* Disable caches (CD = 1, NW = 0). */
1301 		cr0 = rcr0();
1302 		load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1303 
1304 		/* Flushes caches and TLBs. */
1305 		wbinvd();
1306 		cpu_invltlb();
1307 
1308 		/* Update PAT and index table. */
1309 		wrmsr(MSR_PAT, pat_msr);
1310 
1311 		/* Flush caches and TLBs again. */
1312 		wbinvd();
1313 		cpu_invltlb();
1314 
1315 		/* Restore caches and PGE. */
1316 		load_cr0(cr0);
1317 		load_cr4(cr4);
1318 		PatMsr = pat_msr;
1319 	}
1320 
1321 	for (i = 0; i < 8; ++i) {
1322 		pt_entry_t pte;
1323 
1324 		pte = pat_pte_index[i];
1325 		if (pte & X86_PG_PTE_PAT) {
1326 			pte &= ~X86_PG_PTE_PAT;
1327 			pte |= X86_PG_PDE_PAT;
1328 		}
1329 		pat_pde_index[i] = pte;
1330 	}
1331 }
1332 
1333 /*
1334  * Set 4mb pdir for mp startup
1335  */
1336 void
1337 pmap_set_opt(void)
1338 {
1339 	if (cpu_feature & CPUID_PSE) {
1340 		load_cr4(rcr4() | CR4_PSE);
1341 		if (mycpu->gd_cpuid == 0) 	/* only on BSP */
1342 			cpu_invltlb();
1343 	}
1344 
1345 	/*
1346 	 * Check for SMAP support and enable if available.  Must be done
1347 	 * after cr3 is loaded, and on all cores.
1348 	 */
1349 	if (cpu_stdext_feature & CPUID_STDEXT_SMAP) {
1350 		load_cr4(rcr4() | CR4_SMAP);
1351 	}
1352 	if (cpu_stdext_feature & CPUID_STDEXT_SMEP) {
1353 		load_cr4(rcr4() | CR4_SMEP);
1354 	}
1355 }
1356 
1357 /*
1358  * SMAP is just a processor flag, but SMEP can only be enabled
1359  * and disabled via CR4.  We still use the processor flag to
1360  * disable SMAP because the page-fault/trap code checks it, in
1361  * order to allow a page-fault to actually occur.
1362  */
1363 void
1364 smap_smep_disable(void)
1365 {
1366 	/*
1367 	 * disable SMAP.  This also bypasses a software failsafe check
1368 	 * in the trap() code.
1369 	 */
1370 	smap_open();
1371 
1372 	/*
1373 	 * Also needed to bypass a software failsafe check in the trap()
1374 	 * code and allow the userspace address fault from kernel mode
1375 	 * to proceed.
1376 	 *
1377 	 * Note that This will not reload %rip because pcb_onfault_rsp will
1378 	 * not match.  Just setting it to non-NULL is sufficient to bypass
1379 	 * the checks.
1380 	 */
1381 	curthread->td_pcb->pcb_onfault = (void *)1;
1382 
1383 	/*
1384 	 * Disable SMEP (requires modifying cr4)
1385 	 */
1386 	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
1387 		load_cr4(rcr4() & ~CR4_SMEP);
1388 }
1389 
1390 void
1391 smap_smep_enable(void)
1392 {
1393 	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
1394 		load_cr4(rcr4() | CR4_SMEP);
1395 	curthread->td_pcb->pcb_onfault = NULL;
1396 	smap_close();
1397 }
1398 
1399 /*
1400  * Early initialization of the pmap module.
1401  *
1402  * Called by vm_init, to initialize any structures that the pmap
1403  * system needs to map virtual memory.  pmap_init has been enhanced to
1404  * support in a fairly consistant way, discontiguous physical memory.
1405  */
1406 void
1407 pmap_init(void)
1408 {
1409 	vm_pindex_t initial_pvs;
1410 	vm_pindex_t i;
1411 
1412 	/*
1413 	 * Allocate memory for random pmap data structures.  Includes the
1414 	 * pv_head_table.
1415 	 */
1416 	for (i = 0; i < vm_page_array_size; i++) {
1417 		vm_page_t m;
1418 
1419 		m = &vm_page_array[i];
1420 		m->md.interlock_count = 0;
1421 	}
1422 
1423 	/*
1424 	 * init the pv free list
1425 	 */
1426 	initial_pvs = vm_page_array_size;
1427 	if (initial_pvs < MINPV)
1428 		initial_pvs = MINPV;
1429 	pvzone = &pvzone_store;
1430 	pvinit = (void *)kmem_alloc(kernel_map,
1431 				    initial_pvs * sizeof (struct pv_entry),
1432 				    VM_SUBSYS_PVENTRY);
1433 	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry),
1434 		  pvinit, initial_pvs);
1435 
1436 	/*
1437 	 * Now it is safe to enable pv_table recording.
1438 	 */
1439 	pmap_initialized = TRUE;
1440 }
1441 
1442 /*
1443  * Initialize the address space (zone) for the pv_entries.  Set a
1444  * high water mark so that the system can recover from excessive
1445  * numbers of pv entries.
1446  *
1447  * Also create the kernel page table template for isolated user
1448  * pmaps.
1449  */
1450 static void pmap_init_iso_range(vm_offset_t base, size_t bytes);
1451 static void pmap_init2_iso_pmap(void);
1452 #if 0
1453 static void dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base);
1454 #endif
1455 
1456 void
1457 pmap_init2(void)
1458 {
1459 	vm_pindex_t entry_max;
1460 
1461 	/*
1462 	 * We can significantly reduce pv_entry_max from historical
1463 	 * levels because pv_entry's are no longer use for PTEs at the
1464 	 * leafs.  This prevents excessive pcpu caching on many-core
1465 	 * boxes (even with the further '/ 16' done in zinitna().
1466 	 *
1467 	 * Remember, however, that processes can share physical pages
1468 	 * with each process still needing the pdp/pd/pt infrstructure
1469 	 * (which still use pv_entry's).  And don't just assume that
1470 	 * every PT will be completely filled up.  So don't make it
1471 	 * too small.
1472 	 */
1473 	entry_max = maxproc * 32 + vm_page_array_size / 16;
1474 	TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &entry_max);
1475 	vm_pmap_pv_entries = entry_max;
1476 
1477 	/*
1478 	 * Subtract out pages already installed in the zone (hack)
1479 	 */
1480 	if (entry_max <= MINPV)
1481 		entry_max = MINPV;
1482 
1483 	zinitna(pvzone, NULL, 0, entry_max, ZONE_INTERRUPT);
1484 
1485 	/*
1486 	 * Enable dynamic deletion of empty higher-level page table pages
1487 	 * by default only if system memory is < 8GB (use 7GB for slop).
1488 	 * This can save a little memory, but imposes significant
1489 	 * performance overhead for things like bulk builds, and for programs
1490 	 * which do a lot of memory mapping and memory unmapping.
1491 	 */
1492 #if 0
1493 	if (pmap_dynamic_delete < 0) {
1494 		if (vmstats.v_page_count < 7LL * 1024 * 1024 * 1024 / PAGE_SIZE)
1495 			pmap_dynamic_delete = 1;
1496 		else
1497 			pmap_dynamic_delete = 0;
1498 	}
1499 #endif
1500 	/*
1501 	 * Disable so vm_map_backing iterations do not race
1502 	 */
1503 	pmap_dynamic_delete = 0;
1504 
1505 	/*
1506 	 * Automatic detection of Intel meltdown bug requiring user/kernel
1507 	 * mmap isolation.
1508 	 *
1509 	 * Currently there are so many Intel cpu's impacted that its better
1510 	 * to whitelist future Intel CPUs.  Most? AMD cpus are not impacted
1511 	 * so the default is off for AMD.
1512 	 */
1513 	if (meltdown_mitigation < 0) {
1514 		if (cpu_vendor_id == CPU_VENDOR_INTEL) {
1515 			meltdown_mitigation = 1;
1516 			if (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO)
1517 				meltdown_mitigation = 0;
1518 		} else {
1519 			meltdown_mitigation = 0;
1520 		}
1521 	}
1522 	if (meltdown_mitigation) {
1523 		kprintf("machdep.meltdown_mitigation enabled to "
1524 			"protect against (mostly Intel) meltdown bug\n");
1525 		kprintf("system call performance will be impacted\n");
1526 	}
1527 
1528 	pmap_init2_iso_pmap();
1529 }
1530 
1531 /*
1532  * Create the isolation pmap template.  Once created, the template
1533  * is static and its PML4e entries are used to populate the
1534  * kernel portion of any isolated user pmaps.
1535  *
1536  * Our isolation pmap must contain:
1537  * (1) trampoline area for all cpus
1538  * (2) common_tss area for all cpus (its part of the trampoline area now)
1539  * (3) IDT for all cpus
1540  * (4) GDT for all cpus
1541  */
1542 static void
1543 pmap_init2_iso_pmap(void)
1544 {
1545 	int n;
1546 
1547 	if (bootverbose)
1548 		kprintf("Initialize isolation pmap\n");
1549 
1550 	/*
1551 	 * Try to use our normal API calls to make this easier.  We have
1552 	 * to scrap the shadowed kernel PDPs pmap_pinit() creates for our
1553 	 * iso_pmap.
1554 	 */
1555 	pmap_pinit(&iso_pmap);
1556 	bzero(iso_pmap.pm_pml4, PAGE_SIZE);
1557 
1558 	/*
1559 	 * Install areas needed by the cpu and trampoline.
1560 	 */
1561 	for (n = 0; n < ncpus; ++n) {
1562 		struct privatespace *ps;
1563 
1564 		ps = CPU_prvspace[n];
1565 		pmap_init_iso_range((vm_offset_t)&ps->trampoline,
1566 				    sizeof(ps->trampoline));
1567 		pmap_init_iso_range((vm_offset_t)&ps->dblstack,
1568 				    sizeof(ps->dblstack));
1569 		pmap_init_iso_range((vm_offset_t)&ps->dbgstack,
1570 				    sizeof(ps->dbgstack));
1571 		pmap_init_iso_range((vm_offset_t)&ps->common_tss,
1572 				    sizeof(ps->common_tss));
1573 		pmap_init_iso_range(r_idt_arr[n].rd_base,
1574 				    r_idt_arr[n].rd_limit + 1);
1575 	}
1576 	pmap_init_iso_range((register_t)gdt, sizeof(gdt));
1577 	pmap_init_iso_range((vm_offset_t)(int *)btext,
1578 			    (vm_offset_t)(int *)etext -
1579 			     (vm_offset_t)(int *)btext);
1580 
1581 #if 0
1582 	kprintf("Dump iso_pmap:\n");
1583 	dump_pmap(&iso_pmap, vtophys(iso_pmap.pm_pml4), 0, 0);
1584 	kprintf("\nDump kernel_pmap:\n");
1585 	dump_pmap(kernel_pmap, vtophys(kernel_pmap->pm_pml4), 0, 0);
1586 #endif
1587 }
1588 
1589 /*
1590  * This adds a kernel virtual address range to the isolation pmap.
1591  */
1592 static void
1593 pmap_init_iso_range(vm_offset_t base, size_t bytes)
1594 {
1595 	pv_entry_t pv;
1596 	pv_entry_t pvp;
1597 	pt_entry_t *ptep;
1598 	pt_entry_t pte;
1599 	vm_offset_t va;
1600 
1601 	if (bootverbose) {
1602 		kprintf("isolate %016jx-%016jx (%zd)\n",
1603 			base, base + bytes, bytes);
1604 	}
1605 	va = base & ~(vm_offset_t)PAGE_MASK;
1606 	while (va < base + bytes) {
1607 		if ((va & PDRMASK) == 0 && va + NBPDR <= base + bytes &&
1608 		    (ptep = pmap_pt(kernel_pmap, va)) != NULL &&
1609 		    (*ptep & kernel_pmap->pmap_bits[PG_V_IDX]) &&
1610 		    (*ptep & kernel_pmap->pmap_bits[PG_PS_IDX])) {
1611 			/*
1612 			 * Use 2MB pages if possible
1613 			 */
1614 			pte = *ptep;
1615 			pv = pmap_allocpte(&iso_pmap, pmap_pd_pindex(va), &pvp);
1616 			ptep = pv_pte_lookup(pv, (va >> PDRSHIFT) & 511);
1617 			*ptep = pte;
1618 			va += NBPDR;
1619 		} else {
1620 			/*
1621 			 * Otherwise use 4KB pages
1622 			 */
1623 			pv = pmap_allocpte(&iso_pmap, pmap_pt_pindex(va), &pvp);
1624 			ptep = pv_pte_lookup(pv, (va >> PAGE_SHIFT) & 511);
1625 			*ptep = vtophys(va) | kernel_pmap->pmap_bits[PG_RW_IDX] |
1626 					      kernel_pmap->pmap_bits[PG_V_IDX] |
1627 					      kernel_pmap->pmap_bits[PG_A_IDX] |
1628 					      kernel_pmap->pmap_bits[PG_M_IDX];
1629 
1630 			va += PAGE_SIZE;
1631 		}
1632 		pv_put(pv);
1633 		pv_put(pvp);
1634 	}
1635 }
1636 
1637 #if 0
1638 /*
1639  * Useful debugging pmap dumper, do not remove (#if 0 when not in use)
1640  */
1641 static
1642 void
1643 dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base)
1644 {
1645 	pt_entry_t *ptp;
1646 	vm_offset_t incr;
1647 	int i;
1648 
1649 	switch(level) {
1650 	case 0:					/* PML4e page, 512G entries */
1651 		incr = (1LL << 48) / 512;
1652 		break;
1653 	case 1:					/* PDP page, 1G entries */
1654 		incr = (1LL << 39) / 512;
1655 		break;
1656 	case 2:					/* PD page, 2MB entries */
1657 		incr = (1LL << 30) / 512;
1658 		break;
1659 	case 3:					/* PT page, 4KB entries */
1660 		incr = (1LL << 21) / 512;
1661 		break;
1662 	default:
1663 		incr = 0;
1664 		break;
1665 	}
1666 
1667 	if (level == 0)
1668 		kprintf("cr3 %016jx @ va=%016jx\n", pte, base);
1669 	ptp = (void *)PHYS_TO_DMAP(pte & ~(pt_entry_t)PAGE_MASK);
1670 	for (i = 0; i < 512; ++i) {
1671 		if (level == 0 && i == 128)
1672 			base += 0xFFFF000000000000LLU;
1673 		if (ptp[i]) {
1674 			kprintf("%*.*s ", level * 4, level * 4, "");
1675 			if (level == 1 && (ptp[i] & 0x180) == 0x180) {
1676 				kprintf("va=%016jx %3d term %016jx (1GB)\n",
1677 					base, i, ptp[i]);
1678 			} else if (level == 2 && (ptp[i] & 0x180) == 0x180) {
1679 				kprintf("va=%016jx %3d term %016jx (2MB)\n",
1680 					base, i, ptp[i]);
1681 			} else if (level == 3) {
1682 				kprintf("va=%016jx %3d term %016jx\n",
1683 					base, i, ptp[i]);
1684 			} else {
1685 				kprintf("va=%016jx %3d deep %016jx\n",
1686 					base, i, ptp[i]);
1687 				dump_pmap(pmap, ptp[i], level + 1, base);
1688 			}
1689 		}
1690 		base += incr;
1691 	}
1692 }
1693 
1694 #endif
1695 
1696 /*
1697  * Typically used to initialize a fictitious page by vm/device_pager.c
1698  */
1699 void
1700 pmap_page_init(struct vm_page *m)
1701 {
1702 	vm_page_init(m);
1703 	m->md.interlock_count = 0;
1704 }
1705 
1706 /***************************************************
1707  * Low level helper routines.....
1708  ***************************************************/
1709 
1710 /*
1711  * Extract the physical page address associated with the map/VA pair.
1712  * The page must be wired for this to work reliably.
1713  */
1714 vm_paddr_t
1715 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep)
1716 {
1717 	vm_paddr_t rtval;
1718 	pv_entry_t pt_pv;
1719 	pt_entry_t *ptep;
1720 
1721 	rtval = 0;
1722 	if (va >= VM_MAX_USER_ADDRESS) {
1723 		/*
1724 		 * Kernel page directories might be direct-mapped and
1725 		 * there is typically no PV tracking of pte's
1726 		 */
1727 		pd_entry_t *pt;
1728 
1729 		pt = pmap_pt(pmap, va);
1730 		if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) {
1731 			if (*pt & pmap->pmap_bits[PG_PS_IDX]) {
1732 				rtval = *pt & PG_PS_FRAME;
1733 				rtval |= va & PDRMASK;
1734 			} else {
1735 				ptep = pmap_pt_to_pte(*pt, va);
1736 				if (*pt & pmap->pmap_bits[PG_V_IDX]) {
1737 					rtval = *ptep & PG_FRAME;
1738 					rtval |= va & PAGE_MASK;
1739 				}
1740 			}
1741 		}
1742 		if (handlep)
1743 			*handlep = NULL;
1744 	} else {
1745 		/*
1746 		 * User pages currently do not direct-map the page directory
1747 		 * and some pages might not used managed PVs.  But all PT's
1748 		 * will have a PV.
1749 		 */
1750 		pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL);
1751 		if (pt_pv) {
1752 			ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
1753 			if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
1754 				rtval = *ptep & PG_FRAME;
1755 				rtval |= va & PAGE_MASK;
1756 			}
1757 			if (handlep)
1758 				*handlep = pt_pv;	/* locked until done */
1759 			else
1760 				pv_put (pt_pv);
1761 		} else if (handlep) {
1762 			*handlep = NULL;
1763 		}
1764 	}
1765 	return rtval;
1766 }
1767 
1768 void
1769 pmap_extract_done(void *handle)
1770 {
1771 	if (handle)
1772 		pv_put((pv_entry_t)handle);
1773 }
1774 
1775 /*
1776  * Similar to extract but checks protections, SMP-friendly short-cut for
1777  * vm_fault_page[_quick]().  Can return NULL to cause the caller to
1778  * fall-through to the real fault code.  Does not work with HVM page
1779  * tables.
1780  *
1781  * if busyp is NULL the returned page, if not NULL, is held (and not busied).
1782  *
1783  * If busyp is not NULL and this function sets *busyp non-zero, the returned
1784  * page is busied (and not held).
1785  *
1786  * If busyp is not NULL and this function sets *busyp to zero, the returned
1787  * page is held (and not busied).
1788  *
1789  * If VM_PROT_WRITE is set in prot, and the pte is already writable, the
1790  * returned page will be dirtied.  If the pte is not already writable NULL
1791  * is returned.  In otherwords, if the bit is set and a vm_page_t is returned,
1792  * any COW will already have happened and that page can be written by the
1793  * caller.
1794  *
1795  * WARNING! THE RETURNED PAGE IS ONLY HELD AND NOT SUITABLE FOR READING
1796  *	    OR WRITING AS-IS.
1797  */
1798 vm_page_t
1799 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot, int *busyp)
1800 {
1801 	if (pmap &&
1802 	    va < VM_MAX_USER_ADDRESS &&
1803 	    (pmap->pm_flags & PMAP_HVM) == 0) {
1804 		pv_entry_t pt_pv;
1805 		pv_entry_t pte_pv;
1806 		pt_entry_t *ptep;
1807 		pt_entry_t req;
1808 		vm_page_t m;
1809 		int error;
1810 
1811 		req = pmap->pmap_bits[PG_V_IDX] |
1812 		      pmap->pmap_bits[PG_U_IDX];
1813 		if (prot & VM_PROT_WRITE)
1814 			req |= pmap->pmap_bits[PG_RW_IDX];
1815 
1816 		pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL);
1817 		if (pt_pv == NULL)
1818 			return (NULL);
1819 		ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
1820 		if ((*ptep & req) != req) {
1821 			pv_put(pt_pv);
1822 			return (NULL);
1823 		}
1824 		pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), NULL, &error);
1825 		if (pte_pv && error == 0) {
1826 			m = pte_pv->pv_m;
1827 			if (prot & VM_PROT_WRITE) {
1828 				/* interlocked by presence of pv_entry */
1829 				vm_page_dirty(m);
1830 			}
1831 			if (busyp) {
1832 				if (prot & VM_PROT_WRITE) {
1833 					if (vm_page_busy_try(m, TRUE))
1834 						m = NULL;
1835 					*busyp = 1;
1836 				} else {
1837 					vm_page_hold(m);
1838 					*busyp = 0;
1839 				}
1840 			} else {
1841 				vm_page_hold(m);
1842 			}
1843 			pv_put(pte_pv);
1844 		} else if (pte_pv) {
1845 			pv_drop(pte_pv);
1846 			m = NULL;
1847 		} else {
1848 			/* error, since we didn't request a placemarker */
1849 			m = NULL;
1850 		}
1851 		pv_put(pt_pv);
1852 		return(m);
1853 	} else {
1854 		return(NULL);
1855 	}
1856 }
1857 
1858 /*
1859  * Extract the physical page address associated kernel virtual address.
1860  */
1861 vm_paddr_t
1862 pmap_kextract(vm_offset_t va)
1863 {
1864 	pd_entry_t pt;		/* pt entry in pd */
1865 	vm_paddr_t pa;
1866 
1867 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1868 		pa = DMAP_TO_PHYS(va);
1869 	} else {
1870 		pt = *vtopt(va);
1871 		if (pt & kernel_pmap->pmap_bits[PG_PS_IDX]) {
1872 			pa = (pt & PG_PS_FRAME) | (va & PDRMASK);
1873 		} else {
1874 			/*
1875 			 * Beware of a concurrent promotion that changes the
1876 			 * PDE at this point!  For example, vtopte() must not
1877 			 * be used to access the PTE because it would use the
1878 			 * new PDE.  It is, however, safe to use the old PDE
1879 			 * because the page table page is preserved by the
1880 			 * promotion.
1881 			 */
1882 			pa = *pmap_pt_to_pte(pt, va);
1883 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1884 		}
1885 	}
1886 	return pa;
1887 }
1888 
1889 /***************************************************
1890  * Low level mapping routines.....
1891  ***************************************************/
1892 
1893 /*
1894  * Add a wired page to the KVA and invalidate the mapping on all CPUs.
1895  */
1896 void
1897 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1898 {
1899 	pt_entry_t *ptep;
1900 	pt_entry_t npte;
1901 
1902 	npte = pa |
1903 	       kernel_pmap->pmap_bits[PG_RW_IDX] |
1904 	       kernel_pmap->pmap_bits[PG_V_IDX];
1905 //	       pgeflag;
1906 	ptep = vtopte(va);
1907 #if 1
1908 	pmap_inval_smp(kernel_pmap, va, 1, ptep, npte);
1909 #else
1910 	/* FUTURE */
1911 	if (*ptep)
1912 		pmap_inval_smp(kernel_pmap, va, ptep, npte);
1913 	else
1914 		*ptep = npte;
1915 #endif
1916 }
1917 
1918 /*
1919  * Similar to pmap_kenter(), except we only invalidate the mapping on the
1920  * current CPU.  Returns 0 if the previous pte was 0, 1 if it wasn't
1921  * (caller can conditionalize calling smp_invltlb()).
1922  */
1923 int
1924 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
1925 {
1926 	pt_entry_t *ptep;
1927 	pt_entry_t npte;
1928 	int res;
1929 
1930 	npte = pa | kernel_pmap->pmap_bits[PG_RW_IDX] |
1931 		    kernel_pmap->pmap_bits[PG_V_IDX];
1932 	// npte |= pgeflag;
1933 	ptep = vtopte(va);
1934 #if 1
1935 	res = 1;
1936 #else
1937 	/* FUTURE */
1938 	res = (*ptep != 0);
1939 #endif
1940 	atomic_swap_long(ptep, npte);
1941 	cpu_invlpg((void *)va);
1942 
1943 	return res;
1944 }
1945 
1946 /*
1947  * Enter addresses into the kernel pmap but don't bother
1948  * doing any tlb invalidations.  Caller will do a rollup
1949  * invalidation via pmap_rollup_inval().
1950  */
1951 int
1952 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa)
1953 {
1954 	pt_entry_t *ptep;
1955 	pt_entry_t npte;
1956 	int res;
1957 
1958 	npte = pa |
1959 	    kernel_pmap->pmap_bits[PG_RW_IDX] |
1960 	    kernel_pmap->pmap_bits[PG_V_IDX];
1961 //	    pgeflag;
1962 	ptep = vtopte(va);
1963 #if 1
1964 	res = 1;
1965 #else
1966 	/* FUTURE */
1967 	res = (*ptep != 0);
1968 #endif
1969 	atomic_swap_long(ptep, npte);
1970 	cpu_invlpg((void *)va);
1971 
1972 	return res;
1973 }
1974 
1975 /*
1976  * remove a page from the kernel pagetables
1977  */
1978 void
1979 pmap_kremove(vm_offset_t va)
1980 {
1981 	pt_entry_t *ptep;
1982 
1983 	ptep = vtopte(va);
1984 	pmap_inval_smp(kernel_pmap, va, 1, ptep, 0);
1985 }
1986 
1987 void
1988 pmap_kremove_quick(vm_offset_t va)
1989 {
1990 	pt_entry_t *ptep;
1991 
1992 	ptep = vtopte(va);
1993 	atomic_readandclear_long(ptep);
1994 	cpu_invlpg((void *)va);
1995 }
1996 
1997 /*
1998  * Remove addresses from the kernel pmap but don't bother
1999  * doing any tlb invalidations.  Caller will do a rollup
2000  * invalidation via pmap_rollup_inval().
2001  */
2002 void
2003 pmap_kremove_noinval(vm_offset_t va)
2004 {
2005 	pt_entry_t *ptep;
2006 
2007 	ptep = vtopte(va);
2008 	atomic_readandclear_long(ptep);
2009 }
2010 
2011 /*
2012  * XXX these need to be recoded.  They are not used in any critical path.
2013  */
2014 void
2015 pmap_kmodify_rw(vm_offset_t va)
2016 {
2017 	atomic_set_long(vtopte(va), kernel_pmap->pmap_bits[PG_RW_IDX]);
2018 	cpu_invlpg((void *)va);
2019 }
2020 
2021 /* NOT USED
2022 void
2023 pmap_kmodify_nc(vm_offset_t va)
2024 {
2025 	atomic_set_long(vtopte(va), PG_N);
2026 	cpu_invlpg((void *)va);
2027 }
2028 */
2029 
2030 /*
2031  * Used to map a range of physical addresses into kernel virtual
2032  * address space during the low level boot, typically to map the
2033  * dump bitmap, message buffer, and vm_page_array.
2034  *
2035  * These mappings are typically made at some pointer after the end of the
2036  * kernel text+data.
2037  *
2038  * We could return PHYS_TO_DMAP(start) here and not allocate any
2039  * via (*virtp), but then kmem from userland and kernel dumps won't
2040  * have access to the related pointers.
2041  */
2042 vm_offset_t
2043 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot)
2044 {
2045 	vm_offset_t va;
2046 	vm_offset_t va_start;
2047 
2048 	/*return PHYS_TO_DMAP(start);*/
2049 
2050 	va_start = *virtp;
2051 	va = va_start;
2052 
2053 	while (start < end) {
2054 		pmap_kenter_quick(va, start);
2055 		va += PAGE_SIZE;
2056 		start += PAGE_SIZE;
2057 	}
2058 	*virtp = va;
2059 	return va_start;
2060 }
2061 
2062 #define PMAP_CLFLUSH_THRESHOLD  (2 * 1024 * 1024)
2063 
2064 /*
2065  * Remove the specified set of pages from the data and instruction caches.
2066  *
2067  * In contrast to pmap_invalidate_cache_range(), this function does not
2068  * rely on the CPU's self-snoop feature, because it is intended for use
2069  * when moving pages into a different cache domain.
2070  */
2071 void
2072 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
2073 {
2074 	vm_offset_t daddr, eva;
2075 	int i;
2076 
2077 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
2078 	    (cpu_feature & CPUID_CLFSH) == 0)
2079 		wbinvd();
2080 	else {
2081 		cpu_mfence();
2082 		for (i = 0; i < count; i++) {
2083 			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
2084 			eva = daddr + PAGE_SIZE;
2085 			for (; daddr < eva; daddr += cpu_clflush_line_size)
2086 				clflush(daddr);
2087 		}
2088 		cpu_mfence();
2089 	}
2090 }
2091 
2092 void
2093 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
2094 {
2095 	KASSERT((sva & PAGE_MASK) == 0,
2096 	    ("pmap_invalidate_cache_range: sva not page-aligned"));
2097 	KASSERT((eva & PAGE_MASK) == 0,
2098 	    ("pmap_invalidate_cache_range: eva not page-aligned"));
2099 
2100 	if (cpu_feature & CPUID_SS) {
2101 		; /* If "Self Snoop" is supported, do nothing. */
2102 	} else {
2103 		/* Globally invalidate caches */
2104 		cpu_wbinvd_on_all_cpus();
2105 	}
2106 }
2107 
2108 /*
2109  * Invalidate the specified range of virtual memory on all cpus associated
2110  * with the pmap.
2111  */
2112 void
2113 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2114 {
2115 	pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0);
2116 }
2117 
2118 /*
2119  * Add a list of wired pages to the kva.  This routine is used for temporary
2120  * kernel mappings such as those found in buffer cache buffer.  Page
2121  * modifications and accesses are not tracked or recorded.
2122  *
2123  * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed
2124  *	 semantics as previous mappings may have been zerod without any
2125  *	 invalidation.
2126  *
2127  * The page *must* be wired.
2128  */
2129 static __inline void
2130 _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval)
2131 {
2132 	vm_offset_t end_va;
2133 	vm_offset_t va;
2134 
2135 	end_va = beg_va + count * PAGE_SIZE;
2136 
2137 	for (va = beg_va; va < end_va; va += PAGE_SIZE) {
2138 		pt_entry_t pte;
2139 		pt_entry_t *ptep;
2140 
2141 		ptep = vtopte(va);
2142 		pte = VM_PAGE_TO_PHYS(*m) |
2143 			kernel_pmap->pmap_bits[PG_RW_IDX] |
2144 			kernel_pmap->pmap_bits[PG_V_IDX] |
2145 			kernel_pmap->pmap_cache_bits_pte[(*m)->pat_mode];
2146 //		pgeflag;
2147 		atomic_swap_long(ptep, pte);
2148 		m++;
2149 	}
2150 	if (doinval)
2151 		pmap_invalidate_range(kernel_pmap, beg_va, end_va);
2152 }
2153 
2154 void
2155 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count)
2156 {
2157 	_pmap_qenter(beg_va, m, count, 1);
2158 }
2159 
2160 void
2161 pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count)
2162 {
2163 	_pmap_qenter(beg_va, m, count, 0);
2164 }
2165 
2166 /*
2167  * This routine jerks page mappings from the kernel -- it is meant only
2168  * for temporary mappings such as those found in buffer cache buffers.
2169  * No recording modified or access status occurs.
2170  *
2171  * MPSAFE, INTERRUPT SAFE (cluster callback)
2172  */
2173 void
2174 pmap_qremove(vm_offset_t beg_va, int count)
2175 {
2176 	vm_offset_t end_va;
2177 	vm_offset_t va;
2178 
2179 	end_va = beg_va + count * PAGE_SIZE;
2180 
2181 	for (va = beg_va; va < end_va; va += PAGE_SIZE) {
2182 		pt_entry_t *pte;
2183 
2184 		pte = vtopte(va);
2185 		atomic_readandclear_long(pte);
2186 		cpu_invlpg((void *)va);
2187 	}
2188 	pmap_invalidate_range(kernel_pmap, beg_va, end_va);
2189 }
2190 
2191 /*
2192  * This routine removes temporary kernel mappings, only invalidating them
2193  * on the current cpu.  It should only be used under carefully controlled
2194  * conditions.
2195  */
2196 void
2197 pmap_qremove_quick(vm_offset_t beg_va, int count)
2198 {
2199 	vm_offset_t end_va;
2200 	vm_offset_t va;
2201 
2202 	end_va = beg_va + count * PAGE_SIZE;
2203 
2204 	for (va = beg_va; va < end_va; va += PAGE_SIZE) {
2205 		pt_entry_t *pte;
2206 
2207 		pte = vtopte(va);
2208 		atomic_readandclear_long(pte);
2209 		cpu_invlpg((void *)va);
2210 	}
2211 }
2212 
2213 /*
2214  * This routine removes temporary kernel mappings *without* invalidating
2215  * the TLB.  It can only be used on permanent kva reservations such as those
2216  * found in buffer cache buffers, under carefully controlled circumstances.
2217  *
2218  * NOTE: Repopulating these KVAs requires unconditional invalidation.
2219  *	 (pmap_qenter() does unconditional invalidation).
2220  */
2221 void
2222 pmap_qremove_noinval(vm_offset_t beg_va, int count)
2223 {
2224 	vm_offset_t end_va;
2225 	vm_offset_t va;
2226 
2227 	end_va = beg_va + count * PAGE_SIZE;
2228 
2229 	for (va = beg_va; va < end_va; va += PAGE_SIZE) {
2230 		pt_entry_t *pte;
2231 
2232 		pte = vtopte(va);
2233 		atomic_readandclear_long(pte);
2234 	}
2235 }
2236 
2237 /*
2238  * Create a new thread and optionally associate it with a (new) process.
2239  * NOTE! the new thread's cpu may not equal the current cpu.
2240  */
2241 void
2242 pmap_init_thread(thread_t td)
2243 {
2244 	/* enforce pcb placement & alignment */
2245 	td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
2246 	td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF);
2247 	td->td_savefpu = &td->td_pcb->pcb_save;
2248 	td->td_sp = (char *)td->td_pcb;	/* no -16 */
2249 }
2250 
2251 /*
2252  * This routine directly affects the fork perf for a process.
2253  */
2254 void
2255 pmap_init_proc(struct proc *p)
2256 {
2257 }
2258 
2259 static void
2260 pmap_pinit_defaults(struct pmap *pmap)
2261 {
2262 	bcopy(pmap_bits_default, pmap->pmap_bits,
2263 	      sizeof(pmap_bits_default));
2264 	bcopy(protection_codes, pmap->protection_codes,
2265 	      sizeof(protection_codes));
2266 	bcopy(pat_pte_index, pmap->pmap_cache_bits_pte,
2267 	      sizeof(pat_pte_index));
2268 	bcopy(pat_pde_index, pmap->pmap_cache_bits_pde,
2269 	      sizeof(pat_pte_index));
2270 	pmap->pmap_cache_mask_pte = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT;
2271 	pmap->pmap_cache_mask_pde = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PDE_PAT;
2272 	pmap->copyinstr = std_copyinstr;
2273 	pmap->copyin = std_copyin;
2274 	pmap->copyout = std_copyout;
2275 	pmap->fubyte = std_fubyte;
2276 	pmap->subyte = std_subyte;
2277 	pmap->fuword32 = std_fuword32;
2278 	pmap->fuword64 = std_fuword64;
2279 	pmap->suword32 = std_suword32;
2280 	pmap->suword64 = std_suword64;
2281 	pmap->swapu32 = std_swapu32;
2282 	pmap->swapu64 = std_swapu64;
2283 	pmap->fuwordadd32 = std_fuwordadd32;
2284 	pmap->fuwordadd64 = std_fuwordadd64;
2285 }
2286 /*
2287  * Initialize pmap0/vmspace0.
2288  *
2289  * On architectures where the kernel pmap is not integrated into the user
2290  * process pmap, this pmap represents the process pmap, not the kernel pmap.
2291  * kernel_pmap should be used to directly access the kernel_pmap.
2292  */
2293 void
2294 pmap_pinit0(struct pmap *pmap)
2295 {
2296 	int i;
2297 
2298 	pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys);
2299 	pmap->pm_count = 1;
2300 	CPUMASK_ASSZERO(pmap->pm_active);
2301 	pmap->pm_pvhint_pt = NULL;
2302 	pmap->pm_pvhint_unused = NULL;
2303 	RB_INIT(&pmap->pm_pvroot);
2304 	spin_init(&pmap->pm_spin, "pmapinit0");
2305 	for (i = 0; i < PM_PLACEMARKS; ++i)
2306 		pmap->pm_placemarks[i] = PM_NOPLACEMARK;
2307 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2308 	pmap_pinit_defaults(pmap);
2309 }
2310 
2311 /*
2312  * Initialize a preallocated and zeroed pmap structure,
2313  * such as one in a vmspace structure.
2314  */
2315 static void
2316 pmap_pinit_simple(struct pmap *pmap)
2317 {
2318 	int i;
2319 
2320 	/*
2321 	 * Misc initialization
2322 	 */
2323 	pmap->pm_count = 1;
2324 	CPUMASK_ASSZERO(pmap->pm_active);
2325 	pmap->pm_pvhint_pt = NULL;
2326 	pmap->pm_pvhint_unused = NULL;
2327 	pmap->pm_flags = PMAP_FLAG_SIMPLE;
2328 
2329 	pmap_pinit_defaults(pmap);
2330 
2331 	/*
2332 	 * Don't blow up locks/tokens on re-use (XXX fix/use drop code
2333 	 * for this).
2334 	 */
2335 	if (pmap->pm_pmlpv == NULL) {
2336 		RB_INIT(&pmap->pm_pvroot);
2337 		bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2338 		spin_init(&pmap->pm_spin, "pmapinitsimple");
2339 		for (i = 0; i < PM_PLACEMARKS; ++i)
2340 			pmap->pm_placemarks[i] = PM_NOPLACEMARK;
2341 	}
2342 }
2343 
2344 void
2345 pmap_pinit(struct pmap *pmap)
2346 {
2347 	pv_entry_t pv;
2348 	int j;
2349 
2350 	if (pmap->pm_pmlpv) {
2351 		/* Completely clear the cached pmap if not REGULAR_PMAP. */
2352 		if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) {
2353 			pmap_puninit(pmap);
2354 		}
2355 	}
2356 
2357 	pmap_pinit_simple(pmap);
2358 	pmap->pm_flags &= ~PMAP_FLAG_SIMPLE;
2359 
2360 	/*
2361 	 * No need to allocate page table space yet but we do need a valid
2362 	 * page directory table.
2363 	 */
2364 	if (pmap->pm_pml4 == NULL) {
2365 		pmap->pm_pml4 =
2366 		    (pml4_entry_t *)kmem_alloc_pageable(kernel_map,
2367 							PAGE_SIZE * 2,
2368 							VM_SUBSYS_PML4);
2369 		pmap->pm_pml4_iso = (void *)((char *)pmap->pm_pml4 + PAGE_SIZE);
2370 	}
2371 
2372 	/*
2373 	 * Allocate the PML4e table, which wires it even though it isn't
2374 	 * being entered into some higher level page table (it being the
2375 	 * highest level).  If one is already cached we don't have to do
2376 	 * anything.
2377 	 */
2378 	if ((pv = pmap->pm_pmlpv) == NULL) {
2379 		pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
2380 		pmap->pm_pmlpv = pv;
2381 		pmap_kenter((vm_offset_t)pmap->pm_pml4,
2382 			    VM_PAGE_TO_PHYS(pv->pv_m));
2383 		pv_put(pv);
2384 
2385 		/*
2386 		 * Install DMAP and KMAP.
2387 		 */
2388 		for (j = 0; j < NDMPML4E; ++j) {
2389 			pmap->pm_pml4[DMPML4I + j] =
2390 			    (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) |
2391 			    pmap->pmap_bits[PG_RW_IDX] |
2392 			    pmap->pmap_bits[PG_V_IDX] |
2393 			    pmap->pmap_bits[PG_A_IDX];
2394 		}
2395 		for (j = 0; j < NKPML4E; ++j) {
2396 			pmap->pm_pml4[KPML4I + j] =
2397 			    (KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) |
2398 			    pmap->pmap_bits[PG_RW_IDX] |
2399 			    pmap->pmap_bits[PG_V_IDX] |
2400 			    pmap->pmap_bits[PG_A_IDX];
2401 		}
2402 
2403 		/*
2404 		 * install self-referential address mapping entry
2405 		 */
2406 		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) |
2407 		    pmap->pmap_bits[PG_V_IDX] |
2408 		    pmap->pmap_bits[PG_RW_IDX] |
2409 		    pmap->pmap_bits[PG_A_IDX];
2410 	} else {
2411 		KKASSERT(pv->pv_m->flags & PG_MAPPED);
2412 		KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
2413 	}
2414 	KKASSERT(pmap->pm_pml4[255] == 0);
2415 
2416 	/*
2417 	 * When implementing an isolated userland pmap, a second PML4e table
2418 	 * is needed.  We use pmap_pml4_pindex() + 1 for convenience, but
2419 	 * note that we do not operate on this table using our API functions
2420 	 * so handling of the + 1 case is mostly just to prevent implosions.
2421 	 *
2422 	 * We install an isolated version of the kernel PDPs into this
2423 	 * second PML4e table.  The pmap code will mirror all user PDPs
2424 	 * between the primary and secondary PML4e table.
2425 	 */
2426 	if ((pv = pmap->pm_pmlpv_iso) == NULL && meltdown_mitigation &&
2427 	    pmap != &iso_pmap) {
2428 		pv = pmap_allocpte(pmap, pmap_pml4_pindex() + 1, NULL);
2429 		pmap->pm_pmlpv_iso = pv;
2430 		pmap_kenter((vm_offset_t)pmap->pm_pml4_iso,
2431 			    VM_PAGE_TO_PHYS(pv->pv_m));
2432 		pv_put(pv);
2433 
2434 		/*
2435 		 * Install an isolated version of the kernel pmap for
2436 		 * user consumption, using PDPs constructed in iso_pmap.
2437 		 */
2438 		for (j = 0; j < NKPML4E; ++j) {
2439 			pmap->pm_pml4_iso[KPML4I + j] =
2440 				iso_pmap.pm_pml4[KPML4I + j];
2441 		}
2442 	} else if (pv) {
2443 		KKASSERT(pv->pv_m->flags & PG_MAPPED);
2444 		KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
2445 	}
2446 }
2447 
2448 /*
2449  * Clean up a pmap structure so it can be physically freed.  This routine
2450  * is called by the vmspace dtor function.  A great deal of pmap data is
2451  * left passively mapped to improve vmspace management so we have a bit
2452  * of cleanup work to do here.
2453  */
2454 void
2455 pmap_puninit(pmap_t pmap)
2456 {
2457 	pv_entry_t pv;
2458 	vm_page_t p;
2459 
2460 	KKASSERT(CPUMASK_TESTZERO(pmap->pm_active));
2461 	if ((pv = pmap->pm_pmlpv) != NULL) {
2462 		if (pv_hold_try(pv) == 0)
2463 			pv_lock(pv);
2464 		KKASSERT(pv == pmap->pm_pmlpv);
2465 		p = pmap_remove_pv_page(pv, 1);
2466 		pv_free(pv, NULL);
2467 		pv = NULL;	/* safety */
2468 		pmap_kremove((vm_offset_t)pmap->pm_pml4);
2469 		vm_page_busy_wait(p, FALSE, "pgpun");
2470 		KKASSERT(p->flags & PG_UNQUEUED);
2471 		vm_page_unwire(p, 0);
2472 		vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
2473 		vm_page_free(p);
2474 		pmap->pm_pmlpv = NULL;
2475 	}
2476 	if ((pv = pmap->pm_pmlpv_iso) != NULL) {
2477 		if (pv_hold_try(pv) == 0)
2478 			pv_lock(pv);
2479 		KKASSERT(pv == pmap->pm_pmlpv_iso);
2480 		p = pmap_remove_pv_page(pv, 1);
2481 		pv_free(pv, NULL);
2482 		pv = NULL;	/* safety */
2483 		pmap_kremove((vm_offset_t)pmap->pm_pml4_iso);
2484 		vm_page_busy_wait(p, FALSE, "pgpun");
2485 		KKASSERT(p->flags & PG_UNQUEUED);
2486 		vm_page_unwire(p, 0);
2487 		vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
2488 		vm_page_free(p);
2489 		pmap->pm_pmlpv_iso = NULL;
2490 	}
2491 	if (pmap->pm_pml4) {
2492 		KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
2493 		kmem_free(kernel_map,
2494 			  (vm_offset_t)pmap->pm_pml4, PAGE_SIZE * 2);
2495 		pmap->pm_pml4 = NULL;
2496 		pmap->pm_pml4_iso = NULL;
2497 	}
2498 	KKASSERT(pmap->pm_stats.resident_count == 0);
2499 	KKASSERT(pmap->pm_stats.wired_count == 0);
2500 }
2501 
2502 /*
2503  * This function is now unused (used to add the pmap to the pmap_list)
2504  */
2505 void
2506 pmap_pinit2(struct pmap *pmap)
2507 {
2508 }
2509 
2510 /*
2511  * This routine is called when various levels in the page table need to
2512  * be populated.  This routine cannot fail.
2513  *
2514  * This function returns two locked pv_entry's, one representing the
2515  * requested pv and one representing the requested pv's parent pv.  If
2516  * an intermediate page table does not exist it will be created, mapped,
2517  * wired, and the parent page table will be given an additional hold
2518  * count representing the presence of the child pv_entry.
2519  */
2520 static
2521 pv_entry_t
2522 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
2523 {
2524 	pt_entry_t *ptep;
2525 	pt_entry_t *ptep_iso;
2526 	pv_entry_t pv;
2527 	pv_entry_t pvp;
2528 	pt_entry_t v;
2529 	vm_page_t m;
2530 	int isnew;
2531 	int ispt;
2532 
2533 	/*
2534 	 * If the pv already exists and we aren't being asked for the
2535 	 * parent page table page we can just return it.  A locked+held pv
2536 	 * is returned.  The pv will also have a second hold related to the
2537 	 * pmap association that we don't have to worry about.
2538 	 */
2539 	ispt = 0;
2540 	pv = pv_alloc(pmap, ptepindex, &isnew);
2541 	if (isnew == 0 && pvpp == NULL)
2542 		return(pv);
2543 
2544 	/*
2545 	 * DragonFly doesn't use PV's to represent terminal PTEs any more.
2546 	 * The index range is still used for placemarkers, but not for
2547 	 * actual pv_entry's.
2548 	 */
2549 	KKASSERT(ptepindex >= pmap_pt_pindex(0));
2550 
2551 	/*
2552 	 * Note that pt_pv's are only returned for user VAs. We assert that
2553 	 * a pt_pv is not being requested for kernel VAs.  The kernel
2554 	 * pre-wires all higher-level page tables so don't overload managed
2555 	 * higher-level page tables on top of it!
2556 	 *
2557 	 * However, its convenient for us to allow the case when creating
2558 	 * iso_pmap.  This is a bit of a hack but it simplifies iso_pmap
2559 	 * a lot.
2560 	 */
2561 
2562 	/*
2563 	 * The kernel never uses managed PT/PD/PDP pages.
2564 	 */
2565 	KKASSERT(pmap != kernel_pmap);
2566 
2567 	/*
2568 	 * Non-terminal PVs allocate a VM page to represent the page table,
2569 	 * so we have to resolve pvp and calculate ptepindex for the pvp
2570 	 * and then for the page table entry index in the pvp for
2571 	 * fall-through.
2572 	 */
2573 	if (ptepindex < pmap_pd_pindex(0)) {
2574 		/*
2575 		 * pv is PT, pvp is PD
2576 		 */
2577 		ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT;
2578 		ptepindex += NUPTE_TOTAL + NUPT_TOTAL;
2579 		pvp = pmap_allocpte(pmap, ptepindex, NULL);
2580 
2581 		/*
2582 		 * PT index in PD
2583 		 */
2584 		ptepindex = pv->pv_pindex - pmap_pt_pindex(0);
2585 		ptepindex &= ((1ul << NPDEPGSHIFT) - 1);
2586 		ispt = 1;
2587 	} else if (ptepindex < pmap_pdp_pindex(0)) {
2588 		/*
2589 		 * pv is PD, pvp is PDP
2590 		 *
2591 		 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above
2592 		 *		     the PD.
2593 		 */
2594 		ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT;
2595 		ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
2596 
2597 		if (pmap->pm_flags & PMAP_FLAG_SIMPLE) {
2598 			KKASSERT(pvpp == NULL);
2599 			pvp = NULL;
2600 		} else {
2601 			pvp = pmap_allocpte(pmap, ptepindex, NULL);
2602 		}
2603 
2604 		/*
2605 		 * PD index in PDP
2606 		 */
2607 		ptepindex = pv->pv_pindex - pmap_pd_pindex(0);
2608 		ptepindex &= ((1ul << NPDPEPGSHIFT) - 1);
2609 	} else if (ptepindex < pmap_pml4_pindex()) {
2610 		/*
2611 		 * pv is PDP, pvp is the root pml4 table
2612 		 */
2613 		pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
2614 
2615 		/*
2616 		 * PDP index in PML4
2617 		 */
2618 		ptepindex = pv->pv_pindex - pmap_pdp_pindex(0);
2619 		ptepindex &= ((1ul << NPML4EPGSHIFT) - 1);
2620 	} else {
2621 		/*
2622 		 * pv represents the top-level PML4, there is no parent.
2623 		 */
2624 		pvp = NULL;
2625 	}
2626 
2627 	if (isnew == 0)
2628 		goto notnew;
2629 
2630 	/*
2631 	 * (isnew) is TRUE.
2632 	 *
2633 	 * (1) Add a wire count to the parent page table (pvp).
2634 	 * (2) Allocate a VM page for the page table.
2635 	 * (3) Enter the VM page into the parent page table.
2636 	 *
2637 	 * page table pages are marked PG_WRITEABLE and PG_MAPPED.
2638 	 */
2639 	if (pvp)
2640 		vm_page_wire_quick(pvp->pv_m);
2641 
2642 	for (;;) {
2643 		m = vm_page_alloc(NULL, pv->pv_pindex,
2644 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
2645 				  VM_ALLOC_INTERRUPT);
2646 		if (m)
2647 			break;
2648 		vm_wait(0);
2649 	}
2650 	vm_page_wire(m);	/* wire for mapping in parent */
2651 	pmap_zero_page(VM_PAGE_TO_PHYS(m));
2652 	m->valid = VM_PAGE_BITS_ALL;
2653 	vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE | PG_UNQUEUED);
2654 	KKASSERT(m->queue == PQ_NONE);
2655 
2656 	pv->pv_m = m;
2657 
2658 	/*
2659 	 * (isnew) is TRUE.
2660 	 *
2661 	 * Wire the page into pvp.  Bump the resident_count for the pmap.
2662 	 * There is no pvp for the top level, address the pm_pml4[] array
2663 	 * directly.
2664 	 *
2665 	 * If the caller wants the parent we return it, otherwise
2666 	 * we just put it away.
2667 	 *
2668 	 * No interlock is needed for pte 0 -> non-zero.
2669 	 *
2670 	 * In the situation where *ptep is valid we might have an unmanaged
2671 	 * page table page shared from another page table which we need to
2672 	 * unshare before installing our private page table page.
2673 	 */
2674 	if (pvp) {
2675 		v = VM_PAGE_TO_PHYS(m) |
2676 		    (pmap->pmap_bits[PG_RW_IDX] |
2677 		     pmap->pmap_bits[PG_V_IDX] |
2678 		     pmap->pmap_bits[PG_A_IDX]);
2679 		if (ptepindex < NUPTE_USER)
2680 			v |= pmap->pmap_bits[PG_U_IDX];
2681 		if (ptepindex < pmap_pt_pindex(0))
2682 			v |= pmap->pmap_bits[PG_M_IDX];
2683 
2684 		ptep = pv_pte_lookup(pvp, ptepindex);
2685 		if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso)
2686 			ptep_iso = pv_pte_lookup(pmap->pm_pmlpv_iso, ptepindex);
2687 		else
2688 			ptep_iso  = NULL;
2689 		if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
2690 			panic("pmap_allocpte: ptpte present without pv_entry!");
2691 		} else {
2692 			pt_entry_t pte;
2693 
2694 			pte = atomic_swap_long(ptep, v);
2695 			if (ptep_iso)
2696 				atomic_swap_long(ptep_iso, v);
2697 			if (pte != 0) {
2698 				kprintf("install pgtbl mixup 0x%016jx "
2699 					"old/new 0x%016jx/0x%016jx\n",
2700 					(intmax_t)ptepindex, pte, v);
2701 			}
2702 		}
2703 	}
2704 	vm_page_wakeup(m);
2705 
2706 notnew:
2707 	/*
2708 	 * (isnew) may be TRUE or FALSE.
2709 	 */
2710 	if (pvp) {
2711 		KKASSERT(pvp->pv_m != NULL);
2712 		ptep = pv_pte_lookup(pvp, ptepindex);
2713 		v = VM_PAGE_TO_PHYS(pv->pv_m) |
2714 		    (pmap->pmap_bits[PG_RW_IDX] |
2715 		     pmap->pmap_bits[PG_V_IDX] |
2716 		     pmap->pmap_bits[PG_A_IDX]);
2717 		if (ptepindex < NUPTE_USER)
2718 			v |= pmap->pmap_bits[PG_U_IDX];
2719 		if (ptepindex < pmap_pt_pindex(0))
2720 			v |= pmap->pmap_bits[PG_M_IDX];
2721 		if (*ptep != v) {
2722 			kprintf("mismatched upper level pt %016jx/%016jx\n",
2723 				*ptep, v);
2724 		}
2725 	}
2726 	if (pvpp)
2727 		*pvpp = pvp;
2728 	else if (pvp)
2729 		pv_put(pvp);
2730 	return (pv);
2731 }
2732 
2733 /*
2734  * Release any resources held by the given physical map.
2735  *
2736  * Called when a pmap initialized by pmap_pinit is being released.  Should
2737  * only be called if the map contains no valid mappings.
2738  */
2739 struct pmap_release_info {
2740 	pmap_t	pmap;
2741 	int	retry;
2742 	pv_entry_t pvp;
2743 };
2744 
2745 static int pmap_release_callback(pv_entry_t pv, void *data);
2746 
2747 void
2748 pmap_release(struct pmap *pmap)
2749 {
2750 	struct pmap_release_info info;
2751 
2752 	KASSERT(CPUMASK_TESTZERO(pmap->pm_active),
2753 		("pmap still active! %016jx",
2754 		(uintmax_t)CPUMASK_LOWMASK(pmap->pm_active)));
2755 
2756 	/*
2757 	 * There is no longer a pmap_list, if there were we would remove the
2758 	 * pmap from it here.
2759 	 */
2760 
2761 	/*
2762 	 * Pull pv's off the RB tree in order from low to high and release
2763 	 * each page.
2764 	 */
2765 	info.pmap = pmap;
2766 	do {
2767 		info.retry = 0;
2768 		info.pvp = NULL;
2769 
2770 		spin_lock(&pmap->pm_spin);
2771 		RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL,
2772 			pmap_release_callback, &info);
2773 		spin_unlock(&pmap->pm_spin);
2774 
2775 		if (info.pvp)
2776 			pv_put(info.pvp);
2777 	} while (info.retry);
2778 
2779 
2780 	/*
2781 	 * One resident page (the pml4 page) should remain.  Two if
2782 	 * the pmap has implemented an isolated userland PML4E table.
2783 	 * No wired pages should remain.
2784 	 */
2785 	int expected_res = 0;
2786 
2787 	if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0)
2788 		++expected_res;
2789 	if (pmap->pm_pmlpv_iso)
2790 		++expected_res;
2791 
2792 #if 1
2793 	if (pmap->pm_stats.resident_count != expected_res ||
2794 	    pmap->pm_stats.wired_count != 0) {
2795 		kprintf("fatal pmap problem - pmap %p flags %08x "
2796 			"rescnt=%jd wirecnt=%jd\n",
2797 			pmap,
2798 			pmap->pm_flags,
2799 			pmap->pm_stats.resident_count,
2800 			pmap->pm_stats.wired_count);
2801 		tsleep(pmap, 0, "DEAD", 0);
2802 	}
2803 #else
2804 	KKASSERT(pmap->pm_stats.resident_count == expected_res);
2805 	KKASSERT(pmap->pm_stats.wired_count == 0);
2806 #endif
2807 }
2808 
2809 /*
2810  * Called from low to high.  We must cache the proper parent pv so we
2811  * can adjust its wired count.
2812  */
2813 static int
2814 pmap_release_callback(pv_entry_t pv, void *data)
2815 {
2816 	struct pmap_release_info *info = data;
2817 	pmap_t pmap = info->pmap;
2818 	vm_pindex_t pindex;
2819 	int r;
2820 
2821 	/*
2822 	 * Acquire a held and locked pv, check for release race
2823 	 */
2824 	pindex = pv->pv_pindex;
2825 	if (info->pvp == pv) {
2826 		spin_unlock(&pmap->pm_spin);
2827 		info->pvp = NULL;
2828 	} else if (pv_hold_try(pv)) {
2829 		spin_unlock(&pmap->pm_spin);
2830 	} else {
2831 		spin_unlock(&pmap->pm_spin);
2832 		pv_lock(pv);
2833 		pv_put(pv);
2834 		info->retry = 1;
2835 		spin_lock(&pmap->pm_spin);
2836 
2837 		return -1;
2838 	}
2839 	KKASSERT(pv->pv_pmap == pmap && pindex == pv->pv_pindex);
2840 
2841 	if (pv->pv_pindex < pmap_pt_pindex(0)) {
2842 		/*
2843 		 * I am PTE, parent is PT
2844 		 */
2845 		pindex = pv->pv_pindex >> NPTEPGSHIFT;
2846 		pindex += NUPTE_TOTAL;
2847 	} else if (pv->pv_pindex < pmap_pd_pindex(0)) {
2848 		/*
2849 		 * I am PT, parent is PD
2850 		 */
2851 		pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT;
2852 		pindex += NUPTE_TOTAL + NUPT_TOTAL;
2853 	} else if (pv->pv_pindex < pmap_pdp_pindex(0)) {
2854 		/*
2855 		 * I am PD, parent is PDP
2856 		 */
2857 		pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >>
2858 			 NPDPEPGSHIFT;
2859 		pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
2860 	} else if (pv->pv_pindex < pmap_pml4_pindex()) {
2861 		/*
2862 		 * I am PDP, parent is PML4.  We always calculate the
2863 		 * normal PML4 here, not the isolated PML4.
2864 		 */
2865 		pindex = pmap_pml4_pindex();
2866 	} else {
2867 		/*
2868 		 * parent is NULL
2869 		 */
2870 		if (info->pvp) {
2871 			pv_put(info->pvp);
2872 			info->pvp = NULL;
2873 		}
2874 		pindex = 0;
2875 	}
2876 	if (pindex) {
2877 		if (info->pvp && info->pvp->pv_pindex != pindex) {
2878 			pv_put(info->pvp);
2879 			info->pvp = NULL;
2880 		}
2881 		if (info->pvp == NULL)
2882 			info->pvp = pv_get(pmap, pindex, NULL);
2883 	} else {
2884 		if (info->pvp) {
2885 			pv_put(info->pvp);
2886 			info->pvp = NULL;
2887 		}
2888 	}
2889 	r = pmap_release_pv(pv, info->pvp, NULL);
2890 	spin_lock(&pmap->pm_spin);
2891 
2892 	return(r);
2893 }
2894 
2895 /*
2896  * Called with held (i.e. also locked) pv.  This function will dispose of
2897  * the lock along with the pv.
2898  *
2899  * If the caller already holds the locked parent page table for pv it
2900  * must pass it as pvp, allowing us to avoid a deadlock, else it can
2901  * pass NULL for pvp.
2902  */
2903 static int
2904 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk)
2905 {
2906 	vm_page_t p;
2907 
2908 	/*
2909 	 * The pmap is currently not spinlocked, pv is held+locked.
2910 	 * Remove the pv's page from its parent's page table.  The
2911 	 * parent's page table page's wire_count will be decremented.
2912 	 *
2913 	 * This will clean out the pte at any level of the page table.
2914 	 * If smp != 0 all cpus are affected.
2915 	 *
2916 	 * Do not tear-down recursively, its faster to just let the
2917 	 * release run its course.
2918 	 */
2919 	pmap_remove_pv_pte(pv, pvp, bulk, 0);
2920 
2921 	/*
2922 	 * Terminal pvs are unhooked from their vm_pages.  Because
2923 	 * terminal pages aren't page table pages they aren't wired
2924 	 * by us, so we have to be sure not to unwire them either.
2925 	 *
2926 	 * XXX It is unclear if this code ever gets called because we
2927 	 *     no longer use pv's to track terminal pages.
2928 	 */
2929 	if (pv->pv_pindex < pmap_pt_pindex(0)) {
2930 		pmap_remove_pv_page(pv, 0);
2931 		goto skip;
2932 	}
2933 
2934 	/*
2935 	 * We leave the top-level page table page cached, wired, and
2936 	 * mapped in the pmap until the dtor function (pmap_puninit())
2937 	 * gets called.
2938 	 *
2939 	 * Since we are leaving the top-level pv intact we need
2940 	 * to break out of what would otherwise be an infinite loop.
2941 	 *
2942 	 * This covers both the normal and the isolated PML4 page.
2943 	 */
2944 	if (pv->pv_pindex >= pmap_pml4_pindex()) {
2945 		pv_put(pv);
2946 		return(-1);
2947 	}
2948 
2949 	/*
2950 	 * For page table pages (other than the top-level page),
2951 	 * remove and free the vm_page.  The representitive mapping
2952 	 * removed above by pmap_remove_pv_pte() did not undo the
2953 	 * last wire_count so we have to do that as well.
2954 	 */
2955 	p = pmap_remove_pv_page(pv, 1);
2956 	vm_page_busy_wait(p, FALSE, "pmaprl");
2957 	if (p->wire_count != 1) {
2958 		const char *tstr;
2959 
2960 		if (pv->pv_pindex >= pmap_pdp_pindex(0))
2961 			tstr = "PDP";
2962 		else if (pv->pv_pindex >= pmap_pd_pindex(0))
2963 			tstr = "PD";
2964 		else if (pv->pv_pindex >= pmap_pt_pindex(0))
2965 			tstr = "PT";
2966 		else
2967 			tstr = "PTE";
2968 
2969 		kprintf("p(%s) p->wire_count was %016lx %d\n",
2970 			tstr, pv->pv_pindex, p->wire_count);
2971 	}
2972 	KKASSERT(p->wire_count == 1);
2973 	KKASSERT(p->flags & PG_UNQUEUED);
2974 
2975 	vm_page_unwire(p, 0);
2976 	KKASSERT(p->wire_count == 0);
2977 
2978 	vm_page_free(p);
2979 skip:
2980 	pv_free(pv, pvp);
2981 
2982 	return 0;
2983 }
2984 
2985 /*
2986  * This function will remove the pte associated with a pv from its parent.
2987  * Terminal pv's are supported.  All cpus specified by (bulk) are properly
2988  * invalidated.
2989  *
2990  * The wire count will be dropped on the parent page table.  The wire
2991  * count on the page being removed (pv->pv_m) from the parent page table
2992  * is NOT touched.  Note that terminal pages will not have any additional
2993  * wire counts while page table pages will have at least one representing
2994  * the mapping, plus others representing sub-mappings.
2995  *
2996  * NOTE: Cannot be called on kernel page table pages, only KVM terminal
2997  *	 pages and user page table and terminal pages.
2998  *
2999  * NOTE: The pte being removed might be unmanaged, and the pv supplied might
3000  *	 be freshly allocated and not imply that the pte is managed.  In this
3001  *	 case pv->pv_m should be NULL.
3002  *
3003  * The pv must be locked.  The pvp, if supplied, must be locked.  All
3004  * supplied pv's will remain locked on return.
3005  *
3006  * XXX must lock parent pv's if they exist to remove pte XXX
3007  */
3008 static
3009 void
3010 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk,
3011 		   int destroy)
3012 {
3013 	vm_pindex_t ptepindex = pv->pv_pindex;
3014 	pmap_t pmap = pv->pv_pmap;
3015 	vm_page_t p;
3016 	int gotpvp = 0;
3017 
3018 	KKASSERT(pmap);
3019 
3020 	if (ptepindex >= pmap_pml4_pindex()) {
3021 		/*
3022 		 * We are the top level PML4E table, there is no parent.
3023 		 *
3024 		 * This is either the normal or isolated PML4E table.
3025 		 * Only the normal is used in regular operation, the isolated
3026 		 * is only passed in when breaking down the whole pmap.
3027 		 */
3028 		p = pmap->pm_pmlpv->pv_m;
3029 		KKASSERT(pv->pv_m == p);	/* debugging */
3030 	} else if (ptepindex >= pmap_pdp_pindex(0)) {
3031 		/*
3032 		 * Remove a PDP page from the PML4E.  This can only occur
3033 		 * with user page tables.  We do not have to lock the
3034 		 * pml4 PV so just ignore pvp.
3035 		 */
3036 		vm_pindex_t pml4_pindex;
3037 		vm_pindex_t pdp_index;
3038 		pml4_entry_t *pdp;
3039 		pml4_entry_t *pdp_iso;
3040 
3041 		pdp_index = ptepindex - pmap_pdp_pindex(0);
3042 		if (pvp == NULL) {
3043 			pml4_pindex = pmap_pml4_pindex();
3044 			pvp = pv_get(pv->pv_pmap, pml4_pindex, NULL);
3045 			KKASSERT(pvp);
3046 			gotpvp = 1;
3047 		}
3048 
3049 		pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)];
3050 		KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0);
3051 		p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
3052 		pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0);
3053 
3054 		/*
3055 		 * Also remove the PDP from the isolated PML4E if the
3056 		 * process uses one.
3057 		 */
3058 		if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) {
3059 			pdp_iso = &pmap->pm_pml4_iso[pdp_index &
3060 						((1ul << NPML4EPGSHIFT) - 1)];
3061 			pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp_iso, 0);
3062 		}
3063 		KKASSERT(pv->pv_m == p);	/* debugging */
3064 	} else if (ptepindex >= pmap_pd_pindex(0)) {
3065 		/*
3066 		 * Remove a PD page from the PDP
3067 		 *
3068 		 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case
3069 		 *		     of a simple pmap because it stops at
3070 		 *		     the PD page.
3071 		 */
3072 		vm_pindex_t pdp_pindex;
3073 		vm_pindex_t pd_index;
3074 		pdp_entry_t *pd;
3075 
3076 		pd_index = ptepindex - pmap_pd_pindex(0);
3077 
3078 		if (pvp == NULL) {
3079 			pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
3080 				     (pd_index >> NPML4EPGSHIFT);
3081 			pvp = pv_get(pv->pv_pmap, pdp_pindex, NULL);
3082 			gotpvp = 1;
3083 		}
3084 
3085 		if (pvp) {
3086 			pd = pv_pte_lookup(pvp, pd_index &
3087 						((1ul << NPDPEPGSHIFT) - 1));
3088 			KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0);
3089 			p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
3090 			pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0);
3091 		} else {
3092 			KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE);
3093 			p = pv->pv_m;		/* degenerate test later */
3094 		}
3095 		KKASSERT(pv->pv_m == p);	/* debugging */
3096 	} else if (ptepindex >= pmap_pt_pindex(0)) {
3097 		/*
3098 		 *  Remove a PT page from the PD
3099 		 */
3100 		vm_pindex_t pd_pindex;
3101 		vm_pindex_t pt_index;
3102 		pd_entry_t *pt;
3103 
3104 		pt_index = ptepindex - pmap_pt_pindex(0);
3105 
3106 		if (pvp == NULL) {
3107 			pd_pindex = NUPTE_TOTAL + NUPT_TOTAL +
3108 				    (pt_index >> NPDPEPGSHIFT);
3109 			pvp = pv_get(pv->pv_pmap, pd_pindex, NULL);
3110 			KKASSERT(pvp);
3111 			gotpvp = 1;
3112 		}
3113 
3114 		pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1));
3115 #if 0
3116 		KASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0,
3117 			("*pt unexpectedly invalid %016jx "
3118 			 "gotpvp=%d ptepindex=%ld ptindex=%ld pv=%p pvp=%p",
3119 			*pt, gotpvp, ptepindex, pt_index, pv, pvp));
3120 		p = PHYS_TO_VM_PAGE(*pt & PG_FRAME);
3121 #else
3122 		if ((*pt & pmap->pmap_bits[PG_V_IDX]) == 0) {
3123 			kprintf("*pt unexpectedly invalid %016jx "
3124 			        "gotpvp=%d ptepindex=%ld ptindex=%ld "
3125 				"pv=%p pvp=%p\n",
3126 				*pt, gotpvp, ptepindex, pt_index, pv, pvp);
3127 			tsleep(pt, 0, "DEAD", 0);
3128 			p = pv->pv_m;
3129 		} else {
3130 			p = PHYS_TO_VM_PAGE(*pt & PG_FRAME);
3131 		}
3132 #endif
3133 		pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0);
3134 		KKASSERT(pv->pv_m == p);	/* debugging */
3135 	} else {
3136 		KKASSERT(0);
3137 	}
3138 
3139 	/*
3140 	 * If requested, scrap the underlying pv->pv_m and the underlying
3141 	 * pv.  If this is a page-table-page we must also free the page.
3142 	 *
3143 	 * pvp must be returned locked.
3144 	 */
3145 	if (destroy == 1) {
3146 		/*
3147 		 * page table page (PT, PD, PDP, PML4), caller was responsible
3148 		 * for testing wired_count.
3149 		 */
3150 		KKASSERT(pv->pv_m->wire_count == 1);
3151 		p = pmap_remove_pv_page(pv, 1);
3152 		pv_free(pv, pvp);
3153 		pv = NULL;
3154 
3155 		vm_page_busy_wait(p, FALSE, "pgpun");
3156 		vm_page_unwire(p, 0);
3157 		vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
3158 		vm_page_free(p);
3159 	}
3160 
3161 	/*
3162 	 * If we acquired pvp ourselves then we are responsible for
3163 	 * recursively deleting it.
3164 	 */
3165 	if (pvp && gotpvp) {
3166 		/*
3167 		 * Recursively destroy higher-level page tables.
3168 		 *
3169 		 * This is optional.  If we do not, they will still
3170 		 * be destroyed when the process exits.
3171 		 *
3172 		 * NOTE: Do not destroy pv_entry's with extra hold refs,
3173 		 *	 a caller may have unlocked it and intends to
3174 		 *	 continue to use it.
3175 		 */
3176 		if (pmap_dynamic_delete &&
3177 		    pvp->pv_m &&
3178 		    pvp->pv_m->wire_count == 1 &&
3179 		    (pvp->pv_hold & PV_HOLD_MASK) == 2 &&
3180 		    pvp->pv_pindex < pmap_pml4_pindex()) {
3181 			if (pmap != kernel_pmap) {
3182 				pmap_remove_pv_pte(pvp, NULL, bulk, 1);
3183 				pvp = NULL;	/* safety */
3184 			} else {
3185 				kprintf("Attempt to remove kernel_pmap pindex "
3186 					"%jd\n", pvp->pv_pindex);
3187 				pv_put(pvp);
3188 			}
3189 		} else {
3190 			pv_put(pvp);
3191 		}
3192 	}
3193 }
3194 
3195 /*
3196  * Remove the vm_page association to a pv.  The pv must be locked.
3197  */
3198 static
3199 vm_page_t
3200 pmap_remove_pv_page(pv_entry_t pv, int clrpgbits)
3201 {
3202 	vm_page_t m;
3203 
3204 	m = pv->pv_m;
3205 	pv->pv_m = NULL;
3206 	if (clrpgbits)
3207 		vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
3208 
3209 	return(m);
3210 }
3211 
3212 /*
3213  * Grow the number of kernel page table entries, if needed.
3214  *
3215  * This routine is always called to validate any address space
3216  * beyond KERNBASE (for kldloads).  kernel_vm_end only governs the address
3217  * space below KERNBASE.
3218  *
3219  * kernel_map must be locked exclusively by the caller.
3220  */
3221 void
3222 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
3223 {
3224 	vm_paddr_t paddr;
3225 	vm_offset_t ptppaddr;
3226 	vm_page_t nkpg;
3227 	pd_entry_t *pt, newpt;
3228 	pdp_entry_t *pd, newpd;
3229 	int update_kernel_vm_end;
3230 
3231 	/*
3232 	 * bootstrap kernel_vm_end on first real VM use
3233 	 */
3234 	if (kernel_vm_end == 0) {
3235 		kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
3236 
3237 		for (;;) {
3238 			pt = pmap_pt(kernel_pmap, kernel_vm_end);
3239 			if (pt == NULL)
3240 				break;
3241 			if ((*pt & kernel_pmap->pmap_bits[PG_V_IDX]) == 0)
3242 				break;
3243 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
3244 					~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1);
3245 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3246 				kernel_vm_end = vm_map_max(kernel_map);
3247 				break;
3248 			}
3249 		}
3250 	}
3251 
3252 	/*
3253 	 * Fill in the gaps.  kernel_vm_end is only adjusted for ranges
3254 	 * below KERNBASE.  Ranges above KERNBASE are kldloaded and we
3255 	 * do not want to force-fill 128G worth of page tables.
3256 	 */
3257 	if (kstart < KERNBASE) {
3258 		if (kstart > kernel_vm_end)
3259 			kstart = kernel_vm_end;
3260 		KKASSERT(kend <= KERNBASE);
3261 		update_kernel_vm_end = 1;
3262 	} else {
3263 		update_kernel_vm_end = 0;
3264 	}
3265 
3266 	kstart = rounddown2(kstart, (vm_offset_t)(PAGE_SIZE * NPTEPG));
3267 	kend = roundup2(kend, (vm_offset_t)(PAGE_SIZE * NPTEPG));
3268 
3269 	if (kend - 1 >= vm_map_max(kernel_map))
3270 		kend = vm_map_max(kernel_map);
3271 
3272 	while (kstart < kend) {
3273 		pt = pmap_pt(kernel_pmap, kstart);
3274 		if (pt == NULL) {
3275 			/*
3276 			 * We need a new PD entry
3277 			 */
3278 			nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++,
3279 			                     VM_ALLOC_NORMAL |
3280 					     VM_ALLOC_SYSTEM |
3281 					     VM_ALLOC_INTERRUPT);
3282 			if (nkpg == NULL) {
3283 				panic("pmap_growkernel: no memory to grow "
3284 				      "kernel");
3285 			}
3286 			paddr = VM_PAGE_TO_PHYS(nkpg);
3287 			pmap_zero_page(paddr);
3288 			pd = pmap_pd(kernel_pmap, kstart);
3289 
3290 			newpd = (pdp_entry_t)
3291 			    (paddr |
3292 			    kernel_pmap->pmap_bits[PG_V_IDX] |
3293 			    kernel_pmap->pmap_bits[PG_RW_IDX] |
3294 			    kernel_pmap->pmap_bits[PG_A_IDX]);
3295 			atomic_swap_long(pd, newpd);
3296 
3297 #if 0
3298 			kprintf("NEWPD pd=%p pde=%016jx phys=%016jx\n",
3299 				pd, newpd, paddr);
3300 #endif
3301 
3302 			continue; /* try again */
3303 		}
3304 
3305 		if ((*pt & kernel_pmap->pmap_bits[PG_V_IDX]) != 0) {
3306 			kstart = (kstart + PAGE_SIZE * NPTEPG) &
3307 				 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1);
3308 			if (kstart - 1 >= vm_map_max(kernel_map)) {
3309 				kstart = vm_map_max(kernel_map);
3310 				break;
3311 			}
3312 			continue;
3313 		}
3314 
3315 		/*
3316 		 * We need a new PT
3317 		 *
3318 		 * This index is bogus, but out of the way
3319 		 */
3320 		nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++,
3321 				     VM_ALLOC_NORMAL |
3322 				     VM_ALLOC_SYSTEM |
3323 				     VM_ALLOC_INTERRUPT);
3324 		if (nkpg == NULL)
3325 			panic("pmap_growkernel: no memory to grow kernel");
3326 
3327 		vm_page_wire(nkpg);
3328 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
3329 		pmap_zero_page(ptppaddr);
3330 		newpt = (pd_entry_t)(ptppaddr |
3331 				     kernel_pmap->pmap_bits[PG_V_IDX] |
3332 				     kernel_pmap->pmap_bits[PG_RW_IDX] |
3333 				     kernel_pmap->pmap_bits[PG_A_IDX]);
3334 		atomic_swap_long(pt, newpt);
3335 
3336 		kstart = (kstart + PAGE_SIZE * NPTEPG) &
3337 			  ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1);
3338 
3339 		if (kstart - 1 >= vm_map_max(kernel_map)) {
3340 			kstart = vm_map_max(kernel_map);
3341 			break;
3342 		}
3343 	}
3344 
3345 	/*
3346 	 * Only update kernel_vm_end for areas below KERNBASE.
3347 	 */
3348 	if (update_kernel_vm_end && kernel_vm_end < kstart)
3349 		kernel_vm_end = kstart;
3350 }
3351 
3352 /*
3353  *	Add a reference to the specified pmap.
3354  */
3355 void
3356 pmap_reference(pmap_t pmap)
3357 {
3358 	if (pmap != NULL)
3359 		atomic_add_int(&pmap->pm_count, 1);
3360 }
3361 
3362 void
3363 pmap_maybethreaded(pmap_t pmap)
3364 {
3365 	atomic_set_int(&pmap->pm_flags, PMAP_MULTI);
3366 }
3367 
3368 /*
3369  * Called while page is hard-busied to clear the PG_MAPPED and PG_WRITEABLE
3370  * flags if able.  This can happen when the pmap code is unable to clear
3371  * the bits in prior actions due to not holding the page hard-busied at
3372  * the time.
3373  *
3374  * The clearing of PG_MAPPED/WRITEABLE is an optional optimization done
3375  * when the pte is removed and only if the pte has not been multiply-mapped.
3376  * The caller may have to call vm_page_protect() if the bits are still set
3377  * here.
3378  *
3379  * This function is expected to be quick.
3380  */
3381 int
3382 pmap_mapped_sync(vm_page_t m)
3383 {
3384 	return (m->flags);
3385 }
3386 
3387 /***************************************************
3388  * page management routines.
3389  ***************************************************/
3390 
3391 /*
3392  * Hold a pv without locking it
3393  */
3394 #if 0
3395 static void
3396 pv_hold(pv_entry_t pv)
3397 {
3398 	atomic_add_int(&pv->pv_hold, 1);
3399 }
3400 #endif
3401 
3402 /*
3403  * Hold a pv_entry, preventing its destruction.  TRUE is returned if the pv
3404  * was successfully locked, FALSE if it wasn't.  The caller must dispose of
3405  * the pv properly.
3406  *
3407  * Either the pmap->pm_spin or the related vm_page_spin (if traversing a
3408  * pv list via its page) must be held by the caller in order to stabilize
3409  * the pv.
3410  */
3411 static int
3412 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL)
3413 {
3414 	u_int count;
3415 
3416 	/*
3417 	 * Critical path shortcut expects pv to already have one ref
3418 	 * (for the pv->pv_pmap).
3419 	 */
3420 	count = pv->pv_hold;
3421 	cpu_ccfence();
3422 	for (;;) {
3423 		if ((count & PV_HOLD_LOCKED) == 0) {
3424 			if (atomic_fcmpset_int(&pv->pv_hold, &count,
3425 					      (count + 1) | PV_HOLD_LOCKED)) {
3426 #ifdef PMAP_DEBUG
3427 				pv->pv_func = func;
3428 				pv->pv_line = lineno;
3429 #endif
3430 				return TRUE;
3431 			}
3432 		} else {
3433 			if (atomic_fcmpset_int(&pv->pv_hold, &count, count + 1))
3434 				return FALSE;
3435 		}
3436 		/* retry */
3437 	}
3438 }
3439 
3440 /*
3441  * Drop a previously held pv_entry which could not be locked, allowing its
3442  * destruction.
3443  *
3444  * Must not be called with a spinlock held as we might zfree() the pv if it
3445  * is no longer associated with a pmap and this was the last hold count.
3446  */
3447 static void
3448 pv_drop(pv_entry_t pv)
3449 {
3450 	u_int count;
3451 
3452 	for (;;) {
3453 		count = pv->pv_hold;
3454 		cpu_ccfence();
3455 		KKASSERT((count & PV_HOLD_MASK) > 0);
3456 		KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) !=
3457 			 (PV_HOLD_LOCKED | 1));
3458 		if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) {
3459 			if ((count & PV_HOLD_MASK) == 1) {
3460 #ifdef PMAP_DEBUG2
3461 				if (pmap_enter_debug > 0) {
3462 					--pmap_enter_debug;
3463 					kprintf("pv_drop: free pv %p\n", pv);
3464 				}
3465 #endif
3466 				KKASSERT(count == 1);
3467 				KKASSERT(pv->pv_pmap == NULL);
3468 				zfree(pvzone, pv);
3469 			}
3470 			return;
3471 		}
3472 		/* retry */
3473 	}
3474 }
3475 
3476 /*
3477  * Find or allocate the requested PV entry, returning a locked, held pv.
3478  *
3479  * If (*isnew) is non-zero, the returned pv will have two hold counts, one
3480  * for the caller and one representing the pmap and vm_page association.
3481  *
3482  * If (*isnew) is zero, the returned pv will have only one hold count.
3483  *
3484  * Since both associations can only be adjusted while the pv is locked,
3485  * together they represent just one additional hold.
3486  */
3487 static
3488 pv_entry_t
3489 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL)
3490 {
3491 	struct mdglobaldata *md = mdcpu;
3492 	pv_entry_t pv;
3493 	pv_entry_t pnew;
3494 	int pmap_excl = 0;
3495 
3496 	pnew = NULL;
3497 	if (md->gd_newpv) {
3498 #if 1
3499 		pnew = atomic_swap_ptr((void *)&md->gd_newpv, NULL);
3500 #else
3501 		crit_enter();
3502 		pnew = md->gd_newpv;	/* might race NULL */
3503 		md->gd_newpv = NULL;
3504 		crit_exit();
3505 #endif
3506 	}
3507 	if (pnew == NULL)
3508 		pnew = zalloc(pvzone);
3509 
3510 	spin_lock_shared(&pmap->pm_spin);
3511 	for (;;) {
3512 		/*
3513 		 * Shortcut cache
3514 		 */
3515 		pv = pv_entry_lookup(pmap, pindex);
3516 		if (pv == NULL) {
3517 			vm_pindex_t *pmark;
3518 
3519 			/*
3520 			 * Requires exclusive pmap spinlock
3521 			 */
3522 			if (pmap_excl == 0) {
3523 				pmap_excl = 1;
3524 				if (!spin_lock_upgrade_try(&pmap->pm_spin)) {
3525 					spin_unlock_shared(&pmap->pm_spin);
3526 					spin_lock(&pmap->pm_spin);
3527 					continue;
3528 				}
3529 			}
3530 
3531 			/*
3532 			 * We need to block if someone is holding our
3533 			 * placemarker.  As long as we determine the
3534 			 * placemarker has not been aquired we do not
3535 			 * need to get it as acquision also requires
3536 			 * the pmap spin lock.
3537 			 *
3538 			 * However, we can race the wakeup.
3539 			 */
3540 			pmark = pmap_placemarker_hash(pmap, pindex);
3541 
3542 			if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) {
3543 				tsleep_interlock(pmark, 0);
3544 				atomic_set_long(pmark, PM_PLACEMARK_WAKEUP);
3545 				if (((*pmark ^ pindex) &
3546 				     ~PM_PLACEMARK_WAKEUP) == 0) {
3547 					spin_unlock(&pmap->pm_spin);
3548 					tsleep(pmark, PINTERLOCKED, "pvplc", 0);
3549 					spin_lock(&pmap->pm_spin);
3550 				}
3551 				continue;
3552 			}
3553 
3554 			/*
3555 			 * Setup the new entry
3556 			 */
3557 			pnew->pv_pmap = pmap;
3558 			pnew->pv_pindex = pindex;
3559 			pnew->pv_hold = PV_HOLD_LOCKED | 2;
3560 			pnew->pv_flags = 0;
3561 #ifdef PMAP_DEBUG
3562 			pnew->pv_func = func;
3563 			pnew->pv_line = lineno;
3564 			if (pnew->pv_line_lastfree > 0) {
3565 				pnew->pv_line_lastfree =
3566 						-pnew->pv_line_lastfree;
3567 			}
3568 #endif
3569 			pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew);
3570 			atomic_add_long(&pmap->pm_stats.resident_count, 1);
3571 			spin_unlock(&pmap->pm_spin);
3572 			*isnew = 1;
3573 
3574 			KASSERT(pv == NULL, ("pv insert failed %p->%p", pnew, pv));
3575 			return(pnew);
3576 		}
3577 
3578 		/*
3579 		 * We already have an entry, cleanup the staged pnew if
3580 		 * we can get the lock, otherwise block and retry.
3581 		 */
3582 		if (__predict_true(_pv_hold_try(pv PMAP_DEBUG_COPY))) {
3583 			if (pmap_excl)
3584 				spin_unlock(&pmap->pm_spin);
3585 			else
3586 				spin_unlock_shared(&pmap->pm_spin);
3587 #if 1
3588 			pnew = atomic_swap_ptr((void *)&md->gd_newpv, pnew);
3589 			if (pnew)
3590 				zfree(pvzone, pnew);
3591 #else
3592 			crit_enter();
3593 			if (md->gd_newpv == NULL)
3594 				md->gd_newpv = pnew;
3595 			else
3596 				zfree(pvzone, pnew);
3597 			crit_exit();
3598 #endif
3599 			KKASSERT(pv->pv_pmap == pmap &&
3600 				 pv->pv_pindex == pindex);
3601 			*isnew = 0;
3602 			return(pv);
3603 		}
3604 		if (pmap_excl) {
3605 			spin_unlock(&pmap->pm_spin);
3606 			_pv_lock(pv PMAP_DEBUG_COPY);
3607 			pv_put(pv);
3608 			spin_lock(&pmap->pm_spin);
3609 		} else {
3610 			spin_unlock_shared(&pmap->pm_spin);
3611 			_pv_lock(pv PMAP_DEBUG_COPY);
3612 			pv_put(pv);
3613 			spin_lock_shared(&pmap->pm_spin);
3614 		}
3615 	}
3616 	/* NOT REACHED */
3617 }
3618 
3619 /*
3620  * Find the requested PV entry, returning a locked+held pv or NULL
3621  */
3622 static
3623 pv_entry_t
3624 _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp PMAP_DEBUG_DECL)
3625 {
3626 	pv_entry_t pv;
3627 	int pmap_excl = 0;
3628 
3629 	spin_lock_shared(&pmap->pm_spin);
3630 	for (;;) {
3631 		/*
3632 		 * Shortcut cache
3633 		 */
3634 		pv = pv_entry_lookup(pmap, pindex);
3635 		if (pv == NULL) {
3636 			/*
3637 			 * Block if there is ANY placemarker.  If we are to
3638 			 * return it, we must also aquire the spot, so we
3639 			 * have to block even if the placemarker is held on
3640 			 * a different address.
3641 			 *
3642 			 * OPTIMIZATION: If pmarkp is passed as NULL the
3643 			 * caller is just probing (or looking for a real
3644 			 * pv_entry), and in this case we only need to check
3645 			 * to see if the placemarker matches pindex.
3646 			 */
3647 			vm_pindex_t *pmark;
3648 
3649 			/*
3650 			 * Requires exclusive pmap spinlock
3651 			 */
3652 			if (pmap_excl == 0) {
3653 				pmap_excl = 1;
3654 				if (!spin_lock_upgrade_try(&pmap->pm_spin)) {
3655 					spin_unlock_shared(&pmap->pm_spin);
3656 					spin_lock(&pmap->pm_spin);
3657 					continue;
3658 				}
3659 			}
3660 
3661 			pmark = pmap_placemarker_hash(pmap, pindex);
3662 
3663 			if ((pmarkp && *pmark != PM_NOPLACEMARK) ||
3664 			    ((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) {
3665 				tsleep_interlock(pmark, 0);
3666 				atomic_set_long(pmark, PM_PLACEMARK_WAKEUP);
3667 				if ((pmarkp && *pmark != PM_NOPLACEMARK) ||
3668 				    ((*pmark ^ pindex) &
3669 				     ~PM_PLACEMARK_WAKEUP) == 0) {
3670 					spin_unlock(&pmap->pm_spin);
3671 					tsleep(pmark, PINTERLOCKED, "pvpld", 0);
3672 					spin_lock(&pmap->pm_spin);
3673 				}
3674 				continue;
3675 			}
3676 			if (pmarkp) {
3677 				if (atomic_swap_long(pmark, pindex) !=
3678 				    PM_NOPLACEMARK) {
3679 					panic("_pv_get: pmark race");
3680 				}
3681 				*pmarkp = pmark;
3682 			}
3683 			spin_unlock(&pmap->pm_spin);
3684 			return NULL;
3685 		}
3686 		if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
3687 			if (pmap_excl)
3688 				spin_unlock(&pmap->pm_spin);
3689 			else
3690 				spin_unlock_shared(&pmap->pm_spin);
3691 			KKASSERT(pv->pv_pmap == pmap &&
3692 				 pv->pv_pindex == pindex);
3693 			return(pv);
3694 		}
3695 		if (pmap_excl) {
3696 			spin_unlock(&pmap->pm_spin);
3697 			_pv_lock(pv PMAP_DEBUG_COPY);
3698 			pv_put(pv);
3699 			spin_lock(&pmap->pm_spin);
3700 		} else {
3701 			spin_unlock_shared(&pmap->pm_spin);
3702 			_pv_lock(pv PMAP_DEBUG_COPY);
3703 			pv_put(pv);
3704 			spin_lock_shared(&pmap->pm_spin);
3705 		}
3706 	}
3707 }
3708 
3709 /*
3710  * Lookup, hold, and attempt to lock (pmap,pindex).
3711  *
3712  * If the entry does not exist NULL is returned and *errorp is set to 0
3713  *
3714  * If the entry exists and could be successfully locked it is returned and
3715  * errorp is set to 0.
3716  *
3717  * If the entry exists but could NOT be successfully locked it is returned
3718  * held and *errorp is set to 1.
3719  *
3720  * If the entry is placemarked by someone else NULL is returned and *errorp
3721  * is set to 1.
3722  */
3723 static
3724 pv_entry_t
3725 pv_get_try(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp, int *errorp)
3726 {
3727 	pv_entry_t pv;
3728 
3729 	spin_lock_shared(&pmap->pm_spin);
3730 
3731 	pv = pv_entry_lookup(pmap, pindex);
3732 	if (pv == NULL) {
3733 		vm_pindex_t *pmark;
3734 
3735 		pmark = pmap_placemarker_hash(pmap, pindex);
3736 
3737 		if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) {
3738 			*errorp = 1;
3739 		} else if (pmarkp &&
3740 			   atomic_cmpset_long(pmark, PM_NOPLACEMARK, pindex)) {
3741 			*errorp = 0;
3742 		} else {
3743 			/*
3744 			 * Can't set a placemark with a NULL pmarkp, or if
3745 			 * pmarkp is non-NULL but we failed to set our
3746 			 * placemark.
3747 			 */
3748 			*errorp = 1;
3749 		}
3750 		if (pmarkp)
3751 			*pmarkp = pmark;
3752 		spin_unlock_shared(&pmap->pm_spin);
3753 
3754 		return NULL;
3755 	}
3756 
3757 	/*
3758 	 * XXX This has problems if the lock is shared, why?
3759 	 */
3760 	if (pv_hold_try(pv)) {
3761 		spin_unlock_shared(&pmap->pm_spin);
3762 		*errorp = 0;
3763 		KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex);
3764 		return(pv);	/* lock succeeded */
3765 	}
3766 	spin_unlock_shared(&pmap->pm_spin);
3767 	*errorp = 1;
3768 
3769 	return (pv);		/* lock failed */
3770 }
3771 
3772 /*
3773  * Lock a held pv, keeping the hold count
3774  */
3775 static
3776 void
3777 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL)
3778 {
3779 	u_int count;
3780 
3781 	for (;;) {
3782 		count = pv->pv_hold;
3783 		cpu_ccfence();
3784 		if ((count & PV_HOLD_LOCKED) == 0) {
3785 			if (atomic_cmpset_int(&pv->pv_hold, count,
3786 					      count | PV_HOLD_LOCKED)) {
3787 #ifdef PMAP_DEBUG
3788 				pv->pv_func = func;
3789 				pv->pv_line = lineno;
3790 #endif
3791 				return;
3792 			}
3793 			continue;
3794 		}
3795 		tsleep_interlock(pv, 0);
3796 		if (atomic_cmpset_int(&pv->pv_hold, count,
3797 				      count | PV_HOLD_WAITING)) {
3798 #ifdef PMAP_DEBUG2
3799 			if (pmap_enter_debug > 0) {
3800 				--pmap_enter_debug;
3801 				kprintf("pv waiting on %s:%d\n",
3802 					pv->pv_func, pv->pv_line);
3803 			}
3804 #endif
3805 			tsleep(pv, PINTERLOCKED, "pvwait", hz);
3806 		}
3807 		/* retry */
3808 	}
3809 }
3810 
3811 /*
3812  * Unlock a held and locked pv, keeping the hold count.
3813  */
3814 static
3815 void
3816 pv_unlock(pv_entry_t pv)
3817 {
3818 	u_int count;
3819 
3820 	for (;;) {
3821 		count = pv->pv_hold;
3822 		cpu_ccfence();
3823 		KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >=
3824 			 (PV_HOLD_LOCKED | 1));
3825 		if (atomic_cmpset_int(&pv->pv_hold, count,
3826 				      count &
3827 				      ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) {
3828 			if (count & PV_HOLD_WAITING)
3829 				wakeup(pv);
3830 			break;
3831 		}
3832 	}
3833 }
3834 
3835 /*
3836  * Unlock and drop a pv.  If the pv is no longer associated with a pmap
3837  * and the hold count drops to zero we will free it.
3838  *
3839  * Caller should not hold any spin locks.  We are protected from hold races
3840  * by virtue of holds only occuring only with a pmap_spin or vm_page_spin
3841  * lock held.  A pv cannot be located otherwise.
3842  */
3843 static
3844 void
3845 pv_put(pv_entry_t pv)
3846 {
3847 #ifdef PMAP_DEBUG2
3848 	if (pmap_enter_debug > 0) {
3849 		--pmap_enter_debug;
3850 		kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold);
3851 	}
3852 #endif
3853 
3854 	/*
3855 	 * Normal put-aways must have a pv_m associated with the pv,
3856 	 * but allow the case where the pv has been destructed due
3857 	 * to pmap_dynamic_delete.
3858 	 */
3859 	KKASSERT(pv->pv_pmap == NULL || pv->pv_m != NULL);
3860 
3861 	/*
3862 	 * Fast - shortcut most common condition
3863 	 */
3864 	if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1))
3865 		return;
3866 
3867 	/*
3868 	 * Slow
3869 	 */
3870 	pv_unlock(pv);
3871 	pv_drop(pv);
3872 }
3873 
3874 /*
3875  * Remove the pmap association from a pv, require that pv_m already be removed,
3876  * then unlock and drop the pv.  Any pte operations must have already been
3877  * completed.  This call may result in a last-drop which will physically free
3878  * the pv.
3879  *
3880  * Removing the pmap association entails an additional drop.
3881  *
3882  * pv must be exclusively locked on call and will be disposed of on return.
3883  */
3884 static
3885 void
3886 _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL)
3887 {
3888 	pmap_t pmap;
3889 
3890 #ifdef PMAP_DEBUG
3891 	pv->pv_func_lastfree = func;
3892 	pv->pv_line_lastfree = lineno;
3893 #endif
3894 	KKASSERT(pv->pv_m == NULL);
3895 	KKASSERT((pv->pv_hold & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >=
3896 		  (PV_HOLD_LOCKED|1));
3897 	if ((pmap = pv->pv_pmap) != NULL) {
3898 		spin_lock(&pmap->pm_spin);
3899 		KKASSERT(pv->pv_pmap == pmap);
3900 		if (pmap->pm_pvhint_pt == pv)
3901 			pmap->pm_pvhint_pt = NULL;
3902 		if (pmap->pm_pvhint_unused == pv)
3903 			pmap->pm_pvhint_unused = NULL;
3904 		pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
3905 		atomic_add_long(&pmap->pm_stats.resident_count, -1);
3906 		pv->pv_pmap = NULL;
3907 		pv->pv_pindex = 0;
3908 		spin_unlock(&pmap->pm_spin);
3909 
3910 		/*
3911 		 * Try to shortcut three atomic ops, otherwise fall through
3912 		 * and do it normally.  Drop two refs and the lock all in
3913 		 * one go.
3914 		 */
3915 		if (pvp) {
3916 			if (vm_page_unwire_quick(pvp->pv_m))
3917 				panic("_pv_free: bad wirecount on pvp");
3918 		}
3919 		if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) {
3920 #ifdef PMAP_DEBUG2
3921 			if (pmap_enter_debug > 0) {
3922 				--pmap_enter_debug;
3923 				kprintf("pv_free: free pv %p\n", pv);
3924 			}
3925 #endif
3926 			zfree(pvzone, pv);
3927 			return;
3928 		}
3929 		pv_drop(pv);	/* ref for pv_pmap */
3930 	}
3931 	pv_unlock(pv);
3932 	pv_drop(pv);
3933 }
3934 
3935 /*
3936  * This routine is very drastic, but can save the system
3937  * in a pinch.
3938  */
3939 void
3940 pmap_collect(void)
3941 {
3942 	int i;
3943 	vm_page_t m;
3944 	static int warningdone=0;
3945 
3946 	if (pmap_pagedaemon_waken == 0)
3947 		return;
3948 	pmap_pagedaemon_waken = 0;
3949 	if (warningdone < 5) {
3950 		kprintf("pmap_collect: pv_entries exhausted -- "
3951 			"suggest increasing vm.pmap_pv_entries above %ld\n",
3952 			vm_pmap_pv_entries);
3953 		warningdone++;
3954 	}
3955 
3956 	for (i = 0; i < vm_page_array_size; i++) {
3957 		m = &vm_page_array[i];
3958 		if (m->wire_count || m->hold_count)
3959 			continue;
3960 		if (vm_page_busy_try(m, TRUE) == 0) {
3961 			if (m->wire_count == 0 && m->hold_count == 0) {
3962 				pmap_remove_all(m);
3963 			}
3964 			vm_page_wakeup(m);
3965 		}
3966 	}
3967 }
3968 
3969 /*
3970  * Scan the pmap for active page table entries and issue a callback.
3971  * The callback must dispose of pte_pv, whos PTE entry is at *ptep in
3972  * its parent page table.
3973  *
3974  * pte_pv will be NULL if the page or page table is unmanaged.
3975  * pt_pv will point to the page table page containing the pte for the page.
3976  *
3977  * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page),
3978  *	 we pass a NULL pte_pv and we pass a pt_pv pointing to the passed
3979  *	 process pmap's PD and page to the callback function.  This can be
3980  *	 confusing because the pt_pv is really a pd_pv, and the target page
3981  *	 table page is simply aliased by the pmap and not owned by it.
3982  *
3983  * It is assumed that the start and end are properly rounded to the page size.
3984  *
3985  * It is assumed that PD pages and above are managed and thus in the RB tree,
3986  * allowing us to use RB_SCAN from the PD pages down for ranged scans.
3987  */
3988 struct pmap_scan_info {
3989 	struct pmap *pmap;
3990 	vm_offset_t sva;
3991 	vm_offset_t eva;
3992 	vm_pindex_t sva_pd_pindex;
3993 	vm_pindex_t eva_pd_pindex;
3994 	void (*func)(pmap_t, struct pmap_scan_info *,
3995 		     vm_pindex_t *, pv_entry_t, vm_offset_t,
3996 		     pt_entry_t *, void *);
3997 	void *arg;
3998 	pmap_inval_bulk_t bulk_core;
3999 	pmap_inval_bulk_t *bulk;
4000 	int count;
4001 	int stop;
4002 };
4003 
4004 static int pmap_scan_cmp(pv_entry_t pv, void *data);
4005 static int pmap_scan_callback(pv_entry_t pv, void *data);
4006 
4007 static void
4008 pmap_scan(struct pmap_scan_info *info, int smp_inval)
4009 {
4010 	struct pmap *pmap = info->pmap;
4011 	pv_entry_t pt_pv;	/* A page table PV */
4012 	pv_entry_t pte_pv;	/* A page table entry PV */
4013 	vm_pindex_t *pte_placemark;
4014 	vm_pindex_t *pt_placemark;
4015 	pt_entry_t *ptep;
4016 	pt_entry_t oldpte;
4017 	struct pv_entry dummy_pv;
4018 
4019 	info->stop = 0;
4020 	if (pmap == NULL)
4021 		return;
4022 	if (info->sva == info->eva)
4023 		return;
4024 	if (smp_inval) {
4025 		info->bulk = &info->bulk_core;
4026 		pmap_inval_bulk_init(&info->bulk_core, pmap);
4027 	} else {
4028 		info->bulk = NULL;
4029 	}
4030 
4031 	/*
4032 	 * Hold the token for stability; if the pmap is empty we have nothing
4033 	 * to do.
4034 	 */
4035 #if 0
4036 	if (pmap->pm_stats.resident_count == 0) {
4037 		return;
4038 	}
4039 #endif
4040 
4041 	info->count = 0;
4042 
4043 	/*
4044 	 * Special handling for scanning one page, which is a very common
4045 	 * operation (it is?).
4046 	 *
4047 	 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4
4048 	 */
4049 	if (info->sva + PAGE_SIZE == info->eva) {
4050 		if (info->sva >= VM_MAX_USER_ADDRESS) {
4051 			/*
4052 			 * Kernel mappings do not track wire counts on
4053 			 * page table pages and only maintain pd_pv and
4054 			 * pte_pv levels so pmap_scan() works.
4055 			 */
4056 			pt_pv = NULL;
4057 			pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva),
4058 					&pte_placemark);
4059 			KKASSERT(pte_pv == NULL);
4060 			ptep = vtopte(info->sva);
4061 		} else {
4062 			/*
4063 			 * We hold pte_placemark across the operation for
4064 			 * unmanaged pages.
4065 			 *
4066 			 * WARNING!  We must hold pt_placemark across the
4067 			 *	     *ptep test to prevent misintepreting
4068 			 *	     a non-zero *ptep as a shared page
4069 			 *	     table page.  Hold it across the function
4070 			 *	     callback as well for SMP safety.
4071 			 */
4072 			pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva),
4073 					&pte_placemark);
4074 			KKASSERT(pte_pv == NULL);
4075 			pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva),
4076 				       &pt_placemark);
4077 			if (pt_pv == NULL) {
4078 #if 0
4079 				KKASSERT(0);
4080 				pd_pv = pv_get(pmap,
4081 					       pmap_pd_pindex(info->sva),
4082 					       NULL);
4083 				if (pd_pv) {
4084 					ptep = pv_pte_lookup(pd_pv,
4085 						    pmap_pt_index(info->sva));
4086 					if (*ptep) {
4087 						info->func(pmap, info,
4088 						     pt_placemark, pd_pv,
4089 						     info->sva, ptep,
4090 						     info->arg);
4091 					} else {
4092 						pv_placemarker_wakeup(pmap,
4093 								  pt_placemark);
4094 					}
4095 					pv_put(pd_pv);
4096 				} else {
4097 					pv_placemarker_wakeup(pmap,
4098 							      pt_placemark);
4099 				}
4100 #else
4101 				pv_placemarker_wakeup(pmap, pt_placemark);
4102 #endif
4103 				pv_placemarker_wakeup(pmap, pte_placemark);
4104 				goto fast_skip;
4105 			}
4106 			ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva));
4107 		}
4108 
4109 		/*
4110 		 * NOTE: *ptep can't be ripped out from under us if we hold
4111 		 *	 pte_pv (or pte_placemark) locked, but bits can
4112 		 *	 change.
4113 		 */
4114 		oldpte = *ptep;
4115 		cpu_ccfence();
4116 		if (oldpte == 0) {
4117 			KKASSERT(pte_pv == NULL);
4118 			pv_placemarker_wakeup(pmap, pte_placemark);
4119 		} else {
4120 			KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]) ==
4121 				pmap->pmap_bits[PG_V_IDX],
4122 			    ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL",
4123 			    *ptep, oldpte, info->sva));
4124 			info->func(pmap, info, pte_placemark, pt_pv,
4125 				   info->sva, ptep, info->arg);
4126 		}
4127 		if (pt_pv)
4128 			pv_put(pt_pv);
4129 fast_skip:
4130 		pmap_inval_bulk_flush(info->bulk);
4131 		return;
4132 	}
4133 
4134 	/*
4135 	 * Nominal scan case, RB_SCAN() for PD pages and iterate from
4136 	 * there.
4137 	 *
4138 	 * WARNING! eva can overflow our standard ((N + mask) >> bits)
4139 	 *	    bounds, resulting in a pd_pindex of 0.  To solve the
4140 	 *	    problem we use an inclusive range.
4141 	 */
4142 	info->sva_pd_pindex = pmap_pd_pindex(info->sva);
4143 	info->eva_pd_pindex = pmap_pd_pindex(info->eva - PAGE_SIZE);
4144 
4145 	if (info->sva >= VM_MAX_USER_ADDRESS) {
4146 		/*
4147 		 * The kernel does not currently maintain any pv_entry's for
4148 		 * higher-level page tables.
4149 		 */
4150 		bzero(&dummy_pv, sizeof(dummy_pv));
4151 		dummy_pv.pv_pindex = info->sva_pd_pindex;
4152 		spin_lock(&pmap->pm_spin);
4153 		while (dummy_pv.pv_pindex <= info->eva_pd_pindex) {
4154 			pmap_scan_callback(&dummy_pv, info);
4155 			++dummy_pv.pv_pindex;
4156 			if (dummy_pv.pv_pindex < info->sva_pd_pindex) /*wrap*/
4157 				break;
4158 		}
4159 		spin_unlock(&pmap->pm_spin);
4160 	} else {
4161 		/*
4162 		 * User page tables maintain local PML4, PDP, PD, and PT
4163 		 * pv_entry's.  pv_entry's are not used for PTEs.
4164 		 */
4165 		spin_lock(&pmap->pm_spin);
4166 		pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, pmap_scan_cmp,
4167 					 pmap_scan_callback, info);
4168 		spin_unlock(&pmap->pm_spin);
4169 	}
4170 	pmap_inval_bulk_flush(info->bulk);
4171 }
4172 
4173 /*
4174  * WARNING! pmap->pm_spin held
4175  *
4176  * WARNING! eva can overflow our standard ((N + mask) >> bits)
4177  *	    bounds, resulting in a pd_pindex of 0.  To solve the
4178  *	    problem we use an inclusive range.
4179  */
4180 static int
4181 pmap_scan_cmp(pv_entry_t pv, void *data)
4182 {
4183 	struct pmap_scan_info *info = data;
4184 	if (pv->pv_pindex < info->sva_pd_pindex)
4185 		return(-1);
4186 	if (pv->pv_pindex > info->eva_pd_pindex)
4187 		return(1);
4188 	return(0);
4189 }
4190 
4191 /*
4192  * pmap_scan() by PDs
4193  *
4194  * WARNING! pmap->pm_spin held
4195  */
4196 static int
4197 pmap_scan_callback(pv_entry_t pv, void *data)
4198 {
4199 	struct pmap_scan_info *info = data;
4200 	struct pmap *pmap = info->pmap;
4201 	pv_entry_t pd_pv;	/* A page directory PV */
4202 	pv_entry_t pt_pv;	/* A page table PV */
4203 	vm_pindex_t *pt_placemark;
4204 	pt_entry_t *ptep;
4205 	pt_entry_t oldpte;
4206 	vm_offset_t sva;
4207 	vm_offset_t eva;
4208 	vm_offset_t va_next;
4209 	vm_pindex_t pd_pindex;
4210 	int error;
4211 
4212 	/*
4213 	 * Stop if requested
4214 	 */
4215 	if (info->stop)
4216 		return -1;
4217 
4218 	/*
4219 	 * Pull the PD pindex from the pv before releasing the spinlock.
4220 	 *
4221 	 * WARNING: pv is faked for kernel pmap scans.
4222 	 */
4223 	pd_pindex = pv->pv_pindex;
4224 	spin_unlock(&pmap->pm_spin);
4225 	pv = NULL;	/* invalid after spinlock unlocked */
4226 
4227 	/*
4228 	 * Calculate the page range within the PD.  SIMPLE pmaps are
4229 	 * direct-mapped for the entire 2^64 address space.  Normal pmaps
4230 	 * reflect the user and kernel address space which requires
4231 	 * cannonicalization w/regards to converting pd_pindex's back
4232 	 * into addresses.
4233 	 */
4234 	sva = (pd_pindex - pmap_pd_pindex(0)) << PDPSHIFT;
4235 	if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 &&
4236 	    (sva & PML4_SIGNMASK)) {
4237 		sva |= PML4_SIGNMASK;
4238 	}
4239 	eva = sva + NBPDP;	/* can overflow */
4240 	if (sva < info->sva)
4241 		sva = info->sva;
4242 	if (eva < info->sva || eva > info->eva)
4243 		eva = info->eva;
4244 
4245 	/*
4246 	 * NOTE: kernel mappings do not track page table pages, only
4247 	 * 	 terminal pages.
4248 	 *
4249 	 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4.
4250 	 *	 However, for the scan to be efficient we try to
4251 	 *	 cache items top-down.
4252 	 */
4253 	pd_pv = NULL;
4254 	pt_pv = NULL;
4255 
4256 	for (; sva < eva; sva = va_next) {
4257 		if (info->stop)
4258 			break;
4259 		if (sva >= VM_MAX_USER_ADDRESS) {
4260 			if (pt_pv) {
4261 				pv_put(pt_pv);
4262 				pt_pv = NULL;
4263 			}
4264 			goto kernel_skip;
4265 		}
4266 
4267 		/*
4268 		 * PD cache, scan shortcut if it doesn't exist.
4269 		 */
4270 		if (pd_pv == NULL) {
4271 			pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL);
4272 		} else if (pd_pv->pv_pmap != pmap ||
4273 			   pd_pv->pv_pindex != pmap_pd_pindex(sva)) {
4274 			pv_put(pd_pv);
4275 			pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL);
4276 		}
4277 		if (pd_pv == NULL) {
4278 			va_next = (sva + NBPDP) & ~PDPMASK;
4279 			if (va_next < sva)
4280 				va_next = eva;
4281 			continue;
4282 		}
4283 
4284 		/*
4285 		 * PT cache
4286 		 *
4287 		 * NOTE: The cached pt_pv can be removed from the pmap when
4288 		 *	 pmap_dynamic_delete is enabled.
4289 		 */
4290 		if (pt_pv && (pt_pv->pv_pmap != pmap ||
4291 			      pt_pv->pv_pindex != pmap_pt_pindex(sva))) {
4292 			pv_put(pt_pv);
4293 			pt_pv = NULL;
4294 		}
4295 		if (pt_pv == NULL) {
4296 			pt_pv = pv_get_try(pmap, pmap_pt_pindex(sva),
4297 					   &pt_placemark, &error);
4298 			if (error) {
4299 				pv_put(pd_pv);	/* lock order */
4300 				pd_pv = NULL;
4301 				if (pt_pv) {
4302 					pv_lock(pt_pv);
4303 					pv_put(pt_pv);
4304 					pt_pv = NULL;
4305 				} else {
4306 					pv_placemarker_wait(pmap, pt_placemark);
4307 				}
4308 				va_next = sva;
4309 				continue;
4310 			}
4311 			/* may have to re-check later if pt_pv is NULL here */
4312 		}
4313 
4314 		/*
4315 		 * If pt_pv is NULL we either have a shared page table
4316 		 * page (NOT IMPLEMENTED XXX) and must issue a callback
4317 		 * specific to that case, or there is no page table page.
4318 		 *
4319 		 * Either way we can skip the page table page.
4320 		 *
4321 		 * WARNING! pt_pv can also be NULL due to a pv creation
4322 		 *	    race where we find it to be NULL and then
4323 		 *	    later see a pte_pv.  But its possible the pt_pv
4324 		 *	    got created inbetween the two operations, so
4325 		 *	    we must check.
4326 		 *
4327 		 *	    XXX This should no longer be the case because
4328 		 *	    we have pt_placemark.
4329 		 */
4330 		if (pt_pv == NULL) {
4331 #if 0
4332 			/* XXX REMOVED */
4333 			/*
4334 			 * Possible unmanaged (shared from another pmap)
4335 			 * page table page.
4336 			 *
4337 			 * WARNING!  We must hold pt_placemark across the
4338 			 *	     *ptep test to prevent misintepreting
4339 			 *	     a non-zero *ptep as a shared page
4340 			 *	     table page.  Hold it across the function
4341 			 *	     callback as well for SMP safety.
4342 			 */
4343 			KKASSERT(0);
4344 			ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva));
4345 			if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
4346 				info->func(pmap, info, pt_placemark, pd_pv,
4347 					   sva, ptep, info->arg);
4348 			} else {
4349 				pv_placemarker_wakeup(pmap, pt_placemark);
4350 			}
4351 #else
4352 			pv_placemarker_wakeup(pmap, pt_placemark);
4353 #endif
4354 
4355 			/*
4356 			 * Done, move to next page table page.
4357 			 */
4358 			va_next = (sva + NBPDR) & ~PDRMASK;
4359 			if (va_next < sva)
4360 				va_next = eva;
4361 			continue;
4362 		}
4363 
4364 		/*
4365 		 * From this point in the loop testing pt_pv for non-NULL
4366 		 * means we are in UVM, else if it is NULL we are in KVM.
4367 		 *
4368 		 * Limit our scan to either the end of the va represented
4369 		 * by the current page table page, or to the end of the
4370 		 * range being removed.
4371 		 */
4372 kernel_skip:
4373 		va_next = (sva + NBPDR) & ~PDRMASK;
4374 		if (va_next < sva)
4375 			va_next = eva;
4376 		if (va_next > eva)
4377 			va_next = eva;
4378 
4379 		/*
4380 		 * Scan the page table for pages.  Some pages may not be
4381 		 * managed (might not have a pv_entry).
4382 		 *
4383 		 * There is no page table management for kernel pages so
4384 		 * pt_pv will be NULL in that case, but otherwise pt_pv
4385 		 * is non-NULL, locked, and referenced.
4386 		 */
4387 
4388 		/*
4389 		 * At this point a non-NULL pt_pv means a UVA, and a NULL
4390 		 * pt_pv means a KVA.
4391 		 */
4392 		if (pt_pv)
4393 			ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
4394 		else
4395 			ptep = vtopte(sva);
4396 
4397 		while (sva < va_next) {
4398 			vm_pindex_t *pte_placemark;
4399 			pv_entry_t pte_pv;
4400 
4401 			/*
4402 			 * Yield every 64 pages, stop if requested.
4403 			 */
4404 			if ((++info->count & 63) == 0)
4405 				lwkt_user_yield();
4406 			if (info->stop)
4407 				break;
4408 
4409 			/*
4410 			 * We can shortcut our scan if *ptep == 0.  This is
4411 			 * an unlocked check.
4412 			 */
4413 			if (*ptep == 0) {
4414 				sva += PAGE_SIZE;
4415 				++ptep;
4416 				continue;
4417 			}
4418 			cpu_ccfence();
4419 
4420 			/*
4421 			 * Acquire the pte_placemark.  pte_pv's won't exist
4422 			 * for leaf pages.
4423 			 *
4424 			 * A multitude of races are possible here so if we
4425 			 * cannot lock definite state we clean out our cache
4426 			 * and break the inner while() loop to force a loop
4427 			 * up to the top of the for().
4428 			 *
4429 			 * XXX unlock/relock pd_pv, pt_pv, and re-test their
4430 			 *     validity instead of looping up?
4431 			 */
4432 			pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva),
4433 					    &pte_placemark, &error);
4434 			KKASSERT(pte_pv == NULL);
4435 			if (error) {
4436 				if (pd_pv) {
4437 					pv_put(pd_pv);	/* lock order */
4438 					pd_pv = NULL;
4439 				}
4440 				if (pt_pv) {
4441 					pv_put(pt_pv);	/* lock order */
4442 					pt_pv = NULL;
4443 				}
4444 				pv_placemarker_wait(pmap, pte_placemark);
4445 				va_next = sva;		/* retry */
4446 				break;
4447 			}
4448 
4449 			/*
4450 			 * Reload *ptep after successfully locking the
4451 			 * pindex.
4452 			 */
4453 			cpu_ccfence();
4454 			oldpte = *ptep;
4455 			if (oldpte == 0) {
4456 				pv_placemarker_wakeup(pmap, pte_placemark);
4457 				sva += PAGE_SIZE;
4458 				++ptep;
4459 				continue;
4460 			}
4461 
4462 			/*
4463 			 * We can't hold pd_pv across the callback (because
4464 			 * we don't pass it to the callback and the callback
4465 			 * might deadlock)
4466 			 */
4467 			if (pd_pv) {
4468 				vm_page_wire_quick(pd_pv->pv_m);
4469 				pv_unlock(pd_pv);
4470 			}
4471 
4472 			/*
4473 			 * Ready for the callback.  The locked placemarker
4474 			 * is consumed by the callback.
4475 			 */
4476 			if (oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) {
4477 				/*
4478 				 * Managed pte
4479 				 */
4480 				KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]),
4481 				    ("badC *ptep %016lx/%016lx sva %016lx",
4482 				    *ptep, oldpte, sva));
4483 				/*
4484 				 * We must unlock pd_pv across the callback
4485 				 * to avoid deadlocks on any recursive
4486 				 * disposal.  Re-check that it still exists
4487 				 * after re-locking.
4488 				 *
4489 				 * Call target disposes of pte_placemark
4490 				 * and may destroy but will not dispose
4491 				 * of pt_pv.
4492 				 */
4493 				info->func(pmap, info, pte_placemark, pt_pv,
4494 					   sva, ptep, info->arg);
4495 			} else {
4496 				/*
4497 				 * Unmanaged pte
4498 				 *
4499 				 * We must unlock pd_pv across the callback
4500 				 * to avoid deadlocks on any recursive
4501 				 * disposal.  Re-check that it still exists
4502 				 * after re-locking.
4503 				 *
4504 				 * Call target disposes of pte_placemark
4505 				 * and may destroy but will not dispose
4506 				 * of pt_pv.
4507 				 */
4508 				KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]),
4509 				    ("badD *ptep %016lx/%016lx sva %016lx ",
4510 				     *ptep, oldpte, sva));
4511 				info->func(pmap, info, pte_placemark, pt_pv,
4512 					   sva, ptep, info->arg);
4513 			}
4514 			if (pd_pv) {
4515 				pv_lock(pd_pv);
4516 				if (vm_page_unwire_quick(pd_pv->pv_m)) {
4517 					panic("pmap_scan_callback: "
4518 					      "bad wirecount on pd_pv");
4519 				}
4520 				if (pd_pv->pv_pmap == NULL) {
4521 					va_next = sva;		/* retry */
4522 					break;
4523 				}
4524 			}
4525 
4526 			/*
4527 			 * NOTE: The cached pt_pv can be removed from the
4528 			 *	 pmap when pmap_dynamic_delete is enabled,
4529 			 *	 which will cause ptep to become stale.
4530 			 *
4531 			 *	 This also means that no pages remain under
4532 			 *	 the PT, so we can just break out of the inner
4533 			 *	 loop and let the outer loop clean everything
4534 			 *	 up.
4535 			 */
4536 			if (pt_pv && pt_pv->pv_pmap != pmap)
4537 				break;
4538 			sva += PAGE_SIZE;
4539 			++ptep;
4540 		}
4541 	}
4542 	if (pd_pv) {
4543 		pv_put(pd_pv);
4544 		pd_pv = NULL;
4545 	}
4546 	if (pt_pv) {
4547 		pv_put(pt_pv);
4548 		pt_pv = NULL;
4549 	}
4550 	if ((++info->count & 7) == 0)
4551 		lwkt_user_yield();
4552 
4553 	/*
4554 	 * Relock before returning.
4555 	 */
4556 	spin_lock(&pmap->pm_spin);
4557 	return (0);
4558 }
4559 
4560 void
4561 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
4562 {
4563 	struct pmap_scan_info info;
4564 
4565 	info.pmap = pmap;
4566 	info.sva = sva;
4567 	info.eva = eva;
4568 	info.func = pmap_remove_callback;
4569 	info.arg = NULL;
4570 	pmap_scan(&info, 1);
4571 #if 0
4572 	cpu_invltlb();
4573 	if (eva - sva < 1024*1024) {
4574 		while (sva < eva) {
4575 			cpu_invlpg((void *)sva);
4576 			sva += PAGE_SIZE;
4577 		}
4578 	}
4579 #endif
4580 }
4581 
4582 static void
4583 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
4584 {
4585 	struct pmap_scan_info info;
4586 
4587 	info.pmap = pmap;
4588 	info.sva = sva;
4589 	info.eva = eva;
4590 	info.func = pmap_remove_callback;
4591 	info.arg = NULL;
4592 	pmap_scan(&info, 0);
4593 }
4594 
4595 static void
4596 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
4597 		     vm_pindex_t *pte_placemark, pv_entry_t pt_pv,
4598 		     vm_offset_t va, pt_entry_t *ptep, void *arg __unused)
4599 {
4600 	pt_entry_t pte;
4601 	vm_page_t oldm;
4602 
4603 	/*
4604 	 * Managed or unmanaged pte (pte_placemark is non-NULL)
4605 	 *
4606 	 * pt_pv's wire_count is still bumped by unmanaged pages
4607 	 * so we must decrement it manually.
4608 	 *
4609 	 * We have to unwire the target page table page.
4610 	 */
4611 	pte = *ptep;
4612 	if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) {
4613 		oldm = PHYS_TO_VM_PAGE(pte & PG_FRAME);
4614 		atomic_add_long(&oldm->md.interlock_count, 1);
4615 	} else {
4616 		oldm = NULL;
4617 	}
4618 
4619 	pte = pmap_inval_bulk(info->bulk, va, ptep, 0);
4620 	if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) {
4621 		vm_page_t p;
4622 
4623 		p = PHYS_TO_VM_PAGE(pte & PG_FRAME);
4624 		KKASSERT(pte & pmap->pmap_bits[PG_V_IDX]);
4625 		if (pte & pmap->pmap_bits[PG_M_IDX])
4626 			vm_page_dirty(p);
4627 		if (pte & pmap->pmap_bits[PG_A_IDX])
4628 			vm_page_flag_set(p, PG_REFERENCED);
4629 
4630 		/*
4631 		 * (p) is not hard-busied.
4632 		 *
4633 		 * We can safely clear PG_MAPPED and PG_WRITEABLE only
4634 		 * if PG_MAPPEDMULTI is not set, atomically.
4635 		 */
4636 		pmap_removed_pte(p, pte);
4637 	}
4638 	if (pte & pmap->pmap_bits[PG_V_IDX]) {
4639 		atomic_add_long(&pmap->pm_stats.resident_count, -1);
4640 		if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m))
4641 			panic("pmap_remove: insufficient wirecount");
4642 	}
4643 	if (pte & pmap->pmap_bits[PG_W_IDX])
4644 		atomic_add_long(&pmap->pm_stats.wired_count, -1);
4645 	if (pte & pmap->pmap_bits[PG_G_IDX])
4646 		cpu_invlpg((void *)va);
4647 	pv_placemarker_wakeup(pmap, pte_placemark);
4648 	if (oldm) {
4649 		if ((atomic_fetchadd_long(&oldm->md.interlock_count, -1) &
4650 		     0x7FFFFFFFFFFFFFFFLU) == 0x4000000000000001LU) {
4651 			atomic_clear_long(&oldm->md.interlock_count,
4652 					  0x4000000000000000LU);
4653 			wakeup(&oldm->md.interlock_count);
4654 		}
4655 	}
4656 }
4657 
4658 /*
4659  * Removes this physical page from all physical maps in which it resides.
4660  * Reflects back modify bits to the pager.
4661  *
4662  * This routine may not be called from an interrupt.
4663  *
4664  * The page must be busied by its caller, preventing new ptes from being
4665  * installed.  This allows us to assert that pmap_count is zero and safely
4666  * clear the MAPPED and WRITEABLE bits upon completion.
4667  */
4668 static
4669 void
4670 pmap_remove_all(vm_page_t m)
4671 {
4672 	long icount;
4673 	int retry;
4674 
4675 	if (__predict_false(!pmap_initialized))
4676 		return;
4677 
4678 	/*
4679 	 * pmap_count doesn't cover fictitious pages, but PG_MAPPED does
4680 	 * (albeit without certain race protections).
4681 	 */
4682 #if 0
4683 	if (m->md.pmap_count == 0)
4684 		return;
4685 #endif
4686 	if ((m->flags & PG_MAPPED) == 0)
4687 		return;
4688 
4689 	retry = ticks + hz * 60;
4690 again:
4691 	PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) {
4692 		if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0))
4693 			PMAP_PAGE_BACKING_RETRY;
4694 		if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) {
4695 			if (ipte & ipmap->pmap_bits[PG_M_IDX])
4696 				vm_page_dirty(m);
4697 			if (ipte & ipmap->pmap_bits[PG_A_IDX])
4698 				vm_page_flag_set(m, PG_REFERENCED);
4699 
4700 			/*
4701 			 * NOTE: m is not hard-busied so it is not safe to
4702 			 *	 clear PG_MAPPED and PG_WRITEABLE on the 1->0
4703 			 *	 transition against them being set in
4704 			 *	 pmap_enter().
4705 			 */
4706 			pmap_removed_pte(m, ipte);
4707 		}
4708 
4709 		/*
4710 		 * Cleanup various tracking counters.  pt_pv can't go away
4711 		 * due to our wired ref.
4712 		 */
4713 		if (ipmap != kernel_pmap) {
4714 			pv_entry_t pt_pv;
4715 
4716 			spin_lock_shared(&ipmap->pm_spin);
4717 			pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva));
4718 			spin_unlock_shared(&ipmap->pm_spin);
4719 
4720 			if (pt_pv) {
4721 				if (vm_page_unwire_quick(pt_pv->pv_m)) {
4722 					panic("pmap_remove_all: bad "
4723 					      "wire_count on pt_pv");
4724 				}
4725 				atomic_add_long(
4726 					&ipmap->pm_stats.resident_count, -1);
4727 			}
4728 		}
4729 		if (ipte & ipmap->pmap_bits[PG_W_IDX])
4730 			atomic_add_long(&ipmap->pm_stats.wired_count, -1);
4731 		if (ipte & ipmap->pmap_bits[PG_G_IDX])
4732 			cpu_invlpg((void *)iva);
4733 	} PMAP_PAGE_BACKING_DONE;
4734 
4735 	/*
4736 	 * If our scan lost a pte swap race oldm->md.interlock_count might
4737 	 * be set from the pmap_enter() code.  If so sleep a little and try
4738 	 * again.
4739 	 */
4740 	icount = atomic_fetchadd_long(&m->md.interlock_count,
4741 				      0x8000000000000000LU) +
4742 		 0x8000000000000000LU;
4743 	cpu_ccfence();
4744 	while (icount & 0x3FFFFFFFFFFFFFFFLU) {
4745 		tsleep_interlock(&m->md.interlock_count, 0);
4746 		if (atomic_fcmpset_long(&m->md.interlock_count, &icount,
4747 					icount | 0x4000000000000000LU)) {
4748 			tsleep(&m->md.interlock_count, PINTERLOCKED,
4749 			       "pgunm", 1);
4750 			icount = m->md.interlock_count;
4751 			if (retry - ticks > 0)
4752 				goto again;
4753 			panic("pmap_remove_all: cannot return interlock_count "
4754 			      "to 0 (%p, %ld)",
4755 			      m, m->md.interlock_count);
4756 		}
4757 	}
4758 	vm_page_flag_clear(m, PG_MAPPED | PG_MAPPEDMULTI | PG_WRITEABLE);
4759 }
4760 
4761 /*
4762  * Removes the page from a particular pmap.
4763  *
4764  * The page must be busied by the caller.
4765  */
4766 void
4767 pmap_remove_specific(pmap_t pmap_match, vm_page_t m)
4768 {
4769 	if (__predict_false(!pmap_initialized))
4770 		return;
4771 
4772 	/*
4773 	 * PG_MAPPED test works for both non-fictitious and fictitious pages.
4774 	 */
4775 	if ((m->flags & PG_MAPPED) == 0)
4776 		return;
4777 
4778 	PMAP_PAGE_BACKING_SCAN(m, pmap_match, ipmap, iptep, ipte, iva) {
4779 		if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0))
4780 			PMAP_PAGE_BACKING_RETRY;
4781 		if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) {
4782 			if (ipte & ipmap->pmap_bits[PG_M_IDX])
4783 				vm_page_dirty(m);
4784 			if (ipte & ipmap->pmap_bits[PG_A_IDX])
4785 				vm_page_flag_set(m, PG_REFERENCED);
4786 
4787 			/*
4788 			 * NOTE: m is not hard-busied so it is not safe to
4789 			 *	 clear PG_MAPPED and PG_WRITEABLE on the 1->0
4790 			 *	 transition against them being set in
4791 			 *	 pmap_enter().
4792 			 */
4793 			pmap_removed_pte(m, ipte);
4794 		}
4795 
4796 		/*
4797 		 * Cleanup various tracking counters.  pt_pv can't go away
4798 		 * due to our wired ref.
4799 		 */
4800 		if (ipmap != kernel_pmap) {
4801 			pv_entry_t pt_pv;
4802 
4803 			spin_lock_shared(&ipmap->pm_spin);
4804 			pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva));
4805 			spin_unlock_shared(&ipmap->pm_spin);
4806 
4807 			if (pt_pv) {
4808 				atomic_add_long(
4809 					&ipmap->pm_stats.resident_count, -1);
4810 				if (vm_page_unwire_quick(pt_pv->pv_m)) {
4811 					panic("pmap_remove_specific: bad "
4812 					      "wire_count on pt_pv");
4813 				}
4814 			}
4815 		}
4816 		if (ipte & ipmap->pmap_bits[PG_W_IDX])
4817 			atomic_add_long(&ipmap->pm_stats.wired_count, -1);
4818 		if (ipte & ipmap->pmap_bits[PG_G_IDX])
4819 			cpu_invlpg((void *)iva);
4820 	} PMAP_PAGE_BACKING_DONE;
4821 }
4822 
4823 /*
4824  * Set the physical protection on the specified range of this map
4825  * as requested.  This function is typically only used for debug watchpoints
4826  * and COW pages.
4827  *
4828  * This function may not be called from an interrupt if the map is
4829  * not the kernel_pmap.
4830  *
4831  * NOTE!  For shared page table pages we just unmap the page.
4832  */
4833 void
4834 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4835 {
4836 	struct pmap_scan_info info;
4837 	/* JG review for NX */
4838 
4839 	if (pmap == NULL)
4840 		return;
4841 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) {
4842 		pmap_remove(pmap, sva, eva);
4843 		return;
4844 	}
4845 	if (prot & VM_PROT_WRITE)
4846 		return;
4847 	info.pmap = pmap;
4848 	info.sva = sva;
4849 	info.eva = eva;
4850 	info.func = pmap_protect_callback;
4851 	info.arg = &prot;
4852 	pmap_scan(&info, 1);
4853 }
4854 
4855 static
4856 void
4857 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info,
4858 		      vm_pindex_t *pte_placemark,
4859 		      pv_entry_t pt_pv, vm_offset_t va,
4860 		      pt_entry_t *ptep, void *arg __unused)
4861 {
4862 	pt_entry_t pbits;
4863 	pt_entry_t cbits;
4864 	vm_page_t m;
4865 
4866 again:
4867 	pbits = *ptep;
4868 	cpu_ccfence();
4869 	cbits = pbits;
4870 	if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) {
4871 		cbits &= ~pmap->pmap_bits[PG_A_IDX];
4872 		cbits &= ~pmap->pmap_bits[PG_M_IDX];
4873 	}
4874 	/* else unmanaged page, adjust bits, no wire changes */
4875 
4876 	if (ptep) {
4877 		cbits &= ~pmap->pmap_bits[PG_RW_IDX];
4878 #ifdef PMAP_DEBUG2
4879 		if (pmap_enter_debug > 0) {
4880 			--pmap_enter_debug;
4881 			kprintf("pmap_protect va=%lx ptep=%p "
4882 				"pt_pv=%p cbits=%08lx\n",
4883 				va, ptep, pt_pv, cbits
4884 			);
4885 		}
4886 #endif
4887 		if (pbits != cbits) {
4888 			if (!pmap_inval_smp_cmpset(pmap, va,
4889 						   ptep, pbits, cbits)) {
4890 				goto again;
4891 			}
4892 		}
4893 		if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) {
4894 			m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4895 			if (pbits & pmap->pmap_bits[PG_A_IDX])
4896 				vm_page_flag_set(m, PG_REFERENCED);
4897 			if (pbits & pmap->pmap_bits[PG_M_IDX])
4898 				vm_page_dirty(m);
4899 		}
4900 	}
4901 	pv_placemarker_wakeup(pmap, pte_placemark);
4902 }
4903 
4904 /*
4905  * Insert the vm_page (m) at the virtual address (va), replacing any prior
4906  * mapping at that address.  Set protection and wiring as requested.
4907  *
4908  * If entry is non-NULL we check to see if the SEG_SIZE optimization is
4909  * possible.  If it is we enter the page into the appropriate shared pmap
4910  * hanging off the related VM object instead of the passed pmap, then we
4911  * share the page table page from the VM object's pmap into the current pmap.
4912  *
4913  * NOTE: This routine MUST insert the page into the pmap now, it cannot
4914  *	 lazy-evaluate.
4915  */
4916 void
4917 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4918 	   boolean_t wired, vm_map_entry_t entry)
4919 {
4920 	pv_entry_t pt_pv;	/* page table */
4921 	pv_entry_t pte_pv;	/* page table entry */
4922 	vm_pindex_t *pte_placemark;
4923 	pt_entry_t *ptep;
4924 	pt_entry_t origpte;
4925 	vm_paddr_t opa;
4926 	vm_page_t oldm;
4927 	pt_entry_t newpte;
4928 	vm_paddr_t pa;
4929 	int flags;
4930 	int nflags;
4931 
4932 	if (pmap == NULL)
4933 		return;
4934 	va = trunc_page(va);
4935 #ifdef PMAP_DIAGNOSTIC
4936 	if (va >= KvaEnd)
4937 		panic("pmap_enter: toobig");
4938 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
4939 		panic("pmap_enter: invalid to pmap_enter page table "
4940 		      "pages (va: 0x%lx)", va);
4941 #endif
4942 	if (va < UPT_MAX_ADDRESS && pmap == kernel_pmap) {
4943 		kprintf("Warning: pmap_enter called on UVA with "
4944 			"kernel_pmap\n");
4945 #ifdef DDB
4946 		db_print_backtrace();
4947 #endif
4948 	}
4949 	if (va >= UPT_MAX_ADDRESS && pmap != kernel_pmap) {
4950 		kprintf("Warning: pmap_enter called on KVA without"
4951 			"kernel_pmap\n");
4952 #ifdef DDB
4953 		db_print_backtrace();
4954 #endif
4955 	}
4956 
4957 	/*
4958 	 * Get the locked page table page (pt_pv) for our new page table
4959 	 * entry, allocating it if necessary.
4960 	 *
4961 	 * There is no pte_pv for a terminal pte so the terminal pte will
4962 	 * be locked via pte_placemark.
4963 	 *
4964 	 * Only MMU actions by the CPU itself can modify the ptep out from
4965 	 * under us.
4966 	 *
4967 	 * If the pmap is still being initialized we assume existing
4968 	 * page tables.
4969 	 *
4970 	 * NOTE: Kernel mapppings do not track page table pages
4971 	 *	 (i.e. there is no pt_pv pt_pv structure).
4972 	 *
4973 	 * NOTE: origpte here is 'tentative', used only to check for
4974 	 *	 the degenerate case where the entry already exists and
4975 	 *	 matches.
4976 	 */
4977 	if (__predict_false(pmap_initialized == FALSE)) {
4978 		pte_pv = NULL;
4979 		pt_pv = NULL;
4980 		pte_placemark = NULL;
4981 		ptep = vtopte(va);
4982 		origpte = *ptep;
4983 	} else {
4984 		pte_pv = pv_get(pmap, pmap_pte_pindex(va), &pte_placemark);
4985 		KKASSERT(pte_pv == NULL);
4986 		if (va >= VM_MAX_USER_ADDRESS) {
4987 			pt_pv = NULL;
4988 			ptep = vtopte(va);
4989 		} else {
4990 			pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
4991 			ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
4992 		}
4993 		origpte = *ptep;
4994 		cpu_ccfence();
4995 	}
4996 
4997 	pa = VM_PAGE_TO_PHYS(m);
4998 
4999 	/*
5000 	 * Calculate the new PTE.
5001 	 */
5002 	newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) |
5003 		 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]);
5004 	if (wired)
5005 		newpte |= pmap->pmap_bits[PG_W_IDX];
5006 	if (va < VM_MAX_USER_ADDRESS)
5007 		newpte |= pmap->pmap_bits[PG_U_IDX];
5008 	if ((m->flags & PG_FICTITIOUS) == 0)
5009 		newpte |= pmap->pmap_bits[PG_MANAGED_IDX];
5010 //	if (pmap == kernel_pmap)
5011 //		newpte |= pgeflag;
5012 	newpte |= pmap->pmap_cache_bits_pte[m->pat_mode];
5013 
5014 	/*
5015 	 * It is possible for multiple faults to occur in threaded
5016 	 * environments, the existing pte might be correct.
5017 	 */
5018 	if (((origpte ^ newpte) &
5019 	    ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] |
5020 			  pmap->pmap_bits[PG_A_IDX])) == 0) {
5021 		goto done;
5022 	}
5023 
5024 	/*
5025 	 * Adjust page flags.  The page is soft-busied or hard-busied, we
5026 	 * should be able to safely set PG_* flag bits even with the (shared)
5027 	 * soft-busy.
5028 	 *
5029 	 * The pmap_count and writeable_count is only tracked for
5030 	 * non-fictitious pages.  As a bit of a safety, bump pmap_count
5031 	 * and set the PG_* bits before mapping the page.  If another part
5032 	 * of the system does not properly hard-busy the page (against our
5033 	 * soft-busy or hard-busy) in order to remove mappings it might not
5034 	 * see the pte that we are about to add and thus will not be able to
5035 	 * drop pmap_count to 0.
5036 	 *
5037 	 * The PG_MAPPED and PG_WRITEABLE flags are set for any type of page.
5038 	 *
5039 	 * NOTE! PG_MAPPED and PG_WRITEABLE can only be cleared when
5040 	 *	 the page is hard-busied AND pmap_count is 0.  This
5041 	 *	 interlocks our setting of the flags here.
5042 	 */
5043 	/*vm_page_spin_lock(m);*/
5044 
5045 	/*
5046 	 * In advanced mode we keep track of single mappings verses
5047 	 * multiple mappings in order to avoid unnecessary vm_page_protect()
5048 	 * calls (particularly on the kernel_map).
5049 	 *
5050 	 * If non-advanced mode we track the mapping count for similar effect.
5051 	 *
5052 	 * Avoid modifying the vm_page as much as possible, conditionalize
5053 	 * updates to reduce cache line ping-ponging.
5054 	 */
5055 	flags = m->flags;
5056 	cpu_ccfence();
5057 	for (;;) {
5058 		nflags = PG_MAPPED;
5059 		if (newpte & pmap->pmap_bits[PG_RW_IDX])
5060 			nflags |= PG_WRITEABLE;
5061 		if (flags & PG_MAPPED)
5062 			nflags |= PG_MAPPEDMULTI;
5063 		if (flags == (flags | nflags))
5064 			break;
5065 		if (atomic_fcmpset_int(&m->flags, &flags, flags | nflags))
5066 			break;
5067 	}
5068 	/*vm_page_spin_unlock(m);*/
5069 
5070 	/*
5071 	 * A race can develop when replacing an existing mapping.  The new
5072 	 * page has been busied and the pte is placemark-locked, but the
5073 	 * old page could be ripped out from under us at any time by
5074 	 * a backing scan.
5075 	 *
5076 	 * If we do nothing, a concurrent backing scan may clear
5077 	 * PG_WRITEABLE and PG_MAPPED before we can act on oldm.
5078 	 */
5079 	opa = origpte & PG_FRAME;
5080 	if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) {
5081 		oldm = PHYS_TO_VM_PAGE(opa);
5082 		KKASSERT(opa == oldm->phys_addr);
5083 		KKASSERT(entry != NULL);
5084 		atomic_add_long(&oldm->md.interlock_count, 1);
5085 	} else {
5086 		oldm = NULL;
5087 	}
5088 
5089 	/*
5090 	 * Swap the new and old PTEs and perform any necessary SMP
5091 	 * synchronization.
5092 	 */
5093 	if ((prot & VM_PROT_NOSYNC) || (opa == 0 && pt_pv != NULL)) {
5094 		/*
5095 		 * Explicitly permitted to avoid pmap cpu mask synchronization
5096 		 * or the prior content of a non-kernel-related pmap was
5097 		 * invalid.
5098 		 */
5099 		origpte = atomic_swap_long(ptep, newpte);
5100 		if (opa)
5101 			cpu_invlpg((void *)va);
5102 	} else {
5103 		/*
5104 		 * Not permitted to avoid pmap cpu mask synchronization
5105 		 * or there prior content being replaced or this is a kernel
5106 		 * related pmap.
5107 		 *
5108 		 * Due to other kernel optimizations, we cannot assume a
5109 		 * 0->non_zero transition of *ptep can be done with a swap.
5110 		 */
5111 		origpte = pmap_inval_smp(pmap, va, 1, ptep, newpte);
5112 	}
5113 	opa = origpte & PG_FRAME;
5114 
5115 #ifdef PMAP_DEBUG2
5116 	if (pmap_enter_debug > 0) {
5117 		--pmap_enter_debug;
5118 		kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p"
5119 			" pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n",
5120 			va, m,
5121 			origpte, newpte, ptep,
5122 			pte_pv, pt_pv, opa, prot);
5123 	}
5124 #endif
5125 
5126 	/*
5127 	 * Account for the changes in the pt_pv and pmap.
5128 	 *
5129 	 * Retain the same wiring count due to replacing an existing page,
5130 	 * or bump the wiring count for a new page.
5131 	 */
5132 	if (pt_pv && opa == 0) {
5133 		vm_page_wire_quick(pt_pv->pv_m);
5134 		atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1);
5135 	}
5136 	if (wired && (origpte & pmap->pmap_bits[PG_W_IDX]) == 0)
5137 		atomic_add_long(&pmap->pm_stats.wired_count, 1);
5138 
5139 	/*
5140 	 * Account for the removal of the old page.  pmap and pt_pv stats
5141 	 * have already been fully adjusted for both.
5142 	 *
5143 	 * WARNING! oldm is not soft or hard-busied.  The pte at worst can
5144 	 *	    only be removed out from under us since we hold the
5145 	 *	    placemarker.  So if it is still there, it must not have
5146 	 *	    changed.
5147 	 *
5148 	 * WARNING! A backing scan can clear PG_WRITEABLE and/or PG_MAPPED
5149 	 *	    and rip oldm away from us, possibly even freeing or
5150 	 *	    paging it, and not setting our dirtying below.
5151 	 *
5152 	 *	    To deal with this, oldm->md.interlock_count is bumped
5153 	 *	    to indicate that we might (only might) have won the pte
5154 	 *	    swap race, and then released below.
5155 	 */
5156 	if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) {
5157 		KKASSERT(oldm == PHYS_TO_VM_PAGE(opa));
5158 		if (origpte & pmap->pmap_bits[PG_M_IDX])
5159 			vm_page_dirty(oldm);
5160 		if (origpte & pmap->pmap_bits[PG_A_IDX])
5161 			vm_page_flag_set(oldm, PG_REFERENCED);
5162 
5163 		/*
5164 		 * NOTE: oldm is not hard-busied so it is not safe to
5165 		 *	 clear PG_MAPPED and PG_WRITEABLE on the 1->0
5166 		 *	 transition against them being set in
5167 		 *	 pmap_enter().
5168 		 */
5169 		pmap_removed_pte(oldm, origpte);
5170 	}
5171 	if (oldm) {
5172 		if ((atomic_fetchadd_long(&oldm->md.interlock_count, -1) &
5173 		     0x7FFFFFFFFFFFFFFFLU) == 0x4000000000000001LU) {
5174 			atomic_clear_long(&oldm->md.interlock_count,
5175 					  0x4000000000000000LU);
5176 			wakeup(&oldm->md.interlock_count);
5177 		}
5178 	}
5179 
5180 done:
5181 	KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 ||
5182 		 (m->flags & PG_MAPPED));
5183 
5184 	/*
5185 	 * Cleanup the pv entry, allowing other accessors.  If the new page
5186 	 * is not managed but we have a pte_pv (which was locking our
5187 	 * operation), we can free it now.  pte_pv->pv_m should be NULL.
5188 	 */
5189 	if (pte_placemark)
5190 		pv_placemarker_wakeup(pmap, pte_placemark);
5191 	if (pt_pv)
5192 		pv_put(pt_pv);
5193 }
5194 
5195 /*
5196  * Make a temporary mapping for a physical address.  This is only intended
5197  * to be used for panic dumps.
5198  *
5199  * The caller is responsible for calling smp_invltlb().
5200  */
5201 void *
5202 pmap_kenter_temporary(vm_paddr_t pa, long i)
5203 {
5204 	pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
5205 	return ((void *)crashdumpmap);
5206 }
5207 
5208 #if 0
5209 #define MAX_INIT_PT (96)
5210 
5211 /*
5212  * This routine preloads the ptes for a given object into the specified pmap.
5213  * This eliminates the blast of soft faults on process startup and
5214  * immediately after an mmap.
5215  */
5216 static int pmap_object_init_pt_callback(vm_page_t p, void *data);
5217 #endif
5218 
5219 void
5220 pmap_object_init_pt(pmap_t pmap, vm_map_entry_t entry,
5221 		    vm_offset_t addr, vm_size_t size, int limit)
5222 {
5223 #if 0
5224 	vm_prot_t prot = entry->protection;
5225 	vm_object_t object = entry->ba.object;
5226 	vm_pindex_t pindex = atop(entry->ba.offset + (addr - entry->ba.start));
5227 	struct rb_vm_page_scan_info info;
5228 	struct lwp *lp;
5229 	vm_size_t psize;
5230 
5231 	/*
5232 	 * We can't preinit if read access isn't set or there is no pmap
5233 	 * or object.
5234 	 */
5235 	if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL)
5236 		return;
5237 
5238 	/*
5239 	 * We can't preinit if the pmap is not the current pmap
5240 	 */
5241 	lp = curthread->td_lwp;
5242 	if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace))
5243 		return;
5244 
5245 	/*
5246 	 * Misc additional checks
5247 	 */
5248 	psize = x86_64_btop(size);
5249 
5250 	if ((object->type != OBJT_VNODE) ||
5251 		((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
5252 			(object->resident_page_count > MAX_INIT_PT))) {
5253 		return;
5254 	}
5255 
5256 	if (pindex + psize > object->size) {
5257 		if (object->size < pindex)
5258 			return;
5259 		psize = object->size - pindex;
5260 	}
5261 
5262 	if (psize == 0)
5263 		return;
5264 
5265 	/*
5266 	 * If everything is segment-aligned do not pre-init here.  Instead
5267 	 * allow the normal vm_fault path to pass a segment hint to
5268 	 * pmap_enter() which will then use an object-referenced shared
5269 	 * page table page.
5270 	 */
5271 	if ((addr & SEG_MASK) == 0 &&
5272 	    (ctob(psize) & SEG_MASK) == 0 &&
5273 	    (ctob(pindex) & SEG_MASK) == 0) {
5274 		return;
5275 	}
5276 
5277 	/*
5278 	 * Use a red-black scan to traverse the requested range and load
5279 	 * any valid pages found into the pmap.
5280 	 *
5281 	 * We cannot safely scan the object's memq without holding the
5282 	 * object token.
5283 	 */
5284 	info.start_pindex = pindex;
5285 	info.end_pindex = pindex + psize - 1;
5286 	info.limit = limit;
5287 	info.mpte = NULL;
5288 	info.addr = addr;
5289 	info.pmap = pmap;
5290 	info.object = object;
5291 	info.entry = entry;
5292 
5293 	/*
5294 	 * By using the NOLK scan, the callback function must be sure
5295 	 * to return -1 if the VM page falls out of the object.
5296 	 */
5297 	vm_object_hold_shared(object);
5298 	vm_page_rb_tree_RB_SCAN_NOLK(&object->rb_memq, rb_vm_page_scancmp,
5299 				     pmap_object_init_pt_callback, &info);
5300 	vm_object_drop(object);
5301 #endif
5302 }
5303 
5304 #if 0
5305 
5306 static
5307 int
5308 pmap_object_init_pt_callback(vm_page_t p, void *data)
5309 {
5310 	struct rb_vm_page_scan_info *info = data;
5311 	vm_pindex_t rel_index;
5312 	int hard_busy;
5313 
5314 	/*
5315 	 * don't allow an madvise to blow away our really
5316 	 * free pages allocating pv entries.
5317 	 */
5318 	if ((info->limit & MAP_PREFAULT_MADVISE) &&
5319 		vmstats.v_free_count < vmstats.v_free_reserved) {
5320 		    return(-1);
5321 	}
5322 
5323 	/*
5324 	 * Ignore list markers and ignore pages we cannot instantly
5325 	 * busy (while holding the object token).
5326 	 */
5327 	if (p->flags & PG_MARKER)
5328 		return 0;
5329 	hard_busy = 0;
5330 again:
5331 	if (hard_busy) {
5332 		if (vm_page_busy_try(p, TRUE))
5333 			return 0;
5334 	} else {
5335 		if (vm_page_sbusy_try(p))
5336 			return 0;
5337 	}
5338 	if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
5339 	    (p->flags & PG_FICTITIOUS) == 0) {
5340 		if ((p->queue - p->pc) == PQ_CACHE) {
5341 			if (hard_busy == 0) {
5342 				vm_page_sbusy_drop(p);
5343 				hard_busy = 1;
5344 				goto again;
5345 			}
5346 			vm_page_deactivate(p);
5347 		}
5348 		rel_index = p->pindex - info->start_pindex;
5349 		pmap_enter(info->pmap, info->addr + x86_64_ptob(rel_index), p,
5350 			   VM_PROT_READ, FALSE, info->entry);
5351 	}
5352 	if (hard_busy)
5353 		vm_page_wakeup(p);
5354 	else
5355 		vm_page_sbusy_drop(p);
5356 
5357 	/*
5358 	 * We are using an unlocked scan (that is, the scan expects its
5359 	 * current element to remain in the tree on return).  So we have
5360 	 * to check here and abort the scan if it isn't.
5361 	 */
5362 	if (p->object != info->object)
5363 		return -1;
5364 	lwkt_yield();
5365 	return(0);
5366 }
5367 
5368 #endif
5369 
5370 /*
5371  * Return TRUE if the pmap is in shape to trivially pre-fault the specified
5372  * address.
5373  *
5374  * Returns FALSE if it would be non-trivial or if a pte is already loaded
5375  * into the slot.
5376  *
5377  * The address must reside within a vm_map mapped range to ensure that the
5378  * page table doesn't get ripped out from under us.
5379  *
5380  * XXX This is safe only because page table pages are not freed.
5381  */
5382 int
5383 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
5384 {
5385 	pt_entry_t *pte;
5386 
5387 	/*spin_lock(&pmap->pm_spin);*/
5388 	if ((pte = pmap_pte(pmap, addr)) != NULL) {
5389 		if (*pte & pmap->pmap_bits[PG_V_IDX]) {
5390 			/*spin_unlock(&pmap->pm_spin);*/
5391 			return FALSE;
5392 		}
5393 	}
5394 	/*spin_unlock(&pmap->pm_spin);*/
5395 	return TRUE;
5396 }
5397 
5398 /*
5399  * Change the wiring attribute for a pmap/va pair.  The mapping must already
5400  * exist in the pmap.  The mapping may or may not be managed.  The wiring in
5401  * the page is not changed, the page is returned so the caller can adjust
5402  * its wiring (the page is not locked in any way).
5403  *
5404  * Wiring is not a hardware characteristic so there is no need to invalidate
5405  * TLB.  However, in an SMP environment we must use a locked bus cycle to
5406  * update the pte (if we are not using the pmap_inval_*() API that is)...
5407  * it's ok to do this for simple wiring changes.
5408  */
5409 vm_page_t
5410 pmap_unwire(pmap_t pmap, vm_offset_t va)
5411 {
5412 	pt_entry_t *ptep;
5413 	pv_entry_t pt_pv;
5414 	vm_paddr_t pa;
5415 	vm_page_t m;
5416 
5417 	if (pmap == NULL)
5418 		return NULL;
5419 
5420 	/*
5421 	 * Assume elements in the kernel pmap are stable
5422 	 */
5423 	if (pmap == kernel_pmap) {
5424 		if (pmap_pt(pmap, va) == 0)
5425 			return NULL;
5426 		ptep = pmap_pte_quick(pmap, va);
5427 		if (pmap_pte_v(pmap, ptep)) {
5428 			if (pmap_pte_w(pmap, ptep))
5429 				atomic_add_long(&pmap->pm_stats.wired_count,-1);
5430 			atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]);
5431 			pa = *ptep & PG_FRAME;
5432 			m = PHYS_TO_VM_PAGE(pa);
5433 		} else {
5434 			m = NULL;
5435 		}
5436 	} else {
5437 		/*
5438 		 * We can only [un]wire pmap-local pages (we cannot wire
5439 		 * shared pages)
5440 		 */
5441 		pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL);
5442 		if (pt_pv == NULL)
5443 			return NULL;
5444 
5445 		ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
5446 		if ((*ptep & pmap->pmap_bits[PG_V_IDX]) == 0) {
5447 			pv_put(pt_pv);
5448 			return NULL;
5449 		}
5450 
5451 		if (pmap_pte_w(pmap, ptep)) {
5452 			atomic_add_long(&pt_pv->pv_pmap->pm_stats.wired_count,
5453 					-1);
5454 		}
5455 		/* XXX else return NULL so caller doesn't unwire m ? */
5456 
5457 		atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]);
5458 
5459 		pa = *ptep & PG_FRAME;
5460 		m = PHYS_TO_VM_PAGE(pa);	/* held by wired count */
5461 		pv_put(pt_pv);
5462 	}
5463 	return m;
5464 }
5465 
5466 /*
5467  * Copy the range specified by src_addr/len from the source map to
5468  * the range dst_addr/len in the destination map.
5469  *
5470  * This routine is only advisory and need not do anything.
5471  */
5472 void
5473 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
5474 	  vm_size_t len, vm_offset_t src_addr)
5475 {
5476 }
5477 
5478 /*
5479  * pmap_zero_page:
5480  *
5481  *	Zero the specified physical page.
5482  *
5483  *	This function may be called from an interrupt and no locking is
5484  *	required.
5485  */
5486 void
5487 pmap_zero_page(vm_paddr_t phys)
5488 {
5489 	vm_offset_t va = PHYS_TO_DMAP(phys);
5490 
5491 	pagezero((void *)va);
5492 }
5493 
5494 /*
5495  * pmap_zero_page:
5496  *
5497  *	Zero part of a physical page by mapping it into memory and clearing
5498  *	its contents with bzero.
5499  *
5500  *	off and size may not cover an area beyond a single hardware page.
5501  */
5502 void
5503 pmap_zero_page_area(vm_paddr_t phys, int off, int size)
5504 {
5505 	vm_offset_t virt = PHYS_TO_DMAP(phys);
5506 
5507 	bzero((char *)virt + off, size);
5508 }
5509 
5510 /*
5511  * pmap_copy_page:
5512  *
5513  *	Copy the physical page from the source PA to the target PA.
5514  *	This function may be called from an interrupt.  No locking
5515  *	is required.
5516  */
5517 void
5518 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
5519 {
5520 	vm_offset_t src_virt, dst_virt;
5521 
5522 	src_virt = PHYS_TO_DMAP(src);
5523 	dst_virt = PHYS_TO_DMAP(dst);
5524 	bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE);
5525 }
5526 
5527 /*
5528  * pmap_copy_page_frag:
5529  *
5530  *	Copy the physical page from the source PA to the target PA.
5531  *	This function may be called from an interrupt.  No locking
5532  *	is required.
5533  */
5534 void
5535 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
5536 {
5537 	vm_offset_t src_virt, dst_virt;
5538 
5539 	src_virt = PHYS_TO_DMAP(src);
5540 	dst_virt = PHYS_TO_DMAP(dst);
5541 
5542 	bcopy((char *)src_virt + (src & PAGE_MASK),
5543 	      (char *)dst_virt + (dst & PAGE_MASK),
5544 	      bytes);
5545 }
5546 
5547 /*
5548  * Remove all pages from specified address space this aids process exit
5549  * speeds.  Also, this code may be special cased for the current process
5550  * only.
5551  */
5552 void
5553 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5554 {
5555 	pmap_remove_noinval(pmap, sva, eva);
5556 	cpu_invltlb();
5557 }
5558 
5559 /*
5560  * pmap_testbit tests bits in pte's note that the testbit/clearbit
5561  * routines are inline, and a lot of things compile-time evaluate.
5562  *
5563  * Currently only used to test the 'M'odified bit.  If the page
5564  * is not PG_WRITEABLE, the 'M'odified bit cannot be set and we
5565  * return immediately.  Fictitious pages do not track this bit.
5566  */
5567 static
5568 boolean_t
5569 pmap_testbit(vm_page_t m, int bit)
5570 {
5571 	int res = FALSE;
5572 
5573 	if (__predict_false(!pmap_initialized || (m->flags & PG_FICTITIOUS)))
5574 		return FALSE;
5575 	/*
5576 	 * Nothing to do if all the mappings are already read-only.
5577 	 * The page's [M]odify bits have already been synchronized
5578 	 * to the vm_page_t and cleaned out.
5579 	 */
5580 	if (bit == PG_M_IDX && (m->flags & PG_WRITEABLE) == 0)
5581 		return FALSE;
5582 
5583 	/*
5584 	 * Iterate the mapping
5585 	 */
5586 	PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) {
5587 		if (ipte & ipmap->pmap_bits[bit]) {
5588 			res = TRUE;
5589 			break;
5590 		}
5591 	} PMAP_PAGE_BACKING_DONE;
5592 	return res;
5593 }
5594 
5595 /*
5596  * This routine is used to modify bits in ptes.  Only one bit should be
5597  * specified.  PG_RW requires special handling.  This call works with
5598  * any sort of mapped page.  PG_FICTITIOUS pages might not be optimal.
5599  *
5600  * Caller must NOT hold any spin locks
5601  * Caller must hold (m) hard-busied
5602  *
5603  * NOTE: When clearing PG_M we could also (not implemented) drop
5604  *       through to the PG_RW code and clear PG_RW too, forcing
5605  *       a fault on write to redetect PG_M for virtual kernels, but
5606  *       it isn't necessary since virtual kernels invalidate the
5607  *       pte when they clear the VPTE_M bit in their virtual page
5608  *       tables.
5609  *
5610  * NOTE: Does not re-dirty the page when clearing only PG_M.
5611  *
5612  * NOTE: Because we do not lock the pv, *pte can be in a state of
5613  *       flux.  Despite this the value of *pte is still somewhat
5614  *       related while we hold the vm_page spin lock.
5615  *
5616  *       *pte can be zero due to this race.  Since we are clearing
5617  *       bits we basically do no harm when this race occurs.
5618  */
5619 static __inline
5620 void
5621 pmap_clearbit(vm_page_t m, int bit_index)
5622 {
5623 	pt_entry_t npte;
5624 	int retry;
5625 	long icount;
5626 
5627 	/*
5628 	 * Too early in the boot
5629 	 */
5630 	if (__predict_false(!pmap_initialized)) {
5631 		if (bit_index == PG_RW_IDX)
5632 			vm_page_flag_clear(m, PG_WRITEABLE);
5633 		return;
5634 	}
5635 	if ((m->flags & (PG_MAPPED | PG_WRITEABLE)) == 0)
5636 		return;
5637 
5638 	/*
5639 	 * Being asked to clear other random bits, we don't track them
5640 	 * so we have to iterate.
5641 	 *
5642 	 * pmap_clear_reference() is called (into here) with the page
5643 	 * hard-busied to check whether the page is still mapped and
5644 	 * will clear PG_MAPPED and PG_WRITEABLE if it isn't.
5645 	 */
5646 	if (bit_index != PG_RW_IDX) {
5647 #if 0
5648 		long icount;
5649 
5650 		icount = 0;
5651 #endif
5652 		PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) {
5653 #if 0
5654 			++icount;
5655 #endif
5656 			if (ipte & ipmap->pmap_bits[bit_index]) {
5657 				atomic_clear_long(iptep,
5658 						  ipmap->pmap_bits[bit_index]);
5659 			}
5660 		} PMAP_PAGE_BACKING_DONE;
5661 #if 0
5662 		if (icount == 0) {
5663 			icount = atomic_fetchadd_long(&m->md.interlock_count,
5664 						      0x8000000000000000LU);
5665 			if ((icount & 0x3FFFFFFFFFFFFFFFLU) == 0) {
5666 				vm_page_flag_clear(m, PG_MAPPED |
5667 						      PG_MAPPEDMULTI |
5668 						      PG_WRITEABLE);
5669 			}
5670 		}
5671 #endif
5672 		return;
5673 	}
5674 
5675 	/*
5676 	 * Being asked to clear the RW bit.
5677 	 *
5678 	 * Nothing to do if all the mappings are already read-only
5679 	 */
5680 	if ((m->flags & PG_WRITEABLE) == 0)
5681 		return;
5682 
5683 	/*
5684 	 * Iterate the mappings and check.
5685 	 */
5686 	retry = ticks + hz * 60;
5687 again:
5688 	/*
5689 	 * Clear PG_RW. This also clears PG_M and marks the page dirty if
5690 	 * PG_M was set.
5691 	 *
5692 	 * Since the caller holds the page hard-busied we can safely clear
5693 	 * PG_WRITEABLE, and callers expect us to for the PG_RW_IDX path.
5694 	 */
5695 	PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) {
5696 #if 0
5697 		if ((ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) == 0)
5698 			continue;
5699 #endif
5700 		if ((ipte & ipmap->pmap_bits[PG_RW_IDX]) == 0)
5701 			continue;
5702 		npte = ipte & ~(ipmap->pmap_bits[PG_RW_IDX] |
5703 				ipmap->pmap_bits[PG_M_IDX]);
5704 		if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, npte))
5705 			PMAP_PAGE_BACKING_RETRY;
5706 		if (ipte & ipmap->pmap_bits[PG_M_IDX])
5707 			vm_page_dirty(m);
5708 
5709 		/*
5710 		 * NOTE: m is not hard-busied so it is not safe to
5711 		 *	 clear PG_WRITEABLE on the 1->0 transition
5712 		 *	 against it being set in pmap_enter().
5713 		 *
5714 		 *	 pmap_count and writeable_count are only applicable
5715 		 *	 to non-fictitious pages (PG_MANAGED_IDX from pte)
5716 		 */
5717 	} PMAP_PAGE_BACKING_DONE;
5718 
5719 	/*
5720 	 * If our scan lost a pte swap race oldm->md.interlock_count might
5721 	 * be set from the pmap_enter() code.  If so sleep a little and try
5722 	 * again.
5723 	 *
5724 	 * Use an atomic op to access interlock_count to ensure ordering.
5725 	 */
5726 	icount = atomic_fetchadd_long(&m->md.interlock_count,
5727 				      0x8000000000000000LU) +
5728 		 0x8000000000000000LU;
5729 	cpu_ccfence();
5730 	while (icount & 0x3FFFFFFFFFFFFFFFLU) {
5731 		tsleep_interlock(&m->md.interlock_count, 0);
5732 		if (atomic_fcmpset_long(&m->md.interlock_count, &icount,
5733 				        icount | 0x4000000000000000LU)) {
5734 			tsleep(&m->md.interlock_count, PINTERLOCKED,
5735 			       "pgunm", 1);
5736 			icount = m->md.interlock_count;
5737 			if (retry - ticks > 0)
5738 				goto again;
5739 			panic("pmap_clearbit: cannot return interlock_count "
5740 			      "to 0 (%p, %ld)",
5741 			      m, m->md.interlock_count);
5742 		}
5743 	}
5744 	vm_page_flag_clear(m, PG_WRITEABLE);
5745 }
5746 
5747 /*
5748  * Lower the permission for all mappings to a given page.
5749  *
5750  * Page must be hard-busied by caller.  Because the page is busied by the
5751  * caller, this should not be able to race a pmap_enter().
5752  */
5753 void
5754 pmap_page_protect(vm_page_t m, vm_prot_t prot)
5755 {
5756 	/* JG NX support? */
5757 	if ((prot & VM_PROT_WRITE) == 0) {
5758 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
5759 			/*
5760 			 * NOTE: pmap_clearbit(.. PG_RW) also clears
5761 			 *	 the PG_WRITEABLE flag in (m).
5762 			 */
5763 			pmap_clearbit(m, PG_RW_IDX);
5764 		} else {
5765 			pmap_remove_all(m);
5766 		}
5767 	}
5768 }
5769 
5770 vm_paddr_t
5771 pmap_phys_address(vm_pindex_t ppn)
5772 {
5773 	return (x86_64_ptob(ppn));
5774 }
5775 
5776 /*
5777  * Return a count of reference bits for a page, clearing those bits.
5778  * It is not necessary for every reference bit to be cleared, but it
5779  * is necessary that 0 only be returned when there are truly no
5780  * reference bits set.
5781  *
5782  * XXX: The exact number of bits to check and clear is a matter that
5783  * should be tested and standardized at some point in the future for
5784  * optimal aging of shared pages.
5785  *
5786  * This routine may not block.
5787  */
5788 int
5789 pmap_ts_referenced(vm_page_t m)
5790 {
5791 	int rval = 0;
5792 	pt_entry_t npte;
5793 
5794 	if (__predict_false(!pmap_initialized || (m->flags & PG_FICTITIOUS)))
5795 		return rval;
5796 	PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) {
5797 		if (ipte & ipmap->pmap_bits[PG_A_IDX]) {
5798 			npte = ipte & ~ipmap->pmap_bits[PG_A_IDX];
5799 			if (!atomic_cmpset_long(iptep, ipte, npte))
5800 				PMAP_PAGE_BACKING_RETRY;
5801 			++rval;
5802 			if (rval > 4)
5803 				break;
5804 		}
5805 	} PMAP_PAGE_BACKING_DONE;
5806 	return rval;
5807 }
5808 
5809 /*
5810  *	pmap_is_modified:
5811  *
5812  *	Return whether or not the specified physical page was modified
5813  *	in any physical maps.
5814  */
5815 boolean_t
5816 pmap_is_modified(vm_page_t m)
5817 {
5818 	boolean_t res;
5819 
5820 	res = pmap_testbit(m, PG_M_IDX);
5821 	return (res);
5822 }
5823 
5824 /*
5825  * Clear the modify bit on the vm_page.
5826  *
5827  * The page must be hard-busied.
5828  */
5829 void
5830 pmap_clear_modify(vm_page_t m)
5831 {
5832 	pmap_clearbit(m, PG_M_IDX);
5833 }
5834 
5835 /*
5836  *	pmap_clear_reference:
5837  *
5838  *	Clear the reference bit on the specified physical page.
5839  */
5840 void
5841 pmap_clear_reference(vm_page_t m)
5842 {
5843 	pmap_clearbit(m, PG_A_IDX);
5844 }
5845 
5846 /*
5847  * Miscellaneous support routines follow
5848  */
5849 
5850 static
5851 void
5852 x86_64_protection_init(void)
5853 {
5854 	uint64_t *kp;
5855 	int prot;
5856 
5857 	/*
5858 	 * NX supported? (boot time loader.conf override only)
5859 	 *
5860 	 * -1	Automatic (sets mode 1)
5861 	 *  0	Disabled
5862 	 *  1	NX implemented, differentiates PROT_READ vs PROT_READ|PROT_EXEC
5863 	 *  2	NX implemented for all cases
5864 	 */
5865 	TUNABLE_INT_FETCH("machdep.pmap_nx_enable", &pmap_nx_enable);
5866 	if ((amd_feature & AMDID_NX) == 0) {
5867 		pmap_bits_default[PG_NX_IDX] = 0;
5868 		pmap_nx_enable = 0;
5869 	} else if (pmap_nx_enable < 0) {
5870 		pmap_nx_enable = 1;		/* default to mode 1 (READ) */
5871 	}
5872 
5873 	/*
5874 	 * 0 is basically read-only access, but also set the NX (no-execute)
5875 	 * bit when VM_PROT_EXECUTE is not specified.
5876 	 */
5877 	kp = protection_codes;
5878 	for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) {
5879 		switch (prot) {
5880 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
5881 			/*
5882 			 * This case handled elsewhere
5883 			 */
5884 			*kp = 0;
5885 			break;
5886 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
5887 			/*
5888 			 * Read-only is 0|NX	(pmap_nx_enable mode >= 1)
5889 			 */
5890 			if (pmap_nx_enable >= 1)
5891 				*kp = pmap_bits_default[PG_NX_IDX];
5892 			break;
5893 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
5894 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
5895 			/*
5896 			 * Execute requires read access
5897 			 */
5898 			*kp = 0;
5899 			break;
5900 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
5901 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
5902 			/*
5903 			 * Write without execute is RW|NX
5904 			 *			(pmap_nx_enable mode >= 2)
5905 			 */
5906 			*kp = pmap_bits_default[PG_RW_IDX];
5907 			if (pmap_nx_enable >= 2)
5908 				*kp |= pmap_bits_default[PG_NX_IDX];
5909 			break;
5910 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
5911 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
5912 			/*
5913 			 * Write with execute is RW
5914 			 */
5915 			*kp = pmap_bits_default[PG_RW_IDX];
5916 			break;
5917 		}
5918 		++kp;
5919 	}
5920 }
5921 
5922 /*
5923  * Map a set of physical memory pages into the kernel virtual
5924  * address space. Return a pointer to where it is mapped. This
5925  * routine is intended to be used for mapping device memory,
5926  * NOT real memory.
5927  *
5928  * NOTE: We can't use pgeflag unless we invalidate the pages one at
5929  *	 a time.
5930  *
5931  * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE}
5932  *	 work whether the cpu supports PAT or not.  The remaining PAT
5933  *	 attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu
5934  *	 supports PAT.
5935  */
5936 void *
5937 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5938 {
5939 	return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5940 }
5941 
5942 void *
5943 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size)
5944 {
5945 	return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5946 }
5947 
5948 void *
5949 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5950 {
5951 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5952 }
5953 
5954 /*
5955  * Map a set of physical memory pages into the kernel virtual
5956  * address space. Return a pointer to where it is mapped. This
5957  * routine is intended to be used for mapping device memory,
5958  * NOT real memory.
5959  */
5960 void *
5961 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5962 {
5963 	vm_offset_t va, tmpva, offset;
5964 	pt_entry_t *pte;
5965 	vm_size_t tmpsize;
5966 
5967 	offset = pa & PAGE_MASK;
5968 	size = roundup(offset + size, PAGE_SIZE);
5969 
5970 	va = kmem_alloc_nofault(kernel_map, size, VM_SUBSYS_MAPDEV, PAGE_SIZE);
5971 	if (va == 0)
5972 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
5973 
5974 	pa = pa & ~PAGE_MASK;
5975 	for (tmpva = va, tmpsize = size; tmpsize > 0;) {
5976 		pte = vtopte(tmpva);
5977 		*pte = pa |
5978 		    kernel_pmap->pmap_bits[PG_RW_IDX] |
5979 		    kernel_pmap->pmap_bits[PG_V_IDX] | /* pgeflag | */
5980 		    kernel_pmap->pmap_cache_bits_pte[mode];
5981 		tmpsize -= PAGE_SIZE;
5982 		tmpva += PAGE_SIZE;
5983 		pa += PAGE_SIZE;
5984 	}
5985 	pmap_invalidate_range(kernel_pmap, va, va + size);
5986 	pmap_invalidate_cache_range(va, va + size);
5987 
5988 	return ((void *)(va + offset));
5989 }
5990 
5991 void
5992 pmap_unmapdev(vm_offset_t va, vm_size_t size)
5993 {
5994 	vm_offset_t base, offset;
5995 
5996 	base = va & ~PAGE_MASK;
5997 	offset = va & PAGE_MASK;
5998 	size = roundup(offset + size, PAGE_SIZE);
5999 	pmap_qremove(va, size >> PAGE_SHIFT);
6000 	kmem_free(kernel_map, base, size);
6001 }
6002 
6003 /*
6004  * Sets the memory attribute for the specified page.
6005  */
6006 void
6007 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6008 {
6009 
6010     m->pat_mode = ma;
6011 
6012     /*
6013      * If "m" is a normal page, update its direct mapping.  This update
6014      * can be relied upon to perform any cache operations that are
6015      * required for data coherence.
6016      */
6017     if ((m->flags & PG_FICTITIOUS) == 0)
6018         pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode);
6019 }
6020 
6021 /*
6022  * Change the PAT attribute on an existing kernel memory map.  Caller
6023  * must ensure that the virtual memory in question is not accessed
6024  * during the adjustment.
6025  *
6026  * If the va is within the DMAP we cannot use vtopte() because the DMAP
6027  * utilizes 2MB or 1GB pages.  2MB is forced atm so calculate the pd_entry
6028  * pointer based on that.
6029  */
6030 void
6031 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode)
6032 {
6033 	pt_entry_t *pte;
6034 	vm_offset_t base;
6035 	int changed = 0;
6036 
6037 	if (va == 0)
6038 		panic("pmap_change_attr: va is NULL");
6039 	base = trunc_page(va);
6040 
6041 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
6042 		pd_entry_t *pd;
6043 
6044 		KKASSERT(va < DMapMaxAddress);
6045 		pd = (pd_entry_t *)PHYS_TO_DMAP(DMPDphys);
6046 		pd += (va - DMAP_MIN_ADDRESS) >> PDRSHIFT;
6047 
6048 		while ((long)count > 0) {
6049 			*pd =
6050 			   (*pd & ~(pd_entry_t)(kernel_pmap->pmap_cache_mask_pde)) |
6051 			   kernel_pmap->pmap_cache_bits_pde[mode];
6052 			count -= NBPDR / PAGE_SIZE;
6053 			va += NBPDR;
6054 			++pd;
6055 		}
6056 	} else {
6057 		while (count) {
6058 			pte = vtopte(va);
6059 			*pte =
6060 			   (*pte & ~(pt_entry_t)(kernel_pmap->pmap_cache_mask_pte)) |
6061 			   kernel_pmap->pmap_cache_bits_pte[mode];
6062 			--count;
6063 			va += PAGE_SIZE;
6064 		}
6065 	}
6066 
6067 	changed = 1;	/* XXX: not optimal */
6068 
6069 	/*
6070 	 * Flush CPU caches if required to make sure any data isn't cached that
6071 	 * shouldn't be, etc.
6072 	 */
6073 	if (changed) {
6074 		pmap_invalidate_range(kernel_pmap, base, va);
6075 		pmap_invalidate_cache_range(base, va);
6076 	}
6077 }
6078 
6079 /*
6080  * perform the pmap work for mincore
6081  */
6082 int
6083 pmap_mincore(pmap_t pmap, vm_offset_t addr)
6084 {
6085 	pt_entry_t *ptep, pte;
6086 	vm_page_t m;
6087 	int val = 0;
6088 
6089 	ptep = pmap_pte(pmap, addr);
6090 
6091 	if (ptep && (pte = *ptep) != 0) {
6092 		vm_offset_t pa;
6093 
6094 		val = MINCORE_INCORE;
6095 		pa = pte & PG_FRAME;
6096 		if (pte & pmap->pmap_bits[PG_MANAGED_IDX])
6097 			m = PHYS_TO_VM_PAGE(pa);
6098 		else
6099 			m = NULL;
6100 
6101 		/*
6102 		 * Modified by us
6103 		 */
6104 		if (pte & pmap->pmap_bits[PG_M_IDX])
6105 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
6106 
6107 		/*
6108 		 * Modified by someone
6109 		 */
6110 		else if (m && (m->dirty || pmap_is_modified(m)))
6111 			val |= MINCORE_MODIFIED_OTHER;
6112 
6113 		/*
6114 		 * Referenced by us, or someone else.
6115 		 */
6116 		if (pte & pmap->pmap_bits[PG_A_IDX]) {
6117 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
6118 		} else if (m && ((m->flags & PG_REFERENCED) ||
6119 				 pmap_ts_referenced(m))) {
6120 			val |= MINCORE_REFERENCED_OTHER;
6121 			vm_page_flag_set(m, PG_REFERENCED);
6122 		}
6123 	}
6124 	return val;
6125 }
6126 
6127 /*
6128  * Replace p->p_vmspace with a new one.  If adjrefs is non-zero the new
6129  * vmspace will be ref'd and the old one will be deref'd.
6130  *
6131  * The vmspace for all lwps associated with the process will be adjusted
6132  * and cr3 will be reloaded if any lwp is the current lwp.
6133  *
6134  * The process must hold the vmspace->vm_map.token for oldvm and newvm
6135  */
6136 void
6137 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs)
6138 {
6139 	struct vmspace *oldvm;
6140 	struct lwp *lp;
6141 
6142 	oldvm = p->p_vmspace;
6143 	if (oldvm != newvm) {
6144 		if (adjrefs)
6145 			vmspace_ref(newvm);
6146 		p->p_vmspace = newvm;
6147 		KKASSERT(p->p_nthreads == 1);
6148 		lp = RB_ROOT(&p->p_lwp_tree);
6149 		pmap_setlwpvm(lp, newvm);
6150 		if (adjrefs)
6151 			vmspace_rel(oldvm);
6152 	}
6153 }
6154 
6155 /*
6156  * Set the vmspace for a LWP.  The vmspace is almost universally set the
6157  * same as the process vmspace, but virtual kernels need to swap out contexts
6158  * on a per-lwp basis.
6159  *
6160  * Caller does not necessarily hold any vmspace tokens.  Caller must control
6161  * the lwp (typically be in the context of the lwp).  We use a critical
6162  * section to protect against statclock and hardclock (statistics collection).
6163  */
6164 void
6165 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
6166 {
6167 	struct vmspace *oldvm;
6168 	struct pmap *pmap;
6169 	thread_t td;
6170 
6171 	oldvm = lp->lwp_vmspace;
6172 
6173 	if (oldvm != newvm) {
6174 		crit_enter();
6175 		td = curthread;
6176 		KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0);
6177 		lp->lwp_vmspace = newvm;
6178 		if (td->td_lwp == lp) {
6179 			pmap = vmspace_pmap(newvm);
6180 			ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid);
6181 			if (pmap->pm_active_lock & CPULOCK_EXCL)
6182 				pmap_interlock_wait(newvm);
6183 #if defined(SWTCH_OPTIM_STATS)
6184 			tlb_flush_count++;
6185 #endif
6186 			if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) {
6187 				td->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4);
6188 				if (meltdown_mitigation && pmap->pm_pmlpv_iso) {
6189 					td->td_pcb->pcb_cr3_iso =
6190 						vtophys(pmap->pm_pml4_iso);
6191 					td->td_pcb->pcb_flags |= PCB_ISOMMU;
6192 				} else {
6193 					td->td_pcb->pcb_cr3_iso = 0;
6194 					td->td_pcb->pcb_flags &= ~PCB_ISOMMU;
6195 				}
6196 			} else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) {
6197 				td->td_pcb->pcb_cr3 = KPML4phys;
6198 				td->td_pcb->pcb_cr3_iso = 0;
6199 				td->td_pcb->pcb_flags &= ~PCB_ISOMMU;
6200 			} else {
6201 				panic("pmap_setlwpvm: unknown pmap type\n");
6202 			}
6203 
6204 			/*
6205 			 * The MMU separation fields needs to be updated.
6206 			 * (it can't access the pcb directly from the
6207 			 * restricted user pmap).
6208 			 */
6209 			{
6210 				struct trampframe *tramp;
6211 
6212 				tramp = &pscpu->trampoline;
6213 				tramp->tr_pcb_cr3 = td->td_pcb->pcb_cr3;
6214 				tramp->tr_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso;
6215 				tramp->tr_pcb_flags = td->td_pcb->pcb_flags;
6216 				tramp->tr_pcb_rsp = (register_t)td->td_pcb;
6217 				/* tr_pcb_rsp doesn't change */
6218 			}
6219 
6220 			/*
6221 			 * In kernel-land we always use the normal PML4E
6222 			 * so the kernel is fully mapped and can also access
6223 			 * user memory.
6224 			 */
6225 			load_cr3(td->td_pcb->pcb_cr3);
6226 			pmap = vmspace_pmap(oldvm);
6227 			ATOMIC_CPUMASK_NANDBIT(pmap->pm_active,
6228 					       mycpu->gd_cpuid);
6229 		}
6230 		crit_exit();
6231 	}
6232 }
6233 
6234 /*
6235  * Called when switching to a locked pmap, used to interlock against pmaps
6236  * undergoing modifications to prevent us from activating the MMU for the
6237  * target pmap until all such modifications have completed.  We have to do
6238  * this because the thread making the modifications has already set up its
6239  * SMP synchronization mask.
6240  *
6241  * This function cannot sleep!
6242  *
6243  * No requirements.
6244  */
6245 void
6246 pmap_interlock_wait(struct vmspace *vm)
6247 {
6248 	struct pmap *pmap = &vm->vm_pmap;
6249 
6250 	if (pmap->pm_active_lock & CPULOCK_EXCL) {
6251 		crit_enter();
6252 		KKASSERT(curthread->td_critcount >= 2);
6253 		DEBUG_PUSH_INFO("pmap_interlock_wait");
6254 		while (pmap->pm_active_lock & CPULOCK_EXCL) {
6255 			cpu_ccfence();
6256 			lwkt_process_ipiq();
6257 		}
6258 		DEBUG_POP_INFO();
6259 		crit_exit();
6260 	}
6261 }
6262 
6263 vm_offset_t
6264 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
6265 {
6266 
6267 	if ((obj == NULL) || (size < NBPDR) ||
6268 	    ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) {
6269 		return addr;
6270 	}
6271 
6272 	addr = roundup2(addr, NBPDR);
6273 	return addr;
6274 }
6275 
6276 /*
6277  * Used by kmalloc/kfree, page already exists at va
6278  */
6279 vm_page_t
6280 pmap_kvtom(vm_offset_t va)
6281 {
6282 	pt_entry_t *ptep = vtopte(va);
6283 
6284 	return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME));
6285 }
6286 
6287 /*
6288  * Initialize machine-specific shared page directory support.  This
6289  * is executed when a VM object is created.
6290  */
6291 void
6292 pmap_object_init(vm_object_t object)
6293 {
6294 }
6295 
6296 /*
6297  * Clean up machine-specific shared page directory support.  This
6298  * is executed when a VM object is destroyed.
6299  */
6300 void
6301 pmap_object_free(vm_object_t object)
6302 {
6303 }
6304 
6305 /*
6306  * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related
6307  * VM page and issue a pginfo->callback.
6308  */
6309 static
6310 void
6311 pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info,
6312 		      vm_pindex_t *pte_placemark,
6313 		      pv_entry_t pt_pv, vm_offset_t va,
6314 		      pt_entry_t *ptep, void *arg)
6315 {
6316 	struct pmap_pgscan_info *pginfo = arg;
6317 	vm_page_t m;
6318 	pt_entry_t pte;
6319 
6320 	pte = *ptep;
6321 	cpu_ccfence();
6322 
6323 	if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) {
6324 		/*
6325 		 * Try to busy the page while we hold the pte_placemark locked.
6326 		 */
6327 		m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME);
6328 		if (vm_page_busy_try(m, TRUE) == 0) {
6329 			if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) {
6330 				/*
6331 				 * The callback is issued with the pt_pv
6332 				 * unlocked.
6333 				 */
6334 				pv_placemarker_wakeup(pmap, pte_placemark);
6335 				if (pt_pv) {
6336 					vm_page_wire_quick(pt_pv->pv_m);
6337 					pv_unlock(pt_pv);
6338 				}
6339 				if (pginfo->callback(pginfo, va, m) < 0)
6340 					info->stop = 1;
6341 				if (pt_pv) {
6342 					pv_lock(pt_pv);
6343 					if (vm_page_unwire_quick(pt_pv->pv_m)) {
6344 						panic("pmap_pgscan: bad wire_"
6345 						      "count on pt_pv");
6346 					}
6347 				}
6348 			} else {
6349 				vm_page_wakeup(m);
6350 				pv_placemarker_wakeup(pmap, pte_placemark);
6351 			}
6352 		} else {
6353 			++pginfo->busycount;
6354 			pv_placemarker_wakeup(pmap, pte_placemark);
6355 		}
6356 	} else {
6357 		/*
6358 		 * Shared page table or unmanaged page (sharept or !sharept)
6359 		 */
6360 		pv_placemarker_wakeup(pmap, pte_placemark);
6361 	}
6362 }
6363 
6364 void
6365 pmap_pgscan(struct pmap_pgscan_info *pginfo)
6366 {
6367 	struct pmap_scan_info info;
6368 
6369 	pginfo->offset = pginfo->beg_addr;
6370 	info.pmap = pginfo->pmap;
6371 	info.sva = pginfo->beg_addr;
6372 	info.eva = pginfo->end_addr;
6373 	info.func = pmap_pgscan_callback;
6374 	info.arg = pginfo;
6375 	pmap_scan(&info, 0);
6376 	if (info.stop == 0)
6377 		pginfo->offset = pginfo->end_addr;
6378 }
6379 
6380 /*
6381  * Wait for a placemarker that we do not own to clear.  The placemarker
6382  * in question is not necessarily set to the pindex we want, we may have
6383  * to wait on the element because we want to reserve it ourselves.
6384  *
6385  * NOTE: PM_PLACEMARK_WAKEUP sets a bit which is already set in
6386  *	 PM_NOPLACEMARK, so it does not interfere with placemarks
6387  *	 which have already been woken up.
6388  *
6389  * NOTE: This routine is called without the pmap spin-lock and so can
6390  *	 race changes to *pmark.  Due to the sensitivity of the routine
6391  *	 to possible MULTIPLE interactions from other cpus, and the
6392  *	 overloading of the WAKEUP bit on PM_NOPLACEMARK, we have to
6393  *	 use a cmpset loop to avoid a race that might cause the WAKEUP
6394  *	 bit to be lost.
6395  *
6396  * Caller is expected to retry its operation upon return.
6397  */
6398 static
6399 void
6400 pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark)
6401 {
6402 	vm_pindex_t mark;
6403 
6404 	mark = *pmark;
6405 	cpu_ccfence();
6406 	while (mark != PM_NOPLACEMARK) {
6407 		tsleep_interlock(pmark, 0);
6408 		if (atomic_fcmpset_long(pmark, &mark,
6409 				       mark | PM_PLACEMARK_WAKEUP)) {
6410 			tsleep(pmark, PINTERLOCKED, "pvplw", 0);
6411 			break;
6412 		}
6413 	}
6414 }
6415 
6416 /*
6417  * Wakeup a placemarker that we own.  Replace the entry with
6418  * PM_NOPLACEMARK and issue a wakeup() if necessary.
6419  */
6420 static
6421 void
6422 pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark)
6423 {
6424 	vm_pindex_t pindex;
6425 
6426 	pindex = atomic_swap_long(pmark, PM_NOPLACEMARK);
6427 	KKASSERT(pindex != PM_NOPLACEMARK);
6428 	if (pindex & PM_PLACEMARK_WAKEUP)
6429 		wakeup(pmark);
6430 }
6431