xref: /netbsd/sys/arch/x86/x86/pmap.c (revision fe9347a8)
1 /*	$NetBSD: pmap.c,v 1.425 2023/07/26 21:45:28 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright 2001 (c) Wasabi Systems, Inc.
74  * All rights reserved.
75  *
76  * Written by Frank van der Linden for Wasabi Systems, Inc.
77  *
78  * Redistribution and use in source and binary forms, with or without
79  * modification, are permitted provided that the following conditions
80  * are met:
81  * 1. Redistributions of source code must retain the above copyright
82  *    notice, this list of conditions and the following disclaimer.
83  * 2. Redistributions in binary form must reproduce the above copyright
84  *    notice, this list of conditions and the following disclaimer in the
85  *    documentation and/or other materials provided with the distribution.
86  * 3. All advertising materials mentioning features or use of this software
87  *    must display the following acknowledgement:
88  *      This product includes software developed for the NetBSD Project by
89  *      Wasabi Systems, Inc.
90  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91  *    or promote products derived from this software without specific prior
92  *    written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
98  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104  * POSSIBILITY OF SUCH DAMAGE.
105  */
106 
107 /*
108  * Copyright (c) 1997 Charles D. Cranor and Washington University.
109  * All rights reserved.
110  *
111  * Redistribution and use in source and binary forms, with or without
112  * modification, are permitted provided that the following conditions
113  * are met:
114  * 1. Redistributions of source code must retain the above copyright
115  *    notice, this list of conditions and the following disclaimer.
116  * 2. Redistributions in binary form must reproduce the above copyright
117  *    notice, this list of conditions and the following disclaimer in the
118  *    documentation and/or other materials provided with the distribution.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.425 2023/07/26 21:45:28 riastradh Exp $");
134 
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 #include "opt_efi.h"
142 
143 #define	__MUTEX_PRIVATE	/* for assertions */
144 
145 #include <sys/param.h>
146 #include <sys/systm.h>
147 #include <sys/proc.h>
148 #include <sys/pool.h>
149 #include <sys/kernel.h>
150 #include <sys/atomic.h>
151 #include <sys/cpu.h>
152 #include <sys/intr.h>
153 #include <sys/xcall.h>
154 #include <sys/kcore.h>
155 #include <sys/kmem.h>
156 #include <sys/asan.h>
157 #include <sys/msan.h>
158 #include <sys/entropy.h>
159 
160 #include <uvm/uvm.h>
161 #include <uvm/pmap/pmap_pvt.h>
162 
163 #include <dev/isa/isareg.h>
164 
165 #include <machine/specialreg.h>
166 #include <machine/gdt.h>
167 #include <machine/isa_machdep.h>
168 #include <machine/cpuvar.h>
169 #include <machine/cputypes.h>
170 #include <machine/pmap_private.h>
171 
172 #include <x86/bootspace.h>
173 #include <x86/pat.h>
174 #include <x86/pmap_pv.h>
175 
176 #include <x86/i82489reg.h>
177 #include <x86/i82489var.h>
178 
179 #ifdef XEN
180 #include <xen/include/public/xen.h>
181 #include <xen/hypervisor.h>
182 #include <xen/xenpmap.h>
183 #endif
184 
185 #ifdef __HAVE_DIRECT_MAP
186 #include <crypto/nist_hash_drbg/nist_hash_drbg.h>
187 #endif
188 
189 /*
190  * general info:
191  *
192  *  - for an explanation of how the x86 MMU hardware works see
193  *    the comments in <machine/pte.h>.
194  *
195  *  - for an explanation of the general memory structure used by
196  *    this pmap (including the recursive mapping), see the comments
197  *    in <machine/pmap.h>.
198  *
199  * this file contains the code for the "pmap module."   the module's
200  * job is to manage the hardware's virtual to physical address mappings.
201  * note that there are two levels of mapping in the VM system:
202  *
203  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
204  *      to map ranges of virtual address space to objects/files.  for
205  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
206  *      to the file /bin/ls starting at offset zero."   note that
207  *      the upper layer mapping is not concerned with how individual
208  *      vm_pages are mapped.
209  *
210  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
211  *      from virtual addresses.   it is concerned with which vm_page is
212  *      mapped where.   for example, when you run /bin/ls and start
213  *      at page 0x1000 the fault routine may lookup the correct page
214  *      of the /bin/ls file and then ask the pmap layer to establish
215  *      a mapping for it.
216  *
217  * note that information in the lower layer of the VM system can be
218  * thrown away since it can easily be reconstructed from the info
219  * in the upper layer.
220  *
221  * data structures we use include:
222  *
223  *  - struct pmap: describes the address space of one thread
224  *  - struct pmap_page: describes one pv-tracked page, without
225  *    necessarily a corresponding vm_page
226  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
227  *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
228  *    physical memory.   the pp_pvlist points to a list of pv_entry
229  *    structures which describe all the <PMAP,VA> pairs that this
230  *    page is mapped in.    this is critical for page based operations
231  *    such as pmap_page_protect() [change protection on _all_ mappings
232  *    of a page]
233  */
234 
235 /*
236  * Locking
237  *
238  * We have the following locks that we must deal with, listed in the order
239  * that they are acquired:
240  *
241  * pg->uobject->vmobjlock, pg->uanon->an_lock
242  *
243  *	For managed pages, these per-object locks are taken by the VM system
244  *	before calling into the pmap module - either a read or write hold.
245  *	The lock hold prevent pages from changing identity while the pmap is
246  *	operating on them.  For example, the same lock is held across a call
247  *	to pmap_remove() and the following call to pmap_update(), so that a
248  *	page does not gain a new identity while its TLB visibility is stale.
249  *
250  * pmap->pm_lock
251  *
252  *	This lock protects the fields in the pmap structure including the
253  *	non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
254  *	structures.  For modifying unmanaged kernel PTEs it is not needed as
255  *	kernel PDEs are never freed, and the kernel is expected to be self
256  *	consistent (and the lock can't be taken for unmanaged kernel PTEs,
257  *	because they can be modified from interrupt context).
258  *
259  * pmaps_lock
260  *
261  *	This lock protects the list of active pmaps (headed by "pmaps").
262  *	It's acquired when adding or removing pmaps or adjusting kernel PDEs.
263  *
264  * pp_lock
265  *
266  *	This per-page lock protects PV entry lists and the embedded PV entry
267  *	in each vm_page, allowing for concurrent operation on pages by
268  *	different pmaps.  This is a spin mutex at IPL_VM, because at the
269  *	points it is taken context switching is usually not tolerable, and
270  *	spin mutexes must block out interrupts that could take kernel_lock.
271  */
272 
273 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
274 #ifdef DIAGNOSTIC
275 #define	PMAP_DUMMY_LOCK(pm)	rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
276 #define	PMAP_DUMMY_UNLOCK(pm)	rw_exit(&(pm)->pm_dummy_lock)
277 #else
278 #define	PMAP_DUMMY_LOCK(pm)
279 #define	PMAP_DUMMY_UNLOCK(pm)
280 #endif
281 
282 static const struct uvm_pagerops pmap_pager = {
283 	/* nothing */
284 };
285 
286 /*
287  * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X)
288  */
289 #define pl_i(va, lvl) \
290         (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1])
291 
292 #define	pl_i_roundup(va, lvl)	pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl))
293 
294 /*
295  * PTP macros:
296  *   a PTP's index is the PD index of the PDE that points to it
297  *   a PTP's offset is the byte-offset in the PTE space that this PTP is at
298  *   a PTP's VA is the first VA mapped by that PTP
299  */
300 
301 #define ptp_va2o(va, lvl)	(pl_i(va, (lvl)+1) * PAGE_SIZE)
302 
303 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
304 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
305 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
306 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
307 const long nbpd[] = NBPD_INITIALIZER;
308 #ifdef i386
309 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
310 #else
311 pd_entry_t *normal_pdes[3];
312 #endif
313 
314 long nkptp[] = NKPTP_INITIALIZER;
315 
316 struct pmap_head pmaps;
317 kmutex_t pmaps_lock __cacheline_aligned;
318 
319 struct pcpu_area *pcpuarea __read_mostly;
320 
321 static vaddr_t pmap_maxkvaddr;
322 
323 /*
324  * Misc. event counters.
325  */
326 struct evcnt pmap_iobmp_evcnt;
327 struct evcnt pmap_ldt_evcnt;
328 
329 /*
330  * PAT
331  */
332 static bool cpu_pat_enabled __read_mostly = false;
333 
334 /*
335  * Global data structures
336  */
337 
338 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
339 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
340 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
341 
342 struct bootspace bootspace __read_mostly;
343 struct slotspace slotspace __read_mostly;
344 
345 /* Set to PTE_NX if supported. */
346 pd_entry_t pmap_pg_nx __read_mostly = 0;
347 
348 /* Set to PTE_G if supported. */
349 pd_entry_t pmap_pg_g __read_mostly = 0;
350 
351 /* Set to true if large pages are supported. */
352 int pmap_largepages __read_mostly = 0;
353 
354 paddr_t lowmem_rsvd __read_mostly;
355 paddr_t avail_start __read_mostly; /* PA of first available physical page */
356 paddr_t avail_end __read_mostly; /* PA of last available physical page */
357 
358 #ifdef XENPV
359 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
360 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
361 #endif
362 
363 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
364 #define	PMAP_CHECK_PP(pp) \
365     KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
366 
367 #define PAGE_ALIGNED(pp)	\
368 	__builtin_assume_aligned((void *)(pp), PAGE_SIZE)
369 
370 /*
371  * Other data structures
372  */
373 
374 static pt_entry_t protection_codes[8] __read_mostly;
375 
376 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
377 
378 /*
379  * The following two vaddr_t's are used during system startup to keep track of
380  * how much of the kernel's VM space we have used. Once the system is started,
381  * the management of the remaining kernel VM space is turned over to the
382  * kernel_map vm_map.
383  */
384 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
385 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
386 
387 #ifndef XENPV
388 /*
389  * LAPIC virtual address, and fake physical address.
390  */
391 volatile vaddr_t local_apic_va __read_mostly;
392 paddr_t local_apic_pa __read_mostly;
393 #endif
394 
395 /*
396  * pool that pmap structures are allocated from
397  */
398 struct pool_cache pmap_cache;
399 static int  pmap_ctor(void *, void *, int);
400 static void pmap_dtor(void *, void *);
401 
402 /*
403  * pv_page cache
404  */
405 static struct pool_cache pmap_pvp_cache;
406 
407 #ifdef __HAVE_DIRECT_MAP
408 vaddr_t pmap_direct_base __read_mostly;
409 vaddr_t pmap_direct_end __read_mostly;
410 #endif
411 
412 #ifndef __HAVE_DIRECT_MAP
413 /*
414  * Special VAs and the PTEs that map them
415  */
416 static pt_entry_t *early_zero_pte;
417 static void pmap_vpage_cpualloc(struct cpu_info *);
418 #ifdef XENPV
419 char *early_zerop; /* also referenced from xen_locore() */
420 #else
421 static char *early_zerop;
422 #endif
423 #endif
424 
425 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
426 
427 /* PDP pool and its callbacks */
428 static struct pool pmap_pdp_pool;
429 static void pmap_pdp_init(pd_entry_t *);
430 static void pmap_pdp_fini(pd_entry_t *);
431 
432 #ifdef PAE
433 /* need to allocate items of 4 pages */
434 static void *pmap_pdp_alloc(struct pool *, int);
435 static void pmap_pdp_free(struct pool *, void *);
436 static struct pool_allocator pmap_pdp_allocator = {
437 	.pa_alloc = pmap_pdp_alloc,
438 	.pa_free = pmap_pdp_free,
439 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
440 };
441 #endif
442 
443 extern vaddr_t idt_vaddr;
444 extern paddr_t idt_paddr;
445 extern vaddr_t gdt_vaddr;
446 extern paddr_t gdt_paddr;
447 extern vaddr_t ldt_vaddr;
448 extern paddr_t ldt_paddr;
449 
450 #ifdef i386
451 /* stuff to fix the pentium f00f bug */
452 extern vaddr_t pentium_idt_vaddr;
453 #endif
454 
455 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
456 struct pmap_ptparray {
457 	struct vm_page *pg[PTP_LEVELS + 1];
458 	bool alloced[PTP_LEVELS + 1];
459 };
460 
461 /*
462  * PV entries are allocated in page-sized chunks and cached per-pmap to
463  * avoid intense pressure on memory allocators.
464  */
465 
466 struct pv_page {
467 	LIST_HEAD(, pv_entry)	pvp_pves;
468 	LIST_ENTRY(pv_page)	pvp_list;
469 	long			pvp_nfree;
470 	struct pmap		*pvp_pmap;
471 };
472 
473 #define	PVE_PER_PVP	((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
474 
475 /*
476  * PV tree prototypes
477  */
478 
479 static int	pmap_compare_key(void *, const void *, const void *);
480 static int	pmap_compare_nodes(void *, const void *, const void *);
481 
482 /* Read-black tree */
483 static const rb_tree_ops_t pmap_rbtree_ops = {
484 	.rbto_compare_nodes = pmap_compare_nodes,
485 	.rbto_compare_key = pmap_compare_key,
486 	.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
487 	.rbto_context = NULL
488 };
489 
490 /*
491  * Local prototypes
492  */
493 
494 #ifdef __HAVE_PCPU_AREA
495 static void pmap_init_pcpu(void);
496 #endif
497 #ifdef __HAVE_DIRECT_MAP
498 static void pmap_init_directmap(struct pmap *);
499 #endif
500 #if !defined(XENPV)
501 static void pmap_remap_global(void);
502 #endif
503 #ifndef XENPV
504 static void pmap_init_lapic(void);
505 static void pmap_remap_largepages(void);
506 #endif
507 
508 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
509     struct vm_page **);
510 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
511 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
512     pd_entry_t * const *);
513 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
514 static void pmap_freepage(struct pmap *, struct vm_page *, int);
515 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
516     pt_entry_t *, pd_entry_t * const *);
517 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
518     vaddr_t);
519 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
520     vaddr_t);
521 static int pmap_pvp_ctor(void *, void *, int);
522 static void pmap_pvp_dtor(void *, void *);
523 static struct pv_entry *pmap_alloc_pv(struct pmap *);
524 static void pmap_free_pv(struct pmap *, struct pv_entry *);
525 static void pmap_drain_pv(struct pmap *);
526 
527 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
528 
529 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
530 static void pmap_reactivate(struct pmap *);
531 
532 long
pmap_resident_count(struct pmap * pmap)533 pmap_resident_count(struct pmap *pmap)
534 {
535 
536 	return pmap->pm_stats.resident_count;
537 }
538 
539 long
pmap_wired_count(struct pmap * pmap)540 pmap_wired_count(struct pmap *pmap)
541 {
542 
543 	return pmap->pm_stats.wired_count;
544 }
545 
546 /*
547  * p m a p   h e l p e r   f u n c t i o n s
548  */
549 
550 static inline void
pmap_stats_update(struct pmap * pmap,int resid_diff,int wired_diff)551 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
552 {
553 
554 	KASSERT(cold || mutex_owned(&pmap->pm_lock));
555 	pmap->pm_stats.resident_count += resid_diff;
556 	pmap->pm_stats.wired_count += wired_diff;
557 }
558 
559 static inline void
pmap_stats_update_bypte(struct pmap * pmap,pt_entry_t npte,pt_entry_t opte)560 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
561 {
562 	int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
563 	int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
564 
565 	KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
566 	KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
567 
568 	pmap_stats_update(pmap, resid_diff, wired_diff);
569 }
570 
571 /*
572  * ptp_to_pmap: lookup pmap by ptp
573  */
574 static inline struct pmap *
ptp_to_pmap(struct vm_page * ptp)575 ptp_to_pmap(struct vm_page *ptp)
576 {
577 	struct pmap *pmap;
578 
579 	if (ptp == NULL) {
580 		return pmap_kernel();
581 	}
582 	pmap = (struct pmap *)ptp->uobject;
583 	KASSERT(pmap != NULL);
584 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
585 	return pmap;
586 }
587 
588 static inline struct pv_pte *
pve_to_pvpte(struct pv_entry * pve)589 pve_to_pvpte(struct pv_entry *pve)
590 {
591 
592 	if (pve == NULL)
593 		return NULL;
594 	KASSERT((void *)&pve->pve_pte == (void *)pve);
595 	return &pve->pve_pte;
596 }
597 
598 static inline struct pv_entry *
pvpte_to_pve(struct pv_pte * pvpte)599 pvpte_to_pve(struct pv_pte *pvpte)
600 {
601 	struct pv_entry *pve = (void *)pvpte;
602 
603 	KASSERT(pve_to_pvpte(pve) == pvpte);
604 	return pve;
605 }
606 
607 /*
608  * Return true if the pmap page has an embedded PV entry.
609  */
610 static inline bool
pv_pte_embedded(struct pmap_page * pp)611 pv_pte_embedded(struct pmap_page *pp)
612 {
613 
614 	KASSERT(mutex_owned(&pp->pp_lock));
615 	return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
616 }
617 
618 /*
619  * pv_pte_first, pv_pte_next: PV list iterator.
620  */
621 static inline struct pv_pte *
pv_pte_first(struct pmap_page * pp)622 pv_pte_first(struct pmap_page *pp)
623 {
624 
625 	KASSERT(mutex_owned(&pp->pp_lock));
626 	if (pv_pte_embedded(pp)) {
627 		return &pp->pp_pte;
628 	}
629 	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
630 }
631 
632 static inline struct pv_pte *
pv_pte_next(struct pmap_page * pp,struct pv_pte * pvpte)633 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
634 {
635 
636 	KASSERT(mutex_owned(&pp->pp_lock));
637 	KASSERT(pvpte != NULL);
638 	if (pvpte == &pp->pp_pte) {
639 		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
640 	}
641 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
642 }
643 
644 static inline uint8_t
pmap_pte_to_pp_attrs(pt_entry_t pte)645 pmap_pte_to_pp_attrs(pt_entry_t pte)
646 {
647 	uint8_t ret = 0;
648 	if (pte & PTE_D)
649 		ret |= PP_ATTRS_D;
650 	if (pte & PTE_A)
651 		ret |= PP_ATTRS_A;
652 	if (pte & PTE_W)
653 		ret |= PP_ATTRS_W;
654 	return ret;
655 }
656 
657 static inline pt_entry_t
pmap_pp_attrs_to_pte(uint8_t attrs)658 pmap_pp_attrs_to_pte(uint8_t attrs)
659 {
660 	pt_entry_t pte = 0;
661 	if (attrs & PP_ATTRS_D)
662 		pte |= PTE_D;
663 	if (attrs & PP_ATTRS_A)
664 		pte |= PTE_A;
665 	if (attrs & PP_ATTRS_W)
666 		pte |= PTE_W;
667 	return pte;
668 }
669 
670 /*
671  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
672  * of course the kernel is always loaded
673  */
674 bool
pmap_is_curpmap(struct pmap * pmap)675 pmap_is_curpmap(struct pmap *pmap)
676 {
677 	return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
678 }
679 
680 inline void
pmap_reference(struct pmap * pmap)681 pmap_reference(struct pmap *pmap)
682 {
683 
684 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
685 }
686 
687 /*
688  * rbtree: compare two nodes.
689  */
690 static int
pmap_compare_nodes(void * context,const void * n1,const void * n2)691 pmap_compare_nodes(void *context, const void *n1, const void *n2)
692 {
693 	const struct pv_entry *pve1 = n1;
694 	const struct pv_entry *pve2 = n2;
695 
696 	KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
697 
698 	if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
699 		return -1;
700 	}
701 	if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
702 		return 1;
703 	}
704 	return 0;
705 }
706 
707 /*
708  * rbtree: compare a node and a key.
709  */
710 static int
pmap_compare_key(void * context,const void * n,const void * k)711 pmap_compare_key(void *context, const void *n, const void *k)
712 {
713 	const struct pv_entry *pve = n;
714 	const vaddr_t key = (vaddr_t)k;
715 
716 	if (pve->pve_pte.pte_va < key) {
717 		return -1;
718 	}
719 	if (pve->pve_pte.pte_va > key) {
720 		return 1;
721 	}
722 	return 0;
723 }
724 
725 /*
726  * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
727  */
728 static inline void
pmap_ptp_range_set(struct vm_page * ptp,vaddr_t va)729 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
730 {
731 	vaddr_t *min = (vaddr_t *)&ptp->uanon;
732 
733 	if (va < *min) {
734 		*min = va;
735 	}
736 }
737 
738 /*
739  * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
740  */
741 static inline void
pmap_ptp_range_clip(struct vm_page * ptp,vaddr_t * startva,pt_entry_t ** pte)742 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
743 {
744 	vaddr_t sclip;
745 
746 	if (ptp == NULL) {
747 		return;
748 	}
749 
750 	sclip = (vaddr_t)ptp->uanon;
751 	sclip = (*startva < sclip ? sclip : *startva);
752 	*pte += (sclip - *startva) / PAGE_SIZE;
753 	*startva = sclip;
754 }
755 
756 /*
757  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
758  *
759  * there are several pmaps involved.  some or all of them might be same.
760  *
761  *	- the pmap given by the first argument
762  *		our caller wants to access this pmap's PTEs.
763  *
764  *	- pmap_kernel()
765  *		the kernel pmap.  note that it only contains the kernel part
766  *		of the address space which is shared by any pmap.  ie. any
767  *		pmap can be used instead of pmap_kernel() for our purpose.
768  *
769  *	- ci->ci_pmap
770  *		pmap currently loaded on the cpu.
771  *
772  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
773  *		current process' pmap.
774  *
775  * => caller must lock pmap first (if not the kernel pmap)
776  * => must be undone with pmap_unmap_ptes before returning
777  * => disables kernel preemption
778  */
779 void
pmap_map_ptes(struct pmap * pmap,struct pmap ** pmap2,pd_entry_t ** ptepp,pd_entry_t * const ** pdeppp)780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
781     pd_entry_t * const **pdeppp)
782 {
783 	struct pmap *curpmap;
784 	struct cpu_info *ci;
785 	lwp_t *l;
786 
787 	kpreempt_disable();
788 
789 	/* The kernel's pmap is always accessible. */
790 	if (pmap == pmap_kernel()) {
791 		*pmap2 = NULL;
792 		*ptepp = PTE_BASE;
793 		*pdeppp = normal_pdes;
794 		return;
795 	}
796 
797 	KASSERT(mutex_owned(&pmap->pm_lock));
798 
799 	l = curlwp;
800 	ci = l->l_cpu;
801 	curpmap = ci->ci_pmap;
802 	if (pmap == curpmap) {
803 		/*
804 		 * Already on the CPU: make it valid.  This is very
805 		 * often the case during exit(), when we have switched
806 		 * to the kernel pmap in order to destroy a user pmap.
807 		 */
808 		if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
809 			pmap_reactivate(pmap);
810 		}
811 		*pmap2 = NULL;
812 	} else {
813 		/*
814 		 * Toss current pmap from CPU and install new pmap, but keep
815 		 * a reference to the old one.  Dropping the reference can
816 		 * can block as it needs to take locks, so defer that to
817 		 * pmap_unmap_ptes().
818 		 */
819 		pmap_reference(pmap);
820 		pmap_load1(l, pmap, curpmap);
821 		*pmap2 = curpmap;
822 	}
823 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
824 #ifdef DIAGNOSTIC
825 	pmap->pm_ncsw = lwp_pctr();
826 #endif
827 	*ptepp = PTE_BASE;
828 
829 #if defined(XENPV) && defined(__x86_64__)
830 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
831 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
832 	*pdeppp = ci->ci_normal_pdes;
833 #else
834 	*pdeppp = normal_pdes;
835 #endif
836 }
837 
838 /*
839  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
840  *
841  * => we cannot tolerate context switches while mapped in: assert this.
842  * => reenables kernel preemption.
843  * => does not unlock pmap.
844  */
845 void
pmap_unmap_ptes(struct pmap * pmap,struct pmap * pmap2)846 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
847 {
848 	struct cpu_info *ci;
849 	struct pmap *mypmap;
850 	struct lwp *l;
851 
852 	KASSERT(kpreempt_disabled());
853 
854 	/* The kernel's pmap is always accessible. */
855 	if (pmap == pmap_kernel()) {
856 		kpreempt_enable();
857 		return;
858 	}
859 
860 	l = curlwp;
861 	ci = l->l_cpu;
862 
863 	KASSERT(mutex_owned(&pmap->pm_lock));
864 	KASSERT(pmap->pm_ncsw == lwp_pctr());
865 
866 #if defined(XENPV) && defined(__x86_64__)
867 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
868 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
869 #endif
870 
871 	/* If not our own pmap, mark whatever's on the CPU now as lazy. */
872 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
873 	mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
874 	if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
875 		ci->ci_want_pmapload = 0;
876 	} else {
877 		ci->ci_want_pmapload = (mypmap != pmap_kernel());
878 		ci->ci_tlbstate = TLBSTATE_LAZY;
879 	}
880 
881 	/* Now safe to re-enable preemption. */
882 	kpreempt_enable();
883 
884 	/* Toss reference to other pmap taken earlier. */
885 	if (pmap2 != NULL) {
886 		pmap_destroy(pmap2);
887 	}
888 }
889 
890 inline static void
pmap_exec_account(struct pmap * pm,vaddr_t va,pt_entry_t opte,pt_entry_t npte)891 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
892 {
893 
894 #if !defined(__x86_64__)
895 	if (curproc == NULL || curproc->p_vmspace == NULL ||
896 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
897 		return;
898 
899 	if ((opte ^ npte) & PTE_X)
900 		pmap_update_pg(va);
901 
902 	/*
903 	 * Executability was removed on the last executable change.
904 	 * Reset the code segment to something conservative and
905 	 * let the trap handler deal with setting the right limit.
906 	 * We can't do that because of locking constraints on the vm map.
907 	 */
908 
909 	if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
910 		struct trapframe *tf = curlwp->l_md.md_regs;
911 
912 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
913 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
914 	}
915 #endif /* !defined(__x86_64__) */
916 }
917 
918 #if !defined(__x86_64__)
919 /*
920  * Fixup the code segment to cover all potential executable mappings.
921  * returns 0 if no changes to the code segment were made.
922  */
923 int
pmap_exec_fixup(struct vm_map * map,struct trapframe * tf,struct pcb * pcb)924 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
925 {
926 	struct vm_map_entry *ent;
927 	struct pmap *pm = vm_map_pmap(map);
928 	vaddr_t va = 0;
929 
930 	vm_map_lock_read(map);
931 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
932 		/*
933 		 * This entry has greater va than the entries before.
934 		 * We need to make it point to the last page, not past it.
935 		 */
936 		if (ent->protection & VM_PROT_EXECUTE)
937 			va = trunc_page(ent->end) - PAGE_SIZE;
938 	}
939 	vm_map_unlock_read(map);
940 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
941 		return 0;
942 
943 	pm->pm_hiexec = va;
944 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
945 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
946 	} else {
947 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
948 		return 0;
949 	}
950 	return 1;
951 }
952 #endif /* !defined(__x86_64__) */
953 
954 void
pat_init(struct cpu_info * ci)955 pat_init(struct cpu_info *ci)
956 {
957 #ifndef XENPV
958 	uint64_t pat;
959 
960 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
961 		return;
962 
963 	/* We change WT to WC. Leave all other entries the default values. */
964 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
965 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
966 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
967 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
968 
969 	wrmsr(MSR_CR_PAT, pat);
970 	cpu_pat_enabled = true;
971 #endif
972 }
973 
974 static pt_entry_t
pmap_pat_flags(u_int flags)975 pmap_pat_flags(u_int flags)
976 {
977 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
978 
979 	if (!cpu_pat_enabled) {
980 		switch (cacheflags) {
981 		case PMAP_NOCACHE:
982 		case PMAP_NOCACHE_OVR:
983 			/* results in PGC_UCMINUS on cpus which have
984 			 * the cpuid PAT but PAT "disabled"
985 			 */
986 			return PTE_PCD;
987 		default:
988 			return 0;
989 		}
990 	}
991 
992 	switch (cacheflags) {
993 	case PMAP_NOCACHE:
994 		return PGC_UC;
995 	case PMAP_WRITE_COMBINE:
996 		return PGC_WC;
997 	case PMAP_WRITE_BACK:
998 		return PGC_WB;
999 	case PMAP_NOCACHE_OVR:
1000 		return PGC_UCMINUS;
1001 	}
1002 
1003 	return 0;
1004 }
1005 
1006 /*
1007  * p m a p   k e n t e r   f u n c t i o n s
1008  *
1009  * functions to quickly enter/remove pages from the kernel address
1010  * space.   pmap_kremove is exported to MI kernel.  we make use of
1011  * the recursive PTE mappings.
1012  */
1013 
1014 /*
1015  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1016  *
1017  * => no need to lock anything, assume va is already allocated
1018  * => should be faster than normal pmap enter function
1019  */
1020 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int flags)1021 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1022 {
1023 	pt_entry_t *pte, opte, npte;
1024 
1025 	KASSERT(!(prot & ~VM_PROT_ALL));
1026 
1027 	if (va < VM_MIN_KERNEL_ADDRESS)
1028 		pte = vtopte(va);
1029 	else
1030 		pte = kvtopte(va);
1031 #if defined(XENPV) && defined(DOM0OPS)
1032 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1033 #ifdef DEBUG
1034 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
1035 		    " outside range\n", __func__, pa, va);
1036 #endif /* DEBUG */
1037 		npte = pa;
1038 	} else
1039 #endif /* XENPV && DOM0OPS */
1040 		npte = pmap_pa2pte(pa);
1041 	npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
1042 	npte |= pmap_pat_flags(flags);
1043 	opte = pmap_pte_testset(pte, npte); /* zap! */
1044 
1045 	/*
1046 	 * XXX: make sure we are not dealing with a large page, since the only
1047 	 * large pages created are for the kernel image, and they should never
1048 	 * be kentered.
1049 	 */
1050 	KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1051 
1052 	if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1053 		/* This should not happen. */
1054 		printf_nolog("%s: mapping already present\n", __func__);
1055 		kpreempt_disable();
1056 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1057 		kpreempt_enable();
1058 	}
1059 }
1060 
1061 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1062 
1063 #if defined(__x86_64__)
1064 /*
1065  * Change protection for a virtual address. Local for a CPU only, don't
1066  * care about TLB shootdowns.
1067  *
1068  * => must be called with preemption disabled
1069  */
1070 void
pmap_changeprot_local(vaddr_t va,vm_prot_t prot)1071 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1072 {
1073 	pt_entry_t *pte, opte, npte;
1074 
1075 	KASSERT(kpreempt_disabled());
1076 
1077 	if (va < VM_MIN_KERNEL_ADDRESS)
1078 		pte = vtopte(va);
1079 	else
1080 		pte = kvtopte(va);
1081 
1082 	npte = opte = *pte;
1083 
1084 	if ((prot & VM_PROT_WRITE) != 0)
1085 		npte |= PTE_W;
1086 	else
1087 		npte &= ~(PTE_W|PTE_D);
1088 
1089 	if (opte != npte) {
1090 		pmap_pte_set(pte, npte);
1091 		pmap_pte_flush();
1092 		invlpg(va);
1093 	}
1094 }
1095 #endif /* defined(__x86_64__) */
1096 
1097 /*
1098  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1099  *
1100  * => no need to lock anything
1101  * => caller must dispose of any vm_page mapped in the va range
1102  * => note: not an inline function
1103  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1104  * => we assume kernel only unmaps valid addresses and thus don't bother
1105  *    checking the valid bit before doing TLB flushing
1106  * => must be followed by call to pmap_update() before reuse of page
1107  */
1108 static void
pmap_kremove1(vaddr_t sva,vsize_t len,bool localonly)1109 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1110 {
1111 	pt_entry_t *pte, opte;
1112 	vaddr_t va, eva;
1113 
1114 	eva = sva + len;
1115 
1116 	kpreempt_disable();
1117 	for (va = sva; va < eva; va += PAGE_SIZE) {
1118 		pte = kvtopte(va);
1119 		opte = pmap_pte_testset(pte, 0); /* zap! */
1120 		if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1121 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1122 			    TLBSHOOT_KREMOVE);
1123 		}
1124 		KASSERTMSG((opte & PTE_PS) == 0,
1125 		    "va %#" PRIxVADDR " is a large page", va);
1126 		KASSERTMSG((opte & PTE_PVLIST) == 0,
1127 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1128 	}
1129 	if (localonly) {
1130 		tlbflushg();
1131 	}
1132 	kpreempt_enable();
1133 }
1134 
1135 void
pmap_kremove(vaddr_t sva,vsize_t len)1136 pmap_kremove(vaddr_t sva, vsize_t len)
1137 {
1138 
1139 	pmap_kremove1(sva, len, false);
1140 }
1141 
1142 /*
1143  * pmap_kremove_local: like pmap_kremove(), but only worry about
1144  * TLB invalidations on the current CPU.  this is only intended
1145  * for use while writing kernel crash dumps, either after panic
1146  * or via reboot -d.
1147  */
1148 void
pmap_kremove_local(vaddr_t sva,vsize_t len)1149 pmap_kremove_local(vaddr_t sva, vsize_t len)
1150 {
1151 
1152 	pmap_kremove1(sva, len, true);
1153 }
1154 
1155 /*
1156  * p m a p   i n i t   f u n c t i o n s
1157  *
1158  * pmap_bootstrap and pmap_init are called during system startup
1159  * to init the pmap module.   pmap_bootstrap() does a low level
1160  * init just to get things rolling.   pmap_init() finishes the job.
1161  */
1162 
1163 /*
1164  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1165  * This function is to be used before any VM system has been set up.
1166  *
1167  * The va is taken from virtual_avail.
1168  */
1169 static vaddr_t
pmap_bootstrap_valloc(size_t npages)1170 pmap_bootstrap_valloc(size_t npages)
1171 {
1172 	vaddr_t va = virtual_avail;
1173 	virtual_avail += npages * PAGE_SIZE;
1174 	return va;
1175 }
1176 
1177 /*
1178  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1179  * This function is to be used before any VM system has been set up.
1180  *
1181  * The pa is taken from avail_start.
1182  */
1183 static paddr_t
pmap_bootstrap_palloc(size_t npages)1184 pmap_bootstrap_palloc(size_t npages)
1185 {
1186 	paddr_t pa = avail_start;
1187 	avail_start += npages * PAGE_SIZE;
1188 	return pa;
1189 }
1190 
1191 /*
1192  * pmap_bootstrap: get the system in a state where it can run with VM properly
1193  * enabled (called before main()). The VM system is fully init'd later.
1194  *
1195  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1196  *    kernel, and nkpde PTP's for the kernel.
1197  * => kva_start is the first free virtual address in kernel space.
1198  */
1199 void
pmap_bootstrap(vaddr_t kva_start)1200 pmap_bootstrap(vaddr_t kva_start)
1201 {
1202 	struct pmap *kpm;
1203 	int i;
1204 	vaddr_t kva;
1205 
1206 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1207 
1208 	/*
1209 	 * Set up our local static global vars that keep track of the usage of
1210 	 * KVM before kernel_map is set up.
1211 	 */
1212 	virtual_avail = kva_start;		/* first free KVA */
1213 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1214 
1215 	/*
1216 	 * Set up protection_codes: we need to be able to convert from a MI
1217 	 * protection code (some combo of VM_PROT...) to something we can jam
1218 	 * into a x86 PTE.
1219 	 */
1220 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1221 	protection_codes[VM_PROT_EXECUTE] = PTE_X;
1222 	protection_codes[VM_PROT_READ] = pmap_pg_nx;
1223 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1224 	protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1225 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1226 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1227 	protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1228 
1229 	/*
1230 	 * Now we init the kernel's pmap.
1231 	 *
1232 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1233 	 * the pm_obj contains the list of active PTPs.
1234 	 */
1235 	kpm = pmap_kernel();
1236 	mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1237 	rw_init(&kpm->pm_dummy_lock);
1238 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1239 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1240 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1241 		kpm->pm_ptphint[i] = NULL;
1242 	}
1243 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1244 
1245 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1246 	for (i = 0; i < PDP_SIZE; i++)
1247 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1248 
1249 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1250 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1251 
1252 	kcpuset_create(&kpm->pm_cpus, true);
1253 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1254 
1255 	kpm->pm_ldt = NULL;
1256 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1257 
1258 	/*
1259 	 * the above is just a rough estimate and not critical to the proper
1260 	 * operation of the system.
1261 	 */
1262 
1263 #if !defined(XENPV)
1264 	/*
1265 	 * Begin to enable global TLB entries if they are supported: add PTE_G
1266 	 * attribute to already mapped kernel pages. Do that only if SVS is
1267 	 * disabled.
1268 	 *
1269 	 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1270 	 * happens later in cpu_init().
1271 	 */
1272 #ifdef SVS
1273 	if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1274 #else
1275 	if (cpu_feature[0] & CPUID_PGE) {
1276 #endif
1277 		pmap_pg_g = PTE_G;
1278 		pmap_remap_global();
1279 	}
1280 #endif
1281 
1282 #ifndef XENPV
1283 	/*
1284 	 * Enable large pages if they are supported.
1285 	 */
1286 	if (cpu_feature[0] & CPUID_PSE) {
1287 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1288 		pmap_largepages = 1;	/* enable software */
1289 
1290 		/*
1291 		 * The TLB must be flushed after enabling large pages on Pentium
1292 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1293 		 * Software Developer's Manual, Volume 3: System Programming".
1294 		 */
1295 		tlbflushg();
1296 
1297 		/* Remap the kernel. */
1298 		pmap_remap_largepages();
1299 	}
1300 	pmap_init_lapic();
1301 #endif /* !XENPV */
1302 
1303 #ifdef __HAVE_PCPU_AREA
1304 	pmap_init_pcpu();
1305 #endif
1306 
1307 #ifdef __HAVE_DIRECT_MAP
1308 	pmap_init_directmap(kpm);
1309 #else
1310 	pmap_vpage_cpualloc(&cpu_info_primary);
1311 
1312 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1313 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1314 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1315 	} else { /* amd64 */
1316 		/*
1317 		 * zero_pte is stuck at the end of mapped space for the kernel
1318 		 * image (disjunct from kva space). This is done so that it
1319 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1320 		 * when it's called for the first time.
1321 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1322 		 */
1323 #ifdef XENPV
1324 		/* early_zerop initialized in xen_locore() */
1325 #else
1326 		early_zerop = (void *)bootspace.spareva;
1327 #endif
1328 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1329 	}
1330 #endif
1331 
1332 #if defined(XENPV) && defined(__x86_64__)
1333 	extern vaddr_t xen_dummy_page;
1334 	paddr_t xen_dummy_user_pgd;
1335 
1336 	/*
1337 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1338 	 * Xen will still consider it active. So we set user PGD to this one
1339 	 * to lift all protection on the now inactive page tables set.
1340 	 */
1341 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1342 
1343 	/* Zero fill it, the less checks in Xen it requires the better */
1344 	memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1345 	/* Mark read-only */
1346 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1347 	    pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1348 	    UVMF_INVLPG);
1349 	/* Pin as L4 */
1350 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1351 #endif
1352 
1353 	/*
1354 	 * Allocate space for the Interrupt Descriptor Table (IDT),
1355 	 * Global Descriptor Table (GDT), and Local Descriptor Table
1356 	 * (LDT).
1357 	 *
1358 	 * Currently there is an initial temporary GDT allocated on the
1359 	 * stack by the caller of init386/init_x86_64, which is (among
1360 	 * other things) needed on i386 for %fs-relative addressing for
1361 	 * CPU-local data (CPUVAR(...), curcpu(), curlwp).  This
1362 	 * initial temporary GDT will be popped off the stack before we
1363 	 * can enter main, so we need to make sure there is space for a
1364 	 * second temporary GDT to continue existing when we enter main
1365 	 * before we allocate space for the permanent GDT with
1366 	 * uvm_km(9) in gdt_init via cpu_startup and switch to that.
1367 	 */
1368 	idt_vaddr = pmap_bootstrap_valloc(1);
1369 	idt_paddr = pmap_bootstrap_palloc(1);
1370 
1371 	gdt_vaddr = pmap_bootstrap_valloc(1);
1372 	gdt_paddr = pmap_bootstrap_palloc(1);
1373 
1374 #ifdef __HAVE_PCPU_AREA
1375 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1376 #else
1377 	ldt_vaddr = pmap_bootstrap_valloc(1);
1378 #endif
1379 	ldt_paddr = pmap_bootstrap_palloc(1);
1380 
1381 #if !defined(__x86_64__)
1382 	/* pentium f00f bug stuff */
1383 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1384 #endif
1385 
1386 #if defined(XENPVHVM)
1387 	/* XXX: move to hypervisor.c with appropriate API adjustments */
1388 	extern paddr_t HYPERVISOR_shared_info_pa;
1389 	extern volatile struct xencons_interface *xencons_interface; /* XXX */
1390 	extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1391 
1392 	if (vm_guest != VM_GUEST_XENPVH) {
1393 		HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1394 		HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1395 	}
1396 	xencons_interface = (void *) pmap_bootstrap_valloc(1);
1397 	xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1398 #endif
1399 	/*
1400 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1401 	 */
1402 	virtual_avail = reserve_dumppages(virtual_avail);
1403 
1404 	/*
1405 	 * Init the global lock and global list.
1406 	 */
1407 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1408 	LIST_INIT(&pmaps);
1409 
1410 	/*
1411 	 * Ensure the TLB is sync'd with reality by flushing it...
1412 	 */
1413 	tlbflushg();
1414 
1415 	/*
1416 	 * Calculate pmap_maxkvaddr from nkptp[].
1417 	 */
1418 	kva = VM_MIN_KERNEL_ADDRESS;
1419 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1420 		kva += nkptp[i] * nbpd[i];
1421 	}
1422 	pmap_maxkvaddr = kva;
1423 }
1424 
1425 #ifndef XENPV
1426 static void
1427 pmap_init_lapic(void)
1428 {
1429 	/*
1430 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1431 	 * x86 implementation relies a lot on this address to be valid; so just
1432 	 * allocate a fake physical page that will be kentered into
1433 	 * local_apic_va by machdep.
1434 	 *
1435 	 * If the LAPIC is present, the va will be remapped somewhere else
1436 	 * later in lapic_map.
1437 	 */
1438 	local_apic_va = pmap_bootstrap_valloc(1);
1439 	local_apic_pa = pmap_bootstrap_palloc(1);
1440 }
1441 #endif
1442 
1443 #ifdef __x86_64__
1444 static size_t
1445 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1446 {
1447 	size_t npages;
1448 	npages = (roundup(endva, pgsz) / pgsz) -
1449 	    (rounddown(startva, pgsz) / pgsz);
1450 	return npages;
1451 }
1452 #endif
1453 
1454 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1455 static inline void
1456 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1457 {
1458 	size_t sslot = slotspace.area[type].sslot;
1459 	size_t nslot = slotspace.area[type].nslot;
1460 
1461 	memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1462 }
1463 #endif
1464 
1465 #ifdef __x86_64__
1466 /*
1467  * Randomize the location of an area. We count the holes in the VM space. We
1468  * randomly select one hole, and then randomly select an area within that hole.
1469  * Finally we update the associated entry in the slotspace structure.
1470  */
1471 vaddr_t
1472 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1473     vaddr_t randva)
1474 {
1475 	struct {
1476 		int start;
1477 		int end;
1478 	} holes[SLSPACE_NAREAS+1];
1479 	size_t i, nholes, hole;
1480 	size_t startsl, endsl, nslots, winsize;
1481 	vaddr_t startva, va;
1482 
1483 	sz = roundup(sz, align);
1484 
1485 	/*
1486 	 * Take one more slot with +NBPD_L4, because we may end up choosing
1487 	 * an area that crosses slots:
1488 	 *     +------+------+------+
1489 	 *     | Slot | Slot | Slot |
1490 	 *     +------+------+------+
1491 	 *        [Chosen Area]
1492 	 * And in that case we must take into account the additional slot
1493 	 * consumed.
1494 	 */
1495 	nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1496 
1497 	/* Get the holes. */
1498 	nholes = 0;
1499 	size_t curslot = 0 + 256; /* end of SLAREA_USER */
1500 	while (1) {
1501 		/*
1502 		 * Find the first occupied slot after the current one.
1503 		 * The area between the two is a hole.
1504 		 */
1505 		size_t minsslot = 512;
1506 		size_t minnslot = 0;
1507 		for (i = 0; i < SLSPACE_NAREAS; i++) {
1508 			if (!slotspace.area[i].active)
1509 				continue;
1510 			if (slotspace.area[i].sslot >= curslot &&
1511 			    slotspace.area[i].sslot < minsslot) {
1512 				minsslot = slotspace.area[i].sslot;
1513 				minnslot = slotspace.area[i].nslot;
1514 			}
1515 		}
1516 
1517 		/* No hole anymore, stop here. */
1518 		if (minsslot == 512) {
1519 			break;
1520 		}
1521 
1522 		/* Register the hole. */
1523 		if (minsslot - curslot >= nslots) {
1524 			holes[nholes].start = curslot;
1525 			holes[nholes].end = minsslot;
1526 			nholes++;
1527 		}
1528 
1529 		/* Skip that hole, and iterate again. */
1530 		curslot = minsslot + minnslot;
1531 	}
1532 
1533 	if (nholes == 0) {
1534 		panic("%s: impossible", __func__);
1535 	}
1536 
1537 	/* Select a hole. */
1538 	hole = randhole;
1539 #ifdef NO_X86_ASLR
1540 	hole = 0;
1541 #endif
1542 	hole %= nholes;
1543 	startsl = holes[hole].start;
1544 	endsl = holes[hole].end;
1545 	startva = VA_SIGN_NEG(startsl * NBPD_L4);
1546 
1547 	/* Select an area within the hole. */
1548 	va = randva;
1549 #ifdef NO_X86_ASLR
1550 	va = 0;
1551 #endif
1552 	winsize = ((endsl - startsl) * NBPD_L4) - sz;
1553 	va %= winsize;
1554 	va = rounddown(va, align);
1555 	va += startva;
1556 
1557 	/* Update the entry. */
1558 	slotspace.area[type].sslot = pl4_i(va);
1559 	slotspace.area[type].nslot =
1560 	    pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1561 	slotspace.area[type].active = true;
1562 
1563 	return va;
1564 }
1565 #endif
1566 
1567 #ifdef __HAVE_PCPU_AREA
1568 static void
1569 pmap_init_pcpu(void)
1570 {
1571 	const vaddr_t startva = PMAP_PCPU_BASE;
1572 	size_t nL4e, nL3e, nL2e, nL1e;
1573 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1574 	paddr_t pa;
1575 	vaddr_t endva;
1576 	vaddr_t tmpva;
1577 	pt_entry_t *pte;
1578 	size_t size;
1579 	int i;
1580 
1581 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1582 
1583 	size = sizeof(struct pcpu_area);
1584 
1585 	endva = startva + size;
1586 
1587 	/* We will use this temporary va. */
1588 	tmpva = bootspace.spareva;
1589 	pte = PTE_BASE + pl1_i(tmpva);
1590 
1591 	/* Build L4 */
1592 	L4e_idx = pl4_i(startva);
1593 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1594 	KASSERT(nL4e  == 1);
1595 	for (i = 0; i < nL4e; i++) {
1596 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1597 
1598 		pa = pmap_bootstrap_palloc(1);
1599 		*pte = (pa & PTE_FRAME) | pteflags;
1600 		pmap_update_pg(tmpva);
1601 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1602 
1603 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1604 	}
1605 
1606 	/* Build L3 */
1607 	L3e_idx = pl3_i(startva);
1608 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1609 	for (i = 0; i < nL3e; i++) {
1610 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1611 
1612 		pa = pmap_bootstrap_palloc(1);
1613 		*pte = (pa & PTE_FRAME) | pteflags;
1614 		pmap_update_pg(tmpva);
1615 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1616 
1617 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1618 	}
1619 
1620 	/* Build L2 */
1621 	L2e_idx = pl2_i(startva);
1622 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1623 	for (i = 0; i < nL2e; i++) {
1624 
1625 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1626 
1627 		pa = pmap_bootstrap_palloc(1);
1628 		*pte = (pa & PTE_FRAME) | pteflags;
1629 		pmap_update_pg(tmpva);
1630 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1631 
1632 		L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1633 	}
1634 
1635 	/* Build L1 */
1636 	L1e_idx = pl1_i(startva);
1637 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1638 	for (i = 0; i < nL1e; i++) {
1639 		/*
1640 		 * Nothing to do, the PTEs will be entered via
1641 		 * pmap_kenter_pa.
1642 		 */
1643 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1644 	}
1645 
1646 	*pte = 0;
1647 	pmap_update_pg(tmpva);
1648 
1649 	pcpuarea = (struct pcpu_area *)startva;
1650 
1651 	tlbflush();
1652 }
1653 #endif
1654 
1655 #ifdef __HAVE_DIRECT_MAP
1656 static void
1657 randomize_hole(size_t *randholep, vaddr_t *randvap)
1658 {
1659 	struct nist_hash_drbg drbg;
1660 	uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
1661 	const char p[] = "x86/directmap";
1662 	int error;
1663 
1664 	entropy_extract(seed, sizeof(seed), 0);
1665 
1666 	error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
1667 	    /*nonce*/NULL, 0,
1668 	    /*personalization*/p, strlen(p));
1669 	KASSERTMSG(error == 0, "error=%d", error);
1670 
1671 	error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
1672 	    /*additional*/NULL, 0);
1673 	KASSERTMSG(error == 0, "error=%d", error);
1674 
1675 	error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
1676 	    /*additional*/NULL, 0);
1677 	KASSERTMSG(error == 0, "error=%d", error);
1678 
1679 	explicit_memset(seed, 0, sizeof(seed));
1680 	explicit_memset(&drbg, 0, sizeof(drbg));
1681 }
1682 
1683 /*
1684  * Create the amd64 direct map. Called only once at boot time. We map all of
1685  * the physical memory contiguously using 2MB large pages, with RW permissions.
1686  * However there is a hole: the kernel is mapped with RO permissions.
1687  */
1688 static void
1689 pmap_init_directmap(struct pmap *kpm)
1690 {
1691 	extern phys_ram_seg_t mem_clusters[];
1692 	extern int mem_cluster_cnt;
1693 
1694 	vaddr_t startva;
1695 	size_t nL4e, nL3e, nL2e;
1696 	size_t L4e_idx, L3e_idx, L2e_idx;
1697 	size_t spahole, epahole;
1698 	paddr_t lastpa, pa;
1699 	vaddr_t endva;
1700 	vaddr_t tmpva;
1701 	pt_entry_t *pte;
1702 	phys_ram_seg_t *mc;
1703 	int i;
1704 	size_t randhole;
1705 	vaddr_t randva;
1706 
1707 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1708 	const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1709 
1710 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1711 
1712 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1713 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1714 
1715 	/* Get the last physical address available */
1716 	lastpa = 0;
1717 	for (i = 0; i < mem_cluster_cnt; i++) {
1718 		mc = &mem_clusters[i];
1719 		lastpa = MAX(lastpa, mc->start + mc->size);
1720 	}
1721 
1722 	/*
1723 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1724 	 */
1725 	if (lastpa > MAXPHYSMEM) {
1726 		panic("pmap_init_directmap: lastpa incorrect");
1727 	}
1728 
1729 	randomize_hole(&randhole, &randva);
1730 	startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1731 	    randhole, randva);
1732 	endva = startva + lastpa;
1733 
1734 	/* We will use this temporary va. */
1735 	tmpva = bootspace.spareva;
1736 	pte = PTE_BASE + pl1_i(tmpva);
1737 
1738 	/* Build L4 */
1739 	L4e_idx = pl4_i(startva);
1740 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1741 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1742 	for (i = 0; i < nL4e; i++) {
1743 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1744 
1745 		pa = pmap_bootstrap_palloc(1);
1746 		*pte = (pa & PTE_FRAME) | pteflags;
1747 		pmap_update_pg(tmpva);
1748 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1749 
1750 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1751 	}
1752 
1753 	/* Build L3 */
1754 	L3e_idx = pl3_i(startva);
1755 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1756 	for (i = 0; i < nL3e; i++) {
1757 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1758 
1759 		pa = pmap_bootstrap_palloc(1);
1760 		*pte = (pa & PTE_FRAME) | pteflags;
1761 		pmap_update_pg(tmpva);
1762 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1763 
1764 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1765 	}
1766 
1767 	/* Build L2 */
1768 	L2e_idx = pl2_i(startva);
1769 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1770 	for (i = 0; i < nL2e; i++) {
1771 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1772 
1773 		pa = (paddr_t)(i * NBPD_L2);
1774 
1775 		if (spahole <= pa && pa < epahole) {
1776 			L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1777 			    PTE_PS | pmap_pg_g;
1778 		} else {
1779 			L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1780 			    PTE_PS | pmap_pg_g;
1781 		}
1782 	}
1783 
1784 	*pte = 0;
1785 	pmap_update_pg(tmpva);
1786 
1787 	pmap_direct_base = startva;
1788 	pmap_direct_end = endva;
1789 
1790 	tlbflush();
1791 }
1792 #endif /* __HAVE_DIRECT_MAP */
1793 
1794 #if !defined(XENPV)
1795 /*
1796  * Remap all of the virtual pages created so far with the PTE_G bit.
1797  */
1798 static void
1799 pmap_remap_global(void)
1800 {
1801 	vaddr_t kva, kva_end;
1802 	unsigned long p1i;
1803 	size_t i;
1804 
1805 	/* head */
1806 	kva = bootspace.head.va;
1807 	kva_end = kva + bootspace.head.sz;
1808 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1809 		p1i = pl1_i(kva);
1810 		if (pmap_valid_entry(PTE_BASE[p1i]))
1811 			PTE_BASE[p1i] |= pmap_pg_g;
1812 	}
1813 
1814 	/* kernel segments */
1815 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1816 		if (bootspace.segs[i].type == BTSEG_NONE) {
1817 			continue;
1818 		}
1819 		kva = bootspace.segs[i].va;
1820 		kva_end = kva + bootspace.segs[i].sz;
1821 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1822 			p1i = pl1_i(kva);
1823 			if (pmap_valid_entry(PTE_BASE[p1i]))
1824 				PTE_BASE[p1i] |= pmap_pg_g;
1825 		}
1826 	}
1827 
1828 	/* boot space */
1829 	kva = bootspace.boot.va;
1830 	kva_end = kva + bootspace.boot.sz;
1831 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1832 		p1i = pl1_i(kva);
1833 		if (pmap_valid_entry(PTE_BASE[p1i]))
1834 			PTE_BASE[p1i] |= pmap_pg_g;
1835 	}
1836 }
1837 #endif
1838 
1839 #ifndef XENPV
1840 /*
1841  * Remap several kernel segments with large pages. We cover as many pages as we
1842  * can. Called only once at boot time, if the CPU supports large pages.
1843  */
1844 static void
1845 pmap_remap_largepages(void)
1846 {
1847 	pd_entry_t *pde;
1848 	vaddr_t kva, kva_end;
1849 	paddr_t pa;
1850 	size_t i;
1851 
1852 	/* Remap the kernel text using large pages. */
1853 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1854 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1855 			continue;
1856 		}
1857 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1858 		if (kva < bootspace.segs[i].va) {
1859 			continue;
1860 		}
1861 		kva_end = rounddown(bootspace.segs[i].va +
1862 			bootspace.segs[i].sz, NBPD_L2);
1863 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1864 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1865 			pde = &L2_BASE[pl2_i(kva)];
1866 			*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1867 			tlbflushg();
1868 		}
1869 	}
1870 
1871 	/* Remap the kernel rodata using large pages. */
1872 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1873 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1874 			continue;
1875 		}
1876 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1877 		if (kva < bootspace.segs[i].va) {
1878 			continue;
1879 		}
1880 		kva_end = rounddown(bootspace.segs[i].va +
1881 			bootspace.segs[i].sz, NBPD_L2);
1882 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1883 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1884 			pde = &L2_BASE[pl2_i(kva)];
1885 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1886 			tlbflushg();
1887 		}
1888 	}
1889 
1890 	/* Remap the kernel data+bss using large pages. */
1891 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1892 		if (bootspace.segs[i].type != BTSEG_DATA) {
1893 			continue;
1894 		}
1895 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1896 		if (kva < bootspace.segs[i].va) {
1897 			continue;
1898 		}
1899 		kva_end = rounddown(bootspace.segs[i].va +
1900 			bootspace.segs[i].sz, NBPD_L2);
1901 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1902 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1903 			pde = &L2_BASE[pl2_i(kva)];
1904 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1905 			tlbflushg();
1906 		}
1907 	}
1908 }
1909 #endif /* !XENPV */
1910 
1911 /*
1912  * pmap_init: called from uvm_init, our job is to get the pmap system ready
1913  * to manage mappings.
1914  */
1915 void
1916 pmap_init(void)
1917 {
1918 	int flags;
1919 
1920 	/*
1921 	 * initialize caches.
1922 	 */
1923 
1924 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1925 	    0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1926 
1927 #ifdef XENPV
1928 	/*
1929 	 * pool_cache(9) should not touch cached objects, since they
1930 	 * are pinned on xen and R/O for the domU
1931 	 */
1932 	flags = PR_NOTOUCH;
1933 #else
1934 	flags = 0;
1935 #endif
1936 
1937 #ifdef PAE
1938 	pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1939 	    "pdppl", &pmap_pdp_allocator, IPL_NONE);
1940 #else
1941 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1942 	    "pdppl", NULL, IPL_NONE);
1943 #endif
1944 	pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1945 	     0, 0, "pvpage", &pool_allocator_kmem,
1946 	    IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1947 
1948 	pmap_tlb_init();
1949 
1950 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1951 	pmap_tlb_cpu_init(curcpu());
1952 
1953 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1954 	    NULL, "x86", "io bitmap copy");
1955 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1956 	    NULL, "x86", "ldt sync");
1957 
1958 	/*
1959 	 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1960 	 * to hang a tree of pv_entry records.  Dynamically allocated
1961 	 * pv_entry lists are not heavily used in the kernel's pmap (the
1962 	 * usual case is embedded), so cop out and use a single RB tree
1963 	 * to cover them.
1964 	 */
1965 	rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1966 
1967 	/*
1968 	 * done: pmap module is up (and ready for business)
1969 	 */
1970 
1971 	pmap_initialized = true;
1972 }
1973 
1974 #ifndef XENPV
1975 /*
1976  * pmap_cpu_init_late: perform late per-CPU initialization.
1977  */
1978 void
1979 pmap_cpu_init_late(struct cpu_info *ci)
1980 {
1981 	/*
1982 	 * The BP has already its own PD page allocated during early
1983 	 * MD startup.
1984 	 */
1985 	if (ci == &cpu_info_primary)
1986 		return;
1987 #ifdef PAE
1988 	cpu_alloc_l3_page(ci);
1989 #endif
1990 }
1991 #endif
1992 
1993 #ifndef __HAVE_DIRECT_MAP
1994 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1995 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1996 
1997 static void
1998 pmap_vpage_cpualloc(struct cpu_info *ci)
1999 {
2000 	bool primary = (ci == &cpu_info_primary);
2001 	size_t i, npages;
2002 	vaddr_t vabase;
2003 	vsize_t vrange;
2004 
2005 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
2006 	KASSERT(npages >= VPAGE_MAX);
2007 	vrange = npages * PAGE_SIZE;
2008 
2009 	if (primary) {
2010 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
2011 			/* Waste some pages to align properly */
2012 		}
2013 		/* The base is aligned, allocate the rest (contiguous) */
2014 		pmap_bootstrap_valloc(npages - 1);
2015 	} else {
2016 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
2017 		    UVM_KMF_VAONLY);
2018 		if (vabase == 0) {
2019 			panic("%s: failed to allocate tmp VA for CPU %d\n",
2020 			    __func__, cpu_index(ci));
2021 		}
2022 	}
2023 
2024 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
2025 
2026 	for (i = 0; i < VPAGE_MAX; i++) {
2027 		ci->vpage[i] = vabase + i * PAGE_SIZE;
2028 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
2029 	}
2030 }
2031 
2032 void
2033 pmap_vpage_cpu_init(struct cpu_info *ci)
2034 {
2035 	if (ci == &cpu_info_primary) {
2036 		/* cpu0 already taken care of in pmap_bootstrap */
2037 		return;
2038 	}
2039 
2040 	pmap_vpage_cpualloc(ci);
2041 }
2042 #endif
2043 
2044 /*
2045  * p v _ e n t r y   f u n c t i o n s
2046  */
2047 
2048 /*
2049  * pmap_pvp_dtor: pool_cache constructor for PV pages.
2050  */
2051 static int
2052 pmap_pvp_ctor(void *arg, void *obj, int flags)
2053 {
2054 	struct pv_page *pvp = (struct pv_page *)obj;
2055 	struct pv_entry *pve = (struct pv_entry *)obj + 1;
2056 	struct pv_entry *maxpve = pve + PVE_PER_PVP;
2057 
2058 	KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
2059 	KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
2060 
2061 	LIST_INIT(&pvp->pvp_pves);
2062 	pvp->pvp_nfree = PVE_PER_PVP;
2063 	pvp->pvp_pmap = NULL;
2064 
2065 	for (; pve < maxpve; pve++) {
2066 		LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2067 	}
2068 
2069 	return 0;
2070 }
2071 
2072 /*
2073  * pmap_pvp_dtor: pool_cache destructor for PV pages.
2074  */
2075 static void
2076 pmap_pvp_dtor(void *arg, void *obj)
2077 {
2078 	struct pv_page *pvp __diagused = obj;
2079 
2080 	KASSERT(pvp->pvp_pmap == NULL);
2081 	KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2082 }
2083 
2084 /*
2085  * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2086  */
2087 static struct pv_entry *
2088 pmap_alloc_pv(struct pmap *pmap)
2089 {
2090 	struct pv_entry *pve;
2091 	struct pv_page *pvp;
2092 
2093 	KASSERT(mutex_owned(&pmap->pm_lock));
2094 
2095 	if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2096 		if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2097 			LIST_REMOVE(pvp, pvp_list);
2098 		} else {
2099 			pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2100 		}
2101 		if (__predict_false(pvp == NULL)) {
2102 			return NULL;
2103 		}
2104 		/* full -> part */
2105 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2106 		pvp->pvp_pmap = pmap;
2107 	}
2108 
2109 	KASSERT(pvp->pvp_pmap == pmap);
2110 	KASSERT(pvp->pvp_nfree > 0);
2111 
2112 	pve = LIST_FIRST(&pvp->pvp_pves);
2113 	LIST_REMOVE(pve, pve_list);
2114 	pvp->pvp_nfree--;
2115 
2116 	if (__predict_false(pvp->pvp_nfree == 0)) {
2117 		/* part -> empty */
2118 		KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2119 		LIST_REMOVE(pvp, pvp_list);
2120 		LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2121 	} else {
2122 		KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2123 	}
2124 
2125 	return pve;
2126 }
2127 
2128 /*
2129  * pmap_free_pv: delayed free of a PV entry.
2130  */
2131 static void
2132 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2133 {
2134 	struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2135 
2136 	KASSERT(mutex_owned(&pmap->pm_lock));
2137 	KASSERT(pvp->pvp_pmap == pmap);
2138 	KASSERT(pvp->pvp_nfree >= 0);
2139 
2140 	LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2141 	pvp->pvp_nfree++;
2142 
2143 	if (__predict_false(pvp->pvp_nfree == 1)) {
2144 		/* empty -> part */
2145 		LIST_REMOVE(pvp, pvp_list);
2146 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2147 	} else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2148 		/* part -> full */
2149 		LIST_REMOVE(pvp, pvp_list);
2150 		LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2151 	}
2152 }
2153 
2154 /*
2155  * pmap_drain_pv: free full PV pages.
2156  */
2157 static void
2158 pmap_drain_pv(struct pmap *pmap)
2159 {
2160 	struct pv_page *pvp;
2161 
2162 	KASSERT(mutex_owned(&pmap->pm_lock));
2163 
2164 	while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2165 		LIST_REMOVE(pvp, pvp_list);
2166 		KASSERT(pvp->pvp_pmap == pmap);
2167 		KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2168 		pvp->pvp_pmap = NULL;
2169 		pool_cache_put(&pmap_pvp_cache, pvp);
2170 	}
2171 }
2172 
2173 /*
2174  * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2175  */
2176 static void
2177 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2178     vaddr_t va, bool tracked)
2179 {
2180 #ifdef DEBUG
2181 	struct pv_pte *pvpte;
2182 
2183 	PMAP_CHECK_PP(pp);
2184 
2185 	mutex_spin_enter(&pp->pp_lock);
2186 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2187 		if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2188 			break;
2189 		}
2190 	}
2191 	mutex_spin_exit(&pp->pp_lock);
2192 
2193 	if (pvpte && !tracked) {
2194 		panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2195 	} else if (!pvpte && tracked) {
2196 		panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2197 	}
2198 #endif
2199 }
2200 
2201 /*
2202  * pmap_treelookup_pv: search the PV tree for a dynamic entry
2203  *
2204  * => pmap must be locked
2205  */
2206 static struct pv_entry *
2207 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2208     const rb_tree_t *tree, const vaddr_t va)
2209 {
2210 	struct pv_entry *pve;
2211 	rb_node_t *node;
2212 
2213 	/*
2214 	 * Inlined lookup tailored for exactly what's needed here that is
2215 	 * quite a bit faster than using rb_tree_find_node().
2216 	 */
2217 	for (node = tree->rbt_root;;) {
2218 		if (__predict_false(RB_SENTINEL_P(node))) {
2219 			return NULL;
2220 		}
2221 		pve = (struct pv_entry *)
2222 		    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2223 		if (pve->pve_pte.pte_va == va) {
2224 			KASSERT(pve->pve_pte.pte_ptp == ptp);
2225 			return pve;
2226 		}
2227 		node = node->rb_nodes[pve->pve_pte.pte_va < va];
2228 	}
2229 }
2230 
2231 /*
2232  * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2233  *
2234  * => a PV entry must be known present (doesn't check for existence)
2235  * => pmap must be locked
2236  */
2237 static struct pv_entry *
2238 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2239     const struct pmap_page * const old_pp, const vaddr_t va)
2240 {
2241 	struct pv_entry *pve;
2242 	const rb_tree_t *tree;
2243 
2244 	KASSERT(mutex_owned(&pmap->pm_lock));
2245 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2246 
2247 	/*
2248 	 * [This mostly deals with the case of process-private pages, i.e.
2249 	 * anonymous memory allocations or COW.]
2250 	 *
2251 	 * If the page is tracked with an embedded entry then the tree
2252 	 * lookup can be avoided.  It's safe to check for this specific
2253 	 * set of values without pp_lock because both will only ever be
2254 	 * set together for this pmap.
2255 	 *
2256 	 */
2257 	if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2258 	    atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2259 		return NULL;
2260 	}
2261 
2262 	/*
2263 	 * [This mostly deals with shared mappings, for example shared libs
2264 	 * and executables.]
2265 	 *
2266 	 * Optimise for pmap_remove_ptes() which works by ascending scan:
2267 	 * look at the lowest numbered node in the tree first.  The tree is
2268 	 * known non-empty because of the check above.  For short lived
2269 	 * processes where pmap_remove() isn't used much this gets close to
2270 	 * a 100% hit rate.
2271 	 */
2272 	tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2273 	KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2274 	pve = (struct pv_entry *)
2275 	    ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2276 	    offsetof(struct pv_entry, pve_rb));
2277 	if (__predict_true(pve->pve_pte.pte_va == va)) {
2278 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2279 		return pve;
2280 	}
2281 
2282 	/* Search the RB tree for the key (uncommon). */
2283 	return pmap_treelookup_pv(pmap, ptp, tree, va);
2284 }
2285 
2286 /*
2287  * pmap_enter_pv: enter a mapping onto a pmap_page lst
2288  *
2289  * => pmap must be locked
2290  * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2291  */
2292 static int
2293 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2294     vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2295     bool *samepage, bool *new_embedded, rb_tree_t *tree)
2296 {
2297 	struct pv_entry *pve;
2298 	int error;
2299 
2300 	KASSERT(mutex_owned(&pmap->pm_lock));
2301 	KASSERT(ptp_to_pmap(ptp) == pmap);
2302 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2303 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2304 	PMAP_CHECK_PP(pp);
2305 
2306 	/*
2307 	 * If entering the same page and it's already tracked with an
2308 	 * embedded entry, we can avoid the expense below.  It's safe
2309 	 * to check for this very specific set of values without a lock
2310 	 * because both will only ever be set together for this pmap.
2311 	 */
2312 	if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2313 	    atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2314 		*samepage = true;
2315 		pmap_check_pv(pmap, ptp, pp, va, true);
2316 		return 0;
2317 	}
2318 
2319 	/*
2320 	 * Check for an existing dynamic mapping at this address.  If it's
2321 	 * for the same page, then it will be reused and nothing needs to be
2322 	 * changed.
2323 	 */
2324 	*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2325 	if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2326 		*samepage = true;
2327 		pmap_check_pv(pmap, ptp, pp, va, true);
2328 		return 0;
2329 	}
2330 
2331 	/*
2332 	 * Need to put a new mapping in place.  Grab a spare pv_entry in
2333 	 * case it's needed; won't know for sure until the lock is taken.
2334 	 */
2335 	if (pmap->pm_pve == NULL) {
2336 		pmap->pm_pve = pmap_alloc_pv(pmap);
2337 	}
2338 
2339 	error = 0;
2340 	pmap_check_pv(pmap, ptp, pp, va, false);
2341 	mutex_spin_enter(&pp->pp_lock);
2342 	if (!pv_pte_embedded(pp)) {
2343 		/*
2344 		 * Embedded PV tracking available - easy.
2345 		 */
2346 		pp->pp_pte.pte_ptp = ptp;
2347 		pp->pp_pte.pte_va = va;
2348 		*new_embedded = true;
2349 	} else if (__predict_false(pmap->pm_pve == NULL)) {
2350 		/*
2351 		 * No memory.
2352 		 */
2353 		error = ENOMEM;
2354 	} else {
2355 		/*
2356 		 * Install new pv_entry on the page.
2357 		 */
2358 		pve = pmap->pm_pve;
2359 		pmap->pm_pve = NULL;
2360 		*new_pve = pve;
2361 		pve->pve_pte.pte_ptp = ptp;
2362 		pve->pve_pte.pte_va = va;
2363 		pve->pve_pp = pp;
2364 		LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2365 	}
2366 	mutex_spin_exit(&pp->pp_lock);
2367 	if (error == 0) {
2368 		pmap_check_pv(pmap, ptp, pp, va, true);
2369 	}
2370 
2371 	return error;
2372 }
2373 
2374 /*
2375  * pmap_remove_pv: try to remove a mapping from a pv_list
2376  *
2377  * => pmap must be locked
2378  * => removes dynamic entries from tree and frees them
2379  * => caller should adjust ptp's wire_count and free PTP if needed
2380  */
2381 static void
2382 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2383     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2384 {
2385 	rb_tree_t *tree = (ptp != NULL ?
2386 	    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2387 
2388 	KASSERT(mutex_owned(&pmap->pm_lock));
2389 	KASSERT(ptp_to_pmap(ptp) == pmap);
2390 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2391 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2392 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2393 
2394 	pmap_check_pv(pmap, ptp, pp, va, true);
2395 
2396 	if (pve == NULL) {
2397 		mutex_spin_enter(&pp->pp_lock);
2398 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2399 		KASSERT(pp->pp_pte.pte_va == va);
2400 		pp->pp_attrs |= oattrs;
2401 		pp->pp_pte.pte_ptp = NULL;
2402 		pp->pp_pte.pte_va = 0;
2403 		mutex_spin_exit(&pp->pp_lock);
2404 	} else {
2405 		mutex_spin_enter(&pp->pp_lock);
2406 		KASSERT(pp->pp_pte.pte_ptp != ptp ||
2407 		    pp->pp_pte.pte_va != va);
2408 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2409 		KASSERT(pve->pve_pte.pte_va == va);
2410 		KASSERT(pve->pve_pp == pp);
2411 		pp->pp_attrs |= oattrs;
2412 		LIST_REMOVE(pve, pve_list);
2413 		mutex_spin_exit(&pp->pp_lock);
2414 
2415 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2416 		rb_tree_remove_node(tree, pve);
2417 #ifdef DIAGNOSTIC
2418 		memset(pve, 0, sizeof(*pve));
2419 #endif
2420 		pmap_free_pv(pmap, pve);
2421 	}
2422 
2423 	KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2424 	pmap_check_pv(pmap, ptp, pp, va, false);
2425 }
2426 
2427 /*
2428  * p t p   f u n c t i o n s
2429  */
2430 
2431 static struct vm_page *
2432 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2433 {
2434 	int lidx = level - 1;
2435 	off_t off = ptp_va2o(va, level);
2436 	struct vm_page *pg;
2437 
2438 	KASSERT(mutex_owned(&pmap->pm_lock));
2439 
2440 	if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2441 		KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2442 		pg = pmap->pm_ptphint[lidx];
2443 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2444 		return pg;
2445 	}
2446 	PMAP_DUMMY_LOCK(pmap);
2447 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2448 	PMAP_DUMMY_UNLOCK(pmap);
2449 	if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2450 		/* This page is queued to be freed - ignore. */
2451 		pg = NULL;
2452 	}
2453 	if (pg != NULL) {
2454 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2455 	}
2456 	pmap->pm_ptphint[lidx] = pg;
2457 	return pg;
2458 }
2459 
2460 static inline void
2461 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2462 {
2463 	int lidx;
2464 
2465 	KASSERT(ptp->wire_count <= 1);
2466 	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2467 
2468 	lidx = level - 1;
2469 	pmap_stats_update(pmap, -ptp->wire_count, 0);
2470 	if (pmap->pm_ptphint[lidx] == ptp)
2471 		pmap->pm_ptphint[lidx] = NULL;
2472 	ptp->wire_count = 0;
2473 	ptp->uanon = NULL;
2474 	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2475 
2476 	/*
2477 	 * Enqueue the PTP to be freed by pmap_update().  We can't remove
2478 	 * the page from the uvm_object, as that can take further locks
2479 	 * (intolerable right now because the PTEs are likely mapped in).
2480 	 * Instead mark the PTP as free and if we bump into it again, we'll
2481 	 * either ignore or reuse (depending on what's useful at the time).
2482 	 */
2483 	LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2484 }
2485 
2486 static void
2487 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2488 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2489 {
2490 	unsigned long index;
2491 	int level;
2492 	vaddr_t invaladdr;
2493 	pd_entry_t opde;
2494 
2495 	KASSERT(pmap != pmap_kernel());
2496 	KASSERT(mutex_owned(&pmap->pm_lock));
2497 	KASSERT(kpreempt_disabled());
2498 
2499 	level = 1;
2500 	do {
2501 		index = pl_i(va, level + 1);
2502 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2503 
2504 		/*
2505 		 * On Xen-amd64 or SVS, we need to sync the top level page
2506 		 * directory on each CPU.
2507 		 */
2508 #if defined(XENPV) && defined(__x86_64__)
2509 		if (level == PTP_LEVELS - 1) {
2510 			xen_kpm_sync(pmap, index);
2511 		}
2512 #elif defined(SVS)
2513 		if (svs_enabled && level == PTP_LEVELS - 1 &&
2514 		    pmap_is_user(pmap)) {
2515 			svs_pmap_sync(pmap, index);
2516 		}
2517 #endif
2518 
2519 		invaladdr = level == 1 ? (vaddr_t)ptes :
2520 		    (vaddr_t)pdes[level - 2];
2521 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2522 		    opde, TLBSHOOT_FREE_PTP);
2523 
2524 #if defined(XENPV)
2525 		pmap_tlb_shootnow();
2526 #endif
2527 
2528 		pmap_freepage(pmap, ptp, level);
2529 		if (level < PTP_LEVELS - 1) {
2530 			ptp = pmap_find_ptp(pmap, va, level + 1);
2531 			ptp->wire_count--;
2532 			if (ptp->wire_count > 1)
2533 				break;
2534 		}
2535 	} while (++level < PTP_LEVELS);
2536 	pmap_pte_flush();
2537 }
2538 
2539 /*
2540  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2541  *
2542  * => pmap should NOT be pmap_kernel()
2543  * => pmap should be locked
2544  * => we are not touching any PTEs yet, so they need not be mapped in
2545  */
2546 static int
2547 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2548     int flags, struct vm_page **resultp)
2549 {
2550 	struct vm_page *ptp;
2551 	int i, aflags;
2552 	struct uvm_object *obj;
2553 	voff_t off;
2554 
2555 	KASSERT(pmap != pmap_kernel());
2556 	KASSERT(mutex_owned(&pmap->pm_lock));
2557 
2558 	/*
2559 	 * Loop through all page table levels allocating a page
2560 	 * for any level where we don't already have one.
2561 	 */
2562 	memset(pt, 0, sizeof(*pt));
2563 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2564 		UVM_PGA_ZERO;
2565 	for (i = PTP_LEVELS; i > 1; i--) {
2566 		obj = &pmap->pm_obj[i - 2];
2567 		off = ptp_va2o(va, i - 1);
2568 
2569 		PMAP_DUMMY_LOCK(pmap);
2570 		pt->pg[i] = uvm_pagelookup(obj, off);
2571 
2572 		if (pt->pg[i] == NULL) {
2573 			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2574 			pt->alloced[i] = (pt->pg[i] != NULL);
2575 		} else if (pt->pg[i]->wire_count == 0) {
2576 			/* This page was queued to be freed; dequeue it. */
2577 			LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2578 			pt->alloced[i] = true;
2579 		}
2580 		PMAP_DUMMY_UNLOCK(pmap);
2581 		if (pt->pg[i] == NULL) {
2582 			pmap_unget_ptp(pmap, pt);
2583 			return ENOMEM;
2584 		} else if (pt->alloced[i]) {
2585 			pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2586 			rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2587 			    &pmap_rbtree_ops);
2588 			PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2589 		}
2590 	}
2591 	ptp = pt->pg[2];
2592 	KASSERT(ptp != NULL);
2593 	*resultp = ptp;
2594 	pmap->pm_ptphint[0] = ptp;
2595 	return 0;
2596 }
2597 
2598 /*
2599  * pmap_install_ptp: install any freshly allocated PTPs
2600  *
2601  * => pmap should NOT be pmap_kernel()
2602  * => pmap should be locked
2603  * => PTEs must be mapped
2604  * => preemption must be disabled
2605  */
2606 static void
2607 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2608     pd_entry_t * const *pdes)
2609 {
2610 	struct vm_page *ptp;
2611 	unsigned long index;
2612 	pd_entry_t *pva;
2613 	paddr_t pa;
2614 	int i;
2615 
2616 	KASSERT(pmap != pmap_kernel());
2617 	KASSERT(mutex_owned(&pmap->pm_lock));
2618 	KASSERT(kpreempt_disabled());
2619 
2620 	/*
2621 	 * Now that we have all the pages looked up or allocated,
2622 	 * loop through again installing any new ones into the tree.
2623 	 */
2624 	for (i = PTP_LEVELS; i > 1; i--) {
2625 		index = pl_i(va, i);
2626 		pva = pdes[i - 2];
2627 
2628 		if (pmap_valid_entry(pva[index])) {
2629 			KASSERT(!pt->alloced[i]);
2630 			continue;
2631 		}
2632 
2633 		ptp = pt->pg[i];
2634 		ptp->flags &= ~PG_BUSY; /* never busy */
2635 		ptp->wire_count = 1;
2636 		pmap->pm_ptphint[i - 2] = ptp;
2637 		pa = VM_PAGE_TO_PHYS(ptp);
2638 		pmap_pte_set(&pva[index], (pd_entry_t)
2639 		    (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2640 
2641 		/*
2642 		 * On Xen-amd64 or SVS, we need to sync the top level page
2643 		 * directory on each CPU.
2644 		 */
2645 #if defined(XENPV) && defined(__x86_64__)
2646 		if (i == PTP_LEVELS) {
2647 			xen_kpm_sync(pmap, index);
2648 		}
2649 #elif defined(SVS)
2650 		if (svs_enabled && i == PTP_LEVELS &&
2651 		    pmap_is_user(pmap)) {
2652 			svs_pmap_sync(pmap, index);
2653 		}
2654 #endif
2655 
2656 		pmap_pte_flush();
2657 		pmap_stats_update(pmap, 1, 0);
2658 
2659 		/*
2660 		 * If we're not in the top level, increase the
2661 		 * wire count of the parent page.
2662 		 */
2663 		if (i < PTP_LEVELS) {
2664 			pt->pg[i + 1]->wire_count++;
2665 		}
2666 	}
2667 }
2668 
2669 /*
2670  * pmap_unget_ptp: free unusued PTPs
2671  *
2672  * => pmap should NOT be pmap_kernel()
2673  * => pmap should be locked
2674  */
2675 static void
2676 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2677 {
2678 	int i;
2679 
2680 	KASSERT(pmap != pmap_kernel());
2681 	KASSERT(mutex_owned(&pmap->pm_lock));
2682 
2683 	for (i = PTP_LEVELS; i > 1; i--) {
2684 		if (!pt->alloced[i]) {
2685 			continue;
2686 		}
2687 		KASSERT(pt->pg[i]->wire_count == 0);
2688 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2689 		pmap_freepage(pmap, pt->pg[i], i - 1);
2690 	}
2691 }
2692 
2693 /*
2694  * p m a p   l i f e c y c l e   f u n c t i o n s
2695  */
2696 
2697 /*
2698  * pmap_pdp_init: constructor a new PDP.
2699  */
2700 static void
2701 pmap_pdp_init(pd_entry_t *pdir)
2702 {
2703 	paddr_t pdirpa = 0;
2704 	vaddr_t object;
2705 	int i;
2706 
2707 #if !defined(XENPV) || !defined(__x86_64__)
2708 	int npde;
2709 #endif
2710 #ifdef XENPV
2711 	int s;
2712 #endif
2713 
2714 	memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
2715 
2716 	/*
2717 	 * NOTE: This is all done unlocked, but we will check afterwards
2718 	 * if we have raced with pmap_growkernel().
2719 	 */
2720 
2721 #if defined(XENPV) && defined(__x86_64__)
2722 	/* Fetch the physical address of the page directory */
2723 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2724 
2725 	/*
2726 	 * This pdir will NEVER be active in kernel mode, so mark
2727 	 * recursive entry invalid.
2728 	 */
2729 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2730 
2731 	/*
2732 	 * PDP constructed this way won't be for the kernel, hence we
2733 	 * don't put kernel mappings on Xen.
2734 	 *
2735 	 * But we need to make pmap_create() happy, so put a dummy
2736 	 * (without PTE_P) value at the right place.
2737 	 */
2738 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2739 	     (pd_entry_t)-1 & PTE_FRAME;
2740 #else /* XENPV && __x86_64__*/
2741 	object = (vaddr_t)pdir;
2742 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2743 		/* Fetch the physical address of the page directory */
2744 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2745 
2746 		/* Put in recursive PDE to map the PTEs */
2747 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2748 		    pmap_pg_nx;
2749 #ifndef XENPV
2750 		pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2751 #endif
2752 	}
2753 
2754 	/* Copy the kernel's top level PDE */
2755 	npde = nkptp[PTP_LEVELS - 1];
2756 
2757 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2758 	    npde * sizeof(pd_entry_t));
2759 
2760 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2761 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2762 		pdir[idx] = PDP_BASE[idx];
2763 	}
2764 
2765 #ifdef __HAVE_PCPU_AREA
2766 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2767 #endif
2768 #ifdef __HAVE_DIRECT_MAP
2769 	slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2770 #endif
2771 #ifdef KASAN
2772 	slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2773 #endif
2774 #ifdef KMSAN
2775 	slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2776 #endif
2777 #endif /* XENPV  && __x86_64__*/
2778 
2779 #ifdef XENPV
2780 	s = splvm();
2781 	object = (vaddr_t)pdir;
2782 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2783 	    VM_PROT_READ);
2784 	pmap_update(pmap_kernel());
2785 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2786 		/*
2787 		 * pin as L2/L4 page, we have to do the page with the
2788 		 * PDIR_SLOT_PTE entries last
2789 		 */
2790 #ifdef PAE
2791 		if (i == l2tol3(PDIR_SLOT_PTE))
2792 			continue;
2793 #endif
2794 
2795 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2796 #ifdef __x86_64__
2797 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2798 #else
2799 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2800 #endif
2801 	}
2802 #ifdef PAE
2803 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2804 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2805 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2806 #endif
2807 	splx(s);
2808 #endif /* XENPV */
2809 }
2810 
2811 /*
2812  * pmap_pdp_fini: destructor for the PDPs.
2813  */
2814 static void
2815 pmap_pdp_fini(pd_entry_t *pdir)
2816 {
2817 #ifdef XENPV
2818 	paddr_t pdirpa = 0;	/* XXX: GCC */
2819 	vaddr_t object = (vaddr_t)pdir;
2820 	int i;
2821 	int s = splvm();
2822 	pt_entry_t *pte;
2823 
2824 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2825 		/* fetch the physical address of the page directory. */
2826 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2827 		/* unpin page table */
2828 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2829 	}
2830 	object = (vaddr_t)pdir;
2831 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2832 		/* Set page RW again */
2833 		pte = kvtopte(object);
2834 		pmap_pte_set(pte, *pte | PTE_W);
2835 		xen_bcast_invlpg((vaddr_t)object);
2836 	}
2837 	splx(s);
2838 #endif  /* XENPV */
2839 }
2840 
2841 #ifdef PAE
2842 static void *
2843 pmap_pdp_alloc(struct pool *pp, int flags)
2844 {
2845 	return (void *)uvm_km_alloc(kernel_map,
2846 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2847 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2848 	    UVM_KMF_WIRED);
2849 }
2850 
2851 static void
2852 pmap_pdp_free(struct pool *pp, void *v)
2853 {
2854 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2855 	    UVM_KMF_WIRED);
2856 }
2857 #endif /* PAE */
2858 
2859 /*
2860  * pmap_ctor: constructor for the pmap cache.
2861  */
2862 static int
2863 pmap_ctor(void *arg, void *obj, int flags)
2864 {
2865 	struct pmap *pmap = obj;
2866 	pt_entry_t p;
2867 	int i;
2868 
2869 	KASSERT((flags & PR_WAITOK) != 0);
2870 
2871 	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2872 	rw_init(&pmap->pm_dummy_lock);
2873 	kcpuset_create(&pmap->pm_cpus, true);
2874 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2875 #ifdef XENPV
2876 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2877 #endif
2878 	LIST_INIT(&pmap->pm_gc_ptp);
2879 	pmap->pm_pve = NULL;
2880 	LIST_INIT(&pmap->pm_pvp_full);
2881 	LIST_INIT(&pmap->pm_pvp_part);
2882 	LIST_INIT(&pmap->pm_pvp_empty);
2883 
2884 	/* allocate and init PDP */
2885 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2886 
2887 	for (;;) {
2888 		pmap_pdp_init(pmap->pm_pdir);
2889 		mutex_enter(&pmaps_lock);
2890 		p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2891 		if (__predict_true(p != 0)) {
2892 			break;
2893 		}
2894 		mutex_exit(&pmaps_lock);
2895 	}
2896 
2897 	for (i = 0; i < PDP_SIZE; i++)
2898 		pmap->pm_pdirpa[i] =
2899 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2900 
2901 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2902 	mutex_exit(&pmaps_lock);
2903 
2904 	return 0;
2905 }
2906 
2907 /*
2908  * pmap_ctor: destructor for the pmap cache.
2909  */
2910 static void
2911 pmap_dtor(void *arg, void *obj)
2912 {
2913 	struct pmap *pmap = obj;
2914 
2915 	mutex_enter(&pmaps_lock);
2916 	LIST_REMOVE(pmap, pm_list);
2917 	mutex_exit(&pmaps_lock);
2918 
2919 	pmap_pdp_fini(pmap->pm_pdir);
2920 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2921 	mutex_destroy(&pmap->pm_lock);
2922 	rw_destroy(&pmap->pm_dummy_lock);
2923 	kcpuset_destroy(pmap->pm_cpus);
2924 	kcpuset_destroy(pmap->pm_kernel_cpus);
2925 #ifdef XENPV
2926 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2927 #endif
2928 }
2929 
2930 /*
2931  * pmap_create: create a pmap object.
2932  */
2933 struct pmap *
2934 pmap_create(void)
2935 {
2936 	struct pmap *pmap;
2937 	int i;
2938 
2939 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2940 
2941 	/* init uvm_object */
2942 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2943 		uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2944 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2945 		pmap->pm_ptphint[i] = NULL;
2946 	}
2947 	pmap->pm_stats.wired_count = 0;
2948 	/* count the PDP allocd below */
2949 	pmap->pm_stats.resident_count = PDP_SIZE;
2950 #if !defined(__x86_64__)
2951 	pmap->pm_hiexec = 0;
2952 #endif
2953 
2954 	/* Used by NVMM and Xen */
2955 	pmap->pm_enter = NULL;
2956 	pmap->pm_extract = NULL;
2957 	pmap->pm_remove = NULL;
2958 	pmap->pm_sync_pv = NULL;
2959 	pmap->pm_pp_remove_ent = NULL;
2960 	pmap->pm_write_protect = NULL;
2961 	pmap->pm_unwire = NULL;
2962 	pmap->pm_tlb_flush = NULL;
2963 	pmap->pm_data = NULL;
2964 
2965 	/* init the LDT */
2966 	pmap->pm_ldt = NULL;
2967 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2968 
2969 	return pmap;
2970 }
2971 
2972 /*
2973  * pmap_check_ptps: verify that none of the pmap's page table objects
2974  * have any pages allocated to them.
2975  */
2976 static void
2977 pmap_check_ptps(struct pmap *pmap)
2978 {
2979 	int i;
2980 
2981 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2982 		KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2983 		    "pmap %p level %d still has %d pages",
2984 		    pmap, i, (int)pmap->pm_obj[i].uo_npages);
2985 	}
2986 }
2987 
2988 static void
2989 pmap_check_inuse(struct pmap *pmap)
2990 {
2991 #ifdef DEBUG
2992 	CPU_INFO_ITERATOR cii;
2993 	struct cpu_info *ci;
2994 
2995 	for (CPU_INFO_FOREACH(cii, ci)) {
2996 		if (ci->ci_pmap == pmap)
2997 			panic("destroying pmap being used");
2998 #if defined(XENPV) && defined(__x86_64__)
2999 		for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
3000 			if (pmap->pm_pdir[i] != 0 &&
3001 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
3002 				printf("pmap_destroy(%p) pmap_kernel %p "
3003 				    "curcpu %d cpu %d ci_pmap %p "
3004 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
3005 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
3006 				    pmap, pmap_kernel(), curcpu()->ci_index,
3007 				    ci->ci_index, ci->ci_pmap,
3008 				    i, ci->ci_kpm_pdir[i],
3009 				    i, pmap->pm_pdir[i]);
3010 				panic("%s: used pmap", __func__);
3011 			}
3012 		}
3013 #endif
3014 	}
3015 #endif /* DEBUG */
3016 }
3017 
3018 /*
3019  * pmap_destroy:  drop reference count on pmap.  free pmap if reference
3020  * count goes to zero.
3021  *
3022  * => we can be called from pmap_unmap_ptes() with a different, unrelated
3023  *    pmap's lock held.  be careful!
3024  */
3025 void
3026 pmap_destroy(struct pmap *pmap)
3027 {
3028 	int i;
3029 
3030 	/*
3031 	 * drop reference count and verify not in use.
3032 	 */
3033 
3034 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
3035 		return;
3036 	}
3037 	pmap_check_inuse(pmap);
3038 
3039 	/*
3040 	 * handle any deferred frees.
3041 	 */
3042 
3043 	mutex_enter(&pmap->pm_lock);
3044 	if (pmap->pm_pve != NULL) {
3045 		pmap_free_pv(pmap, pmap->pm_pve);
3046 		pmap->pm_pve = NULL;
3047 	}
3048 	pmap_drain_pv(pmap);
3049 	mutex_exit(&pmap->pm_lock);
3050 	pmap_update(pmap);
3051 
3052 	/*
3053 	 * Reference count is zero, free pmap resources and then free pmap.
3054 	 */
3055 
3056 	pmap_check_ptps(pmap);
3057 	KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
3058 
3059 #ifdef USER_LDT
3060 	if (pmap->pm_ldt != NULL) {
3061 		/*
3062 		 * No need to switch the LDT; this address space is gone,
3063 		 * nothing is using it.
3064 		 *
3065 		 * No need to lock the pmap for ldt_free (or anything else),
3066 		 * we're the last one to use it.
3067 		 */
3068 		/* XXXAD can't take cpu_lock here - fix soon. */
3069 		mutex_enter(&cpu_lock);
3070 		ldt_free(pmap->pm_ldt_sel);
3071 		mutex_exit(&cpu_lock);
3072 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
3073 		    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3074 	}
3075 #endif
3076 
3077 	for (i = 0; i < PTP_LEVELS - 1; i++) {
3078 		uvm_obj_destroy(&pmap->pm_obj[i], false);
3079 	}
3080 	kcpuset_zero(pmap->pm_cpus);
3081 	kcpuset_zero(pmap->pm_kernel_cpus);
3082 #ifdef XENPV
3083 	kcpuset_zero(pmap->pm_xen_ptp_cpus);
3084 #endif
3085 
3086 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3087 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3088 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3089 
3090 	pmap_check_ptps(pmap);
3091 	if (__predict_false(pmap->pm_enter != NULL)) {
3092 		/* XXX make this a different cache */
3093 		pool_cache_destruct_object(&pmap_cache, pmap);
3094 	} else {
3095 		pool_cache_put(&pmap_cache, pmap);
3096 	}
3097 }
3098 
3099 /*
3100  * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3101  *
3102  * => caller must hold pmap's lock
3103  * => PTP must be mapped into KVA
3104  * => must be called with kernel preemption disabled
3105  * => does as little work as possible
3106  */
3107 static void
3108 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3109     vaddr_t startva, vaddr_t blkendva)
3110 {
3111 #ifndef XENPV
3112 	struct pv_entry *pve;
3113 	struct vm_page *pg;
3114 	struct pmap_page *pp;
3115 	pt_entry_t opte;
3116 	rb_tree_t *tree;
3117 	vaddr_t va;
3118 	int wired;
3119 	uint8_t oattrs;
3120 	u_int cnt;
3121 
3122 	KASSERT(mutex_owned(&pmap->pm_lock));
3123 	KASSERT(kpreempt_disabled());
3124 	KASSERT(pmap != pmap_kernel());
3125 	KASSERT(ptp->wire_count > 1);
3126 	KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3127 
3128 	/*
3129 	 * Start at the lowest entered VA, and scan until there are no more
3130 	 * PTEs in the PTPs.
3131 	 */
3132 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3133 	pve = RB_TREE_MIN(tree);
3134 	wired = 0;
3135 	va = (vaddr_t)ptp->uanon;
3136 	pte += ((va - startva) >> PAGE_SHIFT);
3137 
3138 	for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3139 		/*
3140 		 * No need for an atomic to clear the PTE.  Nothing else can
3141 		 * see the address space any more and speculative access (if
3142 		 * possible) won't modify.  Therefore there's no need to
3143 		 * track the accessed/dirty bits.
3144 		 */
3145 		opte = *pte;
3146 		if (!pmap_valid_entry(opte)) {
3147 			continue;
3148 		}
3149 
3150 		/*
3151 		 * Count the PTE.  If it's not for a managed mapping
3152 		 * there's noting more to do.
3153 		 */
3154 		cnt--;
3155 		wired -= (opte & PTE_WIRED);
3156 		if ((opte & PTE_PVLIST) == 0) {
3157 #ifndef DOM0OPS
3158 			KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3159 			    "managed page without PTE_PVLIST for %#"
3160 			    PRIxVADDR, va);
3161 			KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3162 			    "pv-tracked page without PTE_PVLIST for %#"
3163 			    PRIxVADDR, va);
3164 #endif
3165 			KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3166 			    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3167 			    va) == NULL);
3168 			continue;
3169 		}
3170 
3171 		/*
3172 		 * "pve" now points to the lowest (by VA) dynamic PV entry
3173 		 * in the PTP.  If it's for this VA, take advantage of it to
3174 		 * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
3175 		 * tree by skipping to the next VA in the tree whenever
3176 		 * there is a match here.  The tree will be cleared out in
3177 		 * one pass before return to pmap_remove_all().
3178 		 */
3179 		oattrs = pmap_pte_to_pp_attrs(opte);
3180 		if (pve != NULL && pve->pve_pte.pte_va == va) {
3181 			pp = pve->pve_pp;
3182 			KASSERT(pve->pve_pte.pte_ptp == ptp);
3183 			KASSERT(pp->pp_pte.pte_ptp != ptp ||
3184 			    pp->pp_pte.pte_va != va);
3185 			mutex_spin_enter(&pp->pp_lock);
3186 			pp->pp_attrs |= oattrs;
3187 			LIST_REMOVE(pve, pve_list);
3188 			mutex_spin_exit(&pp->pp_lock);
3189 
3190 			/*
3191 			 * pve won't be touched again until pmap_drain_pv(),
3192 			 * so it's still safe to traverse the tree.
3193 			 */
3194 			pmap_free_pv(pmap, pve);
3195 			pve = RB_TREE_NEXT(tree, pve);
3196 			continue;
3197 		}
3198 
3199 		/*
3200 		 * No entry in the tree so it must be embedded.  Look up the
3201 		 * page and cancel the embedded entry.
3202 		 */
3203 		if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3204 			pp = VM_PAGE_TO_PP(pg);
3205 		} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3206 			paddr_t pa = pmap_pte2pa(opte);
3207 			panic("%s: PTE_PVLIST with pv-untracked page"
3208 			    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3209 			    "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3210 		}
3211 		mutex_spin_enter(&pp->pp_lock);
3212 		KASSERT(pp->pp_pte.pte_ptp == ptp);
3213 		KASSERT(pp->pp_pte.pte_va == va);
3214 		pp->pp_attrs |= oattrs;
3215 		pp->pp_pte.pte_ptp = NULL;
3216 		pp->pp_pte.pte_va = 0;
3217 		mutex_spin_exit(&pp->pp_lock);
3218 	}
3219 
3220 	/* PTP now empty - adjust the tree & stats to match. */
3221 	pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3222 	ptp->wire_count = 1;
3223 #ifdef DIAGNOSTIC
3224 	rb_tree_init(tree, &pmap_rbtree_ops);
3225 #endif
3226 #else	/* !XENPV */
3227 	/*
3228 	 * XXXAD For XEN, it's not clear to me that we can do this, because
3229 	 * I guess the hypervisor keeps track of PTEs too.
3230 	 */
3231 	pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3232 #endif	/* !XENPV */
3233 }
3234 
3235 /*
3236  * pmap_remove_all: remove all mappings from pmap in bulk.
3237  *
3238  * Ordinarily when removing mappings it's important to hold the UVM object's
3239  * lock, so that pages do not gain a new identity while retaining stale TLB
3240  * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3241  * Here it's known that the address space is no longer visible to any user
3242  * process, so we don't need to worry about that.
3243  */
3244 bool
3245 pmap_remove_all(struct pmap *pmap)
3246 {
3247 	struct vm_page *ptps[32];
3248 	vaddr_t va, blkendva;
3249 	struct pmap *pmap2;
3250 	pt_entry_t *ptes;
3251 	pd_entry_t pde __diagused;
3252 	pd_entry_t * const *pdes;
3253 	int lvl __diagused, i, n;
3254 
3255 	/* XXX Can't handle EPT just yet. */
3256 	if (pmap->pm_remove != NULL) {
3257 		return false;
3258 	}
3259 
3260 	for (;;) {
3261 		/* Fetch a block of PTPs from tree. */
3262 		mutex_enter(&pmap->pm_lock);
3263 		n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3264 		    (void **)ptps, __arraycount(ptps), false);
3265 		if (n == 0) {
3266 			mutex_exit(&pmap->pm_lock);
3267 			break;
3268 		}
3269 
3270 		/* Remove all mappings in the set of PTPs. */
3271 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3272 		for (i = 0; i < n; i++) {
3273 			if (ptps[i]->wire_count == 0) {
3274 				/* It's dead: pmap_update() will expunge. */
3275 				continue;
3276 			}
3277 
3278 			/* Determine range of block. */
3279 			va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3280 			blkendva = x86_round_pdr(va + 1);
3281 
3282 			/* Make sure everything squares up... */
3283 			KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3284 			KASSERT(lvl == 1);
3285 			KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3286 
3287 			/* Zap! */
3288 			pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3289 			    blkendva);
3290 
3291 			/* PTP should now be unused - free it. */
3292 			KASSERT(ptps[i]->wire_count == 1);
3293 			pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3294 		}
3295 		pmap_unmap_ptes(pmap, pmap2);
3296 		pmap_drain_pv(pmap);
3297 		pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3298 		mutex_exit(&pmap->pm_lock);
3299 
3300 		/* Process deferred frees. */
3301 		pmap_update(pmap);
3302 
3303 		/* A breathing point. */
3304 		preempt_point();
3305 	}
3306 
3307 	/* Verify that the pmap is now completely empty. */
3308 	pmap_check_ptps(pmap);
3309 	KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3310 	    "pmap %p not empty", pmap);
3311 
3312 	return true;
3313 }
3314 
3315 #if defined(PMAP_FORK)
3316 /*
3317  * pmap_fork: perform any necessary data structure manipulation when
3318  * a VM space is forked.
3319  */
3320 void
3321 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3322 {
3323 #ifdef USER_LDT
3324 	union descriptor *new_ldt;
3325 	int sel;
3326 
3327 	if (__predict_true(pmap1->pm_ldt == NULL)) {
3328 		return;
3329 	}
3330 
3331 	/*
3332 	 * Copy the LDT into the new process.
3333 	 *
3334 	 * Read pmap1's ldt pointer unlocked; if it changes behind our back
3335 	 * we'll retry. This will starve if there's a stream of LDT changes
3336 	 * in another thread but that should not happen.
3337 	 */
3338 
3339 retry:
3340 	if (pmap1->pm_ldt != NULL) {
3341 		/* Allocate space for the new process's LDT */
3342 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3343 		    MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3344 		if (new_ldt == NULL) {
3345 			printf("WARNING: %s: unable to allocate LDT space\n",
3346 			    __func__);
3347 			return;
3348 		}
3349 		mutex_enter(&cpu_lock);
3350 		/* Get a GDT slot for it */
3351 		sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3352 		if (sel == -1) {
3353 			mutex_exit(&cpu_lock);
3354 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3355 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3356 			printf("WARNING: %s: unable to allocate LDT selector\n",
3357 			    __func__);
3358 			return;
3359 		}
3360 	} else {
3361 		/* Wasn't anything there after all. */
3362 		new_ldt = NULL;
3363 		sel = -1;
3364 		mutex_enter(&cpu_lock);
3365 	}
3366 
3367 	/*
3368 	 * Now that we have cpu_lock, ensure the LDT status is the same.
3369 	 */
3370 	if (pmap1->pm_ldt != NULL) {
3371 		if (new_ldt == NULL) {
3372 			/* A wild LDT just appeared. */
3373 			mutex_exit(&cpu_lock);
3374 			goto retry;
3375 		}
3376 
3377 		/* Copy the LDT data and install it in pmap2 */
3378 		memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3379 		pmap2->pm_ldt = new_ldt;
3380 		pmap2->pm_ldt_sel = sel;
3381 		mutex_exit(&cpu_lock);
3382 	} else {
3383 		if (new_ldt != NULL) {
3384 			/* The LDT disappeared, drop what we did. */
3385 			ldt_free(sel);
3386 			mutex_exit(&cpu_lock);
3387 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3388 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3389 			return;
3390 		}
3391 
3392 		/* We're good, just leave. */
3393 		mutex_exit(&cpu_lock);
3394 	}
3395 #endif /* USER_LDT */
3396 }
3397 #endif /* PMAP_FORK */
3398 
3399 #ifdef USER_LDT
3400 
3401 /*
3402  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
3403  * is active, reload LDTR.
3404  */
3405 static void
3406 pmap_ldt_xcall(void *arg1, void *arg2)
3407 {
3408 	struct pmap *pm;
3409 
3410 	kpreempt_disable();
3411 	pm = arg1;
3412 	if (curcpu()->ci_pmap == pm) {
3413 #if defined(SVS)
3414 		if (svs_enabled) {
3415 			svs_ldt_sync(pm);
3416 		} else
3417 #endif
3418 		lldt(pm->pm_ldt_sel);
3419 	}
3420 	kpreempt_enable();
3421 }
3422 
3423 /*
3424  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
3425  * in the new selector on all CPUs.
3426  */
3427 void
3428 pmap_ldt_sync(struct pmap *pm)
3429 {
3430 	uint64_t where;
3431 
3432 	KASSERT(mutex_owned(&cpu_lock));
3433 
3434 	pmap_ldt_evcnt.ev_count++;
3435 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3436 	xc_wait(where);
3437 }
3438 
3439 /*
3440  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3441  * restore the default.
3442  */
3443 void
3444 pmap_ldt_cleanup(struct lwp *l)
3445 {
3446 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3447 	union descriptor *ldt;
3448 	int sel;
3449 
3450 	if (__predict_true(pmap->pm_ldt == NULL)) {
3451 		return;
3452 	}
3453 
3454 	mutex_enter(&cpu_lock);
3455 	if (pmap->pm_ldt != NULL) {
3456 		sel = pmap->pm_ldt_sel;
3457 		ldt = pmap->pm_ldt;
3458 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3459 		pmap->pm_ldt = NULL;
3460 		pmap_ldt_sync(pmap);
3461 		ldt_free(sel);
3462 		uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3463 		    UVM_KMF_WIRED);
3464 	}
3465 	mutex_exit(&cpu_lock);
3466 }
3467 #endif /* USER_LDT */
3468 
3469 /*
3470  * pmap_activate: activate a process' pmap
3471  *
3472  * => must be called with kernel preemption disabled
3473  * => if lwp is the curlwp, then set ci_want_pmapload so that
3474  *    actual MMU context switch will be done by pmap_load() later
3475  */
3476 void
3477 pmap_activate(struct lwp *l)
3478 {
3479 	struct cpu_info *ci;
3480 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3481 
3482 	KASSERT(kpreempt_disabled());
3483 
3484 	ci = curcpu();
3485 
3486 	if (l != ci->ci_curlwp)
3487 		return;
3488 
3489 	KASSERT(ci->ci_want_pmapload == 0);
3490 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3491 
3492 	/*
3493 	 * no need to switch to kernel vmspace because
3494 	 * it's a subset of any vmspace.
3495 	 */
3496 
3497 	if (pmap == pmap_kernel()) {
3498 		ci->ci_want_pmapload = 0;
3499 		return;
3500 	}
3501 
3502 	ci->ci_want_pmapload = 1;
3503 }
3504 
3505 #if defined(XENPV) && defined(__x86_64__)
3506 #define	KASSERT_PDIRPA(pmap) \
3507 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3508 	    pmap == pmap_kernel())
3509 #elif defined(PAE)
3510 #define	KASSERT_PDIRPA(pmap) \
3511 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3512 #elif !defined(XENPV)
3513 #define	KASSERT_PDIRPA(pmap) \
3514 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3515 #else
3516 #define	KASSERT_PDIRPA(pmap)	KASSERT(true)	/* nothing to do */
3517 #endif
3518 
3519 /*
3520  * pmap_reactivate: try to regain reference to the pmap.
3521  *
3522  * => Must be called with kernel preemption disabled.
3523  */
3524 static void
3525 pmap_reactivate(struct pmap *pmap)
3526 {
3527 	struct cpu_info * const ci = curcpu();
3528 	const cpuid_t cid = cpu_index(ci);
3529 
3530 	KASSERT(kpreempt_disabled());
3531 	KASSERT_PDIRPA(pmap);
3532 
3533 	/*
3534 	 * If we still have a lazy reference to this pmap, we can assume
3535 	 * that there was no TLB shootdown for this pmap in the meantime.
3536 	 *
3537 	 * The order of events here is important as we must synchronize
3538 	 * with TLB shootdown interrupts.  Declare interest in invalidations
3539 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3540 	 * change only when the state is TLBSTATE_LAZY.
3541 	 */
3542 
3543 	ci->ci_tlbstate = TLBSTATE_VALID;
3544 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3545 
3546 	if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3547 		/* We have the reference, state is valid. */
3548 	} else {
3549 		/*
3550 		 * Must reload the TLB, pmap has been changed during
3551 		 * deactivated.
3552 		 */
3553 		kcpuset_atomic_set(pmap->pm_cpus, cid);
3554 
3555 		tlbflush();
3556 	}
3557 }
3558 
3559 /*
3560  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3561  * and relevant LDT info.
3562  *
3563  * Ensures that the current process' pmap is loaded on the current CPU's
3564  * MMU and that there are no stale TLB entries.
3565  *
3566  * => The caller should disable kernel preemption or do check-and-retry
3567  *    to prevent a preemption from undoing our efforts.
3568  * => This function may block.
3569  */
3570 void
3571 pmap_load(void)
3572 {
3573 	struct cpu_info *ci;
3574 	struct pmap *pmap, *oldpmap;
3575 	struct lwp *l;
3576 	uint64_t ncsw;
3577 	int ilevel __diagused;
3578 	u_long psl __diagused;
3579 
3580 	kpreempt_disable();
3581  retry:
3582 	ci = curcpu();
3583 	if (!ci->ci_want_pmapload) {
3584 		kpreempt_enable();
3585 		return;
3586 	}
3587 	l = ci->ci_curlwp;
3588 	ncsw = l->l_ncsw;
3589 	__insn_barrier();
3590 
3591 	/* should be able to take ipis. */
3592 	KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel);
3593 #ifdef XENPV
3594 	/* Check to see if interrupts are enabled (ie; no events are masked) */
3595 	KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl);
3596 #else
3597 	KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl);
3598 #endif
3599 
3600 	KASSERT(l != NULL);
3601 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3602 	KASSERT(pmap != pmap_kernel());
3603 	oldpmap = ci->ci_pmap;
3604 
3605 	if (pmap == oldpmap) {
3606 		pmap_reactivate(pmap);
3607 		ci->ci_want_pmapload = 0;
3608 		kpreempt_enable();
3609 		return;
3610 	}
3611 
3612 	/*
3613 	 * Acquire a reference to the new pmap and perform the switch.
3614 	 */
3615 
3616 	pmap_reference(pmap);
3617 	pmap_load1(l, pmap, oldpmap);
3618 	ci->ci_want_pmapload = 0;
3619 
3620 	/*
3621 	 * we're now running with the new pmap.  drop the reference
3622 	 * to the old pmap.  if we block, we need to go around again.
3623 	 */
3624 
3625 	pmap_destroy(oldpmap);
3626 	__insn_barrier();
3627 	if (l->l_ncsw != ncsw) {
3628 		goto retry;
3629 	}
3630 
3631 	kpreempt_enable();
3632 }
3633 
3634 /*
3635  * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3636  * pmap_load().  It's critically important that this function does not
3637  * block.
3638  */
3639 static void
3640 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3641 {
3642 	struct cpu_info *ci;
3643 	struct pcb *pcb;
3644 	cpuid_t cid;
3645 
3646 	KASSERT(kpreempt_disabled());
3647 
3648 	pcb = lwp_getpcb(l);
3649 	ci = l->l_cpu;
3650 	cid = cpu_index(ci);
3651 
3652 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3653 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3654 
3655 	KASSERT_PDIRPA(oldpmap);
3656 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3657 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3658 
3659 	/*
3660 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
3661 	 * with TLB shootdown interrupts, so set the state VALID first,
3662 	 * then register us for shootdown events on this pmap.
3663 	 */
3664 	ci->ci_tlbstate = TLBSTATE_VALID;
3665 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3666 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3667 	ci->ci_pmap = pmap;
3668 
3669 	/*
3670 	 * update tss.  now that we have registered for invalidations
3671 	 * from other CPUs, we're good to load the page tables.
3672 	 */
3673 #ifdef PAE
3674 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3675 #else
3676 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3677 #endif
3678 
3679 #ifdef i386
3680 #ifndef XENPV
3681 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3682 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3683 #endif
3684 #endif
3685 
3686 #if defined(SVS) && defined(USER_LDT)
3687 	if (svs_enabled) {
3688 		svs_ldt_sync(pmap);
3689 	} else
3690 #endif
3691 	lldt(pmap->pm_ldt_sel);
3692 
3693 	cpu_load_pmap(pmap, oldpmap);
3694 }
3695 
3696 /*
3697  * pmap_deactivate: deactivate a process' pmap.
3698  *
3699  * => Must be called with kernel preemption disabled (high IPL is enough).
3700  */
3701 void
3702 pmap_deactivate(struct lwp *l)
3703 {
3704 	struct pmap *pmap;
3705 	struct cpu_info *ci;
3706 
3707 	KASSERT(kpreempt_disabled());
3708 
3709 	if (l != curlwp) {
3710 		return;
3711 	}
3712 
3713 	/*
3714 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3715 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3716 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3717 	 * to kernel preemption or blocking routine in between.
3718 	 */
3719 	pmap_tlb_shootnow();
3720 
3721 	ci = curcpu();
3722 
3723 	if (ci->ci_want_pmapload) {
3724 		/*
3725 		 * ci_want_pmapload means that our pmap is not loaded on
3726 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3727 		 * is always considered loaded.
3728 		 */
3729 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3730 		    != pmap_kernel());
3731 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3732 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3733 
3734 		/*
3735 		 * userspace has not been touched.
3736 		 * nothing to do here.
3737 		 */
3738 
3739 		ci->ci_want_pmapload = 0;
3740 		return;
3741 	}
3742 
3743 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3744 
3745 	if (pmap == pmap_kernel()) {
3746 		return;
3747 	}
3748 
3749 	KASSERT_PDIRPA(pmap);
3750 	KASSERT(ci->ci_pmap == pmap);
3751 
3752 	/*
3753 	 * we aren't interested in TLB invalidations for this pmap,
3754 	 * at least for the time being.
3755 	 */
3756 
3757 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3758 	ci->ci_tlbstate = TLBSTATE_LAZY;
3759 }
3760 
3761 #ifdef EFI_RUNTIME
3762 
3763 extern struct pmap *efi_runtime_pmap;
3764 
3765 /*
3766  * pmap_is_user: true if pmap, which must not be the kernel pmap, is
3767  * for an unprivileged user process
3768  */
3769 bool
3770 pmap_is_user(struct pmap *pmap)
3771 {
3772 
3773 	KASSERT(pmap != pmap_kernel());
3774 	return (pmap != efi_runtime_pmap);
3775 }
3776 
3777 /*
3778  * pmap_activate_sync: synchronously activate specified pmap.
3779  *
3780  * => Must be called with kernel preemption disabled (high IPL is enough).
3781  * => Must not sleep before pmap_deactivate_sync.
3782  */
3783 void *
3784 pmap_activate_sync(struct pmap *pmap)
3785 {
3786 	struct cpu_info *ci = curcpu();
3787 	struct pmap *oldpmap = ci->ci_pmap;
3788 	unsigned cid = cpu_index(ci);
3789 
3790 	KASSERT(kpreempt_disabled());
3791 	KASSERT(pmap != pmap_kernel());
3792 
3793 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3794 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3795 
3796 	if (oldpmap) {
3797 		KASSERT_PDIRPA(oldpmap);
3798 		kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3799 		kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3800 	}
3801 
3802 	ci->ci_tlbstate = TLBSTATE_VALID;
3803 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3804 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3805 	ci->ci_pmap = pmap;
3806 
3807 #if defined(SVS) && defined(USER_LDT)
3808 	if (svs_enabled) {
3809 		svs_ldt_sync(pmap);
3810 	} else
3811 #endif
3812 	lldt(pmap->pm_ldt_sel);
3813 
3814 	cpu_load_pmap(pmap, oldpmap);
3815 
3816 	return oldpmap;
3817 }
3818 
3819 /*
3820  * pmap_deactivate_sync: synchronously deactivate specified pmap and
3821  * restore whatever was active before pmap_activate_sync.
3822  *
3823  * => Must be called with kernel preemption disabled (high IPL is enough).
3824  * => Must not have slept since pmap_activate_sync.
3825  */
3826 void
3827 pmap_deactivate_sync(struct pmap *pmap, void *cookie)
3828 {
3829 	struct cpu_info *ci = curcpu();
3830 	struct pmap *oldpmap = cookie;
3831 	unsigned cid = cpu_index(ci);
3832 
3833 	KASSERT(kpreempt_disabled());
3834 	KASSERT(pmap != pmap_kernel());
3835 	KASSERT(ci->ci_pmap == pmap);
3836 
3837 	KASSERT_PDIRPA(pmap);
3838 
3839 	KASSERT(kcpuset_isset(pmap->pm_cpus, cid));
3840 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3841 
3842 	pmap_tlb_shootnow();
3843 
3844 	kcpuset_atomic_clear(pmap->pm_cpus, cid);
3845 	kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid);
3846 
3847 	ci->ci_tlbstate = TLBSTATE_VALID;
3848 	ci->ci_pmap = oldpmap;
3849 	if (oldpmap) {
3850 		kcpuset_atomic_set(oldpmap->pm_cpus, cid);
3851 		kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid);
3852 #if defined(SVS) && defined(USER_LDT)
3853 		if (svs_enabled) {
3854 			svs_ldt_sync(oldpmap);
3855 		} else
3856 #endif
3857 		lldt(oldpmap->pm_ldt_sel);
3858 		cpu_load_pmap(oldpmap, pmap);
3859 	} else {
3860 		lcr3(pmap_pdirpa(pmap_kernel(), 0));
3861 	}
3862 }
3863 
3864 #endif	/* EFI_RUNTIME */
3865 
3866 /*
3867  * some misc. functions
3868  */
3869 
3870 bool
3871 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3872     int *lastlvl)
3873 {
3874 	unsigned long index;
3875 	pd_entry_t pde;
3876 	int i;
3877 
3878 	for (i = PTP_LEVELS; i > 1; i--) {
3879 		index = pl_i(va, i);
3880 		pde = pdes[i - 2][index];
3881 		if ((pde & PTE_P) == 0) {
3882 			*lastlvl = i;
3883 			return false;
3884 		}
3885 		if (pde & PTE_PS)
3886 			break;
3887 	}
3888 	if (lastpde != NULL)
3889 		*lastpde = pde;
3890 	*lastlvl = i;
3891 	return true;
3892 }
3893 
3894 /*
3895  * pmap_extract: extract a PA for the given VA
3896  */
3897 bool
3898 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3899 {
3900 	pt_entry_t *ptes, pte;
3901 	pd_entry_t pde;
3902 	pd_entry_t * const *pdes;
3903 	struct pmap *pmap2;
3904 	paddr_t pa;
3905 	bool rv;
3906 	int lvl;
3907 
3908 	if (__predict_false(pmap->pm_extract != NULL)) {
3909 		return (*pmap->pm_extract)(pmap, va, pap);
3910 	}
3911 
3912 #ifdef __HAVE_DIRECT_MAP
3913 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3914 		if (pap != NULL) {
3915 			*pap = PMAP_DIRECT_UNMAP(va);
3916 		}
3917 		return true;
3918 	}
3919 #endif
3920 
3921 	rv = false;
3922 	pa = 0;
3923 
3924 	if (pmap != pmap_kernel()) {
3925 		mutex_enter(&pmap->pm_lock);
3926 	}
3927 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3928 	if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3929 		if (lvl == 2) {
3930 			pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3931 			rv = true;
3932 		} else {
3933 			KASSERT(lvl == 1);
3934 			pte = ptes[pl1_i(va)];
3935 			if (__predict_true((pte & PTE_P) != 0)) {
3936 				pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3937 				rv = true;
3938 			}
3939 		}
3940 	}
3941 	pmap_unmap_ptes(pmap, pmap2);
3942 	if (pmap != pmap_kernel()) {
3943 		mutex_exit(&pmap->pm_lock);
3944 	}
3945 	if (pap != NULL) {
3946 		*pap = pa;
3947 	}
3948 
3949 	return rv;
3950 }
3951 
3952 /*
3953  * vtophys: virtual address to physical address.  For use by
3954  * machine-dependent code only.
3955  */
3956 paddr_t
3957 vtophys(vaddr_t va)
3958 {
3959 	paddr_t pa;
3960 
3961 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3962 		return pa;
3963 	return 0;
3964 }
3965 
3966 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3967 
3968 #ifdef XENPV
3969 /*
3970  * vtomach: virtual address to machine address.  For use by
3971  * machine-dependent code only.
3972  */
3973 paddr_t
3974 vtomach(vaddr_t va)
3975 {
3976 	paddr_t pa;
3977 
3978 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3979 		return pa;
3980 	return 0;
3981 }
3982 #endif
3983 
3984 /*
3985  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3986  * determine the bounds of the kernel virtual address space.
3987  */
3988 void
3989 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3990 {
3991 	*startp = virtual_avail;
3992 	*endp = virtual_end;
3993 }
3994 
3995 void
3996 pmap_zero_page(paddr_t pa)
3997 {
3998 #if defined(__HAVE_DIRECT_MAP)
3999 	memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
4000 #else
4001 #if defined(XENPV)
4002 	if (XEN_VERSION_SUPPORTED(3, 4)) {
4003 		xen_pagezero(pa);
4004 		return;
4005 	}
4006 #endif
4007 	struct cpu_info *ci;
4008 	pt_entry_t *zpte;
4009 	vaddr_t zerova;
4010 
4011 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
4012 
4013 	kpreempt_disable();
4014 
4015 	ci = curcpu();
4016 	zerova = ci->vpage[VPAGE_ZER];
4017 	zpte = ci->vpage_pte[VPAGE_ZER];
4018 
4019 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
4020 
4021 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
4022 	pmap_pte_flush();
4023 	pmap_update_pg(zerova);		/* flush TLB */
4024 
4025 	memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
4026 
4027 #if defined(DIAGNOSTIC) || defined(XENPV)
4028 	pmap_pte_set(zpte, 0);				/* zap ! */
4029 	pmap_pte_flush();
4030 #endif
4031 
4032 	kpreempt_enable();
4033 #endif /* defined(__HAVE_DIRECT_MAP) */
4034 }
4035 
4036 void
4037 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
4038 {
4039 #if defined(__HAVE_DIRECT_MAP)
4040 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
4041 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
4042 
4043 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4044 #else
4045 #if defined(XENPV)
4046 	if (XEN_VERSION_SUPPORTED(3, 4)) {
4047 		xen_copy_page(srcpa, dstpa);
4048 		return;
4049 	}
4050 #endif
4051 	struct cpu_info *ci;
4052 	pt_entry_t *srcpte, *dstpte;
4053 	vaddr_t srcva, dstva;
4054 
4055 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
4056 
4057 	kpreempt_disable();
4058 
4059 	ci = curcpu();
4060 	srcva = ci->vpage[VPAGE_SRC];
4061 	dstva = ci->vpage[VPAGE_DST];
4062 	srcpte = ci->vpage_pte[VPAGE_SRC];
4063 	dstpte = ci->vpage_pte[VPAGE_DST];
4064 
4065 	KASSERT(*srcpte == 0 && *dstpte == 0);
4066 
4067 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
4068 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
4069 	pmap_pte_flush();
4070 	pmap_update_pg(srcva);
4071 	pmap_update_pg(dstva);
4072 
4073 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4074 
4075 #if defined(DIAGNOSTIC) || defined(XENPV)
4076 	pmap_pte_set(srcpte, 0);
4077 	pmap_pte_set(dstpte, 0);
4078 	pmap_pte_flush();
4079 #endif
4080 
4081 	kpreempt_enable();
4082 #endif /* defined(__HAVE_DIRECT_MAP) */
4083 }
4084 
4085 static pt_entry_t *
4086 pmap_map_ptp(struct vm_page *ptp)
4087 {
4088 #ifdef __HAVE_DIRECT_MAP
4089 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
4090 #else
4091 	struct cpu_info *ci;
4092 	pt_entry_t *ptppte;
4093 	vaddr_t ptpva;
4094 
4095 	KASSERT(kpreempt_disabled());
4096 
4097 #ifndef XENPV
4098 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
4099 #else
4100 	const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
4101 #endif
4102 
4103 	ci = curcpu();
4104 	ptpva = ci->vpage[VPAGE_PTP];
4105 	ptppte = ci->vpage_pte[VPAGE_PTP];
4106 
4107 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
4108 
4109 	pmap_pte_flush();
4110 	pmap_update_pg(ptpva);
4111 
4112 	return (pt_entry_t *)ptpva;
4113 #endif
4114 }
4115 
4116 static void
4117 pmap_unmap_ptp(void)
4118 {
4119 #ifndef __HAVE_DIRECT_MAP
4120 #if defined(DIAGNOSTIC) || defined(XENPV)
4121 	struct cpu_info *ci;
4122 	pt_entry_t *pte;
4123 
4124 	KASSERT(kpreempt_disabled());
4125 
4126 	ci = curcpu();
4127 	pte = ci->vpage_pte[VPAGE_PTP];
4128 
4129 	if (*pte != 0) {
4130 		pmap_pte_set(pte, 0);
4131 		pmap_pte_flush();
4132 	}
4133 #endif
4134 #endif
4135 }
4136 
4137 static pt_entry_t *
4138 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
4139 {
4140 
4141 	KASSERT(kpreempt_disabled());
4142 	if (pmap_is_curpmap(pmap)) {
4143 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
4144 	}
4145 	KASSERT(ptp != NULL);
4146 	return pmap_map_ptp(ptp) + pl1_pi(va);
4147 }
4148 
4149 static void
4150 pmap_unmap_pte(void)
4151 {
4152 
4153 	KASSERT(kpreempt_disabled());
4154 
4155 	pmap_unmap_ptp();
4156 }
4157 
4158 /*
4159  * p m a p   r e m o v e   f u n c t i o n s
4160  *
4161  * functions that remove mappings
4162  */
4163 
4164 /*
4165  * pmap_remove_ptes: remove PTEs from a PTP
4166  *
4167  * => caller must hold pmap's lock
4168  * => PTP must be mapped into KVA
4169  * => PTP should be null if pmap == pmap_kernel()
4170  * => must be called with kernel preemption disabled
4171  * => returns composite pte if at least one page should be shot down
4172  */
4173 static void
4174 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
4175     vaddr_t startva, vaddr_t endva)
4176 {
4177 	pt_entry_t *pte = (pt_entry_t *)ptpva;
4178 
4179 	KASSERT(mutex_owned(&pmap->pm_lock));
4180 	KASSERT(kpreempt_disabled());
4181 
4182 	/*
4183 	 * mappings are very often sparse, so clip the given range to the
4184 	 * range of PTEs that are known present in the PTP.
4185 	 */
4186 	pmap_ptp_range_clip(ptp, &startva, &pte);
4187 
4188 	/*
4189 	 * note that ptpva points to the PTE that maps startva.   this may
4190 	 * or may not be the first PTE in the PTP.
4191 	 *
4192 	 * we loop through the PTP while there are still PTEs to look at
4193 	 * and the wire_count is greater than 1 (because we use the wire_count
4194 	 * to keep track of the number of real PTEs in the PTP).
4195 	 */
4196 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4197 		(void)pmap_remove_pte(pmap, ptp, pte, startva);
4198 		startva += PAGE_SIZE;
4199 		pte++;
4200 	}
4201 }
4202 
4203 /*
4204  * pmap_remove_pte: remove a single PTE from a PTP.
4205  *
4206  * => caller must hold pmap's lock
4207  * => PTP must be mapped into KVA
4208  * => PTP should be null if pmap == pmap_kernel()
4209  * => returns true if we removed a mapping
4210  * => must be called with kernel preemption disabled
4211  */
4212 static bool
4213 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4214     vaddr_t va)
4215 {
4216 	struct pv_entry *pve;
4217 	struct vm_page *pg;
4218 	struct pmap_page *pp;
4219 	pt_entry_t opte;
4220 
4221 	KASSERT(mutex_owned(&pmap->pm_lock));
4222 	KASSERT(kpreempt_disabled());
4223 
4224 	if (!pmap_valid_entry(*pte)) {
4225 		/* VA not mapped. */
4226 		return false;
4227 	}
4228 
4229 	/* Atomically save the old PTE and zap it. */
4230 	opte = pmap_pte_testset(pte, 0);
4231 	if (!pmap_valid_entry(opte)) {
4232 		return false;
4233 	}
4234 
4235 	pmap_exec_account(pmap, va, opte, 0);
4236 	pmap_stats_update_bypte(pmap, 0, opte);
4237 
4238 	if (ptp) {
4239 		/*
4240 		 * Dropping a PTE.  Make sure that the PDE is flushed.
4241 		 */
4242 		ptp->wire_count--;
4243 		if (ptp->wire_count <= 1) {
4244 			opte |= PTE_A;
4245 		}
4246 	}
4247 
4248 	if ((opte & PTE_A) != 0) {
4249 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4250 	}
4251 
4252 	/*
4253 	 * If we are not on a pv list - we are done.
4254 	 */
4255 	if ((opte & PTE_PVLIST) == 0) {
4256 #ifndef DOM0OPS
4257 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4258 		    "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4259 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4260 		    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4261 #endif
4262 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4263 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4264 		return true;
4265 	}
4266 
4267 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4268 		pp = VM_PAGE_TO_PP(pg);
4269 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4270 		paddr_t pa = pmap_pte2pa(opte);
4271 		panic("%s: PTE_PVLIST with pv-untracked page"
4272 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4273 		    __func__, va, pa, atop(pa));
4274 	}
4275 
4276 	/* Sync R/M bits. */
4277 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
4278 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4279 	return true;
4280 }
4281 
4282 static void
4283 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4284 {
4285 	pt_entry_t *ptes;
4286 	pd_entry_t pde;
4287 	pd_entry_t * const *pdes;
4288 	bool result;
4289 	vaddr_t blkendva, va = sva;
4290 	struct vm_page *ptp;
4291 	struct pmap *pmap2;
4292 	int lvl;
4293 
4294 	KASSERT(mutex_owned(&pmap->pm_lock));
4295 
4296 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4297 
4298 	/*
4299 	 * removing one page?  take shortcut function.
4300 	 */
4301 
4302 	if (va + PAGE_SIZE == eva) {
4303 		if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4304 			KASSERT(lvl == 1);
4305 
4306 			/* Get PTP if non-kernel mapping. */
4307 			if (pmap != pmap_kernel()) {
4308 				ptp = pmap_find_ptp(pmap, va, 1);
4309 				KASSERTMSG(ptp != NULL,
4310 				    "%s: unmanaged PTP detected", __func__);
4311 			} else {
4312 				/* Never free kernel PTPs. */
4313 				ptp = NULL;
4314 			}
4315 
4316 			result = pmap_remove_pte(pmap, ptp,
4317 			    &ptes[pl1_i(va)], va);
4318 
4319 			/*
4320 			 * if mapping removed and the PTP is no longer
4321 			 * being used, free it!
4322 			 */
4323 
4324 			if (result && ptp && ptp->wire_count <= 1)
4325 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4326 		}
4327 	} else for (/* null */ ; va < eva ; va = blkendva) {
4328 		/* determine range of block */
4329 		blkendva = x86_round_pdr(va+1);
4330 		if (blkendva > eva)
4331 			blkendva = eva;
4332 
4333 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4334 			/* Skip a range corresponding to an invalid pde. */
4335 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4336 			continue;
4337 		}
4338 		KASSERT(lvl == 1);
4339 
4340 		/* Get PTP if non-kernel mapping. */
4341 		if (pmap != pmap_kernel()) {
4342 			ptp = pmap_find_ptp(pmap, va, 1);
4343 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4344 			    __func__);
4345 		} else {
4346 			/* Never free kernel PTPs. */
4347 			ptp = NULL;
4348 		}
4349 
4350 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4351 		    blkendva);
4352 
4353 		/* If PTP is no longer being used, free it. */
4354 		if (ptp && ptp->wire_count <= 1) {
4355 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4356 		}
4357 	}
4358 	pmap_unmap_ptes(pmap, pmap2);
4359 	pmap_drain_pv(pmap);
4360 }
4361 
4362 /*
4363  * pmap_remove: mapping removal function.
4364  *
4365  * => caller should not be holding any pmap locks
4366  */
4367 void
4368 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4369 {
4370 	if (__predict_false(pmap->pm_remove != NULL)) {
4371 		(*pmap->pm_remove)(pmap, sva, eva);
4372 		return;
4373 	}
4374 
4375 	mutex_enter(&pmap->pm_lock);
4376 	pmap_remove_locked(pmap, sva, eva);
4377 	mutex_exit(&pmap->pm_lock);
4378 }
4379 
4380 /*
4381  * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4382  *
4383  * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4384  * => Caller should disable kernel preemption.
4385  * => issues tlb shootdowns if necessary.
4386  */
4387 static int
4388 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4389     pt_entry_t *optep)
4390 {
4391 	struct pmap *pmap;
4392 	struct vm_page *ptp;
4393 	vaddr_t va;
4394 	pt_entry_t *ptep;
4395 	pt_entry_t opte;
4396 	pt_entry_t npte;
4397 	pt_entry_t expect;
4398 	bool need_shootdown;
4399 
4400 	ptp = pvpte->pte_ptp;
4401 	va = pvpte->pte_va;
4402 	KASSERT(ptp == NULL || ptp->uobject != NULL);
4403 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4404 	pmap = ptp_to_pmap(ptp);
4405 	KASSERT(kpreempt_disabled());
4406 
4407 	if (__predict_false(pmap->pm_sync_pv != NULL)) {
4408 		return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4409 		    optep);
4410 	}
4411 
4412 	expect = pmap_pa2pte(pa) | PTE_P;
4413 
4414 	if (clearbits != ~0) {
4415 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4416 		clearbits = pmap_pp_attrs_to_pte(clearbits);
4417 	}
4418 
4419 	ptep = pmap_map_pte(pmap, ptp, va);
4420 	do {
4421 		opte = *ptep;
4422 		KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4423 		KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4424 		KASSERT(opte == 0 || (opte & PTE_P) != 0);
4425 		if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4426 			/*
4427 			 * We lost a race with a V->P operation like
4428 			 * pmap_remove().  Wait for the competitor
4429 			 * reflecting pte bits into mp_attrs.
4430 			 */
4431 			pmap_unmap_pte();
4432 			return EAGAIN;
4433 		}
4434 
4435 		/*
4436 		 * Check if there's anything to do on this PTE.
4437 		 */
4438 		if ((opte & clearbits) == 0) {
4439 			need_shootdown = false;
4440 			break;
4441 		}
4442 
4443 		/*
4444 		 * We need a shootdown if the PTE is cached (PTE_A) ...
4445 		 * ... Unless we are clearing only the PTE_W bit and
4446 		 * it isn't cached as RW (PTE_D).
4447 		 */
4448 		need_shootdown = (opte & PTE_A) != 0 &&
4449 		    !(clearbits == PTE_W && (opte & PTE_D) == 0);
4450 
4451 		npte = opte & ~clearbits;
4452 
4453 		/*
4454 		 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4455 		 */
4456 		if (need_shootdown) {
4457 			npte &= ~(PTE_A | PTE_D);
4458 		}
4459 		KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4460 		KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4461 		KASSERT(npte == 0 || (opte & PTE_P) != 0);
4462 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4463 
4464 	if (need_shootdown) {
4465 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4466 	}
4467 	pmap_unmap_pte();
4468 
4469 	*oattrs = pmap_pte_to_pp_attrs(opte);
4470 	if (optep != NULL)
4471 		*optep = opte;
4472 	return 0;
4473 }
4474 
4475 static void
4476 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4477     vaddr_t va)
4478 {
4479 	struct pmap *pmap2;
4480 	pt_entry_t *ptes;
4481 	pd_entry_t * const *pdes;
4482 
4483 	KASSERT(mutex_owned(&pmap->pm_lock));
4484 
4485 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4486 	pmap_stats_update_bypte(pmap, 0, opte);
4487 	ptp->wire_count--;
4488 	if (ptp->wire_count <= 1) {
4489 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4490 	}
4491 	pmap_unmap_ptes(pmap, pmap2);
4492 }
4493 
4494 static void
4495 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4496 {
4497 	struct pv_pte *pvpte;
4498 	struct vm_page *ptp;
4499 	uintptr_t sum;
4500 	uint8_t oattrs;
4501 	bool locked;
4502 
4503 	/*
4504 	 * Do an unlocked check to see if the page has no mappings, eg when
4505 	 * pmap_remove_all() was called before amap_wipeout() for a process
4506 	 * private amap - common.  The page being removed must be on the way
4507 	 * out, so we don't have to worry about concurrent attempts to enter
4508 	 * it (otherwise the caller either doesn't care or has screwed up).
4509 	 */
4510 	sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4511 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4512 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4513 	if (sum == 0) {
4514 		return;
4515 	}
4516 
4517 	kpreempt_disable();
4518 	for (;;) {
4519 		struct pmap *pmap;
4520 		struct pv_entry *pve;
4521 		pt_entry_t opte;
4522 		vaddr_t va;
4523 
4524 		mutex_spin_enter(&pp->pp_lock);
4525 		if ((pvpte = pv_pte_first(pp)) == NULL) {
4526 			mutex_spin_exit(&pp->pp_lock);
4527 			break;
4528 		}
4529 
4530 		/*
4531 		 * Add a reference to the pmap before clearing the pte.
4532 		 * Otherwise the pmap can disappear behind us.
4533 		 */
4534 		ptp = pvpte->pte_ptp;
4535 		pmap = ptp_to_pmap(ptp);
4536 		KASSERT(pmap->pm_obj[0].uo_refs > 0);
4537 		if (ptp != NULL) {
4538 			pmap_reference(pmap);
4539 		}
4540 
4541 		/*
4542 		 * Now try to lock it.  We need a direct handoff between
4543 		 * pp_lock and pm_lock to know the pv_entry is kept intact
4544 		 * and kept associated with this pmap.  If that can't be
4545 		 * had, wait for the pmap's lock to become free and then
4546 		 * retry.
4547 		 */
4548 		locked = mutex_tryenter(&pmap->pm_lock);
4549 		mutex_spin_exit(&pp->pp_lock);
4550 		if (!locked) {
4551 			mutex_enter(&pmap->pm_lock);
4552 			/* nothing, just wait for it */
4553 			mutex_exit(&pmap->pm_lock);
4554 			if (ptp != NULL) {
4555 				pmap_destroy(pmap);
4556 			}
4557 			continue;
4558 		}
4559 		va = pvpte->pte_va;
4560 
4561 		KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4562 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4563 		KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4564 		    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4565 		KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4566 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4567 
4568 #ifdef DEBUG
4569 		pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4570 		rb_tree_t *tree = (ptp != NULL ?
4571 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4572 		pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4573 		if (pve == NULL) {
4574 			KASSERTMSG(&pp->pp_pte == pvpte,
4575 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4576 			    va, pmap, ptp, pvpte, pve);
4577 		} else {
4578 			KASSERTMSG(&pve->pve_pte == pvpte,
4579 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4580 			    va, pmap, ptp, pvpte, pve);
4581 		}
4582 #endif
4583 
4584 		if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4585 			panic("pmap_pp_remove: mapping not present");
4586 		}
4587 
4588 		pve = pmap_lookup_pv(pmap, ptp, pp, va);
4589 		pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4590 
4591 		/* Update the PTP reference count. Free if last reference. */
4592 		if (ptp != NULL) {
4593 			KASSERT(pmap != pmap_kernel());
4594 			pmap_tlb_shootnow();
4595 			if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4596 				(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4597 			} else {
4598 				pmap_pp_remove_ent(pmap, ptp, opte, va);
4599 			}
4600 		} else {
4601 			KASSERT(pmap == pmap_kernel());
4602 			pmap_stats_update_bypte(pmap, 0, opte);
4603 		}
4604 		pmap_tlb_shootnow();
4605 		pmap_drain_pv(pmap);
4606 		mutex_exit(&pmap->pm_lock);
4607 		if (ptp != NULL) {
4608 			pmap_destroy(pmap);
4609 		}
4610 	}
4611 	kpreempt_enable();
4612 }
4613 
4614 /*
4615  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4616  *
4617  * => R/M bits are sync'd back to attrs
4618  */
4619 void
4620 pmap_page_remove(struct vm_page *pg)
4621 {
4622 	struct pmap_page *pp;
4623 	paddr_t pa;
4624 
4625 	pp = VM_PAGE_TO_PP(pg);
4626 	pa = VM_PAGE_TO_PHYS(pg);
4627 	pmap_pp_remove(pp, pa);
4628 }
4629 
4630 /*
4631  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4632  * that map it
4633  */
4634 void
4635 pmap_pv_remove(paddr_t pa)
4636 {
4637 	struct pmap_page *pp;
4638 
4639 	pp = pmap_pv_tracked(pa);
4640 	if (pp == NULL)
4641 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4642 	pmap_pp_remove(pp, pa);
4643 }
4644 
4645 /*
4646  * p m a p   a t t r i b u t e  f u n c t i o n s
4647  * functions that test/change managed page's attributes
4648  * since a page can be mapped multiple times we must check each PTE that
4649  * maps it by going down the pv lists.
4650  */
4651 
4652 /*
4653  * pmap_test_attrs: test a page's attributes
4654  */
4655 bool
4656 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4657 {
4658 	struct pmap_page *pp;
4659 	struct pv_pte *pvpte;
4660 	struct pmap *pmap;
4661 	uint8_t oattrs;
4662 	u_int result;
4663 	paddr_t pa;
4664 
4665 	pp = VM_PAGE_TO_PP(pg);
4666 	if ((pp->pp_attrs & testbits) != 0) {
4667 		return true;
4668 	}
4669 	pa = VM_PAGE_TO_PHYS(pg);
4670  startover:
4671 	mutex_spin_enter(&pp->pp_lock);
4672 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4673 		if ((pp->pp_attrs & testbits) != 0) {
4674 			break;
4675 		}
4676 		if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4677 			/*
4678 			 * raced with a V->P operation.  wait for the other
4679 			 * side to finish by acquiring pmap's lock.  if no
4680 			 * wait, updates to pp_attrs by the other side may
4681 			 * go unseen.
4682 			 */
4683 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4684 			pmap_reference(pmap);
4685 			mutex_spin_exit(&pp->pp_lock);
4686 			mutex_enter(&pmap->pm_lock);
4687 			/* nothing. */
4688 			mutex_exit(&pmap->pm_lock);
4689 			pmap_destroy(pmap);
4690 			goto startover;
4691 		}
4692 		pp->pp_attrs |= oattrs;
4693 	}
4694 	result = pp->pp_attrs & testbits;
4695 	mutex_spin_exit(&pp->pp_lock);
4696 
4697 	/*
4698 	 * note that we will exit the for loop with a non-null pve if
4699 	 * we have found the bits we are testing for.
4700 	 */
4701 
4702 	return result != 0;
4703 }
4704 
4705 static bool
4706 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4707 {
4708 	struct pv_pte *pvpte;
4709 	struct pmap *pmap;
4710 	uint8_t oattrs;
4711 	u_int result;
4712 
4713 startover:
4714 	mutex_spin_enter(&pp->pp_lock);
4715 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4716 		if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4717 			/*
4718 			 * raced with a V->P operation.  wait for the other
4719 			 * side to finish by acquiring pmap's lock.  it is
4720 			 * probably unmapping the page, and it will be gone
4721 			 * when the loop is restarted.
4722 			 */
4723 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4724 			pmap_reference(pmap);
4725 			mutex_spin_exit(&pp->pp_lock);
4726 			mutex_enter(&pmap->pm_lock);
4727 			/* nothing. */
4728 			mutex_exit(&pmap->pm_lock);
4729 			pmap_destroy(pmap);
4730 			goto startover;
4731 		}
4732 		pp->pp_attrs |= oattrs;
4733 	}
4734 	result = pp->pp_attrs & clearbits;
4735 	pp->pp_attrs &= ~clearbits;
4736 	pmap_tlb_shootnow();
4737 	mutex_spin_exit(&pp->pp_lock);
4738 
4739 	return result != 0;
4740 }
4741 
4742 /*
4743  * pmap_clear_attrs: clear the specified attribute for a page.
4744  *
4745  * => we return true if we cleared one of the bits we were asked to
4746  */
4747 bool
4748 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4749 {
4750 	struct pmap_page *pp;
4751 	paddr_t pa;
4752 
4753 	pp = VM_PAGE_TO_PP(pg);
4754 	pa = VM_PAGE_TO_PHYS(pg);
4755 
4756 	/*
4757 	 * If this is a new page, assert it has no mappings and simply zap
4758 	 * the stored attributes without taking any locks.
4759 	 */
4760 	if ((pg->flags & PG_FAKE) != 0) {
4761 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4762 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4763 		KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4764 		atomic_store_relaxed(&pp->pp_attrs, 0);
4765 		return false;
4766 	} else {
4767 		return pmap_pp_clear_attrs(pp, pa, clearbits);
4768 	}
4769 }
4770 
4771 /*
4772  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4773  * pv-tracked page.
4774  */
4775 bool
4776 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4777 {
4778 	struct pmap_page *pp;
4779 
4780 	pp = pmap_pv_tracked(pa);
4781 	if (pp == NULL)
4782 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4783 
4784 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4785 }
4786 
4787 /*
4788  * p m a p   p r o t e c t i o n   f u n c t i o n s
4789  */
4790 
4791 /*
4792  * pmap_page_protect: change the protection of all recorded mappings
4793  * of a managed page
4794  *
4795  * => NOTE: this is an inline function in pmap.h
4796  */
4797 
4798 /* see pmap.h */
4799 
4800 /*
4801  * pmap_pv_protect: change the protection of all recorded mappings
4802  * of an unmanaged pv-tracked page
4803  *
4804  * => NOTE: this is an inline function in pmap.h
4805  */
4806 
4807 /* see pmap.h */
4808 
4809 /*
4810  * pmap_protect: set the protection in of the pages in a pmap
4811  *
4812  * => NOTE: this is an inline function in pmap.h
4813  */
4814 
4815 /* see pmap.h */
4816 
4817 /*
4818  * pmap_write_protect: write-protect pages in a pmap.
4819  *
4820  * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4821  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4822  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4823  * present the page will still be considered as a kernel page, and the privilege
4824  * separation will be enforced correctly.
4825  */
4826 void
4827 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4828 {
4829 	pt_entry_t bit_rem, bit_put;
4830 	pt_entry_t *ptes;
4831 	pt_entry_t * const *pdes;
4832 	struct pmap *pmap2;
4833 	vaddr_t blockend, va;
4834 	int lvl, i;
4835 
4836 	if (__predict_false(pmap->pm_write_protect != NULL)) {
4837 		(*pmap->pm_write_protect)(pmap, sva, eva, prot);
4838 		return;
4839 	}
4840 
4841 	bit_rem = 0;
4842 	if (!(prot & VM_PROT_WRITE))
4843 		bit_rem = PTE_W;
4844 
4845 	bit_put = 0;
4846 	if (!(prot & VM_PROT_EXECUTE))
4847 		bit_put = pmap_pg_nx;
4848 
4849 	sva &= ~PAGE_MASK;
4850 	eva &= ~PAGE_MASK;
4851 
4852 	/*
4853 	 * Acquire pmap.  No need to lock the kernel pmap as we won't
4854 	 * be touching PV entries nor stats and kernel PDEs aren't
4855 	 * freed.
4856 	 */
4857 	if (pmap != pmap_kernel()) {
4858 		mutex_enter(&pmap->pm_lock);
4859 	}
4860 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4861 
4862 	for (va = sva ; va < eva; va = blockend) {
4863 		pt_entry_t *spte, *epte;
4864 
4865 		blockend = x86_round_pdr(va + 1);
4866 		if (blockend > eva)
4867 			blockend = eva;
4868 
4869 		/* Is it a valid block? */
4870 		if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4871 			continue;
4872 		}
4873 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4874 		KASSERT(lvl == 1);
4875 
4876 		spte = &ptes[pl1_i(va)];
4877 		epte = &ptes[pl1_i(blockend)];
4878 
4879 		for (i = 0; spte < epte; spte++, i++) {
4880 			pt_entry_t opte, npte;
4881 
4882 			do {
4883 				opte = *spte;
4884 				if (!pmap_valid_entry(opte)) {
4885 					goto next;
4886 				}
4887 				npte = (opte & ~bit_rem) | bit_put;
4888 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4889 
4890 			if ((opte & PTE_D) != 0) {
4891 				vaddr_t tva = va + x86_ptob(i);
4892 				pmap_tlb_shootdown(pmap, tva, opte,
4893 				    TLBSHOOT_WRITE_PROTECT);
4894 			}
4895 next:;
4896 		}
4897 	}
4898 
4899 	/* Release pmap. */
4900 	pmap_unmap_ptes(pmap, pmap2);
4901 	if (pmap != pmap_kernel()) {
4902 		mutex_exit(&pmap->pm_lock);
4903 	}
4904 }
4905 
4906 /*
4907  * pmap_unwire: clear the wired bit in the PTE.
4908  *
4909  * => Mapping should already be present.
4910  */
4911 void
4912 pmap_unwire(struct pmap *pmap, vaddr_t va)
4913 {
4914 	pt_entry_t *ptes, *ptep, opte;
4915 	pd_entry_t * const *pdes;
4916 	struct pmap *pmap2;
4917 	int lvl;
4918 
4919 	if (__predict_false(pmap->pm_unwire != NULL)) {
4920 		(*pmap->pm_unwire)(pmap, va);
4921 		return;
4922 	}
4923 
4924 	/*
4925 	 * Acquire pmap.  Need to lock the kernel pmap only to protect the
4926 	 * statistics.
4927 	 */
4928 	mutex_enter(&pmap->pm_lock);
4929 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4930 
4931 	if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4932 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4933 	}
4934 	KASSERT(lvl == 1);
4935 
4936 	ptep = &ptes[pl1_i(va)];
4937 	opte = *ptep;
4938 	KASSERT(pmap_valid_entry(opte));
4939 
4940 	if (opte & PTE_WIRED) {
4941 		pt_entry_t npte = opte & ~PTE_WIRED;
4942 
4943 		opte = pmap_pte_testset(ptep, npte);
4944 		pmap_stats_update_bypte(pmap, npte, opte);
4945 	} else {
4946 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4947 		    " did not change!\n", __func__, pmap, va);
4948 	}
4949 
4950 	/* Release pmap. */
4951 	pmap_unmap_ptes(pmap, pmap2);
4952 	mutex_exit(&pmap->pm_lock);
4953 }
4954 
4955 /*
4956  * pmap_copy: copy mappings from one pmap to another
4957  *
4958  * => optional function
4959  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4960  */
4961 
4962 /*
4963  * defined as macro in pmap.h
4964  */
4965 
4966 __strict_weak_alias(pmap_enter, pmap_enter_default);
4967 
4968 int
4969 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4970     u_int flags)
4971 {
4972 	if (__predict_false(pmap->pm_enter != NULL)) {
4973 		return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4974 	}
4975 
4976 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4977 }
4978 
4979 /*
4980  * pmap_enter: enter a mapping into a pmap
4981  *
4982  * => must be done "now" ... no lazy-evaluation
4983  */
4984 int
4985 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4986 	   vm_prot_t prot, u_int flags, int domid)
4987 {
4988 	pt_entry_t *ptes, opte, npte;
4989 	pt_entry_t *ptep;
4990 	pd_entry_t * const *pdes;
4991 	struct vm_page *ptp;
4992 	struct vm_page *new_pg, *old_pg;
4993 	struct pmap_page *new_pp, *old_pp;
4994 	struct pv_entry *old_pve, *new_pve;
4995 	bool wired = (flags & PMAP_WIRED) != 0;
4996 	struct pmap *pmap2;
4997 	struct pmap_ptparray pt;
4998 	int error;
4999 	bool getptp, samepage, new_embedded;
5000 	rb_tree_t *tree;
5001 
5002 	KASSERT(pmap_initialized);
5003 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5004 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5005 	    PRIxVADDR " over PDP!", __func__, va);
5006 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
5007 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
5008 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
5009 
5010 #ifdef XENPV
5011 	KASSERT(domid == DOMID_SELF || pa == 0);
5012 #endif
5013 
5014 	npte = ma | protection_codes[prot] | PTE_P;
5015 	npte |= pmap_pat_flags(flags);
5016 	if (wired)
5017 		npte |= PTE_WIRED;
5018 	if (va < VM_MAXUSER_ADDRESS) {
5019 		KASSERTMSG(pmap != pmap_kernel(),
5020 		    "entering user va %#"PRIxVADDR" into kernel pmap",
5021 		    va);
5022 		if (pmap_is_user(pmap))
5023 			npte |= PTE_U;
5024 	}
5025 
5026 	if (pmap == pmap_kernel())
5027 		npte |= pmap_pg_g;
5028 	if (flags & VM_PROT_ALL) {
5029 		npte |= PTE_A;
5030 		if (flags & VM_PROT_WRITE) {
5031 			KASSERT((npte & PTE_W) != 0);
5032 			npte |= PTE_D;
5033 		}
5034 	}
5035 
5036 #ifdef XENPV
5037 	if (domid != DOMID_SELF)
5038 		new_pg = NULL;
5039 	else
5040 #endif
5041 		new_pg = PHYS_TO_VM_PAGE(pa);
5042 
5043 	if (new_pg != NULL) {
5044 		/* This is a managed page */
5045 		npte |= PTE_PVLIST;
5046 		new_pp = VM_PAGE_TO_PP(new_pg);
5047 		PMAP_CHECK_PP(new_pp);
5048 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
5049 		/* This is an unmanaged pv-tracked page */
5050 		npte |= PTE_PVLIST;
5051 		PMAP_CHECK_PP(new_pp);
5052 	} else {
5053 		new_pp = NULL;
5054 	}
5055 
5056 	/* Begin by locking the pmap. */
5057 	mutex_enter(&pmap->pm_lock);
5058 
5059 	/* Look up the PTP.  Allocate if none present. */
5060 	ptp = NULL;
5061 	getptp = false;
5062 	if (pmap != pmap_kernel()) {
5063 		ptp = pmap_find_ptp(pmap, va, 1);
5064 		if (ptp == NULL) {
5065 			getptp = true;
5066 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
5067 			if (error != 0) {
5068 				if (flags & PMAP_CANFAIL) {
5069 					mutex_exit(&pmap->pm_lock);
5070 					return error;
5071 				}
5072 				panic("%s: get ptp failed, error=%d", __func__,
5073 				    error);
5074 			}
5075 		}
5076 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5077 	} else {
5078 		/* Embedded PV entries rely on this. */
5079 		KASSERT(va != 0);
5080 		tree = &pmap_kernel_rb;
5081 	}
5082 
5083 	/*
5084 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5085 	 * entry if required for the new mapping.  Temporarily track the old
5086 	 * and new mappings concurrently.  Only after the old mapping is
5087 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5088 	 * our picture of modified/accessed state for either page could get
5089 	 * out of sync (we need any P->V operation for either page to stall
5090 	 * on pmap->pm_lock until done here).
5091 	 */
5092 	new_pve = NULL;
5093 	old_pve = NULL;
5094 	samepage = false;
5095 	new_embedded = false;
5096 
5097 	if (new_pp != NULL) {
5098 		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
5099 		    &old_pve, &samepage, &new_embedded, tree);
5100 
5101 		/*
5102 		 * If a new pv_entry was needed and none was available, we
5103 		 * can go no further.
5104 		 */
5105 		if (error != 0) {
5106 			if (flags & PMAP_CANFAIL) {
5107 				if (getptp) {
5108 					pmap_unget_ptp(pmap, &pt);
5109 				}
5110 				mutex_exit(&pmap->pm_lock);
5111 				return error;
5112 			}
5113 			panic("%s: alloc pve failed", __func__);
5114 		}
5115 	} else {
5116 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5117 	}
5118 
5119 	/* Map PTEs into address space. */
5120 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5121 
5122 	/* Install any newly allocated PTPs. */
5123 	if (getptp) {
5124 		pmap_install_ptp(pmap, &pt, va, pdes);
5125 	}
5126 
5127 	/* Check if there is an existing mapping. */
5128 	ptep = &ptes[pl1_i(va)];
5129 	opte = *ptep;
5130 	bool have_oldpa = pmap_valid_entry(opte);
5131 	paddr_t oldpa = pmap_pte2pa(opte);
5132 
5133 	/*
5134 	 * Update the pte.
5135 	 */
5136 	do {
5137 		opte = *ptep;
5138 
5139 		/*
5140 		 * if the same page, inherit PTE_A and PTE_D.
5141 		 */
5142 		if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5143 			npte |= opte & (PTE_A | PTE_D);
5144 		}
5145 #if defined(XENPV)
5146 		if (domid != DOMID_SELF) {
5147 			/* pmap_pte_cas with error handling */
5148 			int s = splvm();
5149 			if (opte != *ptep) {
5150 				splx(s);
5151 				continue;
5152 			}
5153 			error = xpq_update_foreign(
5154 			    vtomach((vaddr_t)ptep), npte, domid, flags);
5155 			splx(s);
5156 			if (error) {
5157 				/* Undo pv_entry tracking - oof. */
5158 				if (new_pp != NULL) {
5159 					mutex_spin_enter(&new_pp->pp_lock);
5160 					if (new_pve != NULL) {
5161 						LIST_REMOVE(new_pve, pve_list);
5162 						KASSERT(pmap->pm_pve == NULL);
5163 						pmap->pm_pve = new_pve;
5164 					} else if (new_embedded) {
5165 						new_pp->pp_pte.pte_ptp = NULL;
5166 						new_pp->pp_pte.pte_va = 0;
5167 					}
5168 					mutex_spin_exit(&new_pp->pp_lock);
5169 				}
5170 				pmap_unmap_ptes(pmap, pmap2);
5171 				/* Free new PTP. */
5172 				if (ptp != NULL && ptp->wire_count <= 1) {
5173 					pmap_free_ptp(pmap, ptp, va, ptes,
5174 					    pdes);
5175 				}
5176 				mutex_exit(&pmap->pm_lock);
5177 				return error;
5178 			}
5179 			break;
5180 		}
5181 #endif /* defined(XENPV) */
5182 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
5183 
5184 	/*
5185 	 * Done with the PTEs: they can now be unmapped.
5186 	 */
5187 	pmap_unmap_ptes(pmap, pmap2);
5188 
5189 	/*
5190 	 * Update statistics and PTP's reference count.
5191 	 */
5192 	pmap_stats_update_bypte(pmap, npte, opte);
5193 	if (ptp != NULL) {
5194 		if (!have_oldpa) {
5195 			ptp->wire_count++;
5196 		}
5197 		/* Remember minimum VA in PTP. */
5198 		pmap_ptp_range_set(ptp, va);
5199 	}
5200 	KASSERT(ptp == NULL || ptp->wire_count > 1);
5201 
5202 	/*
5203 	 * If the same page, we can skip pv_entry handling.
5204 	 */
5205 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5206 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5207 		if ((npte & PTE_PVLIST) != 0) {
5208 			KASSERT(samepage);
5209 			pmap_check_pv(pmap, ptp, new_pp, va, true);
5210 		}
5211 		goto same_pa;
5212 	} else if ((npte & PTE_PVLIST) != 0) {
5213 		KASSERT(!samepage);
5214 	}
5215 
5216 	/*
5217 	 * If old page is pv-tracked, remove pv_entry from its list.
5218 	 */
5219 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5220 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5221 			old_pp = VM_PAGE_TO_PP(old_pg);
5222 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5223 			panic("%s: PTE_PVLIST with pv-untracked page"
5224 			    " va = %#"PRIxVADDR
5225 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5226 			    __func__, va, oldpa, atop(pa));
5227 		}
5228 
5229 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5230 		    pmap_pte_to_pp_attrs(opte));
5231 	} else {
5232 		KASSERT(old_pve == NULL);
5233 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5234 	}
5235 
5236 	/*
5237 	 * If new page is dynamically PV tracked, insert to tree.
5238 	 */
5239 	if (new_pve != NULL) {
5240 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5241 		old_pve = rb_tree_insert_node(tree, new_pve);
5242 		KASSERT(old_pve == new_pve);
5243 		pmap_check_pv(pmap, ptp, new_pp, va, true);
5244 	}
5245 
5246 same_pa:
5247 	/*
5248 	 * shootdown tlb if necessary.
5249 	 */
5250 
5251 	if ((~opte & (PTE_P | PTE_A)) == 0 &&
5252 	    ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5253 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5254 	}
5255 	pmap_drain_pv(pmap);
5256 	mutex_exit(&pmap->pm_lock);
5257 	return 0;
5258 }
5259 
5260 #if defined(XEN) && defined(DOM0OPS)
5261 
5262 struct pmap_data_gnt {
5263 	SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5264 	vaddr_t pd_gnt_sva;
5265 	vaddr_t pd_gnt_eva; /* range covered by this gnt */
5266 	int pd_gnt_refs; /* ref counter */
5267 	struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5268 };
5269 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5270 
5271 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5272 
5273 static struct pmap_data_gnt *
5274 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5275 {
5276 	struct pmap_data_gnt_head *headp;
5277 	struct pmap_data_gnt *pgnt;
5278 
5279 	KASSERT(mutex_owned(&pmap->pm_lock));
5280 	headp = pmap->pm_data;
5281 	KASSERT(headp != NULL);
5282 	SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5283 		if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5284 			return pgnt;
5285 		/* check that we're not overlapping part of a region */
5286 		KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5287 	}
5288 	return NULL;
5289 }
5290 
5291 static void
5292 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5293     const struct gnttab_map_grant_ref *ops)
5294 {
5295 	struct pmap_data_gnt_head *headp;
5296 	struct pmap_data_gnt *pgnt;
5297 	vaddr_t eva = sva + nentries * PAGE_SIZE;
5298 	KASSERT(mutex_owned(&pmap->pm_lock));
5299 	KASSERT(nentries >= 1);
5300 	if (pmap->pm_remove == NULL) {
5301 		pmap->pm_remove = pmap_remove_gnt;
5302 		KASSERT(pmap->pm_data == NULL);
5303 		headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5304 		SLIST_INIT(headp);
5305 		pmap->pm_data = headp;
5306 	} else {
5307 		KASSERT(pmap->pm_remove == pmap_remove_gnt);
5308 		KASSERT(pmap->pm_data != NULL);
5309 		headp = pmap->pm_data;
5310 	}
5311 
5312 	pgnt = pmap_find_gnt(pmap, sva, eva);
5313 	if (pgnt != NULL) {
5314 		KASSERT(pgnt->pd_gnt_sva == sva);
5315 		KASSERT(pgnt->pd_gnt_eva == eva);
5316 		return;
5317 	}
5318 
5319 	/* new entry */
5320 	pgnt = kmem_alloc(sizeof(*pgnt) +
5321 	    (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5322 	pgnt->pd_gnt_sva = sva;
5323 	pgnt->pd_gnt_eva = eva;
5324 	pgnt->pd_gnt_refs = 0;
5325 	memcpy(pgnt->pd_gnt_ops, ops,
5326 	    sizeof(struct gnttab_map_grant_ref) * nentries);
5327 	SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5328 }
5329 
5330 static void
5331 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5332 {
5333 	struct pmap_data_gnt_head *headp = pmap->pm_data;
5334 	int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5335 	KASSERT(nentries >= 1);
5336 	KASSERT(mutex_owned(&pmap->pm_lock));
5337 	KASSERT(pgnt->pd_gnt_refs == 0);
5338 	SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5339 	kmem_free(pgnt, sizeof(*pgnt) +
5340 		    (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5341 	if (SLIST_EMPTY(headp)) {
5342 		kmem_free(headp, sizeof(*headp));
5343 		pmap->pm_data = NULL;
5344 		pmap->pm_remove = NULL;
5345 	}
5346 }
5347 
5348 /*
5349  * pmap_enter_gnt: enter a grant entry into a pmap
5350  *
5351  * => must be done "now" ... no lazy-evaluation
5352  */
5353 int
5354 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5355     const struct gnttab_map_grant_ref *oops)
5356 {
5357 	struct pmap_data_gnt *pgnt;
5358 	pt_entry_t *ptes, opte;
5359 #ifndef XENPV
5360 	pt_entry_t npte;
5361 #endif
5362 	pt_entry_t *ptep;
5363 	pd_entry_t * const *pdes;
5364 	struct vm_page *ptp;
5365 	struct vm_page *old_pg;
5366 	struct pmap_page *old_pp;
5367 	struct pv_entry *old_pve;
5368 	struct pmap *pmap2;
5369 	struct pmap_ptparray pt;
5370 	int error;
5371 	bool getptp;
5372 	rb_tree_t *tree;
5373 	struct gnttab_map_grant_ref *op;
5374 	int ret;
5375 	int idx;
5376 
5377 	KASSERT(pmap_initialized);
5378 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5379 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5380 	    PRIxVADDR " over PDP!", __func__, va);
5381 	KASSERT(pmap != pmap_kernel());
5382 
5383 	/* Begin by locking the pmap. */
5384 	mutex_enter(&pmap->pm_lock);
5385 	pmap_alloc_gnt(pmap, sva, nentries, oops);
5386 
5387 	pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5388 	KASSERT(pgnt != NULL);
5389 
5390 	/* Look up the PTP.  Allocate if none present. */
5391 	ptp = NULL;
5392 	getptp = false;
5393 	ptp = pmap_find_ptp(pmap, va, 1);
5394 	if (ptp == NULL) {
5395 		getptp = true;
5396 		error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5397 		if (error != 0) {
5398 			mutex_exit(&pmap->pm_lock);
5399 			return error;
5400 		}
5401 	}
5402 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5403 
5404 	/*
5405 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5406 	 * entry if required for the new mapping.  Temporarily track the old
5407 	 * and new mappings concurrently.  Only after the old mapping is
5408 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5409 	 * our picture of modified/accessed state for either page could get
5410 	 * out of sync (we need any P->V operation for either page to stall
5411 	 * on pmap->pm_lock until done here).
5412 	 */
5413 	old_pve = NULL;
5414 
5415 	old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5416 
5417 	/* Map PTEs into address space. */
5418 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5419 
5420 	/* Install any newly allocated PTPs. */
5421 	if (getptp) {
5422 		pmap_install_ptp(pmap, &pt, va, pdes);
5423 	}
5424 
5425 	/* Check if there is an existing mapping. */
5426 	ptep = &ptes[pl1_i(va)];
5427 	opte = *ptep;
5428 	bool have_oldpa = pmap_valid_entry(opte);
5429 	paddr_t oldpa = pmap_pte2pa(opte);
5430 
5431 	/*
5432 	 * Update the pte.
5433 	 */
5434 
5435 	idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5436 	op = &pgnt->pd_gnt_ops[idx];
5437 
5438 #ifdef XENPV
5439 	KASSERT(op->flags & GNTMAP_contains_pte);
5440 	op->host_addr = xpmap_ptetomach(ptep);
5441 #else
5442 	KASSERT((op->flags & GNTMAP_contains_pte) == 0);
5443 	KASSERT(op->flags != 0);
5444 	KASSERT(op->host_addr != 0);
5445 #endif
5446 	op->dev_bus_addr = 0;
5447 	op->status = GNTST_general_error;
5448 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5449 	if (__predict_false(ret)) {
5450 		printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5451 		    __func__, ret);
5452 		op->status = GNTST_general_error;
5453 	}
5454 	for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5455 		kpause("gntmap", false, mstohz(1), NULL);
5456 		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5457 		if (__predict_false(ret)) {
5458 			printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5459 			    __func__, ret);
5460 			op->status = GNTST_general_error;
5461 		}
5462 	}
5463 	if (__predict_false(op->status != GNTST_okay)) {
5464 		printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5465 		    __func__, op->status);
5466 		if (have_oldpa) { /* XXX did the pte really change if XENPV  ?*/
5467 			ptp->wire_count--;
5468 		}
5469 	} else {
5470 #ifndef XENPV
5471 		npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P;
5472 		if ((op->flags & GNTMAP_readonly) == 0)
5473 			npte |= PTE_W;
5474 		do {
5475 			opte = *ptep;
5476 		} while (pmap_pte_cas(ptep, opte, npte) != opte);
5477 #endif
5478 		pgnt->pd_gnt_refs++;
5479 		if (!have_oldpa) {
5480 			ptp->wire_count++;
5481 		}
5482 		KASSERT(ptp->wire_count > 1);
5483 		/* Remember minimum VA in PTP. */
5484 		pmap_ptp_range_set(ptp, va);
5485 	}
5486 	if (ptp->wire_count <= 1)
5487 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5488 
5489 	/*
5490 	 * Done with the PTEs: they can now be unmapped.
5491 	 */
5492 	pmap_unmap_ptes(pmap, pmap2);
5493 
5494 	/*
5495 	 * Update statistics and PTP's reference count.
5496 	 */
5497 	pmap_stats_update_bypte(pmap, 0, opte);
5498 
5499 	/*
5500 	 * If old page is pv-tracked, remove pv_entry from its list.
5501 	 */
5502 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5503 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5504 			old_pp = VM_PAGE_TO_PP(old_pg);
5505 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5506 			panic("%s: PTE_PVLIST with pv-untracked page"
5507 			    " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5508 			    __func__, va, oldpa);
5509 		}
5510 
5511 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5512 		    pmap_pte_to_pp_attrs(opte));
5513 	} else {
5514 		KASSERT(old_pve == NULL);
5515 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5516 	}
5517 
5518 	pmap_drain_pv(pmap);
5519 	mutex_exit(&pmap->pm_lock);
5520 	return op->status;
5521 }
5522 
5523 /*
5524  * pmap_remove_gnt: grant mapping removal function.
5525  *
5526  * => caller should not be holding any pmap locks
5527  */
5528 static void
5529 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5530 {
5531 	struct pmap_data_gnt *pgnt;
5532 	pt_entry_t *ptes;
5533 	pd_entry_t pde;
5534 	pd_entry_t * const *pdes;
5535 	struct vm_page *ptp;
5536 	struct pmap *pmap2;
5537 	vaddr_t va;
5538 	int lvl;
5539 	int idx;
5540 	struct gnttab_map_grant_ref *op;
5541 	struct gnttab_unmap_grant_ref unmap_op;
5542 	int ret;
5543 
5544 	KASSERT(pmap != pmap_kernel());
5545 	KASSERT(pmap->pm_remove == pmap_remove_gnt);
5546 
5547 	mutex_enter(&pmap->pm_lock);
5548 	for (va = sva; va < eva; va += PAGE_SIZE) {
5549 		pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5550 		if (pgnt == NULL) {
5551 			pmap_remove_locked(pmap, sva, eva);
5552 			continue;
5553 		}
5554 
5555 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5556 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5557 			panic("pmap_remove_gnt pdes not valid");
5558 		}
5559 
5560 		idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5561 		op = &pgnt->pd_gnt_ops[idx];
5562 		KASSERT(lvl == 1);
5563 
5564 		/* Get PTP if non-kernel mapping. */
5565 		ptp = pmap_find_ptp(pmap, va, 1);
5566 		KASSERTMSG(ptp != NULL,
5567 		    "%s: unmanaged PTP detected", __func__);
5568 
5569 		if (op->status == GNTST_okay)  {
5570 			KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5571 #ifdef XENPV
5572 			unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5573 #else
5574 			unmap_op.host_addr = op->host_addr;
5575 			pmap_pte_testset(&ptes[pl1_i(va)], 0);
5576 #endif
5577 			unmap_op.handle = op->handle;
5578 			unmap_op.dev_bus_addr = 0;
5579 			ret = HYPERVISOR_grant_table_op(
5580 			    GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5581 			if (ret) {
5582 				printf("%s: GNTTABOP_unmap_grant_ref "
5583 				    "failed: %d\n", __func__, ret);
5584 			}
5585 
5586 			ptp->wire_count--;
5587 			pgnt->pd_gnt_refs--;
5588 		}
5589 		if (pgnt->pd_gnt_refs == 0) {
5590 			pmap_free_gnt(pmap, pgnt);
5591 		}
5592 		/*
5593 		 * if mapping removed and the PTP is no longer
5594 		 * being used, free it!
5595 		 */
5596 
5597 		if (ptp->wire_count <= 1)
5598 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5599 		pmap_unmap_ptes(pmap, pmap2);
5600 	}
5601 	mutex_exit(&pmap->pm_lock);
5602 }
5603 #endif /* XEN && DOM0OPS */
5604 
5605 paddr_t
5606 pmap_get_physpage(void)
5607 {
5608 	struct vm_page *ptp;
5609 	struct pmap *kpm = pmap_kernel();
5610 	paddr_t pa;
5611 
5612 	if (!uvm.page_init_done) {
5613 		/*
5614 		 * We're growing the kernel pmap early (from
5615 		 * uvm_pageboot_alloc()). This case must be
5616 		 * handled a little differently.
5617 		 */
5618 
5619 		if (!uvm_page_physget(&pa))
5620 			panic("%s: out of memory", __func__);
5621 #if defined(__HAVE_DIRECT_MAP)
5622 		memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
5623 #else
5624 #if defined(XENPV)
5625 		if (XEN_VERSION_SUPPORTED(3, 4)) {
5626 			xen_pagezero(pa);
5627 			return pa;
5628 		}
5629 #endif
5630 		kpreempt_disable();
5631 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5632 		    PTE_W | pmap_pg_nx);
5633 		pmap_pte_flush();
5634 		pmap_update_pg((vaddr_t)early_zerop);
5635 		memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
5636 #if defined(DIAGNOSTIC) || defined(XENPV)
5637 		pmap_pte_set(early_zero_pte, 0);
5638 		pmap_pte_flush();
5639 #endif /* defined(DIAGNOSTIC) */
5640 		kpreempt_enable();
5641 #endif /* defined(__HAVE_DIRECT_MAP) */
5642 	} else {
5643 		/* XXX */
5644 		ptp = uvm_pagealloc(NULL, 0, NULL,
5645 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5646 		if (ptp == NULL)
5647 			panic("%s: out of memory", __func__);
5648 		ptp->flags &= ~PG_BUSY;
5649 		ptp->wire_count = 1;
5650 		pa = VM_PAGE_TO_PHYS(ptp);
5651 	}
5652 	pmap_stats_update(kpm, 1, 0);
5653 
5654 	return pa;
5655 }
5656 
5657 /*
5658  * Expand the page tree with the specified amount of PTPs, mapping virtual
5659  * addresses starting at kva. We populate all the levels but the last one
5660  * (L1). The nodes of the tree are created as RW, but the pages covered
5661  * will be kentered in L1, with proper permissions.
5662  *
5663  * Used only by pmap_growkernel.
5664  */
5665 static void
5666 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5667 {
5668 	unsigned long i;
5669 	paddr_t pa;
5670 	unsigned long index, endindex;
5671 	int level;
5672 	pd_entry_t *pdep;
5673 #ifdef XENPV
5674 	int s = splvm(); /* protect xpq_* */
5675 #endif
5676 
5677 	for (level = PTP_LEVELS; level > 1; level--) {
5678 		if (level == PTP_LEVELS)
5679 			pdep = cpm->pm_pdir;
5680 		else
5681 			pdep = normal_pdes[level - 2];
5682 		index = pl_i_roundup(kva, level);
5683 		endindex = index + needed_ptps[level - 1] - 1;
5684 
5685 		for (i = index; i <= endindex; i++) {
5686 			pt_entry_t pte;
5687 
5688 			KASSERT(!pmap_valid_entry(pdep[i]));
5689 			pa = pmap_get_physpage();
5690 			pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5691 #ifdef __x86_64__
5692 			pte |= pmap_pg_nx;
5693 #endif
5694 			pmap_pte_set(&pdep[i], pte);
5695 
5696 #ifdef XENPV
5697 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5698 				if (__predict_true(
5699 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5700 					/* update per-cpu PMDs on all cpus */
5701 					xen_kpm_sync(pmap_kernel(), i);
5702 				} else {
5703 					/*
5704 					 * too early; update primary CPU
5705 					 * PMD only (without locks)
5706 					 */
5707 #ifdef __x86_64__
5708 					pd_entry_t *cpu_pdep =
5709 						&cpu_info_primary.ci_kpm_pdir[i];
5710 #else
5711 					pd_entry_t *cpu_pdep =
5712 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5713 #endif
5714 					pmap_pte_set(cpu_pdep, pte);
5715 				}
5716 			}
5717 #endif
5718 
5719 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5720 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5721 			nkptp[level - 1]++;
5722 		}
5723 		pmap_pte_flush();
5724 	}
5725 #ifdef XENPV
5726 	splx(s);
5727 #endif
5728 }
5729 
5730 /*
5731  * pmap_growkernel: increase usage of KVM space.
5732  *
5733  * => we allocate new PTPs for the kernel and install them in all
5734  *    the pmaps on the system.
5735  */
5736 vaddr_t
5737 pmap_growkernel(vaddr_t maxkvaddr)
5738 {
5739 	struct pmap *kpm = pmap_kernel();
5740 	struct pmap *cpm;
5741 #if !defined(XENPV) || !defined(__x86_64__)
5742 	struct pmap *pm;
5743 	long old;
5744 #endif
5745 	int s, i;
5746 	long needed_kptp[PTP_LEVELS], target_nptp;
5747 	bool invalidate = false;
5748 
5749 	s = splvm();	/* to be safe */
5750 	mutex_enter(&kpm->pm_lock);
5751 
5752 	if (maxkvaddr <= pmap_maxkvaddr) {
5753 		mutex_exit(&kpm->pm_lock);
5754 		splx(s);
5755 		return pmap_maxkvaddr;
5756 	}
5757 
5758 	maxkvaddr = x86_round_pdr(maxkvaddr);
5759 #if !defined(XENPV) || !defined(__x86_64__)
5760 	old = nkptp[PTP_LEVELS - 1];
5761 #endif
5762 
5763 	/* Initialize needed_kptp. */
5764 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
5765 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5766 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5767 
5768 		if (target_nptp > nkptpmax[i])
5769 			panic("out of KVA space");
5770 		KASSERT(target_nptp >= nkptp[i]);
5771 		needed_kptp[i] = target_nptp - nkptp[i];
5772 	}
5773 
5774 #ifdef XENPV
5775 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
5776 	cpm = kpm;
5777 #else
5778 	/* Get the current pmap */
5779 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5780 		cpm = curcpu()->ci_pmap;
5781 	} else {
5782 		cpm = kpm;
5783 	}
5784 #endif
5785 
5786 	kasan_shadow_map((void *)pmap_maxkvaddr,
5787 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5788 	kmsan_shadow_map((void *)pmap_maxkvaddr,
5789 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5790 
5791 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5792 
5793 	/*
5794 	 * If the number of top level entries changed, update all pmaps.
5795 	 */
5796 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
5797 #ifdef XENPV
5798 #ifdef __x86_64__
5799 		/* nothing, kernel entries are never entered in user pmap */
5800 #else
5801 		int pdkidx;
5802 
5803 		mutex_enter(&pmaps_lock);
5804 		LIST_FOREACH(pm, &pmaps, pm_list) {
5805 			for (pdkidx = PDIR_SLOT_KERN + old;
5806 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5807 			    pdkidx++) {
5808 				pmap_pte_set(&pm->pm_pdir[pdkidx],
5809 				    kpm->pm_pdir[pdkidx]);
5810 			}
5811 			pmap_pte_flush();
5812 		}
5813 		mutex_exit(&pmaps_lock);
5814 #endif /* __x86_64__ */
5815 #else /* XENPV */
5816 		size_t newpdes;
5817 		newpdes = nkptp[PTP_LEVELS - 1] - old;
5818 		if (cpm != kpm) {
5819 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5820 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5821 			    newpdes * sizeof(pd_entry_t));
5822 		}
5823 
5824 		mutex_enter(&pmaps_lock);
5825 		LIST_FOREACH(pm, &pmaps, pm_list) {
5826 			if (__predict_false(pm->pm_enter != NULL)) {
5827 				/*
5828 				 * Not a native pmap, the kernel is not mapped,
5829 				 * so nothing to synchronize.
5830 				 */
5831 				continue;
5832 			}
5833 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5834 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5835 			    newpdes * sizeof(pd_entry_t));
5836 		}
5837 		mutex_exit(&pmaps_lock);
5838 #endif
5839 		invalidate = true;
5840 	}
5841 	pmap_maxkvaddr = maxkvaddr;
5842 	mutex_exit(&kpm->pm_lock);
5843 	splx(s);
5844 
5845 	if (invalidate && pmap_initialized) {
5846 		/* Invalidate the pmap cache. */
5847 		pool_cache_invalidate(&pmap_cache);
5848 	}
5849 
5850 	return maxkvaddr;
5851 }
5852 
5853 #ifdef DEBUG
5854 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5855 
5856 /*
5857  * pmap_dump: dump all the mappings from a pmap
5858  *
5859  * => caller should not be holding any pmap locks
5860  */
5861 void
5862 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5863 {
5864 	pt_entry_t *ptes, *pte;
5865 	pd_entry_t * const *pdes;
5866 	struct pmap *pmap2;
5867 	vaddr_t blkendva;
5868 	int lvl;
5869 
5870 	/*
5871 	 * if end is out of range truncate.
5872 	 * if (end == start) update to max.
5873 	 */
5874 
5875 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5876 		eva = VM_MAXUSER_ADDRESS;
5877 
5878 	mutex_enter(&pmap->pm_lock);
5879 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5880 
5881 	/*
5882 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5883 	 */
5884 
5885 	for (/* null */ ; sva < eva ; sva = blkendva) {
5886 
5887 		/* determine range of block */
5888 		blkendva = x86_round_pdr(sva+1);
5889 		if (blkendva > eva)
5890 			blkendva = eva;
5891 
5892 		/* valid block? */
5893 		if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5894 			continue;
5895 		KASSERT(lvl == 1);
5896 
5897 		pte = &ptes[pl1_i(sva)];
5898 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5899 			if (!pmap_valid_entry(*pte))
5900 				continue;
5901 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5902 			    " (pte=%#" PRIxPADDR ")\n",
5903 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5904 		}
5905 	}
5906 	pmap_unmap_ptes(pmap, pmap2);
5907 	mutex_exit(&pmap->pm_lock);
5908 }
5909 #endif
5910 
5911 /*
5912  * pmap_update: process deferred invalidations and frees.
5913  */
5914 void
5915 pmap_update(struct pmap *pmap)
5916 {
5917 	struct pmap_page *pp;
5918 	struct vm_page *ptp;
5919 
5920 	/*
5921 	 * Initiate any pending TLB shootdowns.  Wait for them to
5922 	 * complete before returning control to the caller.
5923 	 */
5924 	kpreempt_disable();
5925 	pmap_tlb_shootnow();
5926 	kpreempt_enable();
5927 
5928 	/*
5929 	 * Now that shootdowns are complete, process deferred frees.  This
5930 	 * is an unlocked check, but is safe as we're only interested in
5931 	 * work done in this LWP - we won't get a false negative.
5932 	 */
5933 	if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5934 		return;
5935 	}
5936 
5937 	mutex_enter(&pmap->pm_lock);
5938 	while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5939 		KASSERT(ptp->wire_count == 0);
5940 		KASSERT(ptp->uanon == NULL);
5941 		LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5942 		pp = VM_PAGE_TO_PP(ptp);
5943 		LIST_INIT(&pp->pp_pvlist);
5944 		pp->pp_attrs = 0;
5945 		pp->pp_pte.pte_ptp = NULL;
5946 		pp->pp_pte.pte_va = 0;
5947 		PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5948 
5949 		/*
5950 		 * XXX Hack to avoid extra locking, and lock
5951 		 * assertions in uvm_pagefree().  Despite uobject
5952 		 * being set, this isn't a managed page.
5953 		 */
5954 		PMAP_DUMMY_LOCK(pmap);
5955 		uvm_pagerealloc(ptp, NULL, 0);
5956 		PMAP_DUMMY_UNLOCK(pmap);
5957 		uvm_pagefree(ptp);
5958 	}
5959 	mutex_exit(&pmap->pm_lock);
5960 }
5961 
5962 #if PTP_LEVELS > 4
5963 #error "Unsupported number of page table mappings"
5964 #endif
5965 
5966 paddr_t
5967 pmap_init_tmp_pgtbl(paddr_t pg)
5968 {
5969 	static bool maps_loaded;
5970 	static const paddr_t x86_tmp_pml_paddr[] = {
5971 	    4 * PAGE_SIZE,	/* L1 */
5972 	    5 * PAGE_SIZE,	/* L2 */
5973 	    6 * PAGE_SIZE,	/* L3 */
5974 	    7 * PAGE_SIZE	/* L4 */
5975 	};
5976 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5977 
5978 	pd_entry_t *tmp_pml, *kernel_pml;
5979 
5980 	int level;
5981 
5982 	if (!maps_loaded) {
5983 		for (level = 0; level < PTP_LEVELS; ++level) {
5984 			x86_tmp_pml_vaddr[level] =
5985 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5986 			    UVM_KMF_VAONLY);
5987 
5988 			if (x86_tmp_pml_vaddr[level] == 0)
5989 				panic("mapping of real mode PML failed\n");
5990 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5991 			    x86_tmp_pml_paddr[level],
5992 			    VM_PROT_READ | VM_PROT_WRITE, 0);
5993 		}
5994 		pmap_update(pmap_kernel());
5995 		maps_loaded = true;
5996 	}
5997 
5998 	/* Zero levels 1-3 */
5999 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
6000 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6001 		memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
6002 	}
6003 
6004 	/* Copy PML4 */
6005 	kernel_pml = pmap_kernel()->pm_pdir;
6006 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
6007 	memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
6008 
6009 #ifdef PAE
6010 	/*
6011 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
6012 	 * last entries are unlikely to be used for temporary mappings.
6013 	 * 508: maps 0->1GB (userland)
6014 	 * 509: unused
6015 	 * 510: unused
6016 	 * 511: maps 3->4GB (kernel)
6017 	 */
6018 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
6019 	tmp_pml[509] = 0;
6020 	tmp_pml[510] = 0;
6021 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
6022 #endif
6023 
6024 	for (level = PTP_LEVELS - 1; level > 0; --level) {
6025 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6026 
6027 		tmp_pml[pl_i(pg, level + 1)] =
6028 		    (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
6029 	}
6030 
6031 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
6032 	tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
6033 
6034 #ifdef PAE
6035 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
6036 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
6037 #endif
6038 
6039 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
6040 }
6041 
6042 u_int
6043 x86_mmap_flags(paddr_t mdpgno)
6044 {
6045 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
6046 	u_int pflag = 0;
6047 
6048 	if (nflag & X86_MMAP_FLAG_PREFETCH)
6049 		pflag |= PMAP_WRITE_COMBINE;
6050 
6051 	return pflag;
6052 }
6053 
6054 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
6055 
6056 /*
6057  * -----------------------------------------------------------------------------
6058  * *****************************************************************************
6059  * *****************************************************************************
6060  * *****************************************************************************
6061  * *****************************************************************************
6062  * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
6063  * *****************************************************************************
6064  * *****************************************************************************
6065  * *****************************************************************************
6066  * *****************************************************************************
6067  * -----------------------------------------------------------------------------
6068  *
6069  * These functions are invoked as callbacks from the code above. Contrary to
6070  * native, EPT does not have a recursive slot; therefore, it is not possible
6071  * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
6072  * tree manually.
6073  *
6074  * Apart from that, the logic is mostly the same as native. Once a pmap has
6075  * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
6076  * After that we're good, and the callbacks will handle the translations
6077  * for us.
6078  *
6079  * -----------------------------------------------------------------------------
6080  */
6081 
6082 /* Hardware bits. */
6083 #define EPT_R		__BIT(0)	/* read */
6084 #define EPT_W		__BIT(1)	/* write */
6085 #define EPT_X		__BIT(2)	/* execute */
6086 #define EPT_T		__BITS(5,3)	/* type */
6087 #define		TYPE_UC	0
6088 #define		TYPE_WC	1
6089 #define		TYPE_WT	4
6090 #define		TYPE_WP	5
6091 #define		TYPE_WB	6
6092 #define EPT_NOPAT	__BIT(6)
6093 #define EPT_L		__BIT(7)	/* large */
6094 #define EPT_A		__BIT(8)	/* accessed */
6095 #define EPT_D		__BIT(9)	/* dirty */
6096 /* Software bits. */
6097 #define EPT_PVLIST	__BIT(60)
6098 #define EPT_WIRED	__BIT(61)
6099 
6100 #define pmap_ept_valid_entry(pte)	(pte & EPT_R)
6101 
6102 bool pmap_ept_has_ad __read_mostly;
6103 
6104 static inline void
6105 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
6106 {
6107 	int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
6108 	int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
6109 
6110 	KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6111 	KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6112 
6113 	pmap_stats_update(pmap, resid_diff, wired_diff);
6114 }
6115 
6116 static pt_entry_t
6117 pmap_ept_type(u_int flags)
6118 {
6119 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
6120 	pt_entry_t ret;
6121 
6122 	switch (cacheflags) {
6123 	case PMAP_NOCACHE:
6124 	case PMAP_NOCACHE_OVR:
6125 		ret = __SHIFTIN(TYPE_UC, EPT_T);
6126 		break;
6127 	case PMAP_WRITE_COMBINE:
6128 		ret = __SHIFTIN(TYPE_WC, EPT_T);
6129 		break;
6130 	case PMAP_WRITE_BACK:
6131 	default:
6132 		ret = __SHIFTIN(TYPE_WB, EPT_T);
6133 		break;
6134 	}
6135 
6136 	ret |= EPT_NOPAT;
6137 	return ret;
6138 }
6139 
6140 static inline pt_entry_t
6141 pmap_ept_prot(vm_prot_t prot)
6142 {
6143 	pt_entry_t res = 0;
6144 
6145 	if (prot & VM_PROT_READ)
6146 		res |= EPT_R;
6147 	if (prot & VM_PROT_WRITE)
6148 		res |= EPT_W;
6149 	if (prot & VM_PROT_EXECUTE)
6150 		res |= EPT_X;
6151 
6152 	return res;
6153 }
6154 
6155 static inline uint8_t
6156 pmap_ept_to_pp_attrs(pt_entry_t ept)
6157 {
6158 	uint8_t ret = 0;
6159 	if (pmap_ept_has_ad) {
6160 		if (ept & EPT_D)
6161 			ret |= PP_ATTRS_D;
6162 		if (ept & EPT_A)
6163 			ret |= PP_ATTRS_A;
6164 	} else {
6165 		ret |= (PP_ATTRS_D|PP_ATTRS_A);
6166 	}
6167 	if (ept & EPT_W)
6168 		ret |= PP_ATTRS_W;
6169 	return ret;
6170 }
6171 
6172 static inline pt_entry_t
6173 pmap_pp_attrs_to_ept(uint8_t attrs)
6174 {
6175 	pt_entry_t ept = 0;
6176 	if (attrs & PP_ATTRS_D)
6177 		ept |= EPT_D;
6178 	if (attrs & PP_ATTRS_A)
6179 		ept |= EPT_A;
6180 	if (attrs & PP_ATTRS_W)
6181 		ept |= EPT_W;
6182 	return ept;
6183 }
6184 
6185 /*
6186  * Helper for pmap_ept_free_ptp.
6187  * tree[0] = &L2[L2idx]
6188  * tree[1] = &L3[L3idx]
6189  * tree[2] = &L4[L4idx]
6190  */
6191 static void
6192 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
6193 {
6194 	pt_entry_t *pteva;
6195 	paddr_t ptepa;
6196 	int i, index;
6197 
6198 	ptepa = pmap->pm_pdirpa[0];
6199 	for (i = PTP_LEVELS; i > 1; i--) {
6200 		index = pl_pi(va, i);
6201 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6202 		KASSERT(pmap_ept_valid_entry(pteva[index]));
6203 		tree[i - 2] = &pteva[index];
6204 		ptepa = pmap_pte2pa(pteva[index]);
6205 	}
6206 }
6207 
6208 static void
6209 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
6210 {
6211 	pd_entry_t *tree[3];
6212 	int level;
6213 
6214 	KASSERT(pmap != pmap_kernel());
6215 	KASSERT(mutex_owned(&pmap->pm_lock));
6216 	KASSERT(kpreempt_disabled());
6217 
6218 	pmap_ept_get_tree(pmap, va, tree);
6219 
6220 	level = 1;
6221 	do {
6222 		(void)pmap_pte_testset(tree[level - 1], 0);
6223 
6224 		pmap_freepage(pmap, ptp, level);
6225 		if (level < PTP_LEVELS - 1) {
6226 			ptp = pmap_find_ptp(pmap, va, level + 1);
6227 			ptp->wire_count--;
6228 			if (ptp->wire_count > 1)
6229 				break;
6230 		}
6231 	} while (++level < PTP_LEVELS);
6232 	pmap_pte_flush();
6233 }
6234 
6235 /* Allocate L4->L3->L2. Return L2. */
6236 static void
6237 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6238 {
6239 	struct vm_page *ptp;
6240 	unsigned long index;
6241 	pd_entry_t *pteva;
6242 	paddr_t ptepa;
6243 	int i;
6244 
6245 	KASSERT(pmap != pmap_kernel());
6246 	KASSERT(mutex_owned(&pmap->pm_lock));
6247 	KASSERT(kpreempt_disabled());
6248 
6249 	/*
6250 	 * Now that we have all the pages looked up or allocated,
6251 	 * loop through again installing any new ones into the tree.
6252 	 */
6253 	ptepa = pmap->pm_pdirpa[0];
6254 	for (i = PTP_LEVELS; i > 1; i--) {
6255 		index = pl_pi(va, i);
6256 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6257 
6258 		if (pmap_ept_valid_entry(pteva[index])) {
6259 			KASSERT(!pt->alloced[i]);
6260 			ptepa = pmap_pte2pa(pteva[index]);
6261 			continue;
6262 		}
6263 
6264 		ptp = pt->pg[i];
6265 		ptp->flags &= ~PG_BUSY; /* never busy */
6266 		ptp->wire_count = 1;
6267 		pmap->pm_ptphint[i - 2] = ptp;
6268 		ptepa = VM_PAGE_TO_PHYS(ptp);
6269 		pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6270 
6271 		pmap_pte_flush();
6272 		pmap_stats_update(pmap, 1, 0);
6273 
6274 		/*
6275 		 * If we're not in the top level, increase the
6276 		 * wire count of the parent page.
6277 		 */
6278 		if (i < PTP_LEVELS) {
6279 			pt->pg[i + 1]->wire_count++;
6280 		}
6281 	}
6282 }
6283 
6284 static int
6285 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6286     u_int flags)
6287 {
6288 	pt_entry_t *ptes, opte, npte;
6289 	pt_entry_t *ptep;
6290 	struct vm_page *ptp;
6291 	struct vm_page *new_pg, *old_pg;
6292 	struct pmap_page *new_pp, *old_pp;
6293 	struct pv_entry *old_pve, *new_pve;
6294 	bool wired = (flags & PMAP_WIRED) != 0;
6295 	bool accessed;
6296 	struct pmap_ptparray pt;
6297 	int error;
6298 	bool getptp, samepage, new_embedded;
6299 	rb_tree_t *tree;
6300 
6301 	KASSERT(pmap_initialized);
6302 	KASSERT(va < VM_MAXUSER_ADDRESS);
6303 
6304 	npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6305 
6306 	if (wired)
6307 		npte |= EPT_WIRED;
6308 	if (flags & VM_PROT_ALL) {
6309 		npte |= EPT_A;
6310 		if (flags & VM_PROT_WRITE) {
6311 			KASSERT((npte & EPT_W) != 0);
6312 			npte |= EPT_D;
6313 		}
6314 	}
6315 
6316 	new_pg = PHYS_TO_VM_PAGE(pa);
6317 	if (new_pg != NULL) {
6318 		/* This is a managed page */
6319 		npte |= EPT_PVLIST;
6320 		new_pp = VM_PAGE_TO_PP(new_pg);
6321 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6322 		/* This is an unmanaged pv-tracked page */
6323 		npte |= EPT_PVLIST;
6324 	} else {
6325 		new_pp = NULL;
6326 	}
6327 
6328 	/* Begin by locking the pmap. */
6329 	mutex_enter(&pmap->pm_lock);
6330 
6331 	/* Look up the PTP.  Allocate if none present. */
6332 	ptp = NULL;
6333 	getptp = false;
6334 	if (pmap != pmap_kernel()) {
6335 		ptp = pmap_find_ptp(pmap, va, 1);
6336 		if (ptp == NULL) {
6337 			getptp = true;
6338 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6339 			if (error != 0) {
6340 				if (flags & PMAP_CANFAIL) {
6341 					mutex_exit(&pmap->pm_lock);
6342 					return error;
6343 				}
6344 				panic("%s: get ptp failed, error=%d", __func__,
6345 				    error);
6346 			}
6347 		}
6348 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6349 	} else {
6350 		/* Embedded PV entries rely on this. */
6351 		KASSERT(va != 0);
6352 		tree = &pmap_kernel_rb;
6353 	}
6354 
6355 	/*
6356 	 * Look up the old PV entry at this VA (if any), and insert a new PV
6357 	 * entry if required for the new mapping.  Temporarily track the old
6358 	 * and new mappings concurrently.  Only after the old mapping is
6359 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
6360 	 * our picture of modified/accessed state for either page could get
6361 	 * out of sync (we need any P->V operation for either page to stall
6362 	 * on pmap->pm_lock until done here).
6363 	 */
6364 	new_pve = NULL;
6365 	old_pve = NULL;
6366 	samepage = false;
6367 	new_embedded = false;
6368 
6369 	if (new_pp != NULL) {
6370 		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6371 		    &old_pve, &samepage, &new_embedded, tree);
6372 
6373 		/*
6374 		 * If a new pv_entry was needed and none was available, we
6375 		 * can go no further.
6376 		 */
6377 		if (error != 0) {
6378 			if (flags & PMAP_CANFAIL) {
6379 				if (getptp) {
6380 					pmap_unget_ptp(pmap, &pt);
6381 				}
6382 				mutex_exit(&pmap->pm_lock);
6383 				return error;
6384 			}
6385 			panic("%s: alloc pve failed", __func__);
6386 		}
6387 	} else {
6388 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6389 	}
6390 
6391 	/* Map PTEs into address space. */
6392 	kpreempt_disable();
6393 
6394 	/* Install any newly allocated PTPs. */
6395 	if (getptp) {
6396 		pmap_ept_install_ptp(pmap, &pt, va);
6397 	}
6398 
6399 	/* Check if there is an existing mapping. */
6400 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6401 	ptep = &ptes[pl1_pi(va)];
6402 	opte = *ptep;
6403 	bool have_oldpa = pmap_ept_valid_entry(opte);
6404 	paddr_t oldpa = pmap_pte2pa(opte);
6405 
6406 	/*
6407 	 * Update the pte.
6408 	 */
6409 	do {
6410 		opte = *ptep;
6411 
6412 		/*
6413 		 * if the same page, inherit PTE_A and PTE_D.
6414 		 */
6415 		if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6416 			npte |= opte & (EPT_A | EPT_D);
6417 		}
6418 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6419 
6420 	/*
6421 	 * Done with the PTEs: they can now be unmapped.
6422 	 */
6423 	kpreempt_enable();
6424 
6425 	/*
6426 	 * Update statistics and PTP's reference count.
6427 	 */
6428 	pmap_ept_stats_update_bypte(pmap, npte, opte);
6429 	if (ptp != NULL) {
6430 		if (!have_oldpa) {
6431 			ptp->wire_count++;
6432 		}
6433 		/* Remember minimum VA in PTP. */
6434 		pmap_ptp_range_set(ptp, va);
6435 	}
6436 	KASSERT(ptp == NULL || ptp->wire_count > 1);
6437 
6438 	/*
6439 	 * If the same page, we can skip pv_entry handling.
6440 	 */
6441 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6442 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6443 		if ((npte & EPT_PVLIST) != 0) {
6444 			KASSERT(samepage);
6445 			pmap_check_pv(pmap, ptp, new_pp, va, true);
6446 		}
6447 		goto same_pa;
6448 	} else if ((npte & EPT_PVLIST) != 0) {
6449 		KASSERT(!samepage);
6450 	}
6451 
6452 	/*
6453 	 * If old page is pv-tracked, remove pv_entry from its list.
6454 	 */
6455 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6456 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6457 			old_pp = VM_PAGE_TO_PP(old_pg);
6458 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6459 			panic("%s: EPT_PVLIST with pv-untracked page"
6460 			    " va = %#"PRIxVADDR
6461 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6462 			    __func__, va, oldpa, atop(pa));
6463 		}
6464 
6465 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6466 		    pmap_ept_to_pp_attrs(opte));
6467 	} else {
6468 		KASSERT(old_pve == NULL);
6469 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6470 	}
6471 
6472 	/*
6473 	 * If new page is dynamically PV tracked, insert to tree.
6474 	 */
6475 	if (new_pve != NULL) {
6476 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6477 		old_pve = rb_tree_insert_node(tree, new_pve);
6478 		KASSERT(old_pve == new_pve);
6479 		pmap_check_pv(pmap, ptp, new_pp, va, true);
6480 	}
6481 
6482 same_pa:
6483 	/*
6484 	 * shootdown tlb if necessary.
6485 	 */
6486 
6487 	if (pmap_ept_has_ad) {
6488 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
6489 	} else {
6490 		accessed = (opte & EPT_R) != 0;
6491 	}
6492 	if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6493 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6494 	}
6495 	pmap_drain_pv(pmap);
6496 	mutex_exit(&pmap->pm_lock);
6497 	return 0;
6498 }
6499 
6500 /* Pay close attention, this returns L2. */
6501 static int
6502 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6503 {
6504 	pt_entry_t *pteva;
6505 	paddr_t ptepa;
6506 	int i, index;
6507 
6508 	KASSERT(mutex_owned(&pmap->pm_lock));
6509 
6510 	ptepa = pmap->pm_pdirpa[0];
6511 	for (i = PTP_LEVELS; i > 1; i--) {
6512 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6513 		index = pl_pi(va, i);
6514 		if (!pmap_ept_valid_entry(pteva[index]))
6515 			return i;
6516 		ptepa = pmap_pte2pa(pteva[index]);
6517 	}
6518 	if (lastpde != NULL) {
6519 		*lastpde = pteva[index];
6520 	}
6521 
6522 	return 0;
6523 }
6524 
6525 static bool
6526 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6527 {
6528 	pt_entry_t *ptes, pte;
6529 	pd_entry_t pde;
6530 	paddr_t ptppa, pa;
6531 	bool rv;
6532 
6533 #ifdef __HAVE_DIRECT_MAP
6534 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6535 		if (pap != NULL) {
6536 			*pap = PMAP_DIRECT_UNMAP(va);
6537 		}
6538 		return true;
6539 	}
6540 #endif
6541 
6542 	rv = false;
6543 	pa = 0;
6544 
6545 	mutex_enter(&pmap->pm_lock);
6546 	kpreempt_disable();
6547 
6548 	if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6549 		ptppa = pmap_pte2pa(pde);
6550 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6551 		pte = ptes[pl1_pi(va)];
6552 		if (__predict_true((pte & EPT_R) != 0)) {
6553 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6554 			rv = true;
6555 		}
6556 	}
6557 
6558 	kpreempt_enable();
6559 	mutex_exit(&pmap->pm_lock);
6560 
6561 	if (pap != NULL) {
6562 		*pap = pa;
6563 	}
6564 	return rv;
6565 }
6566 
6567 static bool
6568 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6569     vaddr_t va)
6570 {
6571 	struct pv_entry *pve;
6572 	struct vm_page *pg;
6573 	struct pmap_page *pp;
6574 	pt_entry_t opte;
6575 	bool accessed;
6576 
6577 	KASSERT(pmap != pmap_kernel());
6578 	KASSERT(mutex_owned(&pmap->pm_lock));
6579 	KASSERT(kpreempt_disabled());
6580 
6581 	if (!pmap_ept_valid_entry(*pte)) {
6582 		/* VA not mapped. */
6583 		return false;
6584 	}
6585 
6586 	/* Atomically save the old PTE and zap it. */
6587 	opte = pmap_pte_testset(pte, 0);
6588 	if (!pmap_ept_valid_entry(opte)) {
6589 		return false;
6590 	}
6591 
6592 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6593 
6594 	if (ptp) {
6595 		/*
6596 		 * Dropping a PTE.  Make sure that the PDE is flushed.
6597 		 */
6598 		ptp->wire_count--;
6599 		if (ptp->wire_count <= 1) {
6600 			opte |= EPT_A;
6601 		}
6602 	}
6603 
6604 	if (pmap_ept_has_ad) {
6605 		accessed = (opte & EPT_A) != 0;
6606 	} else {
6607 		accessed = true;
6608 	}
6609 	if (accessed) {
6610 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6611 	}
6612 
6613 	/*
6614 	 * If we are not on a pv list - we are done.
6615 	 */
6616 	if ((opte & EPT_PVLIST) == 0) {
6617 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6618 		    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6619 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6620 		    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6621 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6622 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6623 		return true;
6624 	}
6625 
6626 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6627 		pp = VM_PAGE_TO_PP(pg);
6628 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6629 		paddr_t pa = pmap_pte2pa(opte);
6630 		panic("%s: EPT_PVLIST with pv-untracked page"
6631 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6632 		    __func__, va, pa, atop(pa));
6633 	}
6634 
6635 	/* Sync R/M bits. */
6636 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
6637 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6638 	return true;
6639 }
6640 
6641 static void
6642 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6643     vaddr_t startva, vaddr_t endva)
6644 {
6645 	pt_entry_t *pte = (pt_entry_t *)ptpva;
6646 
6647 	KASSERT(pmap != pmap_kernel());
6648 	KASSERT(mutex_owned(&pmap->pm_lock));
6649 	KASSERT(kpreempt_disabled());
6650 
6651 	/*
6652 	 * mappings are very often sparse, so clip the given range to the
6653 	 * range of PTEs that are known present in the PTP.
6654 	 */
6655 	pmap_ptp_range_clip(ptp, &startva, &pte);
6656 
6657 	/*
6658 	 * note that ptpva points to the PTE that maps startva.   this may
6659 	 * or may not be the first PTE in the PTP.
6660 	 *
6661 	 * we loop through the PTP while there are still PTEs to look at
6662 	 * and the wire_count is greater than 1 (because we use the wire_count
6663 	 * to keep track of the number of real PTEs in the PTP).
6664 	 */
6665 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6666 		(void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6667 		startva += PAGE_SIZE;
6668 		pte++;
6669 	}
6670 }
6671 
6672 static void
6673 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6674 {
6675 	pt_entry_t *ptes;
6676 	pd_entry_t pde;
6677 	paddr_t ptppa;
6678 	vaddr_t blkendva, va = sva;
6679 	struct vm_page *ptp;
6680 
6681 	mutex_enter(&pmap->pm_lock);
6682 	kpreempt_disable();
6683 
6684 	for (/* null */ ; va < eva ; va = blkendva) {
6685 		int lvl;
6686 
6687 		/* determine range of block */
6688 		blkendva = x86_round_pdr(va+1);
6689 		if (blkendva > eva)
6690 			blkendva = eva;
6691 
6692 		lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6693 		if (lvl != 0) {
6694 			/* Skip a range corresponding to an invalid pde. */
6695 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6696 			continue;
6697 		}
6698 
6699 		/* PA of the PTP */
6700 		ptppa = pmap_pte2pa(pde);
6701 
6702 		ptp = pmap_find_ptp(pmap, va, 1);
6703 		KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6704 		    __func__);
6705 
6706 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6707 
6708 		pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6709 		    blkendva);
6710 
6711 		/* If PTP is no longer being used, free it. */
6712 		if (ptp && ptp->wire_count <= 1) {
6713 			pmap_ept_free_ptp(pmap, ptp, va);
6714 		}
6715 	}
6716 
6717 	kpreempt_enable();
6718 	pmap_drain_pv(pmap);
6719 	mutex_exit(&pmap->pm_lock);
6720 }
6721 
6722 static int
6723 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6724     uint8_t *oattrs, pt_entry_t *optep)
6725 {
6726 	struct pmap *pmap;
6727 	pt_entry_t *ptep;
6728 	pt_entry_t opte;
6729 	pt_entry_t npte;
6730 	pt_entry_t expect;
6731 	bool need_shootdown;
6732 
6733 	expect = pmap_pa2pte(pa) | EPT_R;
6734 	pmap = ptp_to_pmap(ptp);
6735 
6736 	if (clearbits != ~0) {
6737 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6738 		clearbits = pmap_pp_attrs_to_ept(clearbits);
6739 	}
6740 
6741 	ptep = pmap_map_pte(pmap, ptp, va);
6742 	do {
6743 		opte = *ptep;
6744 		KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6745 		KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6746 		KASSERT(opte == 0 || (opte & EPT_R) != 0);
6747 		if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6748 			/*
6749 			 * We lost a race with a V->P operation like
6750 			 * pmap_remove().  Wait for the competitor
6751 			 * reflecting pte bits into mp_attrs.
6752 			 */
6753 			pmap_unmap_pte();
6754 			return EAGAIN;
6755 		}
6756 
6757 		/*
6758 		 * Check if there's anything to do on this PTE.
6759 		 */
6760 		if ((opte & clearbits) == 0) {
6761 			need_shootdown = false;
6762 			break;
6763 		}
6764 
6765 		/*
6766 		 * We need a shootdown if the PTE is cached (EPT_A) ...
6767 		 * ... Unless we are clearing only the EPT_W bit and
6768 		 * it isn't cached as RW (EPT_D).
6769 		 */
6770 		if (pmap_ept_has_ad) {
6771 			need_shootdown = (opte & EPT_A) != 0 &&
6772 			    !(clearbits == EPT_W && (opte & EPT_D) == 0);
6773 		} else {
6774 			need_shootdown = true;
6775 		}
6776 
6777 		npte = opte & ~clearbits;
6778 
6779 		/*
6780 		 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6781 		 */
6782 		if (need_shootdown) {
6783 			npte &= ~(EPT_A | EPT_D);
6784 		}
6785 		KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6786 		KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6787 		KASSERT(npte == 0 || (opte & EPT_R) != 0);
6788 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6789 
6790 	if (need_shootdown) {
6791 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6792 	}
6793 	pmap_unmap_pte();
6794 
6795 	*oattrs = pmap_ept_to_pp_attrs(opte);
6796 	if (optep != NULL)
6797 		*optep = opte;
6798 	return 0;
6799 }
6800 
6801 static void
6802 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6803     vaddr_t va)
6804 {
6805 
6806 	KASSERT(mutex_owned(&pmap->pm_lock));
6807 
6808 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6809 	ptp->wire_count--;
6810 	if (ptp->wire_count <= 1) {
6811 		pmap_ept_free_ptp(pmap, ptp, va);
6812 	}
6813 }
6814 
6815 static void
6816 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6817 {
6818 	pt_entry_t bit_rem;
6819 	pt_entry_t *ptes, *spte;
6820 	pt_entry_t opte, npte;
6821 	pd_entry_t pde;
6822 	paddr_t ptppa;
6823 	vaddr_t va;
6824 	bool modified;
6825 
6826 	bit_rem = 0;
6827 	if (!(prot & VM_PROT_WRITE))
6828 		bit_rem = EPT_W;
6829 
6830 	sva &= PTE_FRAME;
6831 	eva &= PTE_FRAME;
6832 
6833 	/* Acquire pmap. */
6834 	mutex_enter(&pmap->pm_lock);
6835 	kpreempt_disable();
6836 
6837 	for (va = sva; va < eva; va += PAGE_SIZE) {
6838 		if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6839 			continue;
6840 		}
6841 
6842 		ptppa = pmap_pte2pa(pde);
6843 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6844 		spte = &ptes[pl1_pi(va)];
6845 
6846 		do {
6847 			opte = *spte;
6848 			if (!pmap_ept_valid_entry(opte)) {
6849 				goto next;
6850 			}
6851 			npte = (opte & ~bit_rem);
6852 		} while (pmap_pte_cas(spte, opte, npte) != opte);
6853 
6854 		if (pmap_ept_has_ad) {
6855 			modified = (opte & EPT_D) != 0;
6856 		} else {
6857 			modified = true;
6858 		}
6859 		if (modified) {
6860 			vaddr_t tva = x86_ptob(spte - ptes);
6861 			pmap_tlb_shootdown(pmap, tva, 0,
6862 			    TLBSHOOT_WRITE_PROTECT);
6863 		}
6864 next:;
6865 	}
6866 
6867 	kpreempt_enable();
6868 	mutex_exit(&pmap->pm_lock);
6869 }
6870 
6871 static void
6872 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6873 {
6874 	pt_entry_t *ptes, *ptep, opte;
6875 	pd_entry_t pde;
6876 	paddr_t ptppa;
6877 
6878 	/* Acquire pmap. */
6879 	mutex_enter(&pmap->pm_lock);
6880 	kpreempt_disable();
6881 
6882 	if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6883 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6884 	}
6885 
6886 	ptppa = pmap_pte2pa(pde);
6887 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6888 	ptep = &ptes[pl1_pi(va)];
6889 	opte = *ptep;
6890 	KASSERT(pmap_ept_valid_entry(opte));
6891 
6892 	if (opte & EPT_WIRED) {
6893 		pt_entry_t npte = opte & ~EPT_WIRED;
6894 
6895 		opte = pmap_pte_testset(ptep, npte);
6896 		pmap_ept_stats_update_bypte(pmap, npte, opte);
6897 	} else {
6898 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
6899 		    "did not change!\n", __func__, pmap, va);
6900 	}
6901 
6902 	/* Release pmap. */
6903 	kpreempt_enable();
6904 	mutex_exit(&pmap->pm_lock);
6905 }
6906 
6907 /* -------------------------------------------------------------------------- */
6908 
6909 void
6910 pmap_ept_transform(struct pmap *pmap)
6911 {
6912 	pmap->pm_enter = pmap_ept_enter;
6913 	pmap->pm_extract = pmap_ept_extract;
6914 	pmap->pm_remove = pmap_ept_remove;
6915 	pmap->pm_sync_pv = pmap_ept_sync_pv;
6916 	pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6917 	pmap->pm_write_protect = pmap_ept_write_protect;
6918 	pmap->pm_unwire = pmap_ept_unwire;
6919 
6920 	memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
6921 }
6922 
6923 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6924