1 /* $NetBSD: pmap.c,v 1.425 2023/07/26 21:45:28 riastradh Exp $ */
2
3 /*
4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran, and by Maxime Villard.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 2007 Manuel Bouyer.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54 */
55
56 /*
57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58 *
59 * Permission to use, copy, modify, and distribute this software for any
60 * purpose with or without fee is hereby granted, provided that the above
61 * copyright notice and this permission notice appear in all copies.
62 *
63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70 */
71
72 /*
73 * Copyright 2001 (c) Wasabi Systems, Inc.
74 * All rights reserved.
75 *
76 * Written by Frank van der Linden for Wasabi Systems, Inc.
77 *
78 * Redistribution and use in source and binary forms, with or without
79 * modification, are permitted provided that the following conditions
80 * are met:
81 * 1. Redistributions of source code must retain the above copyright
82 * notice, this list of conditions and the following disclaimer.
83 * 2. Redistributions in binary form must reproduce the above copyright
84 * notice, this list of conditions and the following disclaimer in the
85 * documentation and/or other materials provided with the distribution.
86 * 3. All advertising materials mentioning features or use of this software
87 * must display the following acknowledgement:
88 * This product includes software developed for the NetBSD Project by
89 * Wasabi Systems, Inc.
90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91 * or promote products derived from this software without specific prior
92 * written permission.
93 *
94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104 * POSSIBILITY OF SUCH DAMAGE.
105 */
106
107 /*
108 * Copyright (c) 1997 Charles D. Cranor and Washington University.
109 * All rights reserved.
110 *
111 * Redistribution and use in source and binary forms, with or without
112 * modification, are permitted provided that the following conditions
113 * are met:
114 * 1. Redistributions of source code must retain the above copyright
115 * notice, this list of conditions and the following disclaimer.
116 * 2. Redistributions in binary form must reproduce the above copyright
117 * notice, this list of conditions and the following disclaimer in the
118 * documentation and/or other materials provided with the distribution.
119 *
120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130 */
131
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.425 2023/07/26 21:45:28 riastradh Exp $");
134
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 #include "opt_efi.h"
142
143 #define __MUTEX_PRIVATE /* for assertions */
144
145 #include <sys/param.h>
146 #include <sys/systm.h>
147 #include <sys/proc.h>
148 #include <sys/pool.h>
149 #include <sys/kernel.h>
150 #include <sys/atomic.h>
151 #include <sys/cpu.h>
152 #include <sys/intr.h>
153 #include <sys/xcall.h>
154 #include <sys/kcore.h>
155 #include <sys/kmem.h>
156 #include <sys/asan.h>
157 #include <sys/msan.h>
158 #include <sys/entropy.h>
159
160 #include <uvm/uvm.h>
161 #include <uvm/pmap/pmap_pvt.h>
162
163 #include <dev/isa/isareg.h>
164
165 #include <machine/specialreg.h>
166 #include <machine/gdt.h>
167 #include <machine/isa_machdep.h>
168 #include <machine/cpuvar.h>
169 #include <machine/cputypes.h>
170 #include <machine/pmap_private.h>
171
172 #include <x86/bootspace.h>
173 #include <x86/pat.h>
174 #include <x86/pmap_pv.h>
175
176 #include <x86/i82489reg.h>
177 #include <x86/i82489var.h>
178
179 #ifdef XEN
180 #include <xen/include/public/xen.h>
181 #include <xen/hypervisor.h>
182 #include <xen/xenpmap.h>
183 #endif
184
185 #ifdef __HAVE_DIRECT_MAP
186 #include <crypto/nist_hash_drbg/nist_hash_drbg.h>
187 #endif
188
189 /*
190 * general info:
191 *
192 * - for an explanation of how the x86 MMU hardware works see
193 * the comments in <machine/pte.h>.
194 *
195 * - for an explanation of the general memory structure used by
196 * this pmap (including the recursive mapping), see the comments
197 * in <machine/pmap.h>.
198 *
199 * this file contains the code for the "pmap module." the module's
200 * job is to manage the hardware's virtual to physical address mappings.
201 * note that there are two levels of mapping in the VM system:
202 *
203 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
204 * to map ranges of virtual address space to objects/files. for
205 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
206 * to the file /bin/ls starting at offset zero." note that
207 * the upper layer mapping is not concerned with how individual
208 * vm_pages are mapped.
209 *
210 * [2] the lower layer of the VM system (the pmap) maintains the mappings
211 * from virtual addresses. it is concerned with which vm_page is
212 * mapped where. for example, when you run /bin/ls and start
213 * at page 0x1000 the fault routine may lookup the correct page
214 * of the /bin/ls file and then ask the pmap layer to establish
215 * a mapping for it.
216 *
217 * note that information in the lower layer of the VM system can be
218 * thrown away since it can easily be reconstructed from the info
219 * in the upper layer.
220 *
221 * data structures we use include:
222 *
223 * - struct pmap: describes the address space of one thread
224 * - struct pmap_page: describes one pv-tracked page, without
225 * necessarily a corresponding vm_page
226 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
227 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of
228 * physical memory. the pp_pvlist points to a list of pv_entry
229 * structures which describe all the <PMAP,VA> pairs that this
230 * page is mapped in. this is critical for page based operations
231 * such as pmap_page_protect() [change protection on _all_ mappings
232 * of a page]
233 */
234
235 /*
236 * Locking
237 *
238 * We have the following locks that we must deal with, listed in the order
239 * that they are acquired:
240 *
241 * pg->uobject->vmobjlock, pg->uanon->an_lock
242 *
243 * For managed pages, these per-object locks are taken by the VM system
244 * before calling into the pmap module - either a read or write hold.
245 * The lock hold prevent pages from changing identity while the pmap is
246 * operating on them. For example, the same lock is held across a call
247 * to pmap_remove() and the following call to pmap_update(), so that a
248 * page does not gain a new identity while its TLB visibility is stale.
249 *
250 * pmap->pm_lock
251 *
252 * This lock protects the fields in the pmap structure including the
253 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
254 * structures. For modifying unmanaged kernel PTEs it is not needed as
255 * kernel PDEs are never freed, and the kernel is expected to be self
256 * consistent (and the lock can't be taken for unmanaged kernel PTEs,
257 * because they can be modified from interrupt context).
258 *
259 * pmaps_lock
260 *
261 * This lock protects the list of active pmaps (headed by "pmaps").
262 * It's acquired when adding or removing pmaps or adjusting kernel PDEs.
263 *
264 * pp_lock
265 *
266 * This per-page lock protects PV entry lists and the embedded PV entry
267 * in each vm_page, allowing for concurrent operation on pages by
268 * different pmaps. This is a spin mutex at IPL_VM, because at the
269 * points it is taken context switching is usually not tolerable, and
270 * spin mutexes must block out interrupts that could take kernel_lock.
271 */
272
273 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
274 #ifdef DIAGNOSTIC
275 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
276 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock)
277 #else
278 #define PMAP_DUMMY_LOCK(pm)
279 #define PMAP_DUMMY_UNLOCK(pm)
280 #endif
281
282 static const struct uvm_pagerops pmap_pager = {
283 /* nothing */
284 };
285
286 /*
287 * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X)
288 */
289 #define pl_i(va, lvl) \
290 (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1])
291
292 #define pl_i_roundup(va, lvl) pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl))
293
294 /*
295 * PTP macros:
296 * a PTP's index is the PD index of the PDE that points to it
297 * a PTP's offset is the byte-offset in the PTE space that this PTP is at
298 * a PTP's VA is the first VA mapped by that PTP
299 */
300
301 #define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE)
302
303 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
304 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
305 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
306 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
307 const long nbpd[] = NBPD_INITIALIZER;
308 #ifdef i386
309 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
310 #else
311 pd_entry_t *normal_pdes[3];
312 #endif
313
314 long nkptp[] = NKPTP_INITIALIZER;
315
316 struct pmap_head pmaps;
317 kmutex_t pmaps_lock __cacheline_aligned;
318
319 struct pcpu_area *pcpuarea __read_mostly;
320
321 static vaddr_t pmap_maxkvaddr;
322
323 /*
324 * Misc. event counters.
325 */
326 struct evcnt pmap_iobmp_evcnt;
327 struct evcnt pmap_ldt_evcnt;
328
329 /*
330 * PAT
331 */
332 static bool cpu_pat_enabled __read_mostly = false;
333
334 /*
335 * Global data structures
336 */
337
338 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
339 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
340 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
341
342 struct bootspace bootspace __read_mostly;
343 struct slotspace slotspace __read_mostly;
344
345 /* Set to PTE_NX if supported. */
346 pd_entry_t pmap_pg_nx __read_mostly = 0;
347
348 /* Set to PTE_G if supported. */
349 pd_entry_t pmap_pg_g __read_mostly = 0;
350
351 /* Set to true if large pages are supported. */
352 int pmap_largepages __read_mostly = 0;
353
354 paddr_t lowmem_rsvd __read_mostly;
355 paddr_t avail_start __read_mostly; /* PA of first available physical page */
356 paddr_t avail_end __read_mostly; /* PA of last available physical page */
357
358 #ifdef XENPV
359 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
360 paddr_t pmap_pa_end; /* PA of last physical page for this domain */
361 #endif
362
363 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp)
364 #define PMAP_CHECK_PP(pp) \
365 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
366
367 #define PAGE_ALIGNED(pp) \
368 __builtin_assume_aligned((void *)(pp), PAGE_SIZE)
369
370 /*
371 * Other data structures
372 */
373
374 static pt_entry_t protection_codes[8] __read_mostly;
375
376 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
377
378 /*
379 * The following two vaddr_t's are used during system startup to keep track of
380 * how much of the kernel's VM space we have used. Once the system is started,
381 * the management of the remaining kernel VM space is turned over to the
382 * kernel_map vm_map.
383 */
384 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */
385 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */
386
387 #ifndef XENPV
388 /*
389 * LAPIC virtual address, and fake physical address.
390 */
391 volatile vaddr_t local_apic_va __read_mostly;
392 paddr_t local_apic_pa __read_mostly;
393 #endif
394
395 /*
396 * pool that pmap structures are allocated from
397 */
398 struct pool_cache pmap_cache;
399 static int pmap_ctor(void *, void *, int);
400 static void pmap_dtor(void *, void *);
401
402 /*
403 * pv_page cache
404 */
405 static struct pool_cache pmap_pvp_cache;
406
407 #ifdef __HAVE_DIRECT_MAP
408 vaddr_t pmap_direct_base __read_mostly;
409 vaddr_t pmap_direct_end __read_mostly;
410 #endif
411
412 #ifndef __HAVE_DIRECT_MAP
413 /*
414 * Special VAs and the PTEs that map them
415 */
416 static pt_entry_t *early_zero_pte;
417 static void pmap_vpage_cpualloc(struct cpu_info *);
418 #ifdef XENPV
419 char *early_zerop; /* also referenced from xen_locore() */
420 #else
421 static char *early_zerop;
422 #endif
423 #endif
424
425 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
426
427 /* PDP pool and its callbacks */
428 static struct pool pmap_pdp_pool;
429 static void pmap_pdp_init(pd_entry_t *);
430 static void pmap_pdp_fini(pd_entry_t *);
431
432 #ifdef PAE
433 /* need to allocate items of 4 pages */
434 static void *pmap_pdp_alloc(struct pool *, int);
435 static void pmap_pdp_free(struct pool *, void *);
436 static struct pool_allocator pmap_pdp_allocator = {
437 .pa_alloc = pmap_pdp_alloc,
438 .pa_free = pmap_pdp_free,
439 .pa_pagesz = PAGE_SIZE * PDP_SIZE,
440 };
441 #endif
442
443 extern vaddr_t idt_vaddr;
444 extern paddr_t idt_paddr;
445 extern vaddr_t gdt_vaddr;
446 extern paddr_t gdt_paddr;
447 extern vaddr_t ldt_vaddr;
448 extern paddr_t ldt_paddr;
449
450 #ifdef i386
451 /* stuff to fix the pentium f00f bug */
452 extern vaddr_t pentium_idt_vaddr;
453 #endif
454
455 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
456 struct pmap_ptparray {
457 struct vm_page *pg[PTP_LEVELS + 1];
458 bool alloced[PTP_LEVELS + 1];
459 };
460
461 /*
462 * PV entries are allocated in page-sized chunks and cached per-pmap to
463 * avoid intense pressure on memory allocators.
464 */
465
466 struct pv_page {
467 LIST_HEAD(, pv_entry) pvp_pves;
468 LIST_ENTRY(pv_page) pvp_list;
469 long pvp_nfree;
470 struct pmap *pvp_pmap;
471 };
472
473 #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
474
475 /*
476 * PV tree prototypes
477 */
478
479 static int pmap_compare_key(void *, const void *, const void *);
480 static int pmap_compare_nodes(void *, const void *, const void *);
481
482 /* Read-black tree */
483 static const rb_tree_ops_t pmap_rbtree_ops = {
484 .rbto_compare_nodes = pmap_compare_nodes,
485 .rbto_compare_key = pmap_compare_key,
486 .rbto_node_offset = offsetof(struct pv_entry, pve_rb),
487 .rbto_context = NULL
488 };
489
490 /*
491 * Local prototypes
492 */
493
494 #ifdef __HAVE_PCPU_AREA
495 static void pmap_init_pcpu(void);
496 #endif
497 #ifdef __HAVE_DIRECT_MAP
498 static void pmap_init_directmap(struct pmap *);
499 #endif
500 #if !defined(XENPV)
501 static void pmap_remap_global(void);
502 #endif
503 #ifndef XENPV
504 static void pmap_init_lapic(void);
505 static void pmap_remap_largepages(void);
506 #endif
507
508 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
509 struct vm_page **);
510 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
511 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
512 pd_entry_t * const *);
513 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
514 static void pmap_freepage(struct pmap *, struct vm_page *, int);
515 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
516 pt_entry_t *, pd_entry_t * const *);
517 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
518 vaddr_t);
519 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
520 vaddr_t);
521 static int pmap_pvp_ctor(void *, void *, int);
522 static void pmap_pvp_dtor(void *, void *);
523 static struct pv_entry *pmap_alloc_pv(struct pmap *);
524 static void pmap_free_pv(struct pmap *, struct pv_entry *);
525 static void pmap_drain_pv(struct pmap *);
526
527 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
528
529 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
530 static void pmap_reactivate(struct pmap *);
531
532 long
pmap_resident_count(struct pmap * pmap)533 pmap_resident_count(struct pmap *pmap)
534 {
535
536 return pmap->pm_stats.resident_count;
537 }
538
539 long
pmap_wired_count(struct pmap * pmap)540 pmap_wired_count(struct pmap *pmap)
541 {
542
543 return pmap->pm_stats.wired_count;
544 }
545
546 /*
547 * p m a p h e l p e r f u n c t i o n s
548 */
549
550 static inline void
pmap_stats_update(struct pmap * pmap,int resid_diff,int wired_diff)551 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
552 {
553
554 KASSERT(cold || mutex_owned(&pmap->pm_lock));
555 pmap->pm_stats.resident_count += resid_diff;
556 pmap->pm_stats.wired_count += wired_diff;
557 }
558
559 static inline void
pmap_stats_update_bypte(struct pmap * pmap,pt_entry_t npte,pt_entry_t opte)560 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
561 {
562 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
563 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
564
565 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
566 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
567
568 pmap_stats_update(pmap, resid_diff, wired_diff);
569 }
570
571 /*
572 * ptp_to_pmap: lookup pmap by ptp
573 */
574 static inline struct pmap *
ptp_to_pmap(struct vm_page * ptp)575 ptp_to_pmap(struct vm_page *ptp)
576 {
577 struct pmap *pmap;
578
579 if (ptp == NULL) {
580 return pmap_kernel();
581 }
582 pmap = (struct pmap *)ptp->uobject;
583 KASSERT(pmap != NULL);
584 KASSERT(&pmap->pm_obj[0] == ptp->uobject);
585 return pmap;
586 }
587
588 static inline struct pv_pte *
pve_to_pvpte(struct pv_entry * pve)589 pve_to_pvpte(struct pv_entry *pve)
590 {
591
592 if (pve == NULL)
593 return NULL;
594 KASSERT((void *)&pve->pve_pte == (void *)pve);
595 return &pve->pve_pte;
596 }
597
598 static inline struct pv_entry *
pvpte_to_pve(struct pv_pte * pvpte)599 pvpte_to_pve(struct pv_pte *pvpte)
600 {
601 struct pv_entry *pve = (void *)pvpte;
602
603 KASSERT(pve_to_pvpte(pve) == pvpte);
604 return pve;
605 }
606
607 /*
608 * Return true if the pmap page has an embedded PV entry.
609 */
610 static inline bool
pv_pte_embedded(struct pmap_page * pp)611 pv_pte_embedded(struct pmap_page *pp)
612 {
613
614 KASSERT(mutex_owned(&pp->pp_lock));
615 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
616 }
617
618 /*
619 * pv_pte_first, pv_pte_next: PV list iterator.
620 */
621 static inline struct pv_pte *
pv_pte_first(struct pmap_page * pp)622 pv_pte_first(struct pmap_page *pp)
623 {
624
625 KASSERT(mutex_owned(&pp->pp_lock));
626 if (pv_pte_embedded(pp)) {
627 return &pp->pp_pte;
628 }
629 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
630 }
631
632 static inline struct pv_pte *
pv_pte_next(struct pmap_page * pp,struct pv_pte * pvpte)633 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
634 {
635
636 KASSERT(mutex_owned(&pp->pp_lock));
637 KASSERT(pvpte != NULL);
638 if (pvpte == &pp->pp_pte) {
639 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
640 }
641 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
642 }
643
644 static inline uint8_t
pmap_pte_to_pp_attrs(pt_entry_t pte)645 pmap_pte_to_pp_attrs(pt_entry_t pte)
646 {
647 uint8_t ret = 0;
648 if (pte & PTE_D)
649 ret |= PP_ATTRS_D;
650 if (pte & PTE_A)
651 ret |= PP_ATTRS_A;
652 if (pte & PTE_W)
653 ret |= PP_ATTRS_W;
654 return ret;
655 }
656
657 static inline pt_entry_t
pmap_pp_attrs_to_pte(uint8_t attrs)658 pmap_pp_attrs_to_pte(uint8_t attrs)
659 {
660 pt_entry_t pte = 0;
661 if (attrs & PP_ATTRS_D)
662 pte |= PTE_D;
663 if (attrs & PP_ATTRS_A)
664 pte |= PTE_A;
665 if (attrs & PP_ATTRS_W)
666 pte |= PTE_W;
667 return pte;
668 }
669
670 /*
671 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
672 * of course the kernel is always loaded
673 */
674 bool
pmap_is_curpmap(struct pmap * pmap)675 pmap_is_curpmap(struct pmap *pmap)
676 {
677 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
678 }
679
680 inline void
pmap_reference(struct pmap * pmap)681 pmap_reference(struct pmap *pmap)
682 {
683
684 atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
685 }
686
687 /*
688 * rbtree: compare two nodes.
689 */
690 static int
pmap_compare_nodes(void * context,const void * n1,const void * n2)691 pmap_compare_nodes(void *context, const void *n1, const void *n2)
692 {
693 const struct pv_entry *pve1 = n1;
694 const struct pv_entry *pve2 = n2;
695
696 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
697
698 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
699 return -1;
700 }
701 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
702 return 1;
703 }
704 return 0;
705 }
706
707 /*
708 * rbtree: compare a node and a key.
709 */
710 static int
pmap_compare_key(void * context,const void * n,const void * k)711 pmap_compare_key(void *context, const void *n, const void *k)
712 {
713 const struct pv_entry *pve = n;
714 const vaddr_t key = (vaddr_t)k;
715
716 if (pve->pve_pte.pte_va < key) {
717 return -1;
718 }
719 if (pve->pve_pte.pte_va > key) {
720 return 1;
721 }
722 return 0;
723 }
724
725 /*
726 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
727 */
728 static inline void
pmap_ptp_range_set(struct vm_page * ptp,vaddr_t va)729 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
730 {
731 vaddr_t *min = (vaddr_t *)&ptp->uanon;
732
733 if (va < *min) {
734 *min = va;
735 }
736 }
737
738 /*
739 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
740 */
741 static inline void
pmap_ptp_range_clip(struct vm_page * ptp,vaddr_t * startva,pt_entry_t ** pte)742 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
743 {
744 vaddr_t sclip;
745
746 if (ptp == NULL) {
747 return;
748 }
749
750 sclip = (vaddr_t)ptp->uanon;
751 sclip = (*startva < sclip ? sclip : *startva);
752 *pte += (sclip - *startva) / PAGE_SIZE;
753 *startva = sclip;
754 }
755
756 /*
757 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
758 *
759 * there are several pmaps involved. some or all of them might be same.
760 *
761 * - the pmap given by the first argument
762 * our caller wants to access this pmap's PTEs.
763 *
764 * - pmap_kernel()
765 * the kernel pmap. note that it only contains the kernel part
766 * of the address space which is shared by any pmap. ie. any
767 * pmap can be used instead of pmap_kernel() for our purpose.
768 *
769 * - ci->ci_pmap
770 * pmap currently loaded on the cpu.
771 *
772 * - vm_map_pmap(&curproc->p_vmspace->vm_map)
773 * current process' pmap.
774 *
775 * => caller must lock pmap first (if not the kernel pmap)
776 * => must be undone with pmap_unmap_ptes before returning
777 * => disables kernel preemption
778 */
779 void
pmap_map_ptes(struct pmap * pmap,struct pmap ** pmap2,pd_entry_t ** ptepp,pd_entry_t * const ** pdeppp)780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
781 pd_entry_t * const **pdeppp)
782 {
783 struct pmap *curpmap;
784 struct cpu_info *ci;
785 lwp_t *l;
786
787 kpreempt_disable();
788
789 /* The kernel's pmap is always accessible. */
790 if (pmap == pmap_kernel()) {
791 *pmap2 = NULL;
792 *ptepp = PTE_BASE;
793 *pdeppp = normal_pdes;
794 return;
795 }
796
797 KASSERT(mutex_owned(&pmap->pm_lock));
798
799 l = curlwp;
800 ci = l->l_cpu;
801 curpmap = ci->ci_pmap;
802 if (pmap == curpmap) {
803 /*
804 * Already on the CPU: make it valid. This is very
805 * often the case during exit(), when we have switched
806 * to the kernel pmap in order to destroy a user pmap.
807 */
808 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
809 pmap_reactivate(pmap);
810 }
811 *pmap2 = NULL;
812 } else {
813 /*
814 * Toss current pmap from CPU and install new pmap, but keep
815 * a reference to the old one. Dropping the reference can
816 * can block as it needs to take locks, so defer that to
817 * pmap_unmap_ptes().
818 */
819 pmap_reference(pmap);
820 pmap_load1(l, pmap, curpmap);
821 *pmap2 = curpmap;
822 }
823 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
824 #ifdef DIAGNOSTIC
825 pmap->pm_ncsw = lwp_pctr();
826 #endif
827 *ptepp = PTE_BASE;
828
829 #if defined(XENPV) && defined(__x86_64__)
830 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
831 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
832 *pdeppp = ci->ci_normal_pdes;
833 #else
834 *pdeppp = normal_pdes;
835 #endif
836 }
837
838 /*
839 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
840 *
841 * => we cannot tolerate context switches while mapped in: assert this.
842 * => reenables kernel preemption.
843 * => does not unlock pmap.
844 */
845 void
pmap_unmap_ptes(struct pmap * pmap,struct pmap * pmap2)846 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
847 {
848 struct cpu_info *ci;
849 struct pmap *mypmap;
850 struct lwp *l;
851
852 KASSERT(kpreempt_disabled());
853
854 /* The kernel's pmap is always accessible. */
855 if (pmap == pmap_kernel()) {
856 kpreempt_enable();
857 return;
858 }
859
860 l = curlwp;
861 ci = l->l_cpu;
862
863 KASSERT(mutex_owned(&pmap->pm_lock));
864 KASSERT(pmap->pm_ncsw == lwp_pctr());
865
866 #if defined(XENPV) && defined(__x86_64__)
867 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
868 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
869 #endif
870
871 /* If not our own pmap, mark whatever's on the CPU now as lazy. */
872 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
873 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
874 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
875 ci->ci_want_pmapload = 0;
876 } else {
877 ci->ci_want_pmapload = (mypmap != pmap_kernel());
878 ci->ci_tlbstate = TLBSTATE_LAZY;
879 }
880
881 /* Now safe to re-enable preemption. */
882 kpreempt_enable();
883
884 /* Toss reference to other pmap taken earlier. */
885 if (pmap2 != NULL) {
886 pmap_destroy(pmap2);
887 }
888 }
889
890 inline static void
pmap_exec_account(struct pmap * pm,vaddr_t va,pt_entry_t opte,pt_entry_t npte)891 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
892 {
893
894 #if !defined(__x86_64__)
895 if (curproc == NULL || curproc->p_vmspace == NULL ||
896 pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
897 return;
898
899 if ((opte ^ npte) & PTE_X)
900 pmap_update_pg(va);
901
902 /*
903 * Executability was removed on the last executable change.
904 * Reset the code segment to something conservative and
905 * let the trap handler deal with setting the right limit.
906 * We can't do that because of locking constraints on the vm map.
907 */
908
909 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
910 struct trapframe *tf = curlwp->l_md.md_regs;
911
912 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
913 pm->pm_hiexec = I386_MAX_EXE_ADDR;
914 }
915 #endif /* !defined(__x86_64__) */
916 }
917
918 #if !defined(__x86_64__)
919 /*
920 * Fixup the code segment to cover all potential executable mappings.
921 * returns 0 if no changes to the code segment were made.
922 */
923 int
pmap_exec_fixup(struct vm_map * map,struct trapframe * tf,struct pcb * pcb)924 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
925 {
926 struct vm_map_entry *ent;
927 struct pmap *pm = vm_map_pmap(map);
928 vaddr_t va = 0;
929
930 vm_map_lock_read(map);
931 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
932 /*
933 * This entry has greater va than the entries before.
934 * We need to make it point to the last page, not past it.
935 */
936 if (ent->protection & VM_PROT_EXECUTE)
937 va = trunc_page(ent->end) - PAGE_SIZE;
938 }
939 vm_map_unlock_read(map);
940 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
941 return 0;
942
943 pm->pm_hiexec = va;
944 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
945 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
946 } else {
947 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
948 return 0;
949 }
950 return 1;
951 }
952 #endif /* !defined(__x86_64__) */
953
954 void
pat_init(struct cpu_info * ci)955 pat_init(struct cpu_info *ci)
956 {
957 #ifndef XENPV
958 uint64_t pat;
959
960 if (!(ci->ci_feat_val[0] & CPUID_PAT))
961 return;
962
963 /* We change WT to WC. Leave all other entries the default values. */
964 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
965 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
966 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
967 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
968
969 wrmsr(MSR_CR_PAT, pat);
970 cpu_pat_enabled = true;
971 #endif
972 }
973
974 static pt_entry_t
pmap_pat_flags(u_int flags)975 pmap_pat_flags(u_int flags)
976 {
977 u_int cacheflags = (flags & PMAP_CACHE_MASK);
978
979 if (!cpu_pat_enabled) {
980 switch (cacheflags) {
981 case PMAP_NOCACHE:
982 case PMAP_NOCACHE_OVR:
983 /* results in PGC_UCMINUS on cpus which have
984 * the cpuid PAT but PAT "disabled"
985 */
986 return PTE_PCD;
987 default:
988 return 0;
989 }
990 }
991
992 switch (cacheflags) {
993 case PMAP_NOCACHE:
994 return PGC_UC;
995 case PMAP_WRITE_COMBINE:
996 return PGC_WC;
997 case PMAP_WRITE_BACK:
998 return PGC_WB;
999 case PMAP_NOCACHE_OVR:
1000 return PGC_UCMINUS;
1001 }
1002
1003 return 0;
1004 }
1005
1006 /*
1007 * p m a p k e n t e r f u n c t i o n s
1008 *
1009 * functions to quickly enter/remove pages from the kernel address
1010 * space. pmap_kremove is exported to MI kernel. we make use of
1011 * the recursive PTE mappings.
1012 */
1013
1014 /*
1015 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1016 *
1017 * => no need to lock anything, assume va is already allocated
1018 * => should be faster than normal pmap enter function
1019 */
1020 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int flags)1021 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1022 {
1023 pt_entry_t *pte, opte, npte;
1024
1025 KASSERT(!(prot & ~VM_PROT_ALL));
1026
1027 if (va < VM_MIN_KERNEL_ADDRESS)
1028 pte = vtopte(va);
1029 else
1030 pte = kvtopte(va);
1031 #if defined(XENPV) && defined(DOM0OPS)
1032 if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1033 #ifdef DEBUG
1034 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
1035 " outside range\n", __func__, pa, va);
1036 #endif /* DEBUG */
1037 npte = pa;
1038 } else
1039 #endif /* XENPV && DOM0OPS */
1040 npte = pmap_pa2pte(pa);
1041 npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
1042 npte |= pmap_pat_flags(flags);
1043 opte = pmap_pte_testset(pte, npte); /* zap! */
1044
1045 /*
1046 * XXX: make sure we are not dealing with a large page, since the only
1047 * large pages created are for the kernel image, and they should never
1048 * be kentered.
1049 */
1050 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1051
1052 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1053 /* This should not happen. */
1054 printf_nolog("%s: mapping already present\n", __func__);
1055 kpreempt_disable();
1056 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1057 kpreempt_enable();
1058 }
1059 }
1060
1061 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1062
1063 #if defined(__x86_64__)
1064 /*
1065 * Change protection for a virtual address. Local for a CPU only, don't
1066 * care about TLB shootdowns.
1067 *
1068 * => must be called with preemption disabled
1069 */
1070 void
pmap_changeprot_local(vaddr_t va,vm_prot_t prot)1071 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1072 {
1073 pt_entry_t *pte, opte, npte;
1074
1075 KASSERT(kpreempt_disabled());
1076
1077 if (va < VM_MIN_KERNEL_ADDRESS)
1078 pte = vtopte(va);
1079 else
1080 pte = kvtopte(va);
1081
1082 npte = opte = *pte;
1083
1084 if ((prot & VM_PROT_WRITE) != 0)
1085 npte |= PTE_W;
1086 else
1087 npte &= ~(PTE_W|PTE_D);
1088
1089 if (opte != npte) {
1090 pmap_pte_set(pte, npte);
1091 pmap_pte_flush();
1092 invlpg(va);
1093 }
1094 }
1095 #endif /* defined(__x86_64__) */
1096
1097 /*
1098 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1099 *
1100 * => no need to lock anything
1101 * => caller must dispose of any vm_page mapped in the va range
1102 * => note: not an inline function
1103 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1104 * => we assume kernel only unmaps valid addresses and thus don't bother
1105 * checking the valid bit before doing TLB flushing
1106 * => must be followed by call to pmap_update() before reuse of page
1107 */
1108 static void
pmap_kremove1(vaddr_t sva,vsize_t len,bool localonly)1109 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1110 {
1111 pt_entry_t *pte, opte;
1112 vaddr_t va, eva;
1113
1114 eva = sva + len;
1115
1116 kpreempt_disable();
1117 for (va = sva; va < eva; va += PAGE_SIZE) {
1118 pte = kvtopte(va);
1119 opte = pmap_pte_testset(pte, 0); /* zap! */
1120 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1121 pmap_tlb_shootdown(pmap_kernel(), va, opte,
1122 TLBSHOOT_KREMOVE);
1123 }
1124 KASSERTMSG((opte & PTE_PS) == 0,
1125 "va %#" PRIxVADDR " is a large page", va);
1126 KASSERTMSG((opte & PTE_PVLIST) == 0,
1127 "va %#" PRIxVADDR " is a pv tracked page", va);
1128 }
1129 if (localonly) {
1130 tlbflushg();
1131 }
1132 kpreempt_enable();
1133 }
1134
1135 void
pmap_kremove(vaddr_t sva,vsize_t len)1136 pmap_kremove(vaddr_t sva, vsize_t len)
1137 {
1138
1139 pmap_kremove1(sva, len, false);
1140 }
1141
1142 /*
1143 * pmap_kremove_local: like pmap_kremove(), but only worry about
1144 * TLB invalidations on the current CPU. this is only intended
1145 * for use while writing kernel crash dumps, either after panic
1146 * or via reboot -d.
1147 */
1148 void
pmap_kremove_local(vaddr_t sva,vsize_t len)1149 pmap_kremove_local(vaddr_t sva, vsize_t len)
1150 {
1151
1152 pmap_kremove1(sva, len, true);
1153 }
1154
1155 /*
1156 * p m a p i n i t f u n c t i o n s
1157 *
1158 * pmap_bootstrap and pmap_init are called during system startup
1159 * to init the pmap module. pmap_bootstrap() does a low level
1160 * init just to get things rolling. pmap_init() finishes the job.
1161 */
1162
1163 /*
1164 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1165 * This function is to be used before any VM system has been set up.
1166 *
1167 * The va is taken from virtual_avail.
1168 */
1169 static vaddr_t
pmap_bootstrap_valloc(size_t npages)1170 pmap_bootstrap_valloc(size_t npages)
1171 {
1172 vaddr_t va = virtual_avail;
1173 virtual_avail += npages * PAGE_SIZE;
1174 return va;
1175 }
1176
1177 /*
1178 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1179 * This function is to be used before any VM system has been set up.
1180 *
1181 * The pa is taken from avail_start.
1182 */
1183 static paddr_t
pmap_bootstrap_palloc(size_t npages)1184 pmap_bootstrap_palloc(size_t npages)
1185 {
1186 paddr_t pa = avail_start;
1187 avail_start += npages * PAGE_SIZE;
1188 return pa;
1189 }
1190
1191 /*
1192 * pmap_bootstrap: get the system in a state where it can run with VM properly
1193 * enabled (called before main()). The VM system is fully init'd later.
1194 *
1195 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1196 * kernel, and nkpde PTP's for the kernel.
1197 * => kva_start is the first free virtual address in kernel space.
1198 */
1199 void
pmap_bootstrap(vaddr_t kva_start)1200 pmap_bootstrap(vaddr_t kva_start)
1201 {
1202 struct pmap *kpm;
1203 int i;
1204 vaddr_t kva;
1205
1206 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1207
1208 /*
1209 * Set up our local static global vars that keep track of the usage of
1210 * KVM before kernel_map is set up.
1211 */
1212 virtual_avail = kva_start; /* first free KVA */
1213 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */
1214
1215 /*
1216 * Set up protection_codes: we need to be able to convert from a MI
1217 * protection code (some combo of VM_PROT...) to something we can jam
1218 * into a x86 PTE.
1219 */
1220 protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1221 protection_codes[VM_PROT_EXECUTE] = PTE_X;
1222 protection_codes[VM_PROT_READ] = pmap_pg_nx;
1223 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1224 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1225 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1226 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1227 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1228
1229 /*
1230 * Now we init the kernel's pmap.
1231 *
1232 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1233 * the pm_obj contains the list of active PTPs.
1234 */
1235 kpm = pmap_kernel();
1236 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1237 rw_init(&kpm->pm_dummy_lock);
1238 for (i = 0; i < PTP_LEVELS - 1; i++) {
1239 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1240 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1241 kpm->pm_ptphint[i] = NULL;
1242 }
1243 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
1244
1245 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1246 for (i = 0; i < PDP_SIZE; i++)
1247 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1248
1249 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1250 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1251
1252 kcpuset_create(&kpm->pm_cpus, true);
1253 kcpuset_create(&kpm->pm_kernel_cpus, true);
1254
1255 kpm->pm_ldt = NULL;
1256 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1257
1258 /*
1259 * the above is just a rough estimate and not critical to the proper
1260 * operation of the system.
1261 */
1262
1263 #if !defined(XENPV)
1264 /*
1265 * Begin to enable global TLB entries if they are supported: add PTE_G
1266 * attribute to already mapped kernel pages. Do that only if SVS is
1267 * disabled.
1268 *
1269 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1270 * happens later in cpu_init().
1271 */
1272 #ifdef SVS
1273 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1274 #else
1275 if (cpu_feature[0] & CPUID_PGE) {
1276 #endif
1277 pmap_pg_g = PTE_G;
1278 pmap_remap_global();
1279 }
1280 #endif
1281
1282 #ifndef XENPV
1283 /*
1284 * Enable large pages if they are supported.
1285 */
1286 if (cpu_feature[0] & CPUID_PSE) {
1287 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */
1288 pmap_largepages = 1; /* enable software */
1289
1290 /*
1291 * The TLB must be flushed after enabling large pages on Pentium
1292 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1293 * Software Developer's Manual, Volume 3: System Programming".
1294 */
1295 tlbflushg();
1296
1297 /* Remap the kernel. */
1298 pmap_remap_largepages();
1299 }
1300 pmap_init_lapic();
1301 #endif /* !XENPV */
1302
1303 #ifdef __HAVE_PCPU_AREA
1304 pmap_init_pcpu();
1305 #endif
1306
1307 #ifdef __HAVE_DIRECT_MAP
1308 pmap_init_directmap(kpm);
1309 #else
1310 pmap_vpage_cpualloc(&cpu_info_primary);
1311
1312 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1313 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1314 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1315 } else { /* amd64 */
1316 /*
1317 * zero_pte is stuck at the end of mapped space for the kernel
1318 * image (disjunct from kva space). This is done so that it
1319 * can safely be used in pmap_growkernel (pmap_get_physpage),
1320 * when it's called for the first time.
1321 * XXXfvdl fix this for MULTIPROCESSOR later.
1322 */
1323 #ifdef XENPV
1324 /* early_zerop initialized in xen_locore() */
1325 #else
1326 early_zerop = (void *)bootspace.spareva;
1327 #endif
1328 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1329 }
1330 #endif
1331
1332 #if defined(XENPV) && defined(__x86_64__)
1333 extern vaddr_t xen_dummy_page;
1334 paddr_t xen_dummy_user_pgd;
1335
1336 /*
1337 * We want a dummy page directory for Xen: when deactivating a pmap,
1338 * Xen will still consider it active. So we set user PGD to this one
1339 * to lift all protection on the now inactive page tables set.
1340 */
1341 xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1342
1343 /* Zero fill it, the less checks in Xen it requires the better */
1344 memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1345 /* Mark read-only */
1346 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1347 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1348 UVMF_INVLPG);
1349 /* Pin as L4 */
1350 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1351 #endif
1352
1353 /*
1354 * Allocate space for the Interrupt Descriptor Table (IDT),
1355 * Global Descriptor Table (GDT), and Local Descriptor Table
1356 * (LDT).
1357 *
1358 * Currently there is an initial temporary GDT allocated on the
1359 * stack by the caller of init386/init_x86_64, which is (among
1360 * other things) needed on i386 for %fs-relative addressing for
1361 * CPU-local data (CPUVAR(...), curcpu(), curlwp). This
1362 * initial temporary GDT will be popped off the stack before we
1363 * can enter main, so we need to make sure there is space for a
1364 * second temporary GDT to continue existing when we enter main
1365 * before we allocate space for the permanent GDT with
1366 * uvm_km(9) in gdt_init via cpu_startup and switch to that.
1367 */
1368 idt_vaddr = pmap_bootstrap_valloc(1);
1369 idt_paddr = pmap_bootstrap_palloc(1);
1370
1371 gdt_vaddr = pmap_bootstrap_valloc(1);
1372 gdt_paddr = pmap_bootstrap_palloc(1);
1373
1374 #ifdef __HAVE_PCPU_AREA
1375 ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1376 #else
1377 ldt_vaddr = pmap_bootstrap_valloc(1);
1378 #endif
1379 ldt_paddr = pmap_bootstrap_palloc(1);
1380
1381 #if !defined(__x86_64__)
1382 /* pentium f00f bug stuff */
1383 pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1384 #endif
1385
1386 #if defined(XENPVHVM)
1387 /* XXX: move to hypervisor.c with appropriate API adjustments */
1388 extern paddr_t HYPERVISOR_shared_info_pa;
1389 extern volatile struct xencons_interface *xencons_interface; /* XXX */
1390 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1391
1392 if (vm_guest != VM_GUEST_XENPVH) {
1393 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1394 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1395 }
1396 xencons_interface = (void *) pmap_bootstrap_valloc(1);
1397 xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1398 #endif
1399 /*
1400 * Now we reserve some VM for mapping pages when doing a crash dump.
1401 */
1402 virtual_avail = reserve_dumppages(virtual_avail);
1403
1404 /*
1405 * Init the global lock and global list.
1406 */
1407 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1408 LIST_INIT(&pmaps);
1409
1410 /*
1411 * Ensure the TLB is sync'd with reality by flushing it...
1412 */
1413 tlbflushg();
1414
1415 /*
1416 * Calculate pmap_maxkvaddr from nkptp[].
1417 */
1418 kva = VM_MIN_KERNEL_ADDRESS;
1419 for (i = PTP_LEVELS - 1; i >= 1; i--) {
1420 kva += nkptp[i] * nbpd[i];
1421 }
1422 pmap_maxkvaddr = kva;
1423 }
1424
1425 #ifndef XENPV
1426 static void
1427 pmap_init_lapic(void)
1428 {
1429 /*
1430 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1431 * x86 implementation relies a lot on this address to be valid; so just
1432 * allocate a fake physical page that will be kentered into
1433 * local_apic_va by machdep.
1434 *
1435 * If the LAPIC is present, the va will be remapped somewhere else
1436 * later in lapic_map.
1437 */
1438 local_apic_va = pmap_bootstrap_valloc(1);
1439 local_apic_pa = pmap_bootstrap_palloc(1);
1440 }
1441 #endif
1442
1443 #ifdef __x86_64__
1444 static size_t
1445 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1446 {
1447 size_t npages;
1448 npages = (roundup(endva, pgsz) / pgsz) -
1449 (rounddown(startva, pgsz) / pgsz);
1450 return npages;
1451 }
1452 #endif
1453
1454 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1455 static inline void
1456 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1457 {
1458 size_t sslot = slotspace.area[type].sslot;
1459 size_t nslot = slotspace.area[type].nslot;
1460
1461 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1462 }
1463 #endif
1464
1465 #ifdef __x86_64__
1466 /*
1467 * Randomize the location of an area. We count the holes in the VM space. We
1468 * randomly select one hole, and then randomly select an area within that hole.
1469 * Finally we update the associated entry in the slotspace structure.
1470 */
1471 vaddr_t
1472 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1473 vaddr_t randva)
1474 {
1475 struct {
1476 int start;
1477 int end;
1478 } holes[SLSPACE_NAREAS+1];
1479 size_t i, nholes, hole;
1480 size_t startsl, endsl, nslots, winsize;
1481 vaddr_t startva, va;
1482
1483 sz = roundup(sz, align);
1484
1485 /*
1486 * Take one more slot with +NBPD_L4, because we may end up choosing
1487 * an area that crosses slots:
1488 * +------+------+------+
1489 * | Slot | Slot | Slot |
1490 * +------+------+------+
1491 * [Chosen Area]
1492 * And in that case we must take into account the additional slot
1493 * consumed.
1494 */
1495 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1496
1497 /* Get the holes. */
1498 nholes = 0;
1499 size_t curslot = 0 + 256; /* end of SLAREA_USER */
1500 while (1) {
1501 /*
1502 * Find the first occupied slot after the current one.
1503 * The area between the two is a hole.
1504 */
1505 size_t minsslot = 512;
1506 size_t minnslot = 0;
1507 for (i = 0; i < SLSPACE_NAREAS; i++) {
1508 if (!slotspace.area[i].active)
1509 continue;
1510 if (slotspace.area[i].sslot >= curslot &&
1511 slotspace.area[i].sslot < minsslot) {
1512 minsslot = slotspace.area[i].sslot;
1513 minnslot = slotspace.area[i].nslot;
1514 }
1515 }
1516
1517 /* No hole anymore, stop here. */
1518 if (minsslot == 512) {
1519 break;
1520 }
1521
1522 /* Register the hole. */
1523 if (minsslot - curslot >= nslots) {
1524 holes[nholes].start = curslot;
1525 holes[nholes].end = minsslot;
1526 nholes++;
1527 }
1528
1529 /* Skip that hole, and iterate again. */
1530 curslot = minsslot + minnslot;
1531 }
1532
1533 if (nholes == 0) {
1534 panic("%s: impossible", __func__);
1535 }
1536
1537 /* Select a hole. */
1538 hole = randhole;
1539 #ifdef NO_X86_ASLR
1540 hole = 0;
1541 #endif
1542 hole %= nholes;
1543 startsl = holes[hole].start;
1544 endsl = holes[hole].end;
1545 startva = VA_SIGN_NEG(startsl * NBPD_L4);
1546
1547 /* Select an area within the hole. */
1548 va = randva;
1549 #ifdef NO_X86_ASLR
1550 va = 0;
1551 #endif
1552 winsize = ((endsl - startsl) * NBPD_L4) - sz;
1553 va %= winsize;
1554 va = rounddown(va, align);
1555 va += startva;
1556
1557 /* Update the entry. */
1558 slotspace.area[type].sslot = pl4_i(va);
1559 slotspace.area[type].nslot =
1560 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1561 slotspace.area[type].active = true;
1562
1563 return va;
1564 }
1565 #endif
1566
1567 #ifdef __HAVE_PCPU_AREA
1568 static void
1569 pmap_init_pcpu(void)
1570 {
1571 const vaddr_t startva = PMAP_PCPU_BASE;
1572 size_t nL4e, nL3e, nL2e, nL1e;
1573 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1574 paddr_t pa;
1575 vaddr_t endva;
1576 vaddr_t tmpva;
1577 pt_entry_t *pte;
1578 size_t size;
1579 int i;
1580
1581 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1582
1583 size = sizeof(struct pcpu_area);
1584
1585 endva = startva + size;
1586
1587 /* We will use this temporary va. */
1588 tmpva = bootspace.spareva;
1589 pte = PTE_BASE + pl1_i(tmpva);
1590
1591 /* Build L4 */
1592 L4e_idx = pl4_i(startva);
1593 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1594 KASSERT(nL4e == 1);
1595 for (i = 0; i < nL4e; i++) {
1596 KASSERT(L4_BASE[L4e_idx+i] == 0);
1597
1598 pa = pmap_bootstrap_palloc(1);
1599 *pte = (pa & PTE_FRAME) | pteflags;
1600 pmap_update_pg(tmpva);
1601 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1602
1603 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1604 }
1605
1606 /* Build L3 */
1607 L3e_idx = pl3_i(startva);
1608 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1609 for (i = 0; i < nL3e; i++) {
1610 KASSERT(L3_BASE[L3e_idx+i] == 0);
1611
1612 pa = pmap_bootstrap_palloc(1);
1613 *pte = (pa & PTE_FRAME) | pteflags;
1614 pmap_update_pg(tmpva);
1615 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1616
1617 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1618 }
1619
1620 /* Build L2 */
1621 L2e_idx = pl2_i(startva);
1622 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1623 for (i = 0; i < nL2e; i++) {
1624
1625 KASSERT(L2_BASE[L2e_idx+i] == 0);
1626
1627 pa = pmap_bootstrap_palloc(1);
1628 *pte = (pa & PTE_FRAME) | pteflags;
1629 pmap_update_pg(tmpva);
1630 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1631
1632 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1633 }
1634
1635 /* Build L1 */
1636 L1e_idx = pl1_i(startva);
1637 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1638 for (i = 0; i < nL1e; i++) {
1639 /*
1640 * Nothing to do, the PTEs will be entered via
1641 * pmap_kenter_pa.
1642 */
1643 KASSERT(L1_BASE[L1e_idx+i] == 0);
1644 }
1645
1646 *pte = 0;
1647 pmap_update_pg(tmpva);
1648
1649 pcpuarea = (struct pcpu_area *)startva;
1650
1651 tlbflush();
1652 }
1653 #endif
1654
1655 #ifdef __HAVE_DIRECT_MAP
1656 static void
1657 randomize_hole(size_t *randholep, vaddr_t *randvap)
1658 {
1659 struct nist_hash_drbg drbg;
1660 uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
1661 const char p[] = "x86/directmap";
1662 int error;
1663
1664 entropy_extract(seed, sizeof(seed), 0);
1665
1666 error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
1667 /*nonce*/NULL, 0,
1668 /*personalization*/p, strlen(p));
1669 KASSERTMSG(error == 0, "error=%d", error);
1670
1671 error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
1672 /*additional*/NULL, 0);
1673 KASSERTMSG(error == 0, "error=%d", error);
1674
1675 error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
1676 /*additional*/NULL, 0);
1677 KASSERTMSG(error == 0, "error=%d", error);
1678
1679 explicit_memset(seed, 0, sizeof(seed));
1680 explicit_memset(&drbg, 0, sizeof(drbg));
1681 }
1682
1683 /*
1684 * Create the amd64 direct map. Called only once at boot time. We map all of
1685 * the physical memory contiguously using 2MB large pages, with RW permissions.
1686 * However there is a hole: the kernel is mapped with RO permissions.
1687 */
1688 static void
1689 pmap_init_directmap(struct pmap *kpm)
1690 {
1691 extern phys_ram_seg_t mem_clusters[];
1692 extern int mem_cluster_cnt;
1693
1694 vaddr_t startva;
1695 size_t nL4e, nL3e, nL2e;
1696 size_t L4e_idx, L3e_idx, L2e_idx;
1697 size_t spahole, epahole;
1698 paddr_t lastpa, pa;
1699 vaddr_t endva;
1700 vaddr_t tmpva;
1701 pt_entry_t *pte;
1702 phys_ram_seg_t *mc;
1703 int i;
1704 size_t randhole;
1705 vaddr_t randva;
1706
1707 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1708 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1709
1710 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1711
1712 spahole = roundup(bootspace.head.pa, NBPD_L2);
1713 epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1714
1715 /* Get the last physical address available */
1716 lastpa = 0;
1717 for (i = 0; i < mem_cluster_cnt; i++) {
1718 mc = &mem_clusters[i];
1719 lastpa = MAX(lastpa, mc->start + mc->size);
1720 }
1721
1722 /*
1723 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1724 */
1725 if (lastpa > MAXPHYSMEM) {
1726 panic("pmap_init_directmap: lastpa incorrect");
1727 }
1728
1729 randomize_hole(&randhole, &randva);
1730 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1731 randhole, randva);
1732 endva = startva + lastpa;
1733
1734 /* We will use this temporary va. */
1735 tmpva = bootspace.spareva;
1736 pte = PTE_BASE + pl1_i(tmpva);
1737
1738 /* Build L4 */
1739 L4e_idx = pl4_i(startva);
1740 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1741 KASSERT(nL4e <= NL4_SLOT_DIRECT);
1742 for (i = 0; i < nL4e; i++) {
1743 KASSERT(L4_BASE[L4e_idx+i] == 0);
1744
1745 pa = pmap_bootstrap_palloc(1);
1746 *pte = (pa & PTE_FRAME) | pteflags;
1747 pmap_update_pg(tmpva);
1748 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1749
1750 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1751 }
1752
1753 /* Build L3 */
1754 L3e_idx = pl3_i(startva);
1755 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1756 for (i = 0; i < nL3e; i++) {
1757 KASSERT(L3_BASE[L3e_idx+i] == 0);
1758
1759 pa = pmap_bootstrap_palloc(1);
1760 *pte = (pa & PTE_FRAME) | pteflags;
1761 pmap_update_pg(tmpva);
1762 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1763
1764 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1765 }
1766
1767 /* Build L2 */
1768 L2e_idx = pl2_i(startva);
1769 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1770 for (i = 0; i < nL2e; i++) {
1771 KASSERT(L2_BASE[L2e_idx+i] == 0);
1772
1773 pa = (paddr_t)(i * NBPD_L2);
1774
1775 if (spahole <= pa && pa < epahole) {
1776 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1777 PTE_PS | pmap_pg_g;
1778 } else {
1779 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1780 PTE_PS | pmap_pg_g;
1781 }
1782 }
1783
1784 *pte = 0;
1785 pmap_update_pg(tmpva);
1786
1787 pmap_direct_base = startva;
1788 pmap_direct_end = endva;
1789
1790 tlbflush();
1791 }
1792 #endif /* __HAVE_DIRECT_MAP */
1793
1794 #if !defined(XENPV)
1795 /*
1796 * Remap all of the virtual pages created so far with the PTE_G bit.
1797 */
1798 static void
1799 pmap_remap_global(void)
1800 {
1801 vaddr_t kva, kva_end;
1802 unsigned long p1i;
1803 size_t i;
1804
1805 /* head */
1806 kva = bootspace.head.va;
1807 kva_end = kva + bootspace.head.sz;
1808 for ( ; kva < kva_end; kva += PAGE_SIZE) {
1809 p1i = pl1_i(kva);
1810 if (pmap_valid_entry(PTE_BASE[p1i]))
1811 PTE_BASE[p1i] |= pmap_pg_g;
1812 }
1813
1814 /* kernel segments */
1815 for (i = 0; i < BTSPACE_NSEGS; i++) {
1816 if (bootspace.segs[i].type == BTSEG_NONE) {
1817 continue;
1818 }
1819 kva = bootspace.segs[i].va;
1820 kva_end = kva + bootspace.segs[i].sz;
1821 for ( ; kva < kva_end; kva += PAGE_SIZE) {
1822 p1i = pl1_i(kva);
1823 if (pmap_valid_entry(PTE_BASE[p1i]))
1824 PTE_BASE[p1i] |= pmap_pg_g;
1825 }
1826 }
1827
1828 /* boot space */
1829 kva = bootspace.boot.va;
1830 kva_end = kva + bootspace.boot.sz;
1831 for ( ; kva < kva_end; kva += PAGE_SIZE) {
1832 p1i = pl1_i(kva);
1833 if (pmap_valid_entry(PTE_BASE[p1i]))
1834 PTE_BASE[p1i] |= pmap_pg_g;
1835 }
1836 }
1837 #endif
1838
1839 #ifndef XENPV
1840 /*
1841 * Remap several kernel segments with large pages. We cover as many pages as we
1842 * can. Called only once at boot time, if the CPU supports large pages.
1843 */
1844 static void
1845 pmap_remap_largepages(void)
1846 {
1847 pd_entry_t *pde;
1848 vaddr_t kva, kva_end;
1849 paddr_t pa;
1850 size_t i;
1851
1852 /* Remap the kernel text using large pages. */
1853 for (i = 0; i < BTSPACE_NSEGS; i++) {
1854 if (bootspace.segs[i].type != BTSEG_TEXT) {
1855 continue;
1856 }
1857 kva = roundup(bootspace.segs[i].va, NBPD_L2);
1858 if (kva < bootspace.segs[i].va) {
1859 continue;
1860 }
1861 kva_end = rounddown(bootspace.segs[i].va +
1862 bootspace.segs[i].sz, NBPD_L2);
1863 pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1864 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1865 pde = &L2_BASE[pl2_i(kva)];
1866 *pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1867 tlbflushg();
1868 }
1869 }
1870
1871 /* Remap the kernel rodata using large pages. */
1872 for (i = 0; i < BTSPACE_NSEGS; i++) {
1873 if (bootspace.segs[i].type != BTSEG_RODATA) {
1874 continue;
1875 }
1876 kva = roundup(bootspace.segs[i].va, NBPD_L2);
1877 if (kva < bootspace.segs[i].va) {
1878 continue;
1879 }
1880 kva_end = rounddown(bootspace.segs[i].va +
1881 bootspace.segs[i].sz, NBPD_L2);
1882 pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1883 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1884 pde = &L2_BASE[pl2_i(kva)];
1885 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1886 tlbflushg();
1887 }
1888 }
1889
1890 /* Remap the kernel data+bss using large pages. */
1891 for (i = 0; i < BTSPACE_NSEGS; i++) {
1892 if (bootspace.segs[i].type != BTSEG_DATA) {
1893 continue;
1894 }
1895 kva = roundup(bootspace.segs[i].va, NBPD_L2);
1896 if (kva < bootspace.segs[i].va) {
1897 continue;
1898 }
1899 kva_end = rounddown(bootspace.segs[i].va +
1900 bootspace.segs[i].sz, NBPD_L2);
1901 pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1902 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1903 pde = &L2_BASE[pl2_i(kva)];
1904 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1905 tlbflushg();
1906 }
1907 }
1908 }
1909 #endif /* !XENPV */
1910
1911 /*
1912 * pmap_init: called from uvm_init, our job is to get the pmap system ready
1913 * to manage mappings.
1914 */
1915 void
1916 pmap_init(void)
1917 {
1918 int flags;
1919
1920 /*
1921 * initialize caches.
1922 */
1923
1924 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1925 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1926
1927 #ifdef XENPV
1928 /*
1929 * pool_cache(9) should not touch cached objects, since they
1930 * are pinned on xen and R/O for the domU
1931 */
1932 flags = PR_NOTOUCH;
1933 #else
1934 flags = 0;
1935 #endif
1936
1937 #ifdef PAE
1938 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1939 "pdppl", &pmap_pdp_allocator, IPL_NONE);
1940 #else
1941 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1942 "pdppl", NULL, IPL_NONE);
1943 #endif
1944 pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1945 0, 0, "pvpage", &pool_allocator_kmem,
1946 IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1947
1948 pmap_tlb_init();
1949
1950 /* XXX: Since cpu_hatch() is only for secondary CPUs. */
1951 pmap_tlb_cpu_init(curcpu());
1952
1953 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1954 NULL, "x86", "io bitmap copy");
1955 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1956 NULL, "x86", "ldt sync");
1957
1958 /*
1959 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1960 * to hang a tree of pv_entry records. Dynamically allocated
1961 * pv_entry lists are not heavily used in the kernel's pmap (the
1962 * usual case is embedded), so cop out and use a single RB tree
1963 * to cover them.
1964 */
1965 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1966
1967 /*
1968 * done: pmap module is up (and ready for business)
1969 */
1970
1971 pmap_initialized = true;
1972 }
1973
1974 #ifndef XENPV
1975 /*
1976 * pmap_cpu_init_late: perform late per-CPU initialization.
1977 */
1978 void
1979 pmap_cpu_init_late(struct cpu_info *ci)
1980 {
1981 /*
1982 * The BP has already its own PD page allocated during early
1983 * MD startup.
1984 */
1985 if (ci == &cpu_info_primary)
1986 return;
1987 #ifdef PAE
1988 cpu_alloc_l3_page(ci);
1989 #endif
1990 }
1991 #endif
1992
1993 #ifndef __HAVE_DIRECT_MAP
1994 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1995 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1996
1997 static void
1998 pmap_vpage_cpualloc(struct cpu_info *ci)
1999 {
2000 bool primary = (ci == &cpu_info_primary);
2001 size_t i, npages;
2002 vaddr_t vabase;
2003 vsize_t vrange;
2004
2005 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
2006 KASSERT(npages >= VPAGE_MAX);
2007 vrange = npages * PAGE_SIZE;
2008
2009 if (primary) {
2010 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
2011 /* Waste some pages to align properly */
2012 }
2013 /* The base is aligned, allocate the rest (contiguous) */
2014 pmap_bootstrap_valloc(npages - 1);
2015 } else {
2016 vabase = uvm_km_alloc(kernel_map, vrange, vrange,
2017 UVM_KMF_VAONLY);
2018 if (vabase == 0) {
2019 panic("%s: failed to allocate tmp VA for CPU %d\n",
2020 __func__, cpu_index(ci));
2021 }
2022 }
2023
2024 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
2025
2026 for (i = 0; i < VPAGE_MAX; i++) {
2027 ci->vpage[i] = vabase + i * PAGE_SIZE;
2028 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
2029 }
2030 }
2031
2032 void
2033 pmap_vpage_cpu_init(struct cpu_info *ci)
2034 {
2035 if (ci == &cpu_info_primary) {
2036 /* cpu0 already taken care of in pmap_bootstrap */
2037 return;
2038 }
2039
2040 pmap_vpage_cpualloc(ci);
2041 }
2042 #endif
2043
2044 /*
2045 * p v _ e n t r y f u n c t i o n s
2046 */
2047
2048 /*
2049 * pmap_pvp_dtor: pool_cache constructor for PV pages.
2050 */
2051 static int
2052 pmap_pvp_ctor(void *arg, void *obj, int flags)
2053 {
2054 struct pv_page *pvp = (struct pv_page *)obj;
2055 struct pv_entry *pve = (struct pv_entry *)obj + 1;
2056 struct pv_entry *maxpve = pve + PVE_PER_PVP;
2057
2058 KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
2059 KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
2060
2061 LIST_INIT(&pvp->pvp_pves);
2062 pvp->pvp_nfree = PVE_PER_PVP;
2063 pvp->pvp_pmap = NULL;
2064
2065 for (; pve < maxpve; pve++) {
2066 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2067 }
2068
2069 return 0;
2070 }
2071
2072 /*
2073 * pmap_pvp_dtor: pool_cache destructor for PV pages.
2074 */
2075 static void
2076 pmap_pvp_dtor(void *arg, void *obj)
2077 {
2078 struct pv_page *pvp __diagused = obj;
2079
2080 KASSERT(pvp->pvp_pmap == NULL);
2081 KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2082 }
2083
2084 /*
2085 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2086 */
2087 static struct pv_entry *
2088 pmap_alloc_pv(struct pmap *pmap)
2089 {
2090 struct pv_entry *pve;
2091 struct pv_page *pvp;
2092
2093 KASSERT(mutex_owned(&pmap->pm_lock));
2094
2095 if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2096 if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2097 LIST_REMOVE(pvp, pvp_list);
2098 } else {
2099 pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2100 }
2101 if (__predict_false(pvp == NULL)) {
2102 return NULL;
2103 }
2104 /* full -> part */
2105 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2106 pvp->pvp_pmap = pmap;
2107 }
2108
2109 KASSERT(pvp->pvp_pmap == pmap);
2110 KASSERT(pvp->pvp_nfree > 0);
2111
2112 pve = LIST_FIRST(&pvp->pvp_pves);
2113 LIST_REMOVE(pve, pve_list);
2114 pvp->pvp_nfree--;
2115
2116 if (__predict_false(pvp->pvp_nfree == 0)) {
2117 /* part -> empty */
2118 KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2119 LIST_REMOVE(pvp, pvp_list);
2120 LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2121 } else {
2122 KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2123 }
2124
2125 return pve;
2126 }
2127
2128 /*
2129 * pmap_free_pv: delayed free of a PV entry.
2130 */
2131 static void
2132 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2133 {
2134 struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2135
2136 KASSERT(mutex_owned(&pmap->pm_lock));
2137 KASSERT(pvp->pvp_pmap == pmap);
2138 KASSERT(pvp->pvp_nfree >= 0);
2139
2140 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2141 pvp->pvp_nfree++;
2142
2143 if (__predict_false(pvp->pvp_nfree == 1)) {
2144 /* empty -> part */
2145 LIST_REMOVE(pvp, pvp_list);
2146 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2147 } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2148 /* part -> full */
2149 LIST_REMOVE(pvp, pvp_list);
2150 LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2151 }
2152 }
2153
2154 /*
2155 * pmap_drain_pv: free full PV pages.
2156 */
2157 static void
2158 pmap_drain_pv(struct pmap *pmap)
2159 {
2160 struct pv_page *pvp;
2161
2162 KASSERT(mutex_owned(&pmap->pm_lock));
2163
2164 while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2165 LIST_REMOVE(pvp, pvp_list);
2166 KASSERT(pvp->pvp_pmap == pmap);
2167 KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2168 pvp->pvp_pmap = NULL;
2169 pool_cache_put(&pmap_pvp_cache, pvp);
2170 }
2171 }
2172
2173 /*
2174 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2175 */
2176 static void
2177 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2178 vaddr_t va, bool tracked)
2179 {
2180 #ifdef DEBUG
2181 struct pv_pte *pvpte;
2182
2183 PMAP_CHECK_PP(pp);
2184
2185 mutex_spin_enter(&pp->pp_lock);
2186 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2187 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2188 break;
2189 }
2190 }
2191 mutex_spin_exit(&pp->pp_lock);
2192
2193 if (pvpte && !tracked) {
2194 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2195 } else if (!pvpte && tracked) {
2196 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2197 }
2198 #endif
2199 }
2200
2201 /*
2202 * pmap_treelookup_pv: search the PV tree for a dynamic entry
2203 *
2204 * => pmap must be locked
2205 */
2206 static struct pv_entry *
2207 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2208 const rb_tree_t *tree, const vaddr_t va)
2209 {
2210 struct pv_entry *pve;
2211 rb_node_t *node;
2212
2213 /*
2214 * Inlined lookup tailored for exactly what's needed here that is
2215 * quite a bit faster than using rb_tree_find_node().
2216 */
2217 for (node = tree->rbt_root;;) {
2218 if (__predict_false(RB_SENTINEL_P(node))) {
2219 return NULL;
2220 }
2221 pve = (struct pv_entry *)
2222 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2223 if (pve->pve_pte.pte_va == va) {
2224 KASSERT(pve->pve_pte.pte_ptp == ptp);
2225 return pve;
2226 }
2227 node = node->rb_nodes[pve->pve_pte.pte_va < va];
2228 }
2229 }
2230
2231 /*
2232 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2233 *
2234 * => a PV entry must be known present (doesn't check for existence)
2235 * => pmap must be locked
2236 */
2237 static struct pv_entry *
2238 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2239 const struct pmap_page * const old_pp, const vaddr_t va)
2240 {
2241 struct pv_entry *pve;
2242 const rb_tree_t *tree;
2243
2244 KASSERT(mutex_owned(&pmap->pm_lock));
2245 KASSERT(ptp != NULL || pmap == pmap_kernel());
2246
2247 /*
2248 * [This mostly deals with the case of process-private pages, i.e.
2249 * anonymous memory allocations or COW.]
2250 *
2251 * If the page is tracked with an embedded entry then the tree
2252 * lookup can be avoided. It's safe to check for this specific
2253 * set of values without pp_lock because both will only ever be
2254 * set together for this pmap.
2255 *
2256 */
2257 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2258 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2259 return NULL;
2260 }
2261
2262 /*
2263 * [This mostly deals with shared mappings, for example shared libs
2264 * and executables.]
2265 *
2266 * Optimise for pmap_remove_ptes() which works by ascending scan:
2267 * look at the lowest numbered node in the tree first. The tree is
2268 * known non-empty because of the check above. For short lived
2269 * processes where pmap_remove() isn't used much this gets close to
2270 * a 100% hit rate.
2271 */
2272 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2273 KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2274 pve = (struct pv_entry *)
2275 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2276 offsetof(struct pv_entry, pve_rb));
2277 if (__predict_true(pve->pve_pte.pte_va == va)) {
2278 KASSERT(pve->pve_pte.pte_ptp == ptp);
2279 return pve;
2280 }
2281
2282 /* Search the RB tree for the key (uncommon). */
2283 return pmap_treelookup_pv(pmap, ptp, tree, va);
2284 }
2285
2286 /*
2287 * pmap_enter_pv: enter a mapping onto a pmap_page lst
2288 *
2289 * => pmap must be locked
2290 * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2291 */
2292 static int
2293 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2294 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2295 bool *samepage, bool *new_embedded, rb_tree_t *tree)
2296 {
2297 struct pv_entry *pve;
2298 int error;
2299
2300 KASSERT(mutex_owned(&pmap->pm_lock));
2301 KASSERT(ptp_to_pmap(ptp) == pmap);
2302 KASSERT(ptp == NULL || ptp->uobject != NULL);
2303 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2304 PMAP_CHECK_PP(pp);
2305
2306 /*
2307 * If entering the same page and it's already tracked with an
2308 * embedded entry, we can avoid the expense below. It's safe
2309 * to check for this very specific set of values without a lock
2310 * because both will only ever be set together for this pmap.
2311 */
2312 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2313 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2314 *samepage = true;
2315 pmap_check_pv(pmap, ptp, pp, va, true);
2316 return 0;
2317 }
2318
2319 /*
2320 * Check for an existing dynamic mapping at this address. If it's
2321 * for the same page, then it will be reused and nothing needs to be
2322 * changed.
2323 */
2324 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2325 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2326 *samepage = true;
2327 pmap_check_pv(pmap, ptp, pp, va, true);
2328 return 0;
2329 }
2330
2331 /*
2332 * Need to put a new mapping in place. Grab a spare pv_entry in
2333 * case it's needed; won't know for sure until the lock is taken.
2334 */
2335 if (pmap->pm_pve == NULL) {
2336 pmap->pm_pve = pmap_alloc_pv(pmap);
2337 }
2338
2339 error = 0;
2340 pmap_check_pv(pmap, ptp, pp, va, false);
2341 mutex_spin_enter(&pp->pp_lock);
2342 if (!pv_pte_embedded(pp)) {
2343 /*
2344 * Embedded PV tracking available - easy.
2345 */
2346 pp->pp_pte.pte_ptp = ptp;
2347 pp->pp_pte.pte_va = va;
2348 *new_embedded = true;
2349 } else if (__predict_false(pmap->pm_pve == NULL)) {
2350 /*
2351 * No memory.
2352 */
2353 error = ENOMEM;
2354 } else {
2355 /*
2356 * Install new pv_entry on the page.
2357 */
2358 pve = pmap->pm_pve;
2359 pmap->pm_pve = NULL;
2360 *new_pve = pve;
2361 pve->pve_pte.pte_ptp = ptp;
2362 pve->pve_pte.pte_va = va;
2363 pve->pve_pp = pp;
2364 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2365 }
2366 mutex_spin_exit(&pp->pp_lock);
2367 if (error == 0) {
2368 pmap_check_pv(pmap, ptp, pp, va, true);
2369 }
2370
2371 return error;
2372 }
2373
2374 /*
2375 * pmap_remove_pv: try to remove a mapping from a pv_list
2376 *
2377 * => pmap must be locked
2378 * => removes dynamic entries from tree and frees them
2379 * => caller should adjust ptp's wire_count and free PTP if needed
2380 */
2381 static void
2382 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2383 vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2384 {
2385 rb_tree_t *tree = (ptp != NULL ?
2386 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2387
2388 KASSERT(mutex_owned(&pmap->pm_lock));
2389 KASSERT(ptp_to_pmap(ptp) == pmap);
2390 KASSERT(ptp == NULL || ptp->uobject != NULL);
2391 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2392 KASSERT(ptp != NULL || pmap == pmap_kernel());
2393
2394 pmap_check_pv(pmap, ptp, pp, va, true);
2395
2396 if (pve == NULL) {
2397 mutex_spin_enter(&pp->pp_lock);
2398 KASSERT(pp->pp_pte.pte_ptp == ptp);
2399 KASSERT(pp->pp_pte.pte_va == va);
2400 pp->pp_attrs |= oattrs;
2401 pp->pp_pte.pte_ptp = NULL;
2402 pp->pp_pte.pte_va = 0;
2403 mutex_spin_exit(&pp->pp_lock);
2404 } else {
2405 mutex_spin_enter(&pp->pp_lock);
2406 KASSERT(pp->pp_pte.pte_ptp != ptp ||
2407 pp->pp_pte.pte_va != va);
2408 KASSERT(pve->pve_pte.pte_ptp == ptp);
2409 KASSERT(pve->pve_pte.pte_va == va);
2410 KASSERT(pve->pve_pp == pp);
2411 pp->pp_attrs |= oattrs;
2412 LIST_REMOVE(pve, pve_list);
2413 mutex_spin_exit(&pp->pp_lock);
2414
2415 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2416 rb_tree_remove_node(tree, pve);
2417 #ifdef DIAGNOSTIC
2418 memset(pve, 0, sizeof(*pve));
2419 #endif
2420 pmap_free_pv(pmap, pve);
2421 }
2422
2423 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2424 pmap_check_pv(pmap, ptp, pp, va, false);
2425 }
2426
2427 /*
2428 * p t p f u n c t i o n s
2429 */
2430
2431 static struct vm_page *
2432 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2433 {
2434 int lidx = level - 1;
2435 off_t off = ptp_va2o(va, level);
2436 struct vm_page *pg;
2437
2438 KASSERT(mutex_owned(&pmap->pm_lock));
2439
2440 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2441 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2442 pg = pmap->pm_ptphint[lidx];
2443 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2444 return pg;
2445 }
2446 PMAP_DUMMY_LOCK(pmap);
2447 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2448 PMAP_DUMMY_UNLOCK(pmap);
2449 if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2450 /* This page is queued to be freed - ignore. */
2451 pg = NULL;
2452 }
2453 if (pg != NULL) {
2454 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2455 }
2456 pmap->pm_ptphint[lidx] = pg;
2457 return pg;
2458 }
2459
2460 static inline void
2461 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2462 {
2463 int lidx;
2464
2465 KASSERT(ptp->wire_count <= 1);
2466 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2467
2468 lidx = level - 1;
2469 pmap_stats_update(pmap, -ptp->wire_count, 0);
2470 if (pmap->pm_ptphint[lidx] == ptp)
2471 pmap->pm_ptphint[lidx] = NULL;
2472 ptp->wire_count = 0;
2473 ptp->uanon = NULL;
2474 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2475
2476 /*
2477 * Enqueue the PTP to be freed by pmap_update(). We can't remove
2478 * the page from the uvm_object, as that can take further locks
2479 * (intolerable right now because the PTEs are likely mapped in).
2480 * Instead mark the PTP as free and if we bump into it again, we'll
2481 * either ignore or reuse (depending on what's useful at the time).
2482 */
2483 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2484 }
2485
2486 static void
2487 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2488 pt_entry_t *ptes, pd_entry_t * const *pdes)
2489 {
2490 unsigned long index;
2491 int level;
2492 vaddr_t invaladdr;
2493 pd_entry_t opde;
2494
2495 KASSERT(pmap != pmap_kernel());
2496 KASSERT(mutex_owned(&pmap->pm_lock));
2497 KASSERT(kpreempt_disabled());
2498
2499 level = 1;
2500 do {
2501 index = pl_i(va, level + 1);
2502 opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2503
2504 /*
2505 * On Xen-amd64 or SVS, we need to sync the top level page
2506 * directory on each CPU.
2507 */
2508 #if defined(XENPV) && defined(__x86_64__)
2509 if (level == PTP_LEVELS - 1) {
2510 xen_kpm_sync(pmap, index);
2511 }
2512 #elif defined(SVS)
2513 if (svs_enabled && level == PTP_LEVELS - 1 &&
2514 pmap_is_user(pmap)) {
2515 svs_pmap_sync(pmap, index);
2516 }
2517 #endif
2518
2519 invaladdr = level == 1 ? (vaddr_t)ptes :
2520 (vaddr_t)pdes[level - 2];
2521 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2522 opde, TLBSHOOT_FREE_PTP);
2523
2524 #if defined(XENPV)
2525 pmap_tlb_shootnow();
2526 #endif
2527
2528 pmap_freepage(pmap, ptp, level);
2529 if (level < PTP_LEVELS - 1) {
2530 ptp = pmap_find_ptp(pmap, va, level + 1);
2531 ptp->wire_count--;
2532 if (ptp->wire_count > 1)
2533 break;
2534 }
2535 } while (++level < PTP_LEVELS);
2536 pmap_pte_flush();
2537 }
2538
2539 /*
2540 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2541 *
2542 * => pmap should NOT be pmap_kernel()
2543 * => pmap should be locked
2544 * => we are not touching any PTEs yet, so they need not be mapped in
2545 */
2546 static int
2547 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2548 int flags, struct vm_page **resultp)
2549 {
2550 struct vm_page *ptp;
2551 int i, aflags;
2552 struct uvm_object *obj;
2553 voff_t off;
2554
2555 KASSERT(pmap != pmap_kernel());
2556 KASSERT(mutex_owned(&pmap->pm_lock));
2557
2558 /*
2559 * Loop through all page table levels allocating a page
2560 * for any level where we don't already have one.
2561 */
2562 memset(pt, 0, sizeof(*pt));
2563 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2564 UVM_PGA_ZERO;
2565 for (i = PTP_LEVELS; i > 1; i--) {
2566 obj = &pmap->pm_obj[i - 2];
2567 off = ptp_va2o(va, i - 1);
2568
2569 PMAP_DUMMY_LOCK(pmap);
2570 pt->pg[i] = uvm_pagelookup(obj, off);
2571
2572 if (pt->pg[i] == NULL) {
2573 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2574 pt->alloced[i] = (pt->pg[i] != NULL);
2575 } else if (pt->pg[i]->wire_count == 0) {
2576 /* This page was queued to be freed; dequeue it. */
2577 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2578 pt->alloced[i] = true;
2579 }
2580 PMAP_DUMMY_UNLOCK(pmap);
2581 if (pt->pg[i] == NULL) {
2582 pmap_unget_ptp(pmap, pt);
2583 return ENOMEM;
2584 } else if (pt->alloced[i]) {
2585 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2586 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2587 &pmap_rbtree_ops);
2588 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2589 }
2590 }
2591 ptp = pt->pg[2];
2592 KASSERT(ptp != NULL);
2593 *resultp = ptp;
2594 pmap->pm_ptphint[0] = ptp;
2595 return 0;
2596 }
2597
2598 /*
2599 * pmap_install_ptp: install any freshly allocated PTPs
2600 *
2601 * => pmap should NOT be pmap_kernel()
2602 * => pmap should be locked
2603 * => PTEs must be mapped
2604 * => preemption must be disabled
2605 */
2606 static void
2607 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2608 pd_entry_t * const *pdes)
2609 {
2610 struct vm_page *ptp;
2611 unsigned long index;
2612 pd_entry_t *pva;
2613 paddr_t pa;
2614 int i;
2615
2616 KASSERT(pmap != pmap_kernel());
2617 KASSERT(mutex_owned(&pmap->pm_lock));
2618 KASSERT(kpreempt_disabled());
2619
2620 /*
2621 * Now that we have all the pages looked up or allocated,
2622 * loop through again installing any new ones into the tree.
2623 */
2624 for (i = PTP_LEVELS; i > 1; i--) {
2625 index = pl_i(va, i);
2626 pva = pdes[i - 2];
2627
2628 if (pmap_valid_entry(pva[index])) {
2629 KASSERT(!pt->alloced[i]);
2630 continue;
2631 }
2632
2633 ptp = pt->pg[i];
2634 ptp->flags &= ~PG_BUSY; /* never busy */
2635 ptp->wire_count = 1;
2636 pmap->pm_ptphint[i - 2] = ptp;
2637 pa = VM_PAGE_TO_PHYS(ptp);
2638 pmap_pte_set(&pva[index], (pd_entry_t)
2639 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2640
2641 /*
2642 * On Xen-amd64 or SVS, we need to sync the top level page
2643 * directory on each CPU.
2644 */
2645 #if defined(XENPV) && defined(__x86_64__)
2646 if (i == PTP_LEVELS) {
2647 xen_kpm_sync(pmap, index);
2648 }
2649 #elif defined(SVS)
2650 if (svs_enabled && i == PTP_LEVELS &&
2651 pmap_is_user(pmap)) {
2652 svs_pmap_sync(pmap, index);
2653 }
2654 #endif
2655
2656 pmap_pte_flush();
2657 pmap_stats_update(pmap, 1, 0);
2658
2659 /*
2660 * If we're not in the top level, increase the
2661 * wire count of the parent page.
2662 */
2663 if (i < PTP_LEVELS) {
2664 pt->pg[i + 1]->wire_count++;
2665 }
2666 }
2667 }
2668
2669 /*
2670 * pmap_unget_ptp: free unusued PTPs
2671 *
2672 * => pmap should NOT be pmap_kernel()
2673 * => pmap should be locked
2674 */
2675 static void
2676 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2677 {
2678 int i;
2679
2680 KASSERT(pmap != pmap_kernel());
2681 KASSERT(mutex_owned(&pmap->pm_lock));
2682
2683 for (i = PTP_LEVELS; i > 1; i--) {
2684 if (!pt->alloced[i]) {
2685 continue;
2686 }
2687 KASSERT(pt->pg[i]->wire_count == 0);
2688 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2689 pmap_freepage(pmap, pt->pg[i], i - 1);
2690 }
2691 }
2692
2693 /*
2694 * p m a p l i f e c y c l e f u n c t i o n s
2695 */
2696
2697 /*
2698 * pmap_pdp_init: constructor a new PDP.
2699 */
2700 static void
2701 pmap_pdp_init(pd_entry_t *pdir)
2702 {
2703 paddr_t pdirpa = 0;
2704 vaddr_t object;
2705 int i;
2706
2707 #if !defined(XENPV) || !defined(__x86_64__)
2708 int npde;
2709 #endif
2710 #ifdef XENPV
2711 int s;
2712 #endif
2713
2714 memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
2715
2716 /*
2717 * NOTE: This is all done unlocked, but we will check afterwards
2718 * if we have raced with pmap_growkernel().
2719 */
2720
2721 #if defined(XENPV) && defined(__x86_64__)
2722 /* Fetch the physical address of the page directory */
2723 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2724
2725 /*
2726 * This pdir will NEVER be active in kernel mode, so mark
2727 * recursive entry invalid.
2728 */
2729 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2730
2731 /*
2732 * PDP constructed this way won't be for the kernel, hence we
2733 * don't put kernel mappings on Xen.
2734 *
2735 * But we need to make pmap_create() happy, so put a dummy
2736 * (without PTE_P) value at the right place.
2737 */
2738 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2739 (pd_entry_t)-1 & PTE_FRAME;
2740 #else /* XENPV && __x86_64__*/
2741 object = (vaddr_t)pdir;
2742 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2743 /* Fetch the physical address of the page directory */
2744 (void)pmap_extract(pmap_kernel(), object, &pdirpa);
2745
2746 /* Put in recursive PDE to map the PTEs */
2747 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2748 pmap_pg_nx;
2749 #ifndef XENPV
2750 pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2751 #endif
2752 }
2753
2754 /* Copy the kernel's top level PDE */
2755 npde = nkptp[PTP_LEVELS - 1];
2756
2757 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2758 npde * sizeof(pd_entry_t));
2759
2760 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2761 int idx = pl_i(KERNBASE, PTP_LEVELS);
2762 pdir[idx] = PDP_BASE[idx];
2763 }
2764
2765 #ifdef __HAVE_PCPU_AREA
2766 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2767 #endif
2768 #ifdef __HAVE_DIRECT_MAP
2769 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2770 #endif
2771 #ifdef KASAN
2772 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2773 #endif
2774 #ifdef KMSAN
2775 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2776 #endif
2777 #endif /* XENPV && __x86_64__*/
2778
2779 #ifdef XENPV
2780 s = splvm();
2781 object = (vaddr_t)pdir;
2782 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2783 VM_PROT_READ);
2784 pmap_update(pmap_kernel());
2785 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2786 /*
2787 * pin as L2/L4 page, we have to do the page with the
2788 * PDIR_SLOT_PTE entries last
2789 */
2790 #ifdef PAE
2791 if (i == l2tol3(PDIR_SLOT_PTE))
2792 continue;
2793 #endif
2794
2795 (void) pmap_extract(pmap_kernel(), object, &pdirpa);
2796 #ifdef __x86_64__
2797 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2798 #else
2799 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2800 #endif
2801 }
2802 #ifdef PAE
2803 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE);
2804 (void)pmap_extract(pmap_kernel(), object, &pdirpa);
2805 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2806 #endif
2807 splx(s);
2808 #endif /* XENPV */
2809 }
2810
2811 /*
2812 * pmap_pdp_fini: destructor for the PDPs.
2813 */
2814 static void
2815 pmap_pdp_fini(pd_entry_t *pdir)
2816 {
2817 #ifdef XENPV
2818 paddr_t pdirpa = 0; /* XXX: GCC */
2819 vaddr_t object = (vaddr_t)pdir;
2820 int i;
2821 int s = splvm();
2822 pt_entry_t *pte;
2823
2824 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2825 /* fetch the physical address of the page directory. */
2826 (void) pmap_extract(pmap_kernel(), object, &pdirpa);
2827 /* unpin page table */
2828 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2829 }
2830 object = (vaddr_t)pdir;
2831 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2832 /* Set page RW again */
2833 pte = kvtopte(object);
2834 pmap_pte_set(pte, *pte | PTE_W);
2835 xen_bcast_invlpg((vaddr_t)object);
2836 }
2837 splx(s);
2838 #endif /* XENPV */
2839 }
2840
2841 #ifdef PAE
2842 static void *
2843 pmap_pdp_alloc(struct pool *pp, int flags)
2844 {
2845 return (void *)uvm_km_alloc(kernel_map,
2846 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2847 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2848 UVM_KMF_WIRED);
2849 }
2850
2851 static void
2852 pmap_pdp_free(struct pool *pp, void *v)
2853 {
2854 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2855 UVM_KMF_WIRED);
2856 }
2857 #endif /* PAE */
2858
2859 /*
2860 * pmap_ctor: constructor for the pmap cache.
2861 */
2862 static int
2863 pmap_ctor(void *arg, void *obj, int flags)
2864 {
2865 struct pmap *pmap = obj;
2866 pt_entry_t p;
2867 int i;
2868
2869 KASSERT((flags & PR_WAITOK) != 0);
2870
2871 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2872 rw_init(&pmap->pm_dummy_lock);
2873 kcpuset_create(&pmap->pm_cpus, true);
2874 kcpuset_create(&pmap->pm_kernel_cpus, true);
2875 #ifdef XENPV
2876 kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2877 #endif
2878 LIST_INIT(&pmap->pm_gc_ptp);
2879 pmap->pm_pve = NULL;
2880 LIST_INIT(&pmap->pm_pvp_full);
2881 LIST_INIT(&pmap->pm_pvp_part);
2882 LIST_INIT(&pmap->pm_pvp_empty);
2883
2884 /* allocate and init PDP */
2885 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2886
2887 for (;;) {
2888 pmap_pdp_init(pmap->pm_pdir);
2889 mutex_enter(&pmaps_lock);
2890 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2891 if (__predict_true(p != 0)) {
2892 break;
2893 }
2894 mutex_exit(&pmaps_lock);
2895 }
2896
2897 for (i = 0; i < PDP_SIZE; i++)
2898 pmap->pm_pdirpa[i] =
2899 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2900
2901 LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2902 mutex_exit(&pmaps_lock);
2903
2904 return 0;
2905 }
2906
2907 /*
2908 * pmap_ctor: destructor for the pmap cache.
2909 */
2910 static void
2911 pmap_dtor(void *arg, void *obj)
2912 {
2913 struct pmap *pmap = obj;
2914
2915 mutex_enter(&pmaps_lock);
2916 LIST_REMOVE(pmap, pm_list);
2917 mutex_exit(&pmaps_lock);
2918
2919 pmap_pdp_fini(pmap->pm_pdir);
2920 pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2921 mutex_destroy(&pmap->pm_lock);
2922 rw_destroy(&pmap->pm_dummy_lock);
2923 kcpuset_destroy(pmap->pm_cpus);
2924 kcpuset_destroy(pmap->pm_kernel_cpus);
2925 #ifdef XENPV
2926 kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2927 #endif
2928 }
2929
2930 /*
2931 * pmap_create: create a pmap object.
2932 */
2933 struct pmap *
2934 pmap_create(void)
2935 {
2936 struct pmap *pmap;
2937 int i;
2938
2939 pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2940
2941 /* init uvm_object */
2942 for (i = 0; i < PTP_LEVELS - 1; i++) {
2943 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2944 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2945 pmap->pm_ptphint[i] = NULL;
2946 }
2947 pmap->pm_stats.wired_count = 0;
2948 /* count the PDP allocd below */
2949 pmap->pm_stats.resident_count = PDP_SIZE;
2950 #if !defined(__x86_64__)
2951 pmap->pm_hiexec = 0;
2952 #endif
2953
2954 /* Used by NVMM and Xen */
2955 pmap->pm_enter = NULL;
2956 pmap->pm_extract = NULL;
2957 pmap->pm_remove = NULL;
2958 pmap->pm_sync_pv = NULL;
2959 pmap->pm_pp_remove_ent = NULL;
2960 pmap->pm_write_protect = NULL;
2961 pmap->pm_unwire = NULL;
2962 pmap->pm_tlb_flush = NULL;
2963 pmap->pm_data = NULL;
2964
2965 /* init the LDT */
2966 pmap->pm_ldt = NULL;
2967 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2968
2969 return pmap;
2970 }
2971
2972 /*
2973 * pmap_check_ptps: verify that none of the pmap's page table objects
2974 * have any pages allocated to them.
2975 */
2976 static void
2977 pmap_check_ptps(struct pmap *pmap)
2978 {
2979 int i;
2980
2981 for (i = 0; i < PTP_LEVELS - 1; i++) {
2982 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2983 "pmap %p level %d still has %d pages",
2984 pmap, i, (int)pmap->pm_obj[i].uo_npages);
2985 }
2986 }
2987
2988 static void
2989 pmap_check_inuse(struct pmap *pmap)
2990 {
2991 #ifdef DEBUG
2992 CPU_INFO_ITERATOR cii;
2993 struct cpu_info *ci;
2994
2995 for (CPU_INFO_FOREACH(cii, ci)) {
2996 if (ci->ci_pmap == pmap)
2997 panic("destroying pmap being used");
2998 #if defined(XENPV) && defined(__x86_64__)
2999 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
3000 if (pmap->pm_pdir[i] != 0 &&
3001 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
3002 printf("pmap_destroy(%p) pmap_kernel %p "
3003 "curcpu %d cpu %d ci_pmap %p "
3004 "ci->ci_kpm_pdir[%d]=%" PRIx64
3005 " pmap->pm_pdir[%d]=%" PRIx64 "\n",
3006 pmap, pmap_kernel(), curcpu()->ci_index,
3007 ci->ci_index, ci->ci_pmap,
3008 i, ci->ci_kpm_pdir[i],
3009 i, pmap->pm_pdir[i]);
3010 panic("%s: used pmap", __func__);
3011 }
3012 }
3013 #endif
3014 }
3015 #endif /* DEBUG */
3016 }
3017
3018 /*
3019 * pmap_destroy: drop reference count on pmap. free pmap if reference
3020 * count goes to zero.
3021 *
3022 * => we can be called from pmap_unmap_ptes() with a different, unrelated
3023 * pmap's lock held. be careful!
3024 */
3025 void
3026 pmap_destroy(struct pmap *pmap)
3027 {
3028 int i;
3029
3030 /*
3031 * drop reference count and verify not in use.
3032 */
3033
3034 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
3035 return;
3036 }
3037 pmap_check_inuse(pmap);
3038
3039 /*
3040 * handle any deferred frees.
3041 */
3042
3043 mutex_enter(&pmap->pm_lock);
3044 if (pmap->pm_pve != NULL) {
3045 pmap_free_pv(pmap, pmap->pm_pve);
3046 pmap->pm_pve = NULL;
3047 }
3048 pmap_drain_pv(pmap);
3049 mutex_exit(&pmap->pm_lock);
3050 pmap_update(pmap);
3051
3052 /*
3053 * Reference count is zero, free pmap resources and then free pmap.
3054 */
3055
3056 pmap_check_ptps(pmap);
3057 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
3058
3059 #ifdef USER_LDT
3060 if (pmap->pm_ldt != NULL) {
3061 /*
3062 * No need to switch the LDT; this address space is gone,
3063 * nothing is using it.
3064 *
3065 * No need to lock the pmap for ldt_free (or anything else),
3066 * we're the last one to use it.
3067 */
3068 /* XXXAD can't take cpu_lock here - fix soon. */
3069 mutex_enter(&cpu_lock);
3070 ldt_free(pmap->pm_ldt_sel);
3071 mutex_exit(&cpu_lock);
3072 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
3073 MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3074 }
3075 #endif
3076
3077 for (i = 0; i < PTP_LEVELS - 1; i++) {
3078 uvm_obj_destroy(&pmap->pm_obj[i], false);
3079 }
3080 kcpuset_zero(pmap->pm_cpus);
3081 kcpuset_zero(pmap->pm_kernel_cpus);
3082 #ifdef XENPV
3083 kcpuset_zero(pmap->pm_xen_ptp_cpus);
3084 #endif
3085
3086 KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3087 KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3088 KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3089
3090 pmap_check_ptps(pmap);
3091 if (__predict_false(pmap->pm_enter != NULL)) {
3092 /* XXX make this a different cache */
3093 pool_cache_destruct_object(&pmap_cache, pmap);
3094 } else {
3095 pool_cache_put(&pmap_cache, pmap);
3096 }
3097 }
3098
3099 /*
3100 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3101 *
3102 * => caller must hold pmap's lock
3103 * => PTP must be mapped into KVA
3104 * => must be called with kernel preemption disabled
3105 * => does as little work as possible
3106 */
3107 static void
3108 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3109 vaddr_t startva, vaddr_t blkendva)
3110 {
3111 #ifndef XENPV
3112 struct pv_entry *pve;
3113 struct vm_page *pg;
3114 struct pmap_page *pp;
3115 pt_entry_t opte;
3116 rb_tree_t *tree;
3117 vaddr_t va;
3118 int wired;
3119 uint8_t oattrs;
3120 u_int cnt;
3121
3122 KASSERT(mutex_owned(&pmap->pm_lock));
3123 KASSERT(kpreempt_disabled());
3124 KASSERT(pmap != pmap_kernel());
3125 KASSERT(ptp->wire_count > 1);
3126 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3127
3128 /*
3129 * Start at the lowest entered VA, and scan until there are no more
3130 * PTEs in the PTPs.
3131 */
3132 tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3133 pve = RB_TREE_MIN(tree);
3134 wired = 0;
3135 va = (vaddr_t)ptp->uanon;
3136 pte += ((va - startva) >> PAGE_SHIFT);
3137
3138 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3139 /*
3140 * No need for an atomic to clear the PTE. Nothing else can
3141 * see the address space any more and speculative access (if
3142 * possible) won't modify. Therefore there's no need to
3143 * track the accessed/dirty bits.
3144 */
3145 opte = *pte;
3146 if (!pmap_valid_entry(opte)) {
3147 continue;
3148 }
3149
3150 /*
3151 * Count the PTE. If it's not for a managed mapping
3152 * there's noting more to do.
3153 */
3154 cnt--;
3155 wired -= (opte & PTE_WIRED);
3156 if ((opte & PTE_PVLIST) == 0) {
3157 #ifndef DOM0OPS
3158 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3159 "managed page without PTE_PVLIST for %#"
3160 PRIxVADDR, va);
3161 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3162 "pv-tracked page without PTE_PVLIST for %#"
3163 PRIxVADDR, va);
3164 #endif
3165 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3166 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3167 va) == NULL);
3168 continue;
3169 }
3170
3171 /*
3172 * "pve" now points to the lowest (by VA) dynamic PV entry
3173 * in the PTP. If it's for this VA, take advantage of it to
3174 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB
3175 * tree by skipping to the next VA in the tree whenever
3176 * there is a match here. The tree will be cleared out in
3177 * one pass before return to pmap_remove_all().
3178 */
3179 oattrs = pmap_pte_to_pp_attrs(opte);
3180 if (pve != NULL && pve->pve_pte.pte_va == va) {
3181 pp = pve->pve_pp;
3182 KASSERT(pve->pve_pte.pte_ptp == ptp);
3183 KASSERT(pp->pp_pte.pte_ptp != ptp ||
3184 pp->pp_pte.pte_va != va);
3185 mutex_spin_enter(&pp->pp_lock);
3186 pp->pp_attrs |= oattrs;
3187 LIST_REMOVE(pve, pve_list);
3188 mutex_spin_exit(&pp->pp_lock);
3189
3190 /*
3191 * pve won't be touched again until pmap_drain_pv(),
3192 * so it's still safe to traverse the tree.
3193 */
3194 pmap_free_pv(pmap, pve);
3195 pve = RB_TREE_NEXT(tree, pve);
3196 continue;
3197 }
3198
3199 /*
3200 * No entry in the tree so it must be embedded. Look up the
3201 * page and cancel the embedded entry.
3202 */
3203 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3204 pp = VM_PAGE_TO_PP(pg);
3205 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3206 paddr_t pa = pmap_pte2pa(opte);
3207 panic("%s: PTE_PVLIST with pv-untracked page"
3208 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3209 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3210 }
3211 mutex_spin_enter(&pp->pp_lock);
3212 KASSERT(pp->pp_pte.pte_ptp == ptp);
3213 KASSERT(pp->pp_pte.pte_va == va);
3214 pp->pp_attrs |= oattrs;
3215 pp->pp_pte.pte_ptp = NULL;
3216 pp->pp_pte.pte_va = 0;
3217 mutex_spin_exit(&pp->pp_lock);
3218 }
3219
3220 /* PTP now empty - adjust the tree & stats to match. */
3221 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3222 ptp->wire_count = 1;
3223 #ifdef DIAGNOSTIC
3224 rb_tree_init(tree, &pmap_rbtree_ops);
3225 #endif
3226 #else /* !XENPV */
3227 /*
3228 * XXXAD For XEN, it's not clear to me that we can do this, because
3229 * I guess the hypervisor keeps track of PTEs too.
3230 */
3231 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3232 #endif /* !XENPV */
3233 }
3234
3235 /*
3236 * pmap_remove_all: remove all mappings from pmap in bulk.
3237 *
3238 * Ordinarily when removing mappings it's important to hold the UVM object's
3239 * lock, so that pages do not gain a new identity while retaining stale TLB
3240 * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3241 * Here it's known that the address space is no longer visible to any user
3242 * process, so we don't need to worry about that.
3243 */
3244 bool
3245 pmap_remove_all(struct pmap *pmap)
3246 {
3247 struct vm_page *ptps[32];
3248 vaddr_t va, blkendva;
3249 struct pmap *pmap2;
3250 pt_entry_t *ptes;
3251 pd_entry_t pde __diagused;
3252 pd_entry_t * const *pdes;
3253 int lvl __diagused, i, n;
3254
3255 /* XXX Can't handle EPT just yet. */
3256 if (pmap->pm_remove != NULL) {
3257 return false;
3258 }
3259
3260 for (;;) {
3261 /* Fetch a block of PTPs from tree. */
3262 mutex_enter(&pmap->pm_lock);
3263 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3264 (void **)ptps, __arraycount(ptps), false);
3265 if (n == 0) {
3266 mutex_exit(&pmap->pm_lock);
3267 break;
3268 }
3269
3270 /* Remove all mappings in the set of PTPs. */
3271 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3272 for (i = 0; i < n; i++) {
3273 if (ptps[i]->wire_count == 0) {
3274 /* It's dead: pmap_update() will expunge. */
3275 continue;
3276 }
3277
3278 /* Determine range of block. */
3279 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3280 blkendva = x86_round_pdr(va + 1);
3281
3282 /* Make sure everything squares up... */
3283 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3284 KASSERT(lvl == 1);
3285 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3286
3287 /* Zap! */
3288 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3289 blkendva);
3290
3291 /* PTP should now be unused - free it. */
3292 KASSERT(ptps[i]->wire_count == 1);
3293 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3294 }
3295 pmap_unmap_ptes(pmap, pmap2);
3296 pmap_drain_pv(pmap);
3297 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3298 mutex_exit(&pmap->pm_lock);
3299
3300 /* Process deferred frees. */
3301 pmap_update(pmap);
3302
3303 /* A breathing point. */
3304 preempt_point();
3305 }
3306
3307 /* Verify that the pmap is now completely empty. */
3308 pmap_check_ptps(pmap);
3309 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3310 "pmap %p not empty", pmap);
3311
3312 return true;
3313 }
3314
3315 #if defined(PMAP_FORK)
3316 /*
3317 * pmap_fork: perform any necessary data structure manipulation when
3318 * a VM space is forked.
3319 */
3320 void
3321 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3322 {
3323 #ifdef USER_LDT
3324 union descriptor *new_ldt;
3325 int sel;
3326
3327 if (__predict_true(pmap1->pm_ldt == NULL)) {
3328 return;
3329 }
3330
3331 /*
3332 * Copy the LDT into the new process.
3333 *
3334 * Read pmap1's ldt pointer unlocked; if it changes behind our back
3335 * we'll retry. This will starve if there's a stream of LDT changes
3336 * in another thread but that should not happen.
3337 */
3338
3339 retry:
3340 if (pmap1->pm_ldt != NULL) {
3341 /* Allocate space for the new process's LDT */
3342 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3343 MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3344 if (new_ldt == NULL) {
3345 printf("WARNING: %s: unable to allocate LDT space\n",
3346 __func__);
3347 return;
3348 }
3349 mutex_enter(&cpu_lock);
3350 /* Get a GDT slot for it */
3351 sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3352 if (sel == -1) {
3353 mutex_exit(&cpu_lock);
3354 uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3355 MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3356 printf("WARNING: %s: unable to allocate LDT selector\n",
3357 __func__);
3358 return;
3359 }
3360 } else {
3361 /* Wasn't anything there after all. */
3362 new_ldt = NULL;
3363 sel = -1;
3364 mutex_enter(&cpu_lock);
3365 }
3366
3367 /*
3368 * Now that we have cpu_lock, ensure the LDT status is the same.
3369 */
3370 if (pmap1->pm_ldt != NULL) {
3371 if (new_ldt == NULL) {
3372 /* A wild LDT just appeared. */
3373 mutex_exit(&cpu_lock);
3374 goto retry;
3375 }
3376
3377 /* Copy the LDT data and install it in pmap2 */
3378 memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3379 pmap2->pm_ldt = new_ldt;
3380 pmap2->pm_ldt_sel = sel;
3381 mutex_exit(&cpu_lock);
3382 } else {
3383 if (new_ldt != NULL) {
3384 /* The LDT disappeared, drop what we did. */
3385 ldt_free(sel);
3386 mutex_exit(&cpu_lock);
3387 uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3388 MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3389 return;
3390 }
3391
3392 /* We're good, just leave. */
3393 mutex_exit(&cpu_lock);
3394 }
3395 #endif /* USER_LDT */
3396 }
3397 #endif /* PMAP_FORK */
3398
3399 #ifdef USER_LDT
3400
3401 /*
3402 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap
3403 * is active, reload LDTR.
3404 */
3405 static void
3406 pmap_ldt_xcall(void *arg1, void *arg2)
3407 {
3408 struct pmap *pm;
3409
3410 kpreempt_disable();
3411 pm = arg1;
3412 if (curcpu()->ci_pmap == pm) {
3413 #if defined(SVS)
3414 if (svs_enabled) {
3415 svs_ldt_sync(pm);
3416 } else
3417 #endif
3418 lldt(pm->pm_ldt_sel);
3419 }
3420 kpreempt_enable();
3421 }
3422
3423 /*
3424 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap
3425 * in the new selector on all CPUs.
3426 */
3427 void
3428 pmap_ldt_sync(struct pmap *pm)
3429 {
3430 uint64_t where;
3431
3432 KASSERT(mutex_owned(&cpu_lock));
3433
3434 pmap_ldt_evcnt.ev_count++;
3435 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3436 xc_wait(where);
3437 }
3438
3439 /*
3440 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3441 * restore the default.
3442 */
3443 void
3444 pmap_ldt_cleanup(struct lwp *l)
3445 {
3446 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3447 union descriptor *ldt;
3448 int sel;
3449
3450 if (__predict_true(pmap->pm_ldt == NULL)) {
3451 return;
3452 }
3453
3454 mutex_enter(&cpu_lock);
3455 if (pmap->pm_ldt != NULL) {
3456 sel = pmap->pm_ldt_sel;
3457 ldt = pmap->pm_ldt;
3458 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3459 pmap->pm_ldt = NULL;
3460 pmap_ldt_sync(pmap);
3461 ldt_free(sel);
3462 uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3463 UVM_KMF_WIRED);
3464 }
3465 mutex_exit(&cpu_lock);
3466 }
3467 #endif /* USER_LDT */
3468
3469 /*
3470 * pmap_activate: activate a process' pmap
3471 *
3472 * => must be called with kernel preemption disabled
3473 * => if lwp is the curlwp, then set ci_want_pmapload so that
3474 * actual MMU context switch will be done by pmap_load() later
3475 */
3476 void
3477 pmap_activate(struct lwp *l)
3478 {
3479 struct cpu_info *ci;
3480 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3481
3482 KASSERT(kpreempt_disabled());
3483
3484 ci = curcpu();
3485
3486 if (l != ci->ci_curlwp)
3487 return;
3488
3489 KASSERT(ci->ci_want_pmapload == 0);
3490 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3491
3492 /*
3493 * no need to switch to kernel vmspace because
3494 * it's a subset of any vmspace.
3495 */
3496
3497 if (pmap == pmap_kernel()) {
3498 ci->ci_want_pmapload = 0;
3499 return;
3500 }
3501
3502 ci->ci_want_pmapload = 1;
3503 }
3504
3505 #if defined(XENPV) && defined(__x86_64__)
3506 #define KASSERT_PDIRPA(pmap) \
3507 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3508 pmap == pmap_kernel())
3509 #elif defined(PAE)
3510 #define KASSERT_PDIRPA(pmap) \
3511 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3512 #elif !defined(XENPV)
3513 #define KASSERT_PDIRPA(pmap) \
3514 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3515 #else
3516 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */
3517 #endif
3518
3519 /*
3520 * pmap_reactivate: try to regain reference to the pmap.
3521 *
3522 * => Must be called with kernel preemption disabled.
3523 */
3524 static void
3525 pmap_reactivate(struct pmap *pmap)
3526 {
3527 struct cpu_info * const ci = curcpu();
3528 const cpuid_t cid = cpu_index(ci);
3529
3530 KASSERT(kpreempt_disabled());
3531 KASSERT_PDIRPA(pmap);
3532
3533 /*
3534 * If we still have a lazy reference to this pmap, we can assume
3535 * that there was no TLB shootdown for this pmap in the meantime.
3536 *
3537 * The order of events here is important as we must synchronize
3538 * with TLB shootdown interrupts. Declare interest in invalidations
3539 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3540 * change only when the state is TLBSTATE_LAZY.
3541 */
3542
3543 ci->ci_tlbstate = TLBSTATE_VALID;
3544 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3545
3546 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3547 /* We have the reference, state is valid. */
3548 } else {
3549 /*
3550 * Must reload the TLB, pmap has been changed during
3551 * deactivated.
3552 */
3553 kcpuset_atomic_set(pmap->pm_cpus, cid);
3554
3555 tlbflush();
3556 }
3557 }
3558
3559 /*
3560 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3561 * and relevant LDT info.
3562 *
3563 * Ensures that the current process' pmap is loaded on the current CPU's
3564 * MMU and that there are no stale TLB entries.
3565 *
3566 * => The caller should disable kernel preemption or do check-and-retry
3567 * to prevent a preemption from undoing our efforts.
3568 * => This function may block.
3569 */
3570 void
3571 pmap_load(void)
3572 {
3573 struct cpu_info *ci;
3574 struct pmap *pmap, *oldpmap;
3575 struct lwp *l;
3576 uint64_t ncsw;
3577 int ilevel __diagused;
3578 u_long psl __diagused;
3579
3580 kpreempt_disable();
3581 retry:
3582 ci = curcpu();
3583 if (!ci->ci_want_pmapload) {
3584 kpreempt_enable();
3585 return;
3586 }
3587 l = ci->ci_curlwp;
3588 ncsw = l->l_ncsw;
3589 __insn_barrier();
3590
3591 /* should be able to take ipis. */
3592 KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel);
3593 #ifdef XENPV
3594 /* Check to see if interrupts are enabled (ie; no events are masked) */
3595 KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl);
3596 #else
3597 KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl);
3598 #endif
3599
3600 KASSERT(l != NULL);
3601 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3602 KASSERT(pmap != pmap_kernel());
3603 oldpmap = ci->ci_pmap;
3604
3605 if (pmap == oldpmap) {
3606 pmap_reactivate(pmap);
3607 ci->ci_want_pmapload = 0;
3608 kpreempt_enable();
3609 return;
3610 }
3611
3612 /*
3613 * Acquire a reference to the new pmap and perform the switch.
3614 */
3615
3616 pmap_reference(pmap);
3617 pmap_load1(l, pmap, oldpmap);
3618 ci->ci_want_pmapload = 0;
3619
3620 /*
3621 * we're now running with the new pmap. drop the reference
3622 * to the old pmap. if we block, we need to go around again.
3623 */
3624
3625 pmap_destroy(oldpmap);
3626 __insn_barrier();
3627 if (l->l_ncsw != ncsw) {
3628 goto retry;
3629 }
3630
3631 kpreempt_enable();
3632 }
3633
3634 /*
3635 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3636 * pmap_load(). It's critically important that this function does not
3637 * block.
3638 */
3639 static void
3640 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3641 {
3642 struct cpu_info *ci;
3643 struct pcb *pcb;
3644 cpuid_t cid;
3645
3646 KASSERT(kpreempt_disabled());
3647
3648 pcb = lwp_getpcb(l);
3649 ci = l->l_cpu;
3650 cid = cpu_index(ci);
3651
3652 kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3653 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3654
3655 KASSERT_PDIRPA(oldpmap);
3656 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3657 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3658
3659 /*
3660 * Mark the pmap in use by this CPU. Again, we must synchronize
3661 * with TLB shootdown interrupts, so set the state VALID first,
3662 * then register us for shootdown events on this pmap.
3663 */
3664 ci->ci_tlbstate = TLBSTATE_VALID;
3665 kcpuset_atomic_set(pmap->pm_cpus, cid);
3666 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3667 ci->ci_pmap = pmap;
3668
3669 /*
3670 * update tss. now that we have registered for invalidations
3671 * from other CPUs, we're good to load the page tables.
3672 */
3673 #ifdef PAE
3674 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3675 #else
3676 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3677 #endif
3678
3679 #ifdef i386
3680 #ifndef XENPV
3681 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3682 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3683 #endif
3684 #endif
3685
3686 #if defined(SVS) && defined(USER_LDT)
3687 if (svs_enabled) {
3688 svs_ldt_sync(pmap);
3689 } else
3690 #endif
3691 lldt(pmap->pm_ldt_sel);
3692
3693 cpu_load_pmap(pmap, oldpmap);
3694 }
3695
3696 /*
3697 * pmap_deactivate: deactivate a process' pmap.
3698 *
3699 * => Must be called with kernel preemption disabled (high IPL is enough).
3700 */
3701 void
3702 pmap_deactivate(struct lwp *l)
3703 {
3704 struct pmap *pmap;
3705 struct cpu_info *ci;
3706
3707 KASSERT(kpreempt_disabled());
3708
3709 if (l != curlwp) {
3710 return;
3711 }
3712
3713 /*
3714 * Wait for pending TLB shootdowns to complete. Necessary because
3715 * TLB shootdown state is per-CPU, and the LWP may be coming off
3716 * the CPU before it has a chance to call pmap_update(), e.g. due
3717 * to kernel preemption or blocking routine in between.
3718 */
3719 pmap_tlb_shootnow();
3720
3721 ci = curcpu();
3722
3723 if (ci->ci_want_pmapload) {
3724 /*
3725 * ci_want_pmapload means that our pmap is not loaded on
3726 * the CPU or TLB might be stale. note that pmap_kernel()
3727 * is always considered loaded.
3728 */
3729 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3730 != pmap_kernel());
3731 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3732 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3733
3734 /*
3735 * userspace has not been touched.
3736 * nothing to do here.
3737 */
3738
3739 ci->ci_want_pmapload = 0;
3740 return;
3741 }
3742
3743 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3744
3745 if (pmap == pmap_kernel()) {
3746 return;
3747 }
3748
3749 KASSERT_PDIRPA(pmap);
3750 KASSERT(ci->ci_pmap == pmap);
3751
3752 /*
3753 * we aren't interested in TLB invalidations for this pmap,
3754 * at least for the time being.
3755 */
3756
3757 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3758 ci->ci_tlbstate = TLBSTATE_LAZY;
3759 }
3760
3761 #ifdef EFI_RUNTIME
3762
3763 extern struct pmap *efi_runtime_pmap;
3764
3765 /*
3766 * pmap_is_user: true if pmap, which must not be the kernel pmap, is
3767 * for an unprivileged user process
3768 */
3769 bool
3770 pmap_is_user(struct pmap *pmap)
3771 {
3772
3773 KASSERT(pmap != pmap_kernel());
3774 return (pmap != efi_runtime_pmap);
3775 }
3776
3777 /*
3778 * pmap_activate_sync: synchronously activate specified pmap.
3779 *
3780 * => Must be called with kernel preemption disabled (high IPL is enough).
3781 * => Must not sleep before pmap_deactivate_sync.
3782 */
3783 void *
3784 pmap_activate_sync(struct pmap *pmap)
3785 {
3786 struct cpu_info *ci = curcpu();
3787 struct pmap *oldpmap = ci->ci_pmap;
3788 unsigned cid = cpu_index(ci);
3789
3790 KASSERT(kpreempt_disabled());
3791 KASSERT(pmap != pmap_kernel());
3792
3793 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3794 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3795
3796 if (oldpmap) {
3797 KASSERT_PDIRPA(oldpmap);
3798 kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3799 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3800 }
3801
3802 ci->ci_tlbstate = TLBSTATE_VALID;
3803 kcpuset_atomic_set(pmap->pm_cpus, cid);
3804 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3805 ci->ci_pmap = pmap;
3806
3807 #if defined(SVS) && defined(USER_LDT)
3808 if (svs_enabled) {
3809 svs_ldt_sync(pmap);
3810 } else
3811 #endif
3812 lldt(pmap->pm_ldt_sel);
3813
3814 cpu_load_pmap(pmap, oldpmap);
3815
3816 return oldpmap;
3817 }
3818
3819 /*
3820 * pmap_deactivate_sync: synchronously deactivate specified pmap and
3821 * restore whatever was active before pmap_activate_sync.
3822 *
3823 * => Must be called with kernel preemption disabled (high IPL is enough).
3824 * => Must not have slept since pmap_activate_sync.
3825 */
3826 void
3827 pmap_deactivate_sync(struct pmap *pmap, void *cookie)
3828 {
3829 struct cpu_info *ci = curcpu();
3830 struct pmap *oldpmap = cookie;
3831 unsigned cid = cpu_index(ci);
3832
3833 KASSERT(kpreempt_disabled());
3834 KASSERT(pmap != pmap_kernel());
3835 KASSERT(ci->ci_pmap == pmap);
3836
3837 KASSERT_PDIRPA(pmap);
3838
3839 KASSERT(kcpuset_isset(pmap->pm_cpus, cid));
3840 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3841
3842 pmap_tlb_shootnow();
3843
3844 kcpuset_atomic_clear(pmap->pm_cpus, cid);
3845 kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid);
3846
3847 ci->ci_tlbstate = TLBSTATE_VALID;
3848 ci->ci_pmap = oldpmap;
3849 if (oldpmap) {
3850 kcpuset_atomic_set(oldpmap->pm_cpus, cid);
3851 kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid);
3852 #if defined(SVS) && defined(USER_LDT)
3853 if (svs_enabled) {
3854 svs_ldt_sync(oldpmap);
3855 } else
3856 #endif
3857 lldt(oldpmap->pm_ldt_sel);
3858 cpu_load_pmap(oldpmap, pmap);
3859 } else {
3860 lcr3(pmap_pdirpa(pmap_kernel(), 0));
3861 }
3862 }
3863
3864 #endif /* EFI_RUNTIME */
3865
3866 /*
3867 * some misc. functions
3868 */
3869
3870 bool
3871 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3872 int *lastlvl)
3873 {
3874 unsigned long index;
3875 pd_entry_t pde;
3876 int i;
3877
3878 for (i = PTP_LEVELS; i > 1; i--) {
3879 index = pl_i(va, i);
3880 pde = pdes[i - 2][index];
3881 if ((pde & PTE_P) == 0) {
3882 *lastlvl = i;
3883 return false;
3884 }
3885 if (pde & PTE_PS)
3886 break;
3887 }
3888 if (lastpde != NULL)
3889 *lastpde = pde;
3890 *lastlvl = i;
3891 return true;
3892 }
3893
3894 /*
3895 * pmap_extract: extract a PA for the given VA
3896 */
3897 bool
3898 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3899 {
3900 pt_entry_t *ptes, pte;
3901 pd_entry_t pde;
3902 pd_entry_t * const *pdes;
3903 struct pmap *pmap2;
3904 paddr_t pa;
3905 bool rv;
3906 int lvl;
3907
3908 if (__predict_false(pmap->pm_extract != NULL)) {
3909 return (*pmap->pm_extract)(pmap, va, pap);
3910 }
3911
3912 #ifdef __HAVE_DIRECT_MAP
3913 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3914 if (pap != NULL) {
3915 *pap = PMAP_DIRECT_UNMAP(va);
3916 }
3917 return true;
3918 }
3919 #endif
3920
3921 rv = false;
3922 pa = 0;
3923
3924 if (pmap != pmap_kernel()) {
3925 mutex_enter(&pmap->pm_lock);
3926 }
3927 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3928 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3929 if (lvl == 2) {
3930 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3931 rv = true;
3932 } else {
3933 KASSERT(lvl == 1);
3934 pte = ptes[pl1_i(va)];
3935 if (__predict_true((pte & PTE_P) != 0)) {
3936 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3937 rv = true;
3938 }
3939 }
3940 }
3941 pmap_unmap_ptes(pmap, pmap2);
3942 if (pmap != pmap_kernel()) {
3943 mutex_exit(&pmap->pm_lock);
3944 }
3945 if (pap != NULL) {
3946 *pap = pa;
3947 }
3948
3949 return rv;
3950 }
3951
3952 /*
3953 * vtophys: virtual address to physical address. For use by
3954 * machine-dependent code only.
3955 */
3956 paddr_t
3957 vtophys(vaddr_t va)
3958 {
3959 paddr_t pa;
3960
3961 if (pmap_extract(pmap_kernel(), va, &pa) == true)
3962 return pa;
3963 return 0;
3964 }
3965
3966 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3967
3968 #ifdef XENPV
3969 /*
3970 * vtomach: virtual address to machine address. For use by
3971 * machine-dependent code only.
3972 */
3973 paddr_t
3974 vtomach(vaddr_t va)
3975 {
3976 paddr_t pa;
3977
3978 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3979 return pa;
3980 return 0;
3981 }
3982 #endif
3983
3984 /*
3985 * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3986 * determine the bounds of the kernel virtual address space.
3987 */
3988 void
3989 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3990 {
3991 *startp = virtual_avail;
3992 *endp = virtual_end;
3993 }
3994
3995 void
3996 pmap_zero_page(paddr_t pa)
3997 {
3998 #if defined(__HAVE_DIRECT_MAP)
3999 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
4000 #else
4001 #if defined(XENPV)
4002 if (XEN_VERSION_SUPPORTED(3, 4)) {
4003 xen_pagezero(pa);
4004 return;
4005 }
4006 #endif
4007 struct cpu_info *ci;
4008 pt_entry_t *zpte;
4009 vaddr_t zerova;
4010
4011 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
4012
4013 kpreempt_disable();
4014
4015 ci = curcpu();
4016 zerova = ci->vpage[VPAGE_ZER];
4017 zpte = ci->vpage_pte[VPAGE_ZER];
4018
4019 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
4020
4021 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
4022 pmap_pte_flush();
4023 pmap_update_pg(zerova); /* flush TLB */
4024
4025 memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
4026
4027 #if defined(DIAGNOSTIC) || defined(XENPV)
4028 pmap_pte_set(zpte, 0); /* zap ! */
4029 pmap_pte_flush();
4030 #endif
4031
4032 kpreempt_enable();
4033 #endif /* defined(__HAVE_DIRECT_MAP) */
4034 }
4035
4036 void
4037 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
4038 {
4039 #if defined(__HAVE_DIRECT_MAP)
4040 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
4041 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
4042
4043 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4044 #else
4045 #if defined(XENPV)
4046 if (XEN_VERSION_SUPPORTED(3, 4)) {
4047 xen_copy_page(srcpa, dstpa);
4048 return;
4049 }
4050 #endif
4051 struct cpu_info *ci;
4052 pt_entry_t *srcpte, *dstpte;
4053 vaddr_t srcva, dstva;
4054
4055 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
4056
4057 kpreempt_disable();
4058
4059 ci = curcpu();
4060 srcva = ci->vpage[VPAGE_SRC];
4061 dstva = ci->vpage[VPAGE_DST];
4062 srcpte = ci->vpage_pte[VPAGE_SRC];
4063 dstpte = ci->vpage_pte[VPAGE_DST];
4064
4065 KASSERT(*srcpte == 0 && *dstpte == 0);
4066
4067 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
4068 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
4069 pmap_pte_flush();
4070 pmap_update_pg(srcva);
4071 pmap_update_pg(dstva);
4072
4073 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4074
4075 #if defined(DIAGNOSTIC) || defined(XENPV)
4076 pmap_pte_set(srcpte, 0);
4077 pmap_pte_set(dstpte, 0);
4078 pmap_pte_flush();
4079 #endif
4080
4081 kpreempt_enable();
4082 #endif /* defined(__HAVE_DIRECT_MAP) */
4083 }
4084
4085 static pt_entry_t *
4086 pmap_map_ptp(struct vm_page *ptp)
4087 {
4088 #ifdef __HAVE_DIRECT_MAP
4089 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
4090 #else
4091 struct cpu_info *ci;
4092 pt_entry_t *ptppte;
4093 vaddr_t ptpva;
4094
4095 KASSERT(kpreempt_disabled());
4096
4097 #ifndef XENPV
4098 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
4099 #else
4100 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
4101 #endif
4102
4103 ci = curcpu();
4104 ptpva = ci->vpage[VPAGE_PTP];
4105 ptppte = ci->vpage_pte[VPAGE_PTP];
4106
4107 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
4108
4109 pmap_pte_flush();
4110 pmap_update_pg(ptpva);
4111
4112 return (pt_entry_t *)ptpva;
4113 #endif
4114 }
4115
4116 static void
4117 pmap_unmap_ptp(void)
4118 {
4119 #ifndef __HAVE_DIRECT_MAP
4120 #if defined(DIAGNOSTIC) || defined(XENPV)
4121 struct cpu_info *ci;
4122 pt_entry_t *pte;
4123
4124 KASSERT(kpreempt_disabled());
4125
4126 ci = curcpu();
4127 pte = ci->vpage_pte[VPAGE_PTP];
4128
4129 if (*pte != 0) {
4130 pmap_pte_set(pte, 0);
4131 pmap_pte_flush();
4132 }
4133 #endif
4134 #endif
4135 }
4136
4137 static pt_entry_t *
4138 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
4139 {
4140
4141 KASSERT(kpreempt_disabled());
4142 if (pmap_is_curpmap(pmap)) {
4143 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
4144 }
4145 KASSERT(ptp != NULL);
4146 return pmap_map_ptp(ptp) + pl1_pi(va);
4147 }
4148
4149 static void
4150 pmap_unmap_pte(void)
4151 {
4152
4153 KASSERT(kpreempt_disabled());
4154
4155 pmap_unmap_ptp();
4156 }
4157
4158 /*
4159 * p m a p r e m o v e f u n c t i o n s
4160 *
4161 * functions that remove mappings
4162 */
4163
4164 /*
4165 * pmap_remove_ptes: remove PTEs from a PTP
4166 *
4167 * => caller must hold pmap's lock
4168 * => PTP must be mapped into KVA
4169 * => PTP should be null if pmap == pmap_kernel()
4170 * => must be called with kernel preemption disabled
4171 * => returns composite pte if at least one page should be shot down
4172 */
4173 static void
4174 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
4175 vaddr_t startva, vaddr_t endva)
4176 {
4177 pt_entry_t *pte = (pt_entry_t *)ptpva;
4178
4179 KASSERT(mutex_owned(&pmap->pm_lock));
4180 KASSERT(kpreempt_disabled());
4181
4182 /*
4183 * mappings are very often sparse, so clip the given range to the
4184 * range of PTEs that are known present in the PTP.
4185 */
4186 pmap_ptp_range_clip(ptp, &startva, &pte);
4187
4188 /*
4189 * note that ptpva points to the PTE that maps startva. this may
4190 * or may not be the first PTE in the PTP.
4191 *
4192 * we loop through the PTP while there are still PTEs to look at
4193 * and the wire_count is greater than 1 (because we use the wire_count
4194 * to keep track of the number of real PTEs in the PTP).
4195 */
4196 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4197 (void)pmap_remove_pte(pmap, ptp, pte, startva);
4198 startva += PAGE_SIZE;
4199 pte++;
4200 }
4201 }
4202
4203 /*
4204 * pmap_remove_pte: remove a single PTE from a PTP.
4205 *
4206 * => caller must hold pmap's lock
4207 * => PTP must be mapped into KVA
4208 * => PTP should be null if pmap == pmap_kernel()
4209 * => returns true if we removed a mapping
4210 * => must be called with kernel preemption disabled
4211 */
4212 static bool
4213 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4214 vaddr_t va)
4215 {
4216 struct pv_entry *pve;
4217 struct vm_page *pg;
4218 struct pmap_page *pp;
4219 pt_entry_t opte;
4220
4221 KASSERT(mutex_owned(&pmap->pm_lock));
4222 KASSERT(kpreempt_disabled());
4223
4224 if (!pmap_valid_entry(*pte)) {
4225 /* VA not mapped. */
4226 return false;
4227 }
4228
4229 /* Atomically save the old PTE and zap it. */
4230 opte = pmap_pte_testset(pte, 0);
4231 if (!pmap_valid_entry(opte)) {
4232 return false;
4233 }
4234
4235 pmap_exec_account(pmap, va, opte, 0);
4236 pmap_stats_update_bypte(pmap, 0, opte);
4237
4238 if (ptp) {
4239 /*
4240 * Dropping a PTE. Make sure that the PDE is flushed.
4241 */
4242 ptp->wire_count--;
4243 if (ptp->wire_count <= 1) {
4244 opte |= PTE_A;
4245 }
4246 }
4247
4248 if ((opte & PTE_A) != 0) {
4249 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4250 }
4251
4252 /*
4253 * If we are not on a pv list - we are done.
4254 */
4255 if ((opte & PTE_PVLIST) == 0) {
4256 #ifndef DOM0OPS
4257 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4258 "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4259 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4260 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4261 #endif
4262 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4263 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4264 return true;
4265 }
4266
4267 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4268 pp = VM_PAGE_TO_PP(pg);
4269 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4270 paddr_t pa = pmap_pte2pa(opte);
4271 panic("%s: PTE_PVLIST with pv-untracked page"
4272 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4273 __func__, va, pa, atop(pa));
4274 }
4275
4276 /* Sync R/M bits. */
4277 pve = pmap_lookup_pv(pmap, ptp, pp, va);
4278 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4279 return true;
4280 }
4281
4282 static void
4283 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4284 {
4285 pt_entry_t *ptes;
4286 pd_entry_t pde;
4287 pd_entry_t * const *pdes;
4288 bool result;
4289 vaddr_t blkendva, va = sva;
4290 struct vm_page *ptp;
4291 struct pmap *pmap2;
4292 int lvl;
4293
4294 KASSERT(mutex_owned(&pmap->pm_lock));
4295
4296 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4297
4298 /*
4299 * removing one page? take shortcut function.
4300 */
4301
4302 if (va + PAGE_SIZE == eva) {
4303 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4304 KASSERT(lvl == 1);
4305
4306 /* Get PTP if non-kernel mapping. */
4307 if (pmap != pmap_kernel()) {
4308 ptp = pmap_find_ptp(pmap, va, 1);
4309 KASSERTMSG(ptp != NULL,
4310 "%s: unmanaged PTP detected", __func__);
4311 } else {
4312 /* Never free kernel PTPs. */
4313 ptp = NULL;
4314 }
4315
4316 result = pmap_remove_pte(pmap, ptp,
4317 &ptes[pl1_i(va)], va);
4318
4319 /*
4320 * if mapping removed and the PTP is no longer
4321 * being used, free it!
4322 */
4323
4324 if (result && ptp && ptp->wire_count <= 1)
4325 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4326 }
4327 } else for (/* null */ ; va < eva ; va = blkendva) {
4328 /* determine range of block */
4329 blkendva = x86_round_pdr(va+1);
4330 if (blkendva > eva)
4331 blkendva = eva;
4332
4333 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4334 /* Skip a range corresponding to an invalid pde. */
4335 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4336 continue;
4337 }
4338 KASSERT(lvl == 1);
4339
4340 /* Get PTP if non-kernel mapping. */
4341 if (pmap != pmap_kernel()) {
4342 ptp = pmap_find_ptp(pmap, va, 1);
4343 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4344 __func__);
4345 } else {
4346 /* Never free kernel PTPs. */
4347 ptp = NULL;
4348 }
4349
4350 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4351 blkendva);
4352
4353 /* If PTP is no longer being used, free it. */
4354 if (ptp && ptp->wire_count <= 1) {
4355 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4356 }
4357 }
4358 pmap_unmap_ptes(pmap, pmap2);
4359 pmap_drain_pv(pmap);
4360 }
4361
4362 /*
4363 * pmap_remove: mapping removal function.
4364 *
4365 * => caller should not be holding any pmap locks
4366 */
4367 void
4368 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4369 {
4370 if (__predict_false(pmap->pm_remove != NULL)) {
4371 (*pmap->pm_remove)(pmap, sva, eva);
4372 return;
4373 }
4374
4375 mutex_enter(&pmap->pm_lock);
4376 pmap_remove_locked(pmap, sva, eva);
4377 mutex_exit(&pmap->pm_lock);
4378 }
4379
4380 /*
4381 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4382 *
4383 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4384 * => Caller should disable kernel preemption.
4385 * => issues tlb shootdowns if necessary.
4386 */
4387 static int
4388 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4389 pt_entry_t *optep)
4390 {
4391 struct pmap *pmap;
4392 struct vm_page *ptp;
4393 vaddr_t va;
4394 pt_entry_t *ptep;
4395 pt_entry_t opte;
4396 pt_entry_t npte;
4397 pt_entry_t expect;
4398 bool need_shootdown;
4399
4400 ptp = pvpte->pte_ptp;
4401 va = pvpte->pte_va;
4402 KASSERT(ptp == NULL || ptp->uobject != NULL);
4403 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4404 pmap = ptp_to_pmap(ptp);
4405 KASSERT(kpreempt_disabled());
4406
4407 if (__predict_false(pmap->pm_sync_pv != NULL)) {
4408 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4409 optep);
4410 }
4411
4412 expect = pmap_pa2pte(pa) | PTE_P;
4413
4414 if (clearbits != ~0) {
4415 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4416 clearbits = pmap_pp_attrs_to_pte(clearbits);
4417 }
4418
4419 ptep = pmap_map_pte(pmap, ptp, va);
4420 do {
4421 opte = *ptep;
4422 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4423 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4424 KASSERT(opte == 0 || (opte & PTE_P) != 0);
4425 if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4426 /*
4427 * We lost a race with a V->P operation like
4428 * pmap_remove(). Wait for the competitor
4429 * reflecting pte bits into mp_attrs.
4430 */
4431 pmap_unmap_pte();
4432 return EAGAIN;
4433 }
4434
4435 /*
4436 * Check if there's anything to do on this PTE.
4437 */
4438 if ((opte & clearbits) == 0) {
4439 need_shootdown = false;
4440 break;
4441 }
4442
4443 /*
4444 * We need a shootdown if the PTE is cached (PTE_A) ...
4445 * ... Unless we are clearing only the PTE_W bit and
4446 * it isn't cached as RW (PTE_D).
4447 */
4448 need_shootdown = (opte & PTE_A) != 0 &&
4449 !(clearbits == PTE_W && (opte & PTE_D) == 0);
4450
4451 npte = opte & ~clearbits;
4452
4453 /*
4454 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4455 */
4456 if (need_shootdown) {
4457 npte &= ~(PTE_A | PTE_D);
4458 }
4459 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4460 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4461 KASSERT(npte == 0 || (opte & PTE_P) != 0);
4462 } while (pmap_pte_cas(ptep, opte, npte) != opte);
4463
4464 if (need_shootdown) {
4465 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4466 }
4467 pmap_unmap_pte();
4468
4469 *oattrs = pmap_pte_to_pp_attrs(opte);
4470 if (optep != NULL)
4471 *optep = opte;
4472 return 0;
4473 }
4474
4475 static void
4476 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4477 vaddr_t va)
4478 {
4479 struct pmap *pmap2;
4480 pt_entry_t *ptes;
4481 pd_entry_t * const *pdes;
4482
4483 KASSERT(mutex_owned(&pmap->pm_lock));
4484
4485 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4486 pmap_stats_update_bypte(pmap, 0, opte);
4487 ptp->wire_count--;
4488 if (ptp->wire_count <= 1) {
4489 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4490 }
4491 pmap_unmap_ptes(pmap, pmap2);
4492 }
4493
4494 static void
4495 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4496 {
4497 struct pv_pte *pvpte;
4498 struct vm_page *ptp;
4499 uintptr_t sum;
4500 uint8_t oattrs;
4501 bool locked;
4502
4503 /*
4504 * Do an unlocked check to see if the page has no mappings, eg when
4505 * pmap_remove_all() was called before amap_wipeout() for a process
4506 * private amap - common. The page being removed must be on the way
4507 * out, so we don't have to worry about concurrent attempts to enter
4508 * it (otherwise the caller either doesn't care or has screwed up).
4509 */
4510 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4511 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4512 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4513 if (sum == 0) {
4514 return;
4515 }
4516
4517 kpreempt_disable();
4518 for (;;) {
4519 struct pmap *pmap;
4520 struct pv_entry *pve;
4521 pt_entry_t opte;
4522 vaddr_t va;
4523
4524 mutex_spin_enter(&pp->pp_lock);
4525 if ((pvpte = pv_pte_first(pp)) == NULL) {
4526 mutex_spin_exit(&pp->pp_lock);
4527 break;
4528 }
4529
4530 /*
4531 * Add a reference to the pmap before clearing the pte.
4532 * Otherwise the pmap can disappear behind us.
4533 */
4534 ptp = pvpte->pte_ptp;
4535 pmap = ptp_to_pmap(ptp);
4536 KASSERT(pmap->pm_obj[0].uo_refs > 0);
4537 if (ptp != NULL) {
4538 pmap_reference(pmap);
4539 }
4540
4541 /*
4542 * Now try to lock it. We need a direct handoff between
4543 * pp_lock and pm_lock to know the pv_entry is kept intact
4544 * and kept associated with this pmap. If that can't be
4545 * had, wait for the pmap's lock to become free and then
4546 * retry.
4547 */
4548 locked = mutex_tryenter(&pmap->pm_lock);
4549 mutex_spin_exit(&pp->pp_lock);
4550 if (!locked) {
4551 mutex_enter(&pmap->pm_lock);
4552 /* nothing, just wait for it */
4553 mutex_exit(&pmap->pm_lock);
4554 if (ptp != NULL) {
4555 pmap_destroy(pmap);
4556 }
4557 continue;
4558 }
4559 va = pvpte->pte_va;
4560
4561 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4562 "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4563 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4564 "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4565 KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4566 "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4567
4568 #ifdef DEBUG
4569 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4570 rb_tree_t *tree = (ptp != NULL ?
4571 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4572 pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4573 if (pve == NULL) {
4574 KASSERTMSG(&pp->pp_pte == pvpte,
4575 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4576 va, pmap, ptp, pvpte, pve);
4577 } else {
4578 KASSERTMSG(&pve->pve_pte == pvpte,
4579 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4580 va, pmap, ptp, pvpte, pve);
4581 }
4582 #endif
4583
4584 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4585 panic("pmap_pp_remove: mapping not present");
4586 }
4587
4588 pve = pmap_lookup_pv(pmap, ptp, pp, va);
4589 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4590
4591 /* Update the PTP reference count. Free if last reference. */
4592 if (ptp != NULL) {
4593 KASSERT(pmap != pmap_kernel());
4594 pmap_tlb_shootnow();
4595 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4596 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4597 } else {
4598 pmap_pp_remove_ent(pmap, ptp, opte, va);
4599 }
4600 } else {
4601 KASSERT(pmap == pmap_kernel());
4602 pmap_stats_update_bypte(pmap, 0, opte);
4603 }
4604 pmap_tlb_shootnow();
4605 pmap_drain_pv(pmap);
4606 mutex_exit(&pmap->pm_lock);
4607 if (ptp != NULL) {
4608 pmap_destroy(pmap);
4609 }
4610 }
4611 kpreempt_enable();
4612 }
4613
4614 /*
4615 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4616 *
4617 * => R/M bits are sync'd back to attrs
4618 */
4619 void
4620 pmap_page_remove(struct vm_page *pg)
4621 {
4622 struct pmap_page *pp;
4623 paddr_t pa;
4624
4625 pp = VM_PAGE_TO_PP(pg);
4626 pa = VM_PAGE_TO_PHYS(pg);
4627 pmap_pp_remove(pp, pa);
4628 }
4629
4630 /*
4631 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4632 * that map it
4633 */
4634 void
4635 pmap_pv_remove(paddr_t pa)
4636 {
4637 struct pmap_page *pp;
4638
4639 pp = pmap_pv_tracked(pa);
4640 if (pp == NULL)
4641 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4642 pmap_pp_remove(pp, pa);
4643 }
4644
4645 /*
4646 * p m a p a t t r i b u t e f u n c t i o n s
4647 * functions that test/change managed page's attributes
4648 * since a page can be mapped multiple times we must check each PTE that
4649 * maps it by going down the pv lists.
4650 */
4651
4652 /*
4653 * pmap_test_attrs: test a page's attributes
4654 */
4655 bool
4656 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4657 {
4658 struct pmap_page *pp;
4659 struct pv_pte *pvpte;
4660 struct pmap *pmap;
4661 uint8_t oattrs;
4662 u_int result;
4663 paddr_t pa;
4664
4665 pp = VM_PAGE_TO_PP(pg);
4666 if ((pp->pp_attrs & testbits) != 0) {
4667 return true;
4668 }
4669 pa = VM_PAGE_TO_PHYS(pg);
4670 startover:
4671 mutex_spin_enter(&pp->pp_lock);
4672 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4673 if ((pp->pp_attrs & testbits) != 0) {
4674 break;
4675 }
4676 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4677 /*
4678 * raced with a V->P operation. wait for the other
4679 * side to finish by acquiring pmap's lock. if no
4680 * wait, updates to pp_attrs by the other side may
4681 * go unseen.
4682 */
4683 pmap = ptp_to_pmap(pvpte->pte_ptp);
4684 pmap_reference(pmap);
4685 mutex_spin_exit(&pp->pp_lock);
4686 mutex_enter(&pmap->pm_lock);
4687 /* nothing. */
4688 mutex_exit(&pmap->pm_lock);
4689 pmap_destroy(pmap);
4690 goto startover;
4691 }
4692 pp->pp_attrs |= oattrs;
4693 }
4694 result = pp->pp_attrs & testbits;
4695 mutex_spin_exit(&pp->pp_lock);
4696
4697 /*
4698 * note that we will exit the for loop with a non-null pve if
4699 * we have found the bits we are testing for.
4700 */
4701
4702 return result != 0;
4703 }
4704
4705 static bool
4706 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4707 {
4708 struct pv_pte *pvpte;
4709 struct pmap *pmap;
4710 uint8_t oattrs;
4711 u_int result;
4712
4713 startover:
4714 mutex_spin_enter(&pp->pp_lock);
4715 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4716 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4717 /*
4718 * raced with a V->P operation. wait for the other
4719 * side to finish by acquiring pmap's lock. it is
4720 * probably unmapping the page, and it will be gone
4721 * when the loop is restarted.
4722 */
4723 pmap = ptp_to_pmap(pvpte->pte_ptp);
4724 pmap_reference(pmap);
4725 mutex_spin_exit(&pp->pp_lock);
4726 mutex_enter(&pmap->pm_lock);
4727 /* nothing. */
4728 mutex_exit(&pmap->pm_lock);
4729 pmap_destroy(pmap);
4730 goto startover;
4731 }
4732 pp->pp_attrs |= oattrs;
4733 }
4734 result = pp->pp_attrs & clearbits;
4735 pp->pp_attrs &= ~clearbits;
4736 pmap_tlb_shootnow();
4737 mutex_spin_exit(&pp->pp_lock);
4738
4739 return result != 0;
4740 }
4741
4742 /*
4743 * pmap_clear_attrs: clear the specified attribute for a page.
4744 *
4745 * => we return true if we cleared one of the bits we were asked to
4746 */
4747 bool
4748 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4749 {
4750 struct pmap_page *pp;
4751 paddr_t pa;
4752
4753 pp = VM_PAGE_TO_PP(pg);
4754 pa = VM_PAGE_TO_PHYS(pg);
4755
4756 /*
4757 * If this is a new page, assert it has no mappings and simply zap
4758 * the stored attributes without taking any locks.
4759 */
4760 if ((pg->flags & PG_FAKE) != 0) {
4761 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4762 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4763 KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4764 atomic_store_relaxed(&pp->pp_attrs, 0);
4765 return false;
4766 } else {
4767 return pmap_pp_clear_attrs(pp, pa, clearbits);
4768 }
4769 }
4770
4771 /*
4772 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4773 * pv-tracked page.
4774 */
4775 bool
4776 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4777 {
4778 struct pmap_page *pp;
4779
4780 pp = pmap_pv_tracked(pa);
4781 if (pp == NULL)
4782 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4783
4784 return pmap_pp_clear_attrs(pp, pa, clearbits);
4785 }
4786
4787 /*
4788 * p m a p p r o t e c t i o n f u n c t i o n s
4789 */
4790
4791 /*
4792 * pmap_page_protect: change the protection of all recorded mappings
4793 * of a managed page
4794 *
4795 * => NOTE: this is an inline function in pmap.h
4796 */
4797
4798 /* see pmap.h */
4799
4800 /*
4801 * pmap_pv_protect: change the protection of all recorded mappings
4802 * of an unmanaged pv-tracked page
4803 *
4804 * => NOTE: this is an inline function in pmap.h
4805 */
4806
4807 /* see pmap.h */
4808
4809 /*
4810 * pmap_protect: set the protection in of the pages in a pmap
4811 *
4812 * => NOTE: this is an inline function in pmap.h
4813 */
4814
4815 /* see pmap.h */
4816
4817 /*
4818 * pmap_write_protect: write-protect pages in a pmap.
4819 *
4820 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4821 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4822 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4823 * present the page will still be considered as a kernel page, and the privilege
4824 * separation will be enforced correctly.
4825 */
4826 void
4827 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4828 {
4829 pt_entry_t bit_rem, bit_put;
4830 pt_entry_t *ptes;
4831 pt_entry_t * const *pdes;
4832 struct pmap *pmap2;
4833 vaddr_t blockend, va;
4834 int lvl, i;
4835
4836 if (__predict_false(pmap->pm_write_protect != NULL)) {
4837 (*pmap->pm_write_protect)(pmap, sva, eva, prot);
4838 return;
4839 }
4840
4841 bit_rem = 0;
4842 if (!(prot & VM_PROT_WRITE))
4843 bit_rem = PTE_W;
4844
4845 bit_put = 0;
4846 if (!(prot & VM_PROT_EXECUTE))
4847 bit_put = pmap_pg_nx;
4848
4849 sva &= ~PAGE_MASK;
4850 eva &= ~PAGE_MASK;
4851
4852 /*
4853 * Acquire pmap. No need to lock the kernel pmap as we won't
4854 * be touching PV entries nor stats and kernel PDEs aren't
4855 * freed.
4856 */
4857 if (pmap != pmap_kernel()) {
4858 mutex_enter(&pmap->pm_lock);
4859 }
4860 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4861
4862 for (va = sva ; va < eva; va = blockend) {
4863 pt_entry_t *spte, *epte;
4864
4865 blockend = x86_round_pdr(va + 1);
4866 if (blockend > eva)
4867 blockend = eva;
4868
4869 /* Is it a valid block? */
4870 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4871 continue;
4872 }
4873 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4874 KASSERT(lvl == 1);
4875
4876 spte = &ptes[pl1_i(va)];
4877 epte = &ptes[pl1_i(blockend)];
4878
4879 for (i = 0; spte < epte; spte++, i++) {
4880 pt_entry_t opte, npte;
4881
4882 do {
4883 opte = *spte;
4884 if (!pmap_valid_entry(opte)) {
4885 goto next;
4886 }
4887 npte = (opte & ~bit_rem) | bit_put;
4888 } while (pmap_pte_cas(spte, opte, npte) != opte);
4889
4890 if ((opte & PTE_D) != 0) {
4891 vaddr_t tva = va + x86_ptob(i);
4892 pmap_tlb_shootdown(pmap, tva, opte,
4893 TLBSHOOT_WRITE_PROTECT);
4894 }
4895 next:;
4896 }
4897 }
4898
4899 /* Release pmap. */
4900 pmap_unmap_ptes(pmap, pmap2);
4901 if (pmap != pmap_kernel()) {
4902 mutex_exit(&pmap->pm_lock);
4903 }
4904 }
4905
4906 /*
4907 * pmap_unwire: clear the wired bit in the PTE.
4908 *
4909 * => Mapping should already be present.
4910 */
4911 void
4912 pmap_unwire(struct pmap *pmap, vaddr_t va)
4913 {
4914 pt_entry_t *ptes, *ptep, opte;
4915 pd_entry_t * const *pdes;
4916 struct pmap *pmap2;
4917 int lvl;
4918
4919 if (__predict_false(pmap->pm_unwire != NULL)) {
4920 (*pmap->pm_unwire)(pmap, va);
4921 return;
4922 }
4923
4924 /*
4925 * Acquire pmap. Need to lock the kernel pmap only to protect the
4926 * statistics.
4927 */
4928 mutex_enter(&pmap->pm_lock);
4929 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4930
4931 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4932 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4933 }
4934 KASSERT(lvl == 1);
4935
4936 ptep = &ptes[pl1_i(va)];
4937 opte = *ptep;
4938 KASSERT(pmap_valid_entry(opte));
4939
4940 if (opte & PTE_WIRED) {
4941 pt_entry_t npte = opte & ~PTE_WIRED;
4942
4943 opte = pmap_pte_testset(ptep, npte);
4944 pmap_stats_update_bypte(pmap, npte, opte);
4945 } else {
4946 printf("%s: wiring for pmap %p va %#" PRIxVADDR
4947 " did not change!\n", __func__, pmap, va);
4948 }
4949
4950 /* Release pmap. */
4951 pmap_unmap_ptes(pmap, pmap2);
4952 mutex_exit(&pmap->pm_lock);
4953 }
4954
4955 /*
4956 * pmap_copy: copy mappings from one pmap to another
4957 *
4958 * => optional function
4959 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4960 */
4961
4962 /*
4963 * defined as macro in pmap.h
4964 */
4965
4966 __strict_weak_alias(pmap_enter, pmap_enter_default);
4967
4968 int
4969 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4970 u_int flags)
4971 {
4972 if (__predict_false(pmap->pm_enter != NULL)) {
4973 return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4974 }
4975
4976 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4977 }
4978
4979 /*
4980 * pmap_enter: enter a mapping into a pmap
4981 *
4982 * => must be done "now" ... no lazy-evaluation
4983 */
4984 int
4985 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4986 vm_prot_t prot, u_int flags, int domid)
4987 {
4988 pt_entry_t *ptes, opte, npte;
4989 pt_entry_t *ptep;
4990 pd_entry_t * const *pdes;
4991 struct vm_page *ptp;
4992 struct vm_page *new_pg, *old_pg;
4993 struct pmap_page *new_pp, *old_pp;
4994 struct pv_entry *old_pve, *new_pve;
4995 bool wired = (flags & PMAP_WIRED) != 0;
4996 struct pmap *pmap2;
4997 struct pmap_ptparray pt;
4998 int error;
4999 bool getptp, samepage, new_embedded;
5000 rb_tree_t *tree;
5001
5002 KASSERT(pmap_initialized);
5003 KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5004 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5005 PRIxVADDR " over PDP!", __func__, va);
5006 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
5007 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
5008 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
5009
5010 #ifdef XENPV
5011 KASSERT(domid == DOMID_SELF || pa == 0);
5012 #endif
5013
5014 npte = ma | protection_codes[prot] | PTE_P;
5015 npte |= pmap_pat_flags(flags);
5016 if (wired)
5017 npte |= PTE_WIRED;
5018 if (va < VM_MAXUSER_ADDRESS) {
5019 KASSERTMSG(pmap != pmap_kernel(),
5020 "entering user va %#"PRIxVADDR" into kernel pmap",
5021 va);
5022 if (pmap_is_user(pmap))
5023 npte |= PTE_U;
5024 }
5025
5026 if (pmap == pmap_kernel())
5027 npte |= pmap_pg_g;
5028 if (flags & VM_PROT_ALL) {
5029 npte |= PTE_A;
5030 if (flags & VM_PROT_WRITE) {
5031 KASSERT((npte & PTE_W) != 0);
5032 npte |= PTE_D;
5033 }
5034 }
5035
5036 #ifdef XENPV
5037 if (domid != DOMID_SELF)
5038 new_pg = NULL;
5039 else
5040 #endif
5041 new_pg = PHYS_TO_VM_PAGE(pa);
5042
5043 if (new_pg != NULL) {
5044 /* This is a managed page */
5045 npte |= PTE_PVLIST;
5046 new_pp = VM_PAGE_TO_PP(new_pg);
5047 PMAP_CHECK_PP(new_pp);
5048 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
5049 /* This is an unmanaged pv-tracked page */
5050 npte |= PTE_PVLIST;
5051 PMAP_CHECK_PP(new_pp);
5052 } else {
5053 new_pp = NULL;
5054 }
5055
5056 /* Begin by locking the pmap. */
5057 mutex_enter(&pmap->pm_lock);
5058
5059 /* Look up the PTP. Allocate if none present. */
5060 ptp = NULL;
5061 getptp = false;
5062 if (pmap != pmap_kernel()) {
5063 ptp = pmap_find_ptp(pmap, va, 1);
5064 if (ptp == NULL) {
5065 getptp = true;
5066 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
5067 if (error != 0) {
5068 if (flags & PMAP_CANFAIL) {
5069 mutex_exit(&pmap->pm_lock);
5070 return error;
5071 }
5072 panic("%s: get ptp failed, error=%d", __func__,
5073 error);
5074 }
5075 }
5076 tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5077 } else {
5078 /* Embedded PV entries rely on this. */
5079 KASSERT(va != 0);
5080 tree = &pmap_kernel_rb;
5081 }
5082
5083 /*
5084 * Look up the old PV entry at this VA (if any), and insert a new PV
5085 * entry if required for the new mapping. Temporarily track the old
5086 * and new mappings concurrently. Only after the old mapping is
5087 * evicted from the pmap will we remove its PV entry. Otherwise,
5088 * our picture of modified/accessed state for either page could get
5089 * out of sync (we need any P->V operation for either page to stall
5090 * on pmap->pm_lock until done here).
5091 */
5092 new_pve = NULL;
5093 old_pve = NULL;
5094 samepage = false;
5095 new_embedded = false;
5096
5097 if (new_pp != NULL) {
5098 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
5099 &old_pve, &samepage, &new_embedded, tree);
5100
5101 /*
5102 * If a new pv_entry was needed and none was available, we
5103 * can go no further.
5104 */
5105 if (error != 0) {
5106 if (flags & PMAP_CANFAIL) {
5107 if (getptp) {
5108 pmap_unget_ptp(pmap, &pt);
5109 }
5110 mutex_exit(&pmap->pm_lock);
5111 return error;
5112 }
5113 panic("%s: alloc pve failed", __func__);
5114 }
5115 } else {
5116 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5117 }
5118
5119 /* Map PTEs into address space. */
5120 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5121
5122 /* Install any newly allocated PTPs. */
5123 if (getptp) {
5124 pmap_install_ptp(pmap, &pt, va, pdes);
5125 }
5126
5127 /* Check if there is an existing mapping. */
5128 ptep = &ptes[pl1_i(va)];
5129 opte = *ptep;
5130 bool have_oldpa = pmap_valid_entry(opte);
5131 paddr_t oldpa = pmap_pte2pa(opte);
5132
5133 /*
5134 * Update the pte.
5135 */
5136 do {
5137 opte = *ptep;
5138
5139 /*
5140 * if the same page, inherit PTE_A and PTE_D.
5141 */
5142 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5143 npte |= opte & (PTE_A | PTE_D);
5144 }
5145 #if defined(XENPV)
5146 if (domid != DOMID_SELF) {
5147 /* pmap_pte_cas with error handling */
5148 int s = splvm();
5149 if (opte != *ptep) {
5150 splx(s);
5151 continue;
5152 }
5153 error = xpq_update_foreign(
5154 vtomach((vaddr_t)ptep), npte, domid, flags);
5155 splx(s);
5156 if (error) {
5157 /* Undo pv_entry tracking - oof. */
5158 if (new_pp != NULL) {
5159 mutex_spin_enter(&new_pp->pp_lock);
5160 if (new_pve != NULL) {
5161 LIST_REMOVE(new_pve, pve_list);
5162 KASSERT(pmap->pm_pve == NULL);
5163 pmap->pm_pve = new_pve;
5164 } else if (new_embedded) {
5165 new_pp->pp_pte.pte_ptp = NULL;
5166 new_pp->pp_pte.pte_va = 0;
5167 }
5168 mutex_spin_exit(&new_pp->pp_lock);
5169 }
5170 pmap_unmap_ptes(pmap, pmap2);
5171 /* Free new PTP. */
5172 if (ptp != NULL && ptp->wire_count <= 1) {
5173 pmap_free_ptp(pmap, ptp, va, ptes,
5174 pdes);
5175 }
5176 mutex_exit(&pmap->pm_lock);
5177 return error;
5178 }
5179 break;
5180 }
5181 #endif /* defined(XENPV) */
5182 } while (pmap_pte_cas(ptep, opte, npte) != opte);
5183
5184 /*
5185 * Done with the PTEs: they can now be unmapped.
5186 */
5187 pmap_unmap_ptes(pmap, pmap2);
5188
5189 /*
5190 * Update statistics and PTP's reference count.
5191 */
5192 pmap_stats_update_bypte(pmap, npte, opte);
5193 if (ptp != NULL) {
5194 if (!have_oldpa) {
5195 ptp->wire_count++;
5196 }
5197 /* Remember minimum VA in PTP. */
5198 pmap_ptp_range_set(ptp, va);
5199 }
5200 KASSERT(ptp == NULL || ptp->wire_count > 1);
5201
5202 /*
5203 * If the same page, we can skip pv_entry handling.
5204 */
5205 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5206 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5207 if ((npte & PTE_PVLIST) != 0) {
5208 KASSERT(samepage);
5209 pmap_check_pv(pmap, ptp, new_pp, va, true);
5210 }
5211 goto same_pa;
5212 } else if ((npte & PTE_PVLIST) != 0) {
5213 KASSERT(!samepage);
5214 }
5215
5216 /*
5217 * If old page is pv-tracked, remove pv_entry from its list.
5218 */
5219 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5220 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5221 old_pp = VM_PAGE_TO_PP(old_pg);
5222 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5223 panic("%s: PTE_PVLIST with pv-untracked page"
5224 " va = %#"PRIxVADDR
5225 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5226 __func__, va, oldpa, atop(pa));
5227 }
5228
5229 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5230 pmap_pte_to_pp_attrs(opte));
5231 } else {
5232 KASSERT(old_pve == NULL);
5233 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5234 }
5235
5236 /*
5237 * If new page is dynamically PV tracked, insert to tree.
5238 */
5239 if (new_pve != NULL) {
5240 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5241 old_pve = rb_tree_insert_node(tree, new_pve);
5242 KASSERT(old_pve == new_pve);
5243 pmap_check_pv(pmap, ptp, new_pp, va, true);
5244 }
5245
5246 same_pa:
5247 /*
5248 * shootdown tlb if necessary.
5249 */
5250
5251 if ((~opte & (PTE_P | PTE_A)) == 0 &&
5252 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5253 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5254 }
5255 pmap_drain_pv(pmap);
5256 mutex_exit(&pmap->pm_lock);
5257 return 0;
5258 }
5259
5260 #if defined(XEN) && defined(DOM0OPS)
5261
5262 struct pmap_data_gnt {
5263 SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5264 vaddr_t pd_gnt_sva;
5265 vaddr_t pd_gnt_eva; /* range covered by this gnt */
5266 int pd_gnt_refs; /* ref counter */
5267 struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5268 };
5269 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5270
5271 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5272
5273 static struct pmap_data_gnt *
5274 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5275 {
5276 struct pmap_data_gnt_head *headp;
5277 struct pmap_data_gnt *pgnt;
5278
5279 KASSERT(mutex_owned(&pmap->pm_lock));
5280 headp = pmap->pm_data;
5281 KASSERT(headp != NULL);
5282 SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5283 if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5284 return pgnt;
5285 /* check that we're not overlapping part of a region */
5286 KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5287 }
5288 return NULL;
5289 }
5290
5291 static void
5292 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5293 const struct gnttab_map_grant_ref *ops)
5294 {
5295 struct pmap_data_gnt_head *headp;
5296 struct pmap_data_gnt *pgnt;
5297 vaddr_t eva = sva + nentries * PAGE_SIZE;
5298 KASSERT(mutex_owned(&pmap->pm_lock));
5299 KASSERT(nentries >= 1);
5300 if (pmap->pm_remove == NULL) {
5301 pmap->pm_remove = pmap_remove_gnt;
5302 KASSERT(pmap->pm_data == NULL);
5303 headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5304 SLIST_INIT(headp);
5305 pmap->pm_data = headp;
5306 } else {
5307 KASSERT(pmap->pm_remove == pmap_remove_gnt);
5308 KASSERT(pmap->pm_data != NULL);
5309 headp = pmap->pm_data;
5310 }
5311
5312 pgnt = pmap_find_gnt(pmap, sva, eva);
5313 if (pgnt != NULL) {
5314 KASSERT(pgnt->pd_gnt_sva == sva);
5315 KASSERT(pgnt->pd_gnt_eva == eva);
5316 return;
5317 }
5318
5319 /* new entry */
5320 pgnt = kmem_alloc(sizeof(*pgnt) +
5321 (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5322 pgnt->pd_gnt_sva = sva;
5323 pgnt->pd_gnt_eva = eva;
5324 pgnt->pd_gnt_refs = 0;
5325 memcpy(pgnt->pd_gnt_ops, ops,
5326 sizeof(struct gnttab_map_grant_ref) * nentries);
5327 SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5328 }
5329
5330 static void
5331 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5332 {
5333 struct pmap_data_gnt_head *headp = pmap->pm_data;
5334 int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5335 KASSERT(nentries >= 1);
5336 KASSERT(mutex_owned(&pmap->pm_lock));
5337 KASSERT(pgnt->pd_gnt_refs == 0);
5338 SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5339 kmem_free(pgnt, sizeof(*pgnt) +
5340 (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5341 if (SLIST_EMPTY(headp)) {
5342 kmem_free(headp, sizeof(*headp));
5343 pmap->pm_data = NULL;
5344 pmap->pm_remove = NULL;
5345 }
5346 }
5347
5348 /*
5349 * pmap_enter_gnt: enter a grant entry into a pmap
5350 *
5351 * => must be done "now" ... no lazy-evaluation
5352 */
5353 int
5354 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5355 const struct gnttab_map_grant_ref *oops)
5356 {
5357 struct pmap_data_gnt *pgnt;
5358 pt_entry_t *ptes, opte;
5359 #ifndef XENPV
5360 pt_entry_t npte;
5361 #endif
5362 pt_entry_t *ptep;
5363 pd_entry_t * const *pdes;
5364 struct vm_page *ptp;
5365 struct vm_page *old_pg;
5366 struct pmap_page *old_pp;
5367 struct pv_entry *old_pve;
5368 struct pmap *pmap2;
5369 struct pmap_ptparray pt;
5370 int error;
5371 bool getptp;
5372 rb_tree_t *tree;
5373 struct gnttab_map_grant_ref *op;
5374 int ret;
5375 int idx;
5376
5377 KASSERT(pmap_initialized);
5378 KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5379 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5380 PRIxVADDR " over PDP!", __func__, va);
5381 KASSERT(pmap != pmap_kernel());
5382
5383 /* Begin by locking the pmap. */
5384 mutex_enter(&pmap->pm_lock);
5385 pmap_alloc_gnt(pmap, sva, nentries, oops);
5386
5387 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5388 KASSERT(pgnt != NULL);
5389
5390 /* Look up the PTP. Allocate if none present. */
5391 ptp = NULL;
5392 getptp = false;
5393 ptp = pmap_find_ptp(pmap, va, 1);
5394 if (ptp == NULL) {
5395 getptp = true;
5396 error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5397 if (error != 0) {
5398 mutex_exit(&pmap->pm_lock);
5399 return error;
5400 }
5401 }
5402 tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5403
5404 /*
5405 * Look up the old PV entry at this VA (if any), and insert a new PV
5406 * entry if required for the new mapping. Temporarily track the old
5407 * and new mappings concurrently. Only after the old mapping is
5408 * evicted from the pmap will we remove its PV entry. Otherwise,
5409 * our picture of modified/accessed state for either page could get
5410 * out of sync (we need any P->V operation for either page to stall
5411 * on pmap->pm_lock until done here).
5412 */
5413 old_pve = NULL;
5414
5415 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5416
5417 /* Map PTEs into address space. */
5418 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5419
5420 /* Install any newly allocated PTPs. */
5421 if (getptp) {
5422 pmap_install_ptp(pmap, &pt, va, pdes);
5423 }
5424
5425 /* Check if there is an existing mapping. */
5426 ptep = &ptes[pl1_i(va)];
5427 opte = *ptep;
5428 bool have_oldpa = pmap_valid_entry(opte);
5429 paddr_t oldpa = pmap_pte2pa(opte);
5430
5431 /*
5432 * Update the pte.
5433 */
5434
5435 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5436 op = &pgnt->pd_gnt_ops[idx];
5437
5438 #ifdef XENPV
5439 KASSERT(op->flags & GNTMAP_contains_pte);
5440 op->host_addr = xpmap_ptetomach(ptep);
5441 #else
5442 KASSERT((op->flags & GNTMAP_contains_pte) == 0);
5443 KASSERT(op->flags != 0);
5444 KASSERT(op->host_addr != 0);
5445 #endif
5446 op->dev_bus_addr = 0;
5447 op->status = GNTST_general_error;
5448 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5449 if (__predict_false(ret)) {
5450 printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5451 __func__, ret);
5452 op->status = GNTST_general_error;
5453 }
5454 for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5455 kpause("gntmap", false, mstohz(1), NULL);
5456 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5457 if (__predict_false(ret)) {
5458 printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5459 __func__, ret);
5460 op->status = GNTST_general_error;
5461 }
5462 }
5463 if (__predict_false(op->status != GNTST_okay)) {
5464 printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5465 __func__, op->status);
5466 if (have_oldpa) { /* XXX did the pte really change if XENPV ?*/
5467 ptp->wire_count--;
5468 }
5469 } else {
5470 #ifndef XENPV
5471 npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P;
5472 if ((op->flags & GNTMAP_readonly) == 0)
5473 npte |= PTE_W;
5474 do {
5475 opte = *ptep;
5476 } while (pmap_pte_cas(ptep, opte, npte) != opte);
5477 #endif
5478 pgnt->pd_gnt_refs++;
5479 if (!have_oldpa) {
5480 ptp->wire_count++;
5481 }
5482 KASSERT(ptp->wire_count > 1);
5483 /* Remember minimum VA in PTP. */
5484 pmap_ptp_range_set(ptp, va);
5485 }
5486 if (ptp->wire_count <= 1)
5487 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5488
5489 /*
5490 * Done with the PTEs: they can now be unmapped.
5491 */
5492 pmap_unmap_ptes(pmap, pmap2);
5493
5494 /*
5495 * Update statistics and PTP's reference count.
5496 */
5497 pmap_stats_update_bypte(pmap, 0, opte);
5498
5499 /*
5500 * If old page is pv-tracked, remove pv_entry from its list.
5501 */
5502 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5503 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5504 old_pp = VM_PAGE_TO_PP(old_pg);
5505 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5506 panic("%s: PTE_PVLIST with pv-untracked page"
5507 " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5508 __func__, va, oldpa);
5509 }
5510
5511 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5512 pmap_pte_to_pp_attrs(opte));
5513 } else {
5514 KASSERT(old_pve == NULL);
5515 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5516 }
5517
5518 pmap_drain_pv(pmap);
5519 mutex_exit(&pmap->pm_lock);
5520 return op->status;
5521 }
5522
5523 /*
5524 * pmap_remove_gnt: grant mapping removal function.
5525 *
5526 * => caller should not be holding any pmap locks
5527 */
5528 static void
5529 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5530 {
5531 struct pmap_data_gnt *pgnt;
5532 pt_entry_t *ptes;
5533 pd_entry_t pde;
5534 pd_entry_t * const *pdes;
5535 struct vm_page *ptp;
5536 struct pmap *pmap2;
5537 vaddr_t va;
5538 int lvl;
5539 int idx;
5540 struct gnttab_map_grant_ref *op;
5541 struct gnttab_unmap_grant_ref unmap_op;
5542 int ret;
5543
5544 KASSERT(pmap != pmap_kernel());
5545 KASSERT(pmap->pm_remove == pmap_remove_gnt);
5546
5547 mutex_enter(&pmap->pm_lock);
5548 for (va = sva; va < eva; va += PAGE_SIZE) {
5549 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5550 if (pgnt == NULL) {
5551 pmap_remove_locked(pmap, sva, eva);
5552 continue;
5553 }
5554
5555 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5556 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5557 panic("pmap_remove_gnt pdes not valid");
5558 }
5559
5560 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5561 op = &pgnt->pd_gnt_ops[idx];
5562 KASSERT(lvl == 1);
5563
5564 /* Get PTP if non-kernel mapping. */
5565 ptp = pmap_find_ptp(pmap, va, 1);
5566 KASSERTMSG(ptp != NULL,
5567 "%s: unmanaged PTP detected", __func__);
5568
5569 if (op->status == GNTST_okay) {
5570 KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5571 #ifdef XENPV
5572 unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5573 #else
5574 unmap_op.host_addr = op->host_addr;
5575 pmap_pte_testset(&ptes[pl1_i(va)], 0);
5576 #endif
5577 unmap_op.handle = op->handle;
5578 unmap_op.dev_bus_addr = 0;
5579 ret = HYPERVISOR_grant_table_op(
5580 GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5581 if (ret) {
5582 printf("%s: GNTTABOP_unmap_grant_ref "
5583 "failed: %d\n", __func__, ret);
5584 }
5585
5586 ptp->wire_count--;
5587 pgnt->pd_gnt_refs--;
5588 }
5589 if (pgnt->pd_gnt_refs == 0) {
5590 pmap_free_gnt(pmap, pgnt);
5591 }
5592 /*
5593 * if mapping removed and the PTP is no longer
5594 * being used, free it!
5595 */
5596
5597 if (ptp->wire_count <= 1)
5598 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5599 pmap_unmap_ptes(pmap, pmap2);
5600 }
5601 mutex_exit(&pmap->pm_lock);
5602 }
5603 #endif /* XEN && DOM0OPS */
5604
5605 paddr_t
5606 pmap_get_physpage(void)
5607 {
5608 struct vm_page *ptp;
5609 struct pmap *kpm = pmap_kernel();
5610 paddr_t pa;
5611
5612 if (!uvm.page_init_done) {
5613 /*
5614 * We're growing the kernel pmap early (from
5615 * uvm_pageboot_alloc()). This case must be
5616 * handled a little differently.
5617 */
5618
5619 if (!uvm_page_physget(&pa))
5620 panic("%s: out of memory", __func__);
5621 #if defined(__HAVE_DIRECT_MAP)
5622 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
5623 #else
5624 #if defined(XENPV)
5625 if (XEN_VERSION_SUPPORTED(3, 4)) {
5626 xen_pagezero(pa);
5627 return pa;
5628 }
5629 #endif
5630 kpreempt_disable();
5631 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5632 PTE_W | pmap_pg_nx);
5633 pmap_pte_flush();
5634 pmap_update_pg((vaddr_t)early_zerop);
5635 memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
5636 #if defined(DIAGNOSTIC) || defined(XENPV)
5637 pmap_pte_set(early_zero_pte, 0);
5638 pmap_pte_flush();
5639 #endif /* defined(DIAGNOSTIC) */
5640 kpreempt_enable();
5641 #endif /* defined(__HAVE_DIRECT_MAP) */
5642 } else {
5643 /* XXX */
5644 ptp = uvm_pagealloc(NULL, 0, NULL,
5645 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5646 if (ptp == NULL)
5647 panic("%s: out of memory", __func__);
5648 ptp->flags &= ~PG_BUSY;
5649 ptp->wire_count = 1;
5650 pa = VM_PAGE_TO_PHYS(ptp);
5651 }
5652 pmap_stats_update(kpm, 1, 0);
5653
5654 return pa;
5655 }
5656
5657 /*
5658 * Expand the page tree with the specified amount of PTPs, mapping virtual
5659 * addresses starting at kva. We populate all the levels but the last one
5660 * (L1). The nodes of the tree are created as RW, but the pages covered
5661 * will be kentered in L1, with proper permissions.
5662 *
5663 * Used only by pmap_growkernel.
5664 */
5665 static void
5666 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5667 {
5668 unsigned long i;
5669 paddr_t pa;
5670 unsigned long index, endindex;
5671 int level;
5672 pd_entry_t *pdep;
5673 #ifdef XENPV
5674 int s = splvm(); /* protect xpq_* */
5675 #endif
5676
5677 for (level = PTP_LEVELS; level > 1; level--) {
5678 if (level == PTP_LEVELS)
5679 pdep = cpm->pm_pdir;
5680 else
5681 pdep = normal_pdes[level - 2];
5682 index = pl_i_roundup(kva, level);
5683 endindex = index + needed_ptps[level - 1] - 1;
5684
5685 for (i = index; i <= endindex; i++) {
5686 pt_entry_t pte;
5687
5688 KASSERT(!pmap_valid_entry(pdep[i]));
5689 pa = pmap_get_physpage();
5690 pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5691 #ifdef __x86_64__
5692 pte |= pmap_pg_nx;
5693 #endif
5694 pmap_pte_set(&pdep[i], pte);
5695
5696 #ifdef XENPV
5697 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5698 if (__predict_true(
5699 cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5700 /* update per-cpu PMDs on all cpus */
5701 xen_kpm_sync(pmap_kernel(), i);
5702 } else {
5703 /*
5704 * too early; update primary CPU
5705 * PMD only (without locks)
5706 */
5707 #ifdef __x86_64__
5708 pd_entry_t *cpu_pdep =
5709 &cpu_info_primary.ci_kpm_pdir[i];
5710 #else
5711 pd_entry_t *cpu_pdep =
5712 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5713 #endif
5714 pmap_pte_set(cpu_pdep, pte);
5715 }
5716 }
5717 #endif
5718
5719 KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5720 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5721 nkptp[level - 1]++;
5722 }
5723 pmap_pte_flush();
5724 }
5725 #ifdef XENPV
5726 splx(s);
5727 #endif
5728 }
5729
5730 /*
5731 * pmap_growkernel: increase usage of KVM space.
5732 *
5733 * => we allocate new PTPs for the kernel and install them in all
5734 * the pmaps on the system.
5735 */
5736 vaddr_t
5737 pmap_growkernel(vaddr_t maxkvaddr)
5738 {
5739 struct pmap *kpm = pmap_kernel();
5740 struct pmap *cpm;
5741 #if !defined(XENPV) || !defined(__x86_64__)
5742 struct pmap *pm;
5743 long old;
5744 #endif
5745 int s, i;
5746 long needed_kptp[PTP_LEVELS], target_nptp;
5747 bool invalidate = false;
5748
5749 s = splvm(); /* to be safe */
5750 mutex_enter(&kpm->pm_lock);
5751
5752 if (maxkvaddr <= pmap_maxkvaddr) {
5753 mutex_exit(&kpm->pm_lock);
5754 splx(s);
5755 return pmap_maxkvaddr;
5756 }
5757
5758 maxkvaddr = x86_round_pdr(maxkvaddr);
5759 #if !defined(XENPV) || !defined(__x86_64__)
5760 old = nkptp[PTP_LEVELS - 1];
5761 #endif
5762
5763 /* Initialize needed_kptp. */
5764 for (i = PTP_LEVELS - 1; i >= 1; i--) {
5765 target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5766 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5767
5768 if (target_nptp > nkptpmax[i])
5769 panic("out of KVA space");
5770 KASSERT(target_nptp >= nkptp[i]);
5771 needed_kptp[i] = target_nptp - nkptp[i];
5772 }
5773
5774 #ifdef XENPV
5775 /* only pmap_kernel(), or the per-cpu map, has kernel entries */
5776 cpm = kpm;
5777 #else
5778 /* Get the current pmap */
5779 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5780 cpm = curcpu()->ci_pmap;
5781 } else {
5782 cpm = kpm;
5783 }
5784 #endif
5785
5786 kasan_shadow_map((void *)pmap_maxkvaddr,
5787 (size_t)(maxkvaddr - pmap_maxkvaddr));
5788 kmsan_shadow_map((void *)pmap_maxkvaddr,
5789 (size_t)(maxkvaddr - pmap_maxkvaddr));
5790
5791 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5792
5793 /*
5794 * If the number of top level entries changed, update all pmaps.
5795 */
5796 if (needed_kptp[PTP_LEVELS - 1] != 0) {
5797 #ifdef XENPV
5798 #ifdef __x86_64__
5799 /* nothing, kernel entries are never entered in user pmap */
5800 #else
5801 int pdkidx;
5802
5803 mutex_enter(&pmaps_lock);
5804 LIST_FOREACH(pm, &pmaps, pm_list) {
5805 for (pdkidx = PDIR_SLOT_KERN + old;
5806 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5807 pdkidx++) {
5808 pmap_pte_set(&pm->pm_pdir[pdkidx],
5809 kpm->pm_pdir[pdkidx]);
5810 }
5811 pmap_pte_flush();
5812 }
5813 mutex_exit(&pmaps_lock);
5814 #endif /* __x86_64__ */
5815 #else /* XENPV */
5816 size_t newpdes;
5817 newpdes = nkptp[PTP_LEVELS - 1] - old;
5818 if (cpm != kpm) {
5819 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5820 &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5821 newpdes * sizeof(pd_entry_t));
5822 }
5823
5824 mutex_enter(&pmaps_lock);
5825 LIST_FOREACH(pm, &pmaps, pm_list) {
5826 if (__predict_false(pm->pm_enter != NULL)) {
5827 /*
5828 * Not a native pmap, the kernel is not mapped,
5829 * so nothing to synchronize.
5830 */
5831 continue;
5832 }
5833 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5834 &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5835 newpdes * sizeof(pd_entry_t));
5836 }
5837 mutex_exit(&pmaps_lock);
5838 #endif
5839 invalidate = true;
5840 }
5841 pmap_maxkvaddr = maxkvaddr;
5842 mutex_exit(&kpm->pm_lock);
5843 splx(s);
5844
5845 if (invalidate && pmap_initialized) {
5846 /* Invalidate the pmap cache. */
5847 pool_cache_invalidate(&pmap_cache);
5848 }
5849
5850 return maxkvaddr;
5851 }
5852
5853 #ifdef DEBUG
5854 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5855
5856 /*
5857 * pmap_dump: dump all the mappings from a pmap
5858 *
5859 * => caller should not be holding any pmap locks
5860 */
5861 void
5862 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5863 {
5864 pt_entry_t *ptes, *pte;
5865 pd_entry_t * const *pdes;
5866 struct pmap *pmap2;
5867 vaddr_t blkendva;
5868 int lvl;
5869
5870 /*
5871 * if end is out of range truncate.
5872 * if (end == start) update to max.
5873 */
5874
5875 if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5876 eva = VM_MAXUSER_ADDRESS;
5877
5878 mutex_enter(&pmap->pm_lock);
5879 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5880
5881 /*
5882 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5883 */
5884
5885 for (/* null */ ; sva < eva ; sva = blkendva) {
5886
5887 /* determine range of block */
5888 blkendva = x86_round_pdr(sva+1);
5889 if (blkendva > eva)
5890 blkendva = eva;
5891
5892 /* valid block? */
5893 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5894 continue;
5895 KASSERT(lvl == 1);
5896
5897 pte = &ptes[pl1_i(sva)];
5898 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5899 if (!pmap_valid_entry(*pte))
5900 continue;
5901 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5902 " (pte=%#" PRIxPADDR ")\n",
5903 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5904 }
5905 }
5906 pmap_unmap_ptes(pmap, pmap2);
5907 mutex_exit(&pmap->pm_lock);
5908 }
5909 #endif
5910
5911 /*
5912 * pmap_update: process deferred invalidations and frees.
5913 */
5914 void
5915 pmap_update(struct pmap *pmap)
5916 {
5917 struct pmap_page *pp;
5918 struct vm_page *ptp;
5919
5920 /*
5921 * Initiate any pending TLB shootdowns. Wait for them to
5922 * complete before returning control to the caller.
5923 */
5924 kpreempt_disable();
5925 pmap_tlb_shootnow();
5926 kpreempt_enable();
5927
5928 /*
5929 * Now that shootdowns are complete, process deferred frees. This
5930 * is an unlocked check, but is safe as we're only interested in
5931 * work done in this LWP - we won't get a false negative.
5932 */
5933 if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5934 return;
5935 }
5936
5937 mutex_enter(&pmap->pm_lock);
5938 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5939 KASSERT(ptp->wire_count == 0);
5940 KASSERT(ptp->uanon == NULL);
5941 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5942 pp = VM_PAGE_TO_PP(ptp);
5943 LIST_INIT(&pp->pp_pvlist);
5944 pp->pp_attrs = 0;
5945 pp->pp_pte.pte_ptp = NULL;
5946 pp->pp_pte.pte_va = 0;
5947 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5948
5949 /*
5950 * XXX Hack to avoid extra locking, and lock
5951 * assertions in uvm_pagefree(). Despite uobject
5952 * being set, this isn't a managed page.
5953 */
5954 PMAP_DUMMY_LOCK(pmap);
5955 uvm_pagerealloc(ptp, NULL, 0);
5956 PMAP_DUMMY_UNLOCK(pmap);
5957 uvm_pagefree(ptp);
5958 }
5959 mutex_exit(&pmap->pm_lock);
5960 }
5961
5962 #if PTP_LEVELS > 4
5963 #error "Unsupported number of page table mappings"
5964 #endif
5965
5966 paddr_t
5967 pmap_init_tmp_pgtbl(paddr_t pg)
5968 {
5969 static bool maps_loaded;
5970 static const paddr_t x86_tmp_pml_paddr[] = {
5971 4 * PAGE_SIZE, /* L1 */
5972 5 * PAGE_SIZE, /* L2 */
5973 6 * PAGE_SIZE, /* L3 */
5974 7 * PAGE_SIZE /* L4 */
5975 };
5976 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5977
5978 pd_entry_t *tmp_pml, *kernel_pml;
5979
5980 int level;
5981
5982 if (!maps_loaded) {
5983 for (level = 0; level < PTP_LEVELS; ++level) {
5984 x86_tmp_pml_vaddr[level] =
5985 uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5986 UVM_KMF_VAONLY);
5987
5988 if (x86_tmp_pml_vaddr[level] == 0)
5989 panic("mapping of real mode PML failed\n");
5990 pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5991 x86_tmp_pml_paddr[level],
5992 VM_PROT_READ | VM_PROT_WRITE, 0);
5993 }
5994 pmap_update(pmap_kernel());
5995 maps_loaded = true;
5996 }
5997
5998 /* Zero levels 1-3 */
5999 for (level = 0; level < PTP_LEVELS - 1; ++level) {
6000 tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6001 memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
6002 }
6003
6004 /* Copy PML4 */
6005 kernel_pml = pmap_kernel()->pm_pdir;
6006 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
6007 memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
6008
6009 #ifdef PAE
6010 /*
6011 * Use the last 4 entries of the L2 page as L3 PD entries. These
6012 * last entries are unlikely to be used for temporary mappings.
6013 * 508: maps 0->1GB (userland)
6014 * 509: unused
6015 * 510: unused
6016 * 511: maps 3->4GB (kernel)
6017 */
6018 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
6019 tmp_pml[509] = 0;
6020 tmp_pml[510] = 0;
6021 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
6022 #endif
6023
6024 for (level = PTP_LEVELS - 1; level > 0; --level) {
6025 tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6026
6027 tmp_pml[pl_i(pg, level + 1)] =
6028 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
6029 }
6030
6031 tmp_pml = (void *)x86_tmp_pml_vaddr[0];
6032 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
6033
6034 #ifdef PAE
6035 /* Return the PA of the L3 page (entry 508 of the L2 page) */
6036 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
6037 #endif
6038
6039 return x86_tmp_pml_paddr[PTP_LEVELS - 1];
6040 }
6041
6042 u_int
6043 x86_mmap_flags(paddr_t mdpgno)
6044 {
6045 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
6046 u_int pflag = 0;
6047
6048 if (nflag & X86_MMAP_FLAG_PREFETCH)
6049 pflag |= PMAP_WRITE_COMBINE;
6050
6051 return pflag;
6052 }
6053
6054 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
6055
6056 /*
6057 * -----------------------------------------------------------------------------
6058 * *****************************************************************************
6059 * *****************************************************************************
6060 * *****************************************************************************
6061 * *****************************************************************************
6062 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
6063 * *****************************************************************************
6064 * *****************************************************************************
6065 * *****************************************************************************
6066 * *****************************************************************************
6067 * -----------------------------------------------------------------------------
6068 *
6069 * These functions are invoked as callbacks from the code above. Contrary to
6070 * native, EPT does not have a recursive slot; therefore, it is not possible
6071 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
6072 * tree manually.
6073 *
6074 * Apart from that, the logic is mostly the same as native. Once a pmap has
6075 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
6076 * After that we're good, and the callbacks will handle the translations
6077 * for us.
6078 *
6079 * -----------------------------------------------------------------------------
6080 */
6081
6082 /* Hardware bits. */
6083 #define EPT_R __BIT(0) /* read */
6084 #define EPT_W __BIT(1) /* write */
6085 #define EPT_X __BIT(2) /* execute */
6086 #define EPT_T __BITS(5,3) /* type */
6087 #define TYPE_UC 0
6088 #define TYPE_WC 1
6089 #define TYPE_WT 4
6090 #define TYPE_WP 5
6091 #define TYPE_WB 6
6092 #define EPT_NOPAT __BIT(6)
6093 #define EPT_L __BIT(7) /* large */
6094 #define EPT_A __BIT(8) /* accessed */
6095 #define EPT_D __BIT(9) /* dirty */
6096 /* Software bits. */
6097 #define EPT_PVLIST __BIT(60)
6098 #define EPT_WIRED __BIT(61)
6099
6100 #define pmap_ept_valid_entry(pte) (pte & EPT_R)
6101
6102 bool pmap_ept_has_ad __read_mostly;
6103
6104 static inline void
6105 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
6106 {
6107 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
6108 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
6109
6110 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6111 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6112
6113 pmap_stats_update(pmap, resid_diff, wired_diff);
6114 }
6115
6116 static pt_entry_t
6117 pmap_ept_type(u_int flags)
6118 {
6119 u_int cacheflags = (flags & PMAP_CACHE_MASK);
6120 pt_entry_t ret;
6121
6122 switch (cacheflags) {
6123 case PMAP_NOCACHE:
6124 case PMAP_NOCACHE_OVR:
6125 ret = __SHIFTIN(TYPE_UC, EPT_T);
6126 break;
6127 case PMAP_WRITE_COMBINE:
6128 ret = __SHIFTIN(TYPE_WC, EPT_T);
6129 break;
6130 case PMAP_WRITE_BACK:
6131 default:
6132 ret = __SHIFTIN(TYPE_WB, EPT_T);
6133 break;
6134 }
6135
6136 ret |= EPT_NOPAT;
6137 return ret;
6138 }
6139
6140 static inline pt_entry_t
6141 pmap_ept_prot(vm_prot_t prot)
6142 {
6143 pt_entry_t res = 0;
6144
6145 if (prot & VM_PROT_READ)
6146 res |= EPT_R;
6147 if (prot & VM_PROT_WRITE)
6148 res |= EPT_W;
6149 if (prot & VM_PROT_EXECUTE)
6150 res |= EPT_X;
6151
6152 return res;
6153 }
6154
6155 static inline uint8_t
6156 pmap_ept_to_pp_attrs(pt_entry_t ept)
6157 {
6158 uint8_t ret = 0;
6159 if (pmap_ept_has_ad) {
6160 if (ept & EPT_D)
6161 ret |= PP_ATTRS_D;
6162 if (ept & EPT_A)
6163 ret |= PP_ATTRS_A;
6164 } else {
6165 ret |= (PP_ATTRS_D|PP_ATTRS_A);
6166 }
6167 if (ept & EPT_W)
6168 ret |= PP_ATTRS_W;
6169 return ret;
6170 }
6171
6172 static inline pt_entry_t
6173 pmap_pp_attrs_to_ept(uint8_t attrs)
6174 {
6175 pt_entry_t ept = 0;
6176 if (attrs & PP_ATTRS_D)
6177 ept |= EPT_D;
6178 if (attrs & PP_ATTRS_A)
6179 ept |= EPT_A;
6180 if (attrs & PP_ATTRS_W)
6181 ept |= EPT_W;
6182 return ept;
6183 }
6184
6185 /*
6186 * Helper for pmap_ept_free_ptp.
6187 * tree[0] = &L2[L2idx]
6188 * tree[1] = &L3[L3idx]
6189 * tree[2] = &L4[L4idx]
6190 */
6191 static void
6192 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
6193 {
6194 pt_entry_t *pteva;
6195 paddr_t ptepa;
6196 int i, index;
6197
6198 ptepa = pmap->pm_pdirpa[0];
6199 for (i = PTP_LEVELS; i > 1; i--) {
6200 index = pl_pi(va, i);
6201 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6202 KASSERT(pmap_ept_valid_entry(pteva[index]));
6203 tree[i - 2] = &pteva[index];
6204 ptepa = pmap_pte2pa(pteva[index]);
6205 }
6206 }
6207
6208 static void
6209 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
6210 {
6211 pd_entry_t *tree[3];
6212 int level;
6213
6214 KASSERT(pmap != pmap_kernel());
6215 KASSERT(mutex_owned(&pmap->pm_lock));
6216 KASSERT(kpreempt_disabled());
6217
6218 pmap_ept_get_tree(pmap, va, tree);
6219
6220 level = 1;
6221 do {
6222 (void)pmap_pte_testset(tree[level - 1], 0);
6223
6224 pmap_freepage(pmap, ptp, level);
6225 if (level < PTP_LEVELS - 1) {
6226 ptp = pmap_find_ptp(pmap, va, level + 1);
6227 ptp->wire_count--;
6228 if (ptp->wire_count > 1)
6229 break;
6230 }
6231 } while (++level < PTP_LEVELS);
6232 pmap_pte_flush();
6233 }
6234
6235 /* Allocate L4->L3->L2. Return L2. */
6236 static void
6237 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6238 {
6239 struct vm_page *ptp;
6240 unsigned long index;
6241 pd_entry_t *pteva;
6242 paddr_t ptepa;
6243 int i;
6244
6245 KASSERT(pmap != pmap_kernel());
6246 KASSERT(mutex_owned(&pmap->pm_lock));
6247 KASSERT(kpreempt_disabled());
6248
6249 /*
6250 * Now that we have all the pages looked up or allocated,
6251 * loop through again installing any new ones into the tree.
6252 */
6253 ptepa = pmap->pm_pdirpa[0];
6254 for (i = PTP_LEVELS; i > 1; i--) {
6255 index = pl_pi(va, i);
6256 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6257
6258 if (pmap_ept_valid_entry(pteva[index])) {
6259 KASSERT(!pt->alloced[i]);
6260 ptepa = pmap_pte2pa(pteva[index]);
6261 continue;
6262 }
6263
6264 ptp = pt->pg[i];
6265 ptp->flags &= ~PG_BUSY; /* never busy */
6266 ptp->wire_count = 1;
6267 pmap->pm_ptphint[i - 2] = ptp;
6268 ptepa = VM_PAGE_TO_PHYS(ptp);
6269 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6270
6271 pmap_pte_flush();
6272 pmap_stats_update(pmap, 1, 0);
6273
6274 /*
6275 * If we're not in the top level, increase the
6276 * wire count of the parent page.
6277 */
6278 if (i < PTP_LEVELS) {
6279 pt->pg[i + 1]->wire_count++;
6280 }
6281 }
6282 }
6283
6284 static int
6285 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6286 u_int flags)
6287 {
6288 pt_entry_t *ptes, opte, npte;
6289 pt_entry_t *ptep;
6290 struct vm_page *ptp;
6291 struct vm_page *new_pg, *old_pg;
6292 struct pmap_page *new_pp, *old_pp;
6293 struct pv_entry *old_pve, *new_pve;
6294 bool wired = (flags & PMAP_WIRED) != 0;
6295 bool accessed;
6296 struct pmap_ptparray pt;
6297 int error;
6298 bool getptp, samepage, new_embedded;
6299 rb_tree_t *tree;
6300
6301 KASSERT(pmap_initialized);
6302 KASSERT(va < VM_MAXUSER_ADDRESS);
6303
6304 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6305
6306 if (wired)
6307 npte |= EPT_WIRED;
6308 if (flags & VM_PROT_ALL) {
6309 npte |= EPT_A;
6310 if (flags & VM_PROT_WRITE) {
6311 KASSERT((npte & EPT_W) != 0);
6312 npte |= EPT_D;
6313 }
6314 }
6315
6316 new_pg = PHYS_TO_VM_PAGE(pa);
6317 if (new_pg != NULL) {
6318 /* This is a managed page */
6319 npte |= EPT_PVLIST;
6320 new_pp = VM_PAGE_TO_PP(new_pg);
6321 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6322 /* This is an unmanaged pv-tracked page */
6323 npte |= EPT_PVLIST;
6324 } else {
6325 new_pp = NULL;
6326 }
6327
6328 /* Begin by locking the pmap. */
6329 mutex_enter(&pmap->pm_lock);
6330
6331 /* Look up the PTP. Allocate if none present. */
6332 ptp = NULL;
6333 getptp = false;
6334 if (pmap != pmap_kernel()) {
6335 ptp = pmap_find_ptp(pmap, va, 1);
6336 if (ptp == NULL) {
6337 getptp = true;
6338 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6339 if (error != 0) {
6340 if (flags & PMAP_CANFAIL) {
6341 mutex_exit(&pmap->pm_lock);
6342 return error;
6343 }
6344 panic("%s: get ptp failed, error=%d", __func__,
6345 error);
6346 }
6347 }
6348 tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6349 } else {
6350 /* Embedded PV entries rely on this. */
6351 KASSERT(va != 0);
6352 tree = &pmap_kernel_rb;
6353 }
6354
6355 /*
6356 * Look up the old PV entry at this VA (if any), and insert a new PV
6357 * entry if required for the new mapping. Temporarily track the old
6358 * and new mappings concurrently. Only after the old mapping is
6359 * evicted from the pmap will we remove its PV entry. Otherwise,
6360 * our picture of modified/accessed state for either page could get
6361 * out of sync (we need any P->V operation for either page to stall
6362 * on pmap->pm_lock until done here).
6363 */
6364 new_pve = NULL;
6365 old_pve = NULL;
6366 samepage = false;
6367 new_embedded = false;
6368
6369 if (new_pp != NULL) {
6370 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6371 &old_pve, &samepage, &new_embedded, tree);
6372
6373 /*
6374 * If a new pv_entry was needed and none was available, we
6375 * can go no further.
6376 */
6377 if (error != 0) {
6378 if (flags & PMAP_CANFAIL) {
6379 if (getptp) {
6380 pmap_unget_ptp(pmap, &pt);
6381 }
6382 mutex_exit(&pmap->pm_lock);
6383 return error;
6384 }
6385 panic("%s: alloc pve failed", __func__);
6386 }
6387 } else {
6388 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6389 }
6390
6391 /* Map PTEs into address space. */
6392 kpreempt_disable();
6393
6394 /* Install any newly allocated PTPs. */
6395 if (getptp) {
6396 pmap_ept_install_ptp(pmap, &pt, va);
6397 }
6398
6399 /* Check if there is an existing mapping. */
6400 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6401 ptep = &ptes[pl1_pi(va)];
6402 opte = *ptep;
6403 bool have_oldpa = pmap_ept_valid_entry(opte);
6404 paddr_t oldpa = pmap_pte2pa(opte);
6405
6406 /*
6407 * Update the pte.
6408 */
6409 do {
6410 opte = *ptep;
6411
6412 /*
6413 * if the same page, inherit PTE_A and PTE_D.
6414 */
6415 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6416 npte |= opte & (EPT_A | EPT_D);
6417 }
6418 } while (pmap_pte_cas(ptep, opte, npte) != opte);
6419
6420 /*
6421 * Done with the PTEs: they can now be unmapped.
6422 */
6423 kpreempt_enable();
6424
6425 /*
6426 * Update statistics and PTP's reference count.
6427 */
6428 pmap_ept_stats_update_bypte(pmap, npte, opte);
6429 if (ptp != NULL) {
6430 if (!have_oldpa) {
6431 ptp->wire_count++;
6432 }
6433 /* Remember minimum VA in PTP. */
6434 pmap_ptp_range_set(ptp, va);
6435 }
6436 KASSERT(ptp == NULL || ptp->wire_count > 1);
6437
6438 /*
6439 * If the same page, we can skip pv_entry handling.
6440 */
6441 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6442 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6443 if ((npte & EPT_PVLIST) != 0) {
6444 KASSERT(samepage);
6445 pmap_check_pv(pmap, ptp, new_pp, va, true);
6446 }
6447 goto same_pa;
6448 } else if ((npte & EPT_PVLIST) != 0) {
6449 KASSERT(!samepage);
6450 }
6451
6452 /*
6453 * If old page is pv-tracked, remove pv_entry from its list.
6454 */
6455 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6456 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6457 old_pp = VM_PAGE_TO_PP(old_pg);
6458 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6459 panic("%s: EPT_PVLIST with pv-untracked page"
6460 " va = %#"PRIxVADDR
6461 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6462 __func__, va, oldpa, atop(pa));
6463 }
6464
6465 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6466 pmap_ept_to_pp_attrs(opte));
6467 } else {
6468 KASSERT(old_pve == NULL);
6469 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6470 }
6471
6472 /*
6473 * If new page is dynamically PV tracked, insert to tree.
6474 */
6475 if (new_pve != NULL) {
6476 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6477 old_pve = rb_tree_insert_node(tree, new_pve);
6478 KASSERT(old_pve == new_pve);
6479 pmap_check_pv(pmap, ptp, new_pp, va, true);
6480 }
6481
6482 same_pa:
6483 /*
6484 * shootdown tlb if necessary.
6485 */
6486
6487 if (pmap_ept_has_ad) {
6488 accessed = (~opte & (EPT_R | EPT_A)) == 0;
6489 } else {
6490 accessed = (opte & EPT_R) != 0;
6491 }
6492 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6493 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6494 }
6495 pmap_drain_pv(pmap);
6496 mutex_exit(&pmap->pm_lock);
6497 return 0;
6498 }
6499
6500 /* Pay close attention, this returns L2. */
6501 static int
6502 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6503 {
6504 pt_entry_t *pteva;
6505 paddr_t ptepa;
6506 int i, index;
6507
6508 KASSERT(mutex_owned(&pmap->pm_lock));
6509
6510 ptepa = pmap->pm_pdirpa[0];
6511 for (i = PTP_LEVELS; i > 1; i--) {
6512 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6513 index = pl_pi(va, i);
6514 if (!pmap_ept_valid_entry(pteva[index]))
6515 return i;
6516 ptepa = pmap_pte2pa(pteva[index]);
6517 }
6518 if (lastpde != NULL) {
6519 *lastpde = pteva[index];
6520 }
6521
6522 return 0;
6523 }
6524
6525 static bool
6526 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6527 {
6528 pt_entry_t *ptes, pte;
6529 pd_entry_t pde;
6530 paddr_t ptppa, pa;
6531 bool rv;
6532
6533 #ifdef __HAVE_DIRECT_MAP
6534 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6535 if (pap != NULL) {
6536 *pap = PMAP_DIRECT_UNMAP(va);
6537 }
6538 return true;
6539 }
6540 #endif
6541
6542 rv = false;
6543 pa = 0;
6544
6545 mutex_enter(&pmap->pm_lock);
6546 kpreempt_disable();
6547
6548 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6549 ptppa = pmap_pte2pa(pde);
6550 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6551 pte = ptes[pl1_pi(va)];
6552 if (__predict_true((pte & EPT_R) != 0)) {
6553 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6554 rv = true;
6555 }
6556 }
6557
6558 kpreempt_enable();
6559 mutex_exit(&pmap->pm_lock);
6560
6561 if (pap != NULL) {
6562 *pap = pa;
6563 }
6564 return rv;
6565 }
6566
6567 static bool
6568 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6569 vaddr_t va)
6570 {
6571 struct pv_entry *pve;
6572 struct vm_page *pg;
6573 struct pmap_page *pp;
6574 pt_entry_t opte;
6575 bool accessed;
6576
6577 KASSERT(pmap != pmap_kernel());
6578 KASSERT(mutex_owned(&pmap->pm_lock));
6579 KASSERT(kpreempt_disabled());
6580
6581 if (!pmap_ept_valid_entry(*pte)) {
6582 /* VA not mapped. */
6583 return false;
6584 }
6585
6586 /* Atomically save the old PTE and zap it. */
6587 opte = pmap_pte_testset(pte, 0);
6588 if (!pmap_ept_valid_entry(opte)) {
6589 return false;
6590 }
6591
6592 pmap_ept_stats_update_bypte(pmap, 0, opte);
6593
6594 if (ptp) {
6595 /*
6596 * Dropping a PTE. Make sure that the PDE is flushed.
6597 */
6598 ptp->wire_count--;
6599 if (ptp->wire_count <= 1) {
6600 opte |= EPT_A;
6601 }
6602 }
6603
6604 if (pmap_ept_has_ad) {
6605 accessed = (opte & EPT_A) != 0;
6606 } else {
6607 accessed = true;
6608 }
6609 if (accessed) {
6610 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6611 }
6612
6613 /*
6614 * If we are not on a pv list - we are done.
6615 */
6616 if ((opte & EPT_PVLIST) == 0) {
6617 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6618 "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6619 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6620 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6621 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6622 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6623 return true;
6624 }
6625
6626 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6627 pp = VM_PAGE_TO_PP(pg);
6628 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6629 paddr_t pa = pmap_pte2pa(opte);
6630 panic("%s: EPT_PVLIST with pv-untracked page"
6631 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6632 __func__, va, pa, atop(pa));
6633 }
6634
6635 /* Sync R/M bits. */
6636 pve = pmap_lookup_pv(pmap, ptp, pp, va);
6637 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6638 return true;
6639 }
6640
6641 static void
6642 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6643 vaddr_t startva, vaddr_t endva)
6644 {
6645 pt_entry_t *pte = (pt_entry_t *)ptpva;
6646
6647 KASSERT(pmap != pmap_kernel());
6648 KASSERT(mutex_owned(&pmap->pm_lock));
6649 KASSERT(kpreempt_disabled());
6650
6651 /*
6652 * mappings are very often sparse, so clip the given range to the
6653 * range of PTEs that are known present in the PTP.
6654 */
6655 pmap_ptp_range_clip(ptp, &startva, &pte);
6656
6657 /*
6658 * note that ptpva points to the PTE that maps startva. this may
6659 * or may not be the first PTE in the PTP.
6660 *
6661 * we loop through the PTP while there are still PTEs to look at
6662 * and the wire_count is greater than 1 (because we use the wire_count
6663 * to keep track of the number of real PTEs in the PTP).
6664 */
6665 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6666 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6667 startva += PAGE_SIZE;
6668 pte++;
6669 }
6670 }
6671
6672 static void
6673 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6674 {
6675 pt_entry_t *ptes;
6676 pd_entry_t pde;
6677 paddr_t ptppa;
6678 vaddr_t blkendva, va = sva;
6679 struct vm_page *ptp;
6680
6681 mutex_enter(&pmap->pm_lock);
6682 kpreempt_disable();
6683
6684 for (/* null */ ; va < eva ; va = blkendva) {
6685 int lvl;
6686
6687 /* determine range of block */
6688 blkendva = x86_round_pdr(va+1);
6689 if (blkendva > eva)
6690 blkendva = eva;
6691
6692 lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6693 if (lvl != 0) {
6694 /* Skip a range corresponding to an invalid pde. */
6695 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6696 continue;
6697 }
6698
6699 /* PA of the PTP */
6700 ptppa = pmap_pte2pa(pde);
6701
6702 ptp = pmap_find_ptp(pmap, va, 1);
6703 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6704 __func__);
6705
6706 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6707
6708 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6709 blkendva);
6710
6711 /* If PTP is no longer being used, free it. */
6712 if (ptp && ptp->wire_count <= 1) {
6713 pmap_ept_free_ptp(pmap, ptp, va);
6714 }
6715 }
6716
6717 kpreempt_enable();
6718 pmap_drain_pv(pmap);
6719 mutex_exit(&pmap->pm_lock);
6720 }
6721
6722 static int
6723 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6724 uint8_t *oattrs, pt_entry_t *optep)
6725 {
6726 struct pmap *pmap;
6727 pt_entry_t *ptep;
6728 pt_entry_t opte;
6729 pt_entry_t npte;
6730 pt_entry_t expect;
6731 bool need_shootdown;
6732
6733 expect = pmap_pa2pte(pa) | EPT_R;
6734 pmap = ptp_to_pmap(ptp);
6735
6736 if (clearbits != ~0) {
6737 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6738 clearbits = pmap_pp_attrs_to_ept(clearbits);
6739 }
6740
6741 ptep = pmap_map_pte(pmap, ptp, va);
6742 do {
6743 opte = *ptep;
6744 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6745 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6746 KASSERT(opte == 0 || (opte & EPT_R) != 0);
6747 if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6748 /*
6749 * We lost a race with a V->P operation like
6750 * pmap_remove(). Wait for the competitor
6751 * reflecting pte bits into mp_attrs.
6752 */
6753 pmap_unmap_pte();
6754 return EAGAIN;
6755 }
6756
6757 /*
6758 * Check if there's anything to do on this PTE.
6759 */
6760 if ((opte & clearbits) == 0) {
6761 need_shootdown = false;
6762 break;
6763 }
6764
6765 /*
6766 * We need a shootdown if the PTE is cached (EPT_A) ...
6767 * ... Unless we are clearing only the EPT_W bit and
6768 * it isn't cached as RW (EPT_D).
6769 */
6770 if (pmap_ept_has_ad) {
6771 need_shootdown = (opte & EPT_A) != 0 &&
6772 !(clearbits == EPT_W && (opte & EPT_D) == 0);
6773 } else {
6774 need_shootdown = true;
6775 }
6776
6777 npte = opte & ~clearbits;
6778
6779 /*
6780 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6781 */
6782 if (need_shootdown) {
6783 npte &= ~(EPT_A | EPT_D);
6784 }
6785 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6786 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6787 KASSERT(npte == 0 || (opte & EPT_R) != 0);
6788 } while (pmap_pte_cas(ptep, opte, npte) != opte);
6789
6790 if (need_shootdown) {
6791 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6792 }
6793 pmap_unmap_pte();
6794
6795 *oattrs = pmap_ept_to_pp_attrs(opte);
6796 if (optep != NULL)
6797 *optep = opte;
6798 return 0;
6799 }
6800
6801 static void
6802 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6803 vaddr_t va)
6804 {
6805
6806 KASSERT(mutex_owned(&pmap->pm_lock));
6807
6808 pmap_ept_stats_update_bypte(pmap, 0, opte);
6809 ptp->wire_count--;
6810 if (ptp->wire_count <= 1) {
6811 pmap_ept_free_ptp(pmap, ptp, va);
6812 }
6813 }
6814
6815 static void
6816 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6817 {
6818 pt_entry_t bit_rem;
6819 pt_entry_t *ptes, *spte;
6820 pt_entry_t opte, npte;
6821 pd_entry_t pde;
6822 paddr_t ptppa;
6823 vaddr_t va;
6824 bool modified;
6825
6826 bit_rem = 0;
6827 if (!(prot & VM_PROT_WRITE))
6828 bit_rem = EPT_W;
6829
6830 sva &= PTE_FRAME;
6831 eva &= PTE_FRAME;
6832
6833 /* Acquire pmap. */
6834 mutex_enter(&pmap->pm_lock);
6835 kpreempt_disable();
6836
6837 for (va = sva; va < eva; va += PAGE_SIZE) {
6838 if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6839 continue;
6840 }
6841
6842 ptppa = pmap_pte2pa(pde);
6843 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6844 spte = &ptes[pl1_pi(va)];
6845
6846 do {
6847 opte = *spte;
6848 if (!pmap_ept_valid_entry(opte)) {
6849 goto next;
6850 }
6851 npte = (opte & ~bit_rem);
6852 } while (pmap_pte_cas(spte, opte, npte) != opte);
6853
6854 if (pmap_ept_has_ad) {
6855 modified = (opte & EPT_D) != 0;
6856 } else {
6857 modified = true;
6858 }
6859 if (modified) {
6860 vaddr_t tva = x86_ptob(spte - ptes);
6861 pmap_tlb_shootdown(pmap, tva, 0,
6862 TLBSHOOT_WRITE_PROTECT);
6863 }
6864 next:;
6865 }
6866
6867 kpreempt_enable();
6868 mutex_exit(&pmap->pm_lock);
6869 }
6870
6871 static void
6872 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6873 {
6874 pt_entry_t *ptes, *ptep, opte;
6875 pd_entry_t pde;
6876 paddr_t ptppa;
6877
6878 /* Acquire pmap. */
6879 mutex_enter(&pmap->pm_lock);
6880 kpreempt_disable();
6881
6882 if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6883 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6884 }
6885
6886 ptppa = pmap_pte2pa(pde);
6887 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6888 ptep = &ptes[pl1_pi(va)];
6889 opte = *ptep;
6890 KASSERT(pmap_ept_valid_entry(opte));
6891
6892 if (opte & EPT_WIRED) {
6893 pt_entry_t npte = opte & ~EPT_WIRED;
6894
6895 opte = pmap_pte_testset(ptep, npte);
6896 pmap_ept_stats_update_bypte(pmap, npte, opte);
6897 } else {
6898 printf("%s: wiring for pmap %p va %#" PRIxVADDR
6899 "did not change!\n", __func__, pmap, va);
6900 }
6901
6902 /* Release pmap. */
6903 kpreempt_enable();
6904 mutex_exit(&pmap->pm_lock);
6905 }
6906
6907 /* -------------------------------------------------------------------------- */
6908
6909 void
6910 pmap_ept_transform(struct pmap *pmap)
6911 {
6912 pmap->pm_enter = pmap_ept_enter;
6913 pmap->pm_extract = pmap_ept_extract;
6914 pmap->pm_remove = pmap_ept_remove;
6915 pmap->pm_sync_pv = pmap_ept_sync_pv;
6916 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6917 pmap->pm_write_protect = pmap_ept_write_protect;
6918 pmap->pm_unwire = pmap_ept_unwire;
6919
6920 memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
6921 }
6922
6923 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6924