1 /* $OpenBSD: pmap.c,v 1.178 2024/11/02 07:58:58 mpi Exp $ */
2 /* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */
3
4 /*
5 * Copyright (c) 1997 Charles D. Cranor and Washington University.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Copyright 2001 (c) Wasabi Systems, Inc.
31 * All rights reserved.
32 *
33 * Written by Frank van der Linden for Wasabi Systems, Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgement:
45 * This product includes software developed for the NetBSD Project by
46 * Wasabi Systems, Inc.
47 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
48 * or promote products derived from this software without specific prior
49 * written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
53 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
54 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
55 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
56 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
57 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
58 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
59 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
60 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
61 * POSSIBILITY OF SUCH DAMAGE.
62 */
63
64 /*
65 * This is the i386 pmap modified and generalized to support x86-64
66 * as well. The idea is to hide the upper N levels of the page tables
67 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
68 * is mostly untouched, except that it uses some more generalized
69 * macros and interfaces.
70 *
71 * This pmap has been tested on the i386 as well, and it can be easily
72 * adapted to PAE.
73 *
74 * fvdl@wasabisystems.com 18-Jun-2001
75 */
76
77 /*
78 * pmap.c: i386 pmap module rewrite
79 * Chuck Cranor <chuck@ccrc.wustl.edu>
80 * 11-Aug-97
81 *
82 * history of this pmap module: in addition to my own input, i used
83 * the following references for this rewrite of the i386 pmap:
84 *
85 * [1] the NetBSD i386 pmap. this pmap appears to be based on the
86 * BSD hp300 pmap done by Mike Hibler at University of Utah.
87 * it was then ported to the i386 by William Jolitz of UUNET
88 * Technologies, Inc. Then Charles M. Hannum of the NetBSD
89 * project fixed some bugs and provided some speed ups.
90 *
91 * [2] the FreeBSD i386 pmap. this pmap seems to be the
92 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
93 * and David Greenman.
94 *
95 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
96 * between several processors. the VAX version was done by
97 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
98 * version was done by Lance Berc, Mike Kupfer, Bob Baron,
99 * David Golub, and Richard Draves. the alpha version was
100 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
101 * (NetBSD/alpha).
102 */
103
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/atomic.h>
107 #include <sys/proc.h>
108 #include <sys/pool.h>
109 #include <sys/user.h>
110 #include <sys/mutex.h>
111
112 #include <uvm/uvm.h>
113
114 #include <machine/cpu.h>
115 #ifdef MULTIPROCESSOR
116 #include <machine/i82489reg.h>
117 #include <machine/i82489var.h>
118 #endif
119
120 #include "vmm.h"
121
122 #if NVMM > 0
123 #include <machine/vmmvar.h>
124 #endif /* NVMM > 0 */
125
126 #include "acpi.h"
127
128 /* #define PMAP_DEBUG */
129
130 #ifdef PMAP_DEBUG
131 #define DPRINTF(x...) do { printf(x); } while(0)
132 #else
133 #define DPRINTF(x...)
134 #endif /* PMAP_DEBUG */
135
136
137 /*
138 * general info:
139 *
140 * - for an explanation of how the i386 MMU hardware works see
141 * the comments in <machine/pte.h>.
142 *
143 * - for an explanation of the general memory structure used by
144 * this pmap (including the recursive mapping), see the comments
145 * in <machine/pmap.h>.
146 *
147 * this file contains the code for the "pmap module." the module's
148 * job is to manage the hardware's virtual to physical address mappings.
149 * note that there are two levels of mapping in the VM system:
150 *
151 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
152 * to map ranges of virtual address space to objects/files. for
153 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
154 * to the file /bin/ls starting at offset zero." note that
155 * the upper layer mapping is not concerned with how individual
156 * vm_pages are mapped.
157 *
158 * [2] the lower layer of the VM system (the pmap) maintains the mappings
159 * from virtual addresses. it is concerned with which vm_page is
160 * mapped where. for example, when you run /bin/ls and start
161 * at page 0x1000 the fault routine may lookup the correct page
162 * of the /bin/ls file and then ask the pmap layer to establish
163 * a mapping for it.
164 *
165 * note that information in the lower layer of the VM system can be
166 * thrown away since it can easily be reconstructed from the info
167 * in the upper layer.
168 *
169 * data structures we use include:
170 * - struct pmap: describes the address space of one process
171 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
172 * - struct pg_to_free: a list of virtual addresses whose mappings
173 * have been changed. used for TLB flushing.
174 */
175
176 /*
177 * memory allocation
178 *
179 * - there are three data structures that we must dynamically allocate:
180 *
181 * [A] new process' page directory page (PDP)
182 * - plan 1: done at pmap_create() we use
183 * pool_get(&pmap_pmap_pool, PR_WAITOK) to do this allocation.
184 *
185 * if we are low in free physical memory then we sleep in
186 * pool_get() -- in this case this is ok since we are creating
187 * a new pmap and should not be holding any locks.
188 *
189 * XXX: the fork code currently has no way to return an "out of
190 * memory, try again" error code since uvm_fork [fka vm_fork]
191 * is a void function.
192 *
193 * [B] new page tables pages (PTP)
194 * call uvm_pagealloc()
195 * => success: zero page, add to pm_pdir
196 * => failure: we are out of free vm_pages, let pmap_enter()
197 * tell UVM about it.
198 *
199 * note: for kernel PTPs, we start with NKPTP of them. as we map
200 * kernel memory (at uvm_map time) we check to see if we've grown
201 * the kernel pmap. if so, we call the optional function
202 * pmap_growkernel() to grow the kernel PTPs in advance.
203 *
204 * [C] pv_entry structures
205 * - try to allocate one from the pool.
206 * If we fail, we simply let pmap_enter() tell UVM about it.
207 */
208
209 long nkptp[] = NKPTP_INITIALIZER;
210
211 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
212 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
213 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
214 const long nbpd[] = NBPD_INITIALIZER;
215 pd_entry_t *const normal_pdes[] = PDES_INITIALIZER;
216
217 #define pmap_pte_set(p, n) atomic_swap_64(p, n)
218 #define pmap_pte_clearbits(p, b) x86_atomic_clearbits_u64(p, b)
219 #define pmap_pte_setbits(p, b) x86_atomic_setbits_u64(p, b)
220
221 /*
222 * global data structures
223 */
224
225 struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */
226
227 /*
228 * pg_nx: NX PTE bit (if CPU supports)
229 * pg_g_kern: PG_G if global pages should be used in kernel mappings,
230 * 0 otherwise (for insecure CPUs)
231 */
232 pt_entry_t pg_nx = 0;
233 pt_entry_t pg_g_kern = 0;
234
235 /* pg_xo: XO PTE bits, set to PKU key1 (if cpu supports PKU) */
236 pt_entry_t pg_xo;
237
238 /* pg_crypt, pg_frame, pg_lgframe: will be derived from CPUID */
239 pt_entry_t pg_crypt = 0;
240 pt_entry_t pg_frame = PG_FRAME;
241 pt_entry_t pg_lgframe = PG_LGFRAME;
242
243 /*
244 * pmap_pg_wc: if our processor supports PAT then we set this
245 * to be the pte bits for Write Combining. Else we fall back to
246 * UC- so mtrrs can override the cacheability;
247 */
248 int pmap_pg_wc = PG_UCMINUS;
249
250 /*
251 * pmap_use_pcid: nonzero if PCID use is enabled (currently we require INVPCID)
252 *
253 * The next three are zero unless and until PCID support is enabled so code
254 * can just 'or' them in as needed without tests.
255 * cr3_pcid: CR3_REUSE_PCID
256 * cr3_pcid_proc and cr3_pcid_temp: PCID_PROC and PCID_TEMP
257 */
258 #if PCID_KERN != 0
259 # error "pmap.c assumes PCID_KERN is zero"
260 #endif
261 int pmap_use_pcid;
262 static u_int cr3_pcid_proc;
263 static u_int cr3_pcid_temp;
264 /* these two are accessed from locore.o */
265 paddr_t cr3_reuse_pcid;
266 paddr_t cr3_pcid_proc_intel;
267
268 /*
269 * other data structures
270 */
271
272 pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */
273 int pmap_initialized = 0; /* pmap_init done yet? */
274
275 /*
276 * pv management structures.
277 */
278 struct pool pmap_pv_pool;
279
280 /*
281 * linked list of all non-kernel pmaps
282 */
283
284 struct pmap_head pmaps;
285 struct mutex pmaps_lock = MUTEX_INITIALIZER(IPL_VM);
286
287 /*
288 * pool that pmap structures are allocated from
289 */
290
291 struct pool pmap_pmap_pool;
292
293 /*
294 * When we're freeing a ptp, we need to delay the freeing until all
295 * tlb shootdown has been done. This is the list of the to-be-freed pages.
296 */
297 TAILQ_HEAD(pg_to_free, vm_page);
298
299 /*
300 * pool that PDPs are allocated from
301 */
302
303 struct pool pmap_pdp_pool;
304 void pmap_pdp_ctor(pd_entry_t *);
305 void pmap_pdp_ctor_intel(pd_entry_t *);
306
307 extern vaddr_t msgbuf_vaddr;
308 extern paddr_t msgbuf_paddr;
309
310 extern vaddr_t idt_vaddr; /* we allocate IDT early */
311 extern paddr_t idt_paddr;
312
313 extern vaddr_t lo32_vaddr;
314 extern vaddr_t lo32_paddr;
315
316 vaddr_t virtual_avail;
317 extern int end;
318
319 /*
320 * local prototypes
321 */
322
323 void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *,
324 vaddr_t, struct vm_page *);
325 struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t);
326 struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
327 int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs);
328 void pmap_free_ptp(struct pmap *, struct vm_page *,
329 vaddr_t, struct pg_to_free *);
330 void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
331 #ifdef MULTIPROCESSOR
332 static int pmap_is_active(struct pmap *, struct cpu_info *);
333 #endif
334 paddr_t pmap_map_ptes(struct pmap *);
335 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
336 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
337 #if NVMM > 0
338 void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t);
339 void pmap_do_remove_ept(struct pmap *, vaddr_t);
340 int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t);
341 void pmap_shootept(struct pmap *, int);
342 #endif /* NVMM > 0 */
343 int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
344 vaddr_t, int, struct pv_entry **);
345 void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
346 vaddr_t, vaddr_t, int, struct pv_entry **);
347 #define PMAP_REMOVE_ALL 0 /* remove all mappings */
348 #define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */
349
350 void pmap_unmap_ptes(struct pmap *, paddr_t);
351 int pmap_get_physpage(vaddr_t, int, paddr_t *);
352 int pmap_pdes_valid(vaddr_t, pd_entry_t *);
353 void pmap_alloc_level(vaddr_t, int, long *);
354
355 static inline
356 void pmap_sync_flags_pte(struct vm_page *, u_long);
357
358 void pmap_tlb_shootpage(struct pmap *, vaddr_t, int);
359 void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int);
360 void pmap_tlb_shoottlb(struct pmap *, int);
361 #ifdef MULTIPROCESSOR
362 void pmap_tlb_shootwait(void);
363 #else
364 #define pmap_tlb_shootwait() do { } while (0)
365 #endif
366
367 /*
368 * p m a p i n l i n e h e l p e r f u n c t i o n s
369 */
370
371 /*
372 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
373 * of course the kernel is always loaded
374 */
375
376 static inline int
pmap_is_curpmap(struct pmap * pmap)377 pmap_is_curpmap(struct pmap *pmap)
378 {
379 return((pmap == pmap_kernel()) ||
380 (pmap->pm_pdirpa == (rcr3() & CR3_PADDR)));
381 }
382
383 /*
384 * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
385 */
386
387 #ifdef MULTIPROCESSOR
388 static inline int
pmap_is_active(struct pmap * pmap,struct cpu_info * ci)389 pmap_is_active(struct pmap *pmap, struct cpu_info *ci)
390 {
391 return (pmap == pmap_kernel() || pmap == ci->ci_proc_pmap
392 #if NVMM > 0
393 || (pmap_is_ept(pmap) && pmap == ci->ci_ept_pmap)
394 #endif /* NVMM > 0 */
395 );
396 }
397 #endif
398
399 static inline u_int
pmap_pte2flags(u_long pte)400 pmap_pte2flags(u_long pte)
401 {
402 return (((pte & PG_U) ? PG_PMAP_REF : 0) |
403 ((pte & PG_M) ? PG_PMAP_MOD : 0));
404 }
405
406 static inline void
pmap_sync_flags_pte(struct vm_page * pg,u_long pte)407 pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
408 {
409 if (pte & (PG_U|PG_M)) {
410 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
411 }
412 }
413
414 /*
415 * pmap_map_ptes: map a pmap's PTEs into KVM
416 *
417 * This should not be done for EPT pmaps
418 */
419 paddr_t
pmap_map_ptes(struct pmap * pmap)420 pmap_map_ptes(struct pmap *pmap)
421 {
422 paddr_t cr3;
423
424 KASSERT(!pmap_is_ept(pmap));
425
426 /* the kernel's pmap is always accessible */
427 if (pmap == pmap_kernel())
428 return 0;
429
430 /*
431 * Lock the target map before switching to its page tables to
432 * guarantee other CPUs have finished changing the tables before
433 * we potentially start caching table and TLB entries.
434 */
435 mtx_enter(&pmap->pm_mtx);
436
437 cr3 = rcr3();
438 KASSERT((cr3 & CR3_PCID) == PCID_KERN ||
439 (cr3 & CR3_PCID) == PCID_PROC);
440 if (pmap->pm_pdirpa == (cr3 & CR3_PADDR))
441 cr3 = 0;
442 else {
443 cr3 |= cr3_reuse_pcid;
444 lcr3(pmap->pm_pdirpa | cr3_pcid_temp);
445 }
446
447 return cr3;
448 }
449
450 void
pmap_unmap_ptes(struct pmap * pmap,paddr_t save_cr3)451 pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
452 {
453 if (pmap != pmap_kernel())
454 mtx_leave(&pmap->pm_mtx);
455
456 if (save_cr3 != 0)
457 lcr3(save_cr3);
458 }
459
460 int
pmap_find_pte_direct(struct pmap * pm,vaddr_t va,pt_entry_t ** pd,int * offs)461 pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs)
462 {
463 u_long mask, shift;
464 pd_entry_t pde;
465 paddr_t pdpa;
466 int lev;
467
468 pdpa = pm->pm_pdirpa;
469 shift = L4_SHIFT;
470 mask = L4_MASK;
471 for (lev = PTP_LEVELS; lev > 0; lev--) {
472 *pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa);
473 *offs = (VA_SIGN_POS(va) & mask) >> shift;
474 pde = (*pd)[*offs];
475
476 /* Large pages are different, break early if we run into one. */
477 if ((pde & (PG_PS|PG_V)) != PG_V)
478 return (lev - 1);
479
480 pdpa = ((*pd)[*offs] & pg_frame);
481 /* 4096/8 == 512 == 2^9 entries per level */
482 shift -= 9;
483 mask >>= 9;
484 }
485
486 return (0);
487 }
488
489 /*
490 * p m a p k e n t e r f u n c t i o n s
491 *
492 * functions to quickly enter/remove pages from the kernel address
493 * space. pmap_kremove is exported to MI kernel. we make use of
494 * the recursive PTE mappings.
495 */
496
497 /*
498 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
499 *
500 * => no need to lock anything, assume va is already allocated
501 * => should be faster than normal pmap enter function
502 */
503
504 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot)505 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
506 {
507 pt_entry_t *pte, opte, npte;
508
509 pte = kvtopte(va);
510
511 npte = (pa & PMAP_PA_MASK) | ((prot & PROT_WRITE) ? PG_RW : PG_RO) |
512 ((pa & PMAP_NOCACHE) ? PG_N : 0) |
513 ((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V |
514 ((pa & PMAP_NOCRYPT) ? 0 : pg_crypt);
515
516 /* special 1:1 mappings in the first 2MB must not be global */
517 if (va >= (vaddr_t)NBPD_L2)
518 npte |= pg_g_kern;
519
520 if (!(prot & PROT_EXEC))
521 npte |= pg_nx;
522 opte = pmap_pte_set(pte, npte);
523 #ifdef LARGEPAGES
524 /* XXX For now... */
525 if (opte & PG_PS)
526 panic("%s: PG_PS", __func__);
527 #endif
528 if (pmap_valid_entry(opte)) {
529 if ((pa & PMAP_NOCACHE && (opte & PG_N) == 0) ||
530 (pa & PMAP_NOCRYPT))
531 wbinvd_on_all_cpus();
532 /* This shouldn't happen */
533 pmap_tlb_shootpage(pmap_kernel(), va, 1);
534 pmap_tlb_shootwait();
535 }
536 }
537
538 /*
539 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
540 *
541 * => no need to lock anything
542 * => caller must dispose of any vm_page mapped in the va range
543 * => note: not an inline function
544 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
545 * => we assume kernel only unmaps valid addresses and thus don't bother
546 * checking the valid bit before doing TLB flushing
547 */
548
549 void
pmap_kremove(vaddr_t sva,vsize_t len)550 pmap_kremove(vaddr_t sva, vsize_t len)
551 {
552 pt_entry_t *pte, opte;
553 vaddr_t va, eva;
554
555 eva = sva + len;
556
557 for (va = sva; va != eva; va += PAGE_SIZE) {
558 pte = kvtopte(va);
559
560 opte = pmap_pte_set(pte, 0);
561 #ifdef LARGEPAGES
562 KASSERT((opte & PG_PS) == 0);
563 #endif
564 KASSERT((opte & PG_PVLIST) == 0);
565 }
566
567 pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1);
568 pmap_tlb_shootwait();
569 }
570
571 /*
572 * pmap_set_pml4_early
573 *
574 * Utility function to map 2GB of 2MB pages to 'pa'. The VA that is assigned
575 * is the pml4 entry for 'early mappings' (see pmap.h). This function is used
576 * by display drivers that need to map their framebuffers early, before the
577 * pmap is fully initialized (eg, to show panic messages).
578 *
579 * Users of this function must call pmap_clear_pml4_early to remove the
580 * mapping when finished.
581 *
582 * Parameters:
583 * pa: phys addr to map
584 *
585 * Return value:
586 * VA mapping to 'pa'. This mapping is 2GB in size and starts at the base
587 * of the 2MB region containing 'va'.
588 */
589 vaddr_t
pmap_set_pml4_early(paddr_t pa)590 pmap_set_pml4_early(paddr_t pa)
591 {
592 extern paddr_t early_pte_pages;
593 pt_entry_t *pml4e, *pte;
594 int i, j, off;
595 paddr_t curpa;
596 vaddr_t va;
597
598 pml4e = (pt_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
599 pml4e[PDIR_SLOT_EARLY] = (pd_entry_t)early_pte_pages | PG_V | PG_RW |
600 pg_crypt;
601
602 off = pa & PAGE_MASK_L2;
603 curpa = pa & L2_FRAME;
604
605 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
606 memset(pte, 0, 3 * NBPG);
607
608 pte[0] = (early_pte_pages + NBPG) | PG_V | PG_RW | pg_crypt;
609 pte[1] = (early_pte_pages + 2 * NBPG) | PG_V | PG_RW | pg_crypt;
610
611 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages + NBPG);
612 for (i = 0; i < 2; i++) {
613 /* 2 early pages of mappings */
614 for (j = 0; j < 512; j++) {
615 /* j[0..511] : 2MB mappings per page */
616 pte[(i * 512) + j] = curpa | PG_V | PG_RW | PG_PS |
617 pg_crypt;
618 curpa += (2 * 1024 * 1024);
619 }
620 }
621
622 va = (vaddr_t)((PDIR_SLOT_EARLY * 512ULL) << L3_SHIFT) + off;
623 return VA_SIGN_NEG(va);
624 }
625
626 /*
627 * pmap_clear_pml4_early
628 *
629 * Clears the mapping previously established with pmap_set_pml4_early.
630 */
631 void
pmap_clear_pml4_early(void)632 pmap_clear_pml4_early(void)
633 {
634 extern paddr_t early_pte_pages;
635 pt_entry_t *pml4e, *pte;
636
637 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
638 memset(pte, 0, 3 * NBPG);
639
640 pml4e = (pd_entry_t *)pmap_kernel()->pm_pdir;
641 pml4e[PDIR_SLOT_EARLY] = 0;
642 tlbflush();
643 }
644
645 /*
646 * p m a p i n i t f u n c t i o n s
647 *
648 * pmap_bootstrap and pmap_init are called during system startup
649 * to init the pmap module. pmap_bootstrap() does a low level
650 * init just to get things rolling. pmap_init() finishes the job.
651 */
652
653 /*
654 * pmap_bootstrap: get the system in a state where it can run with VM
655 * properly enabled (called before main()). the VM system is
656 * fully init'd later...
657 */
658
659 paddr_t
pmap_bootstrap(paddr_t first_avail,paddr_t max_pa)660 pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
661 {
662 vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS;
663 struct pmap *kpm;
664 int curslot, i, j, p;
665 long ndmpdp;
666 paddr_t dmpd, dmpdp, start_cur, cur_pa;
667 vaddr_t kva, kva_end;
668 pt_entry_t *pml3, *pml2;
669
670 KASSERT(((0x1000ULL | pg_crypt) & pg_frame) == 0x1000ULL);
671
672 /*
673 * define the boundaries of the managed kernel virtual address
674 * space.
675 */
676
677 virtual_avail = kva_start; /* first free KVA */
678
679 /*
680 * If PKU is available, initialize PROT_EXEC entry correctly,
681 * and enable the feature before it gets used
682 * XXX Some Hypervisors forget to save/restore PKU
683 */
684 if (cpuid_level >= 0x7) {
685 uint32_t ecx, dummy;
686
687 CPUID_LEAF(0x7, 0, dummy, dummy, ecx, dummy);
688 if (ecx & SEFF0ECX_PKU) {
689 lcr4(rcr4() | CR4_PKE);
690 pg_xo = PG_XO;
691 }
692 }
693
694 /*
695 * set up protection_codes: we need to be able to convert from
696 * a MI protection code (some combo of VM_PROT...) to something
697 * we can jam into a i386 PTE.
698 */
699
700 protection_codes[PROT_NONE] = pg_nx; /* --- */
701 protection_codes[PROT_EXEC] = pg_xo; /* --x */
702 protection_codes[PROT_READ] = PG_RO | pg_nx; /* -r- */
703 protection_codes[PROT_READ | PROT_EXEC] = PG_RO; /* -rx */
704 protection_codes[PROT_WRITE] = PG_RW | pg_nx; /* w-- */
705 protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW; /* w-x */
706 protection_codes[PROT_WRITE | PROT_READ] = PG_RW | pg_nx; /* wr- */
707 protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW; /* wrx */
708
709 /*
710 * now we init the kernel's pmap
711 *
712 * the kernel pmap's pm_obj is not used for much. however, in
713 * user pmaps the pm_obj contains the list of active PTPs.
714 * the pm_obj currently does not have a pager.
715 */
716
717 kpm = pmap_kernel();
718 for (i = 0; i < PTP_LEVELS - 1; i++) {
719 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1);
720 kpm->pm_ptphint[i] = NULL;
721 }
722 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
723 kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
724 kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
725 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
726 atop(kva_start - VM_MIN_KERNEL_ADDRESS);
727 /*
728 * the above is just a rough estimate and not critical to the proper
729 * operation of the system.
730 */
731
732 kpm->pm_type = PMAP_TYPE_NORMAL;
733
734 curpcb->pcb_pmap = kpm; /* proc0's pcb */
735
736 /*
737 * Configure and enable PCID use if supported.
738 * Currently we require INVPCID support.
739 */
740 if ((cpu_ecxfeature & CPUIDECX_PCID) && cpuid_level >= 0x07) {
741 uint32_t ebx, dummy;
742 CPUID_LEAF(0x7, 0, dummy, ebx, dummy, dummy);
743 if (ebx & SEFF0EBX_INVPCID) {
744 pmap_use_pcid = 1;
745 /*
746 * We cannot use global mappings because
747 * invpcid function 0 does not invalidate global
748 * mappings. The hardware can cache kernel
749 * mappings based on PCID_KERN, i.e. there is no
750 * need for global mappings.
751 */
752 pg_g_kern = 0;
753 lcr4( rcr4() | CR4_PCIDE );
754 cr3_pcid_proc = PCID_PROC;
755 cr3_pcid_temp = PCID_TEMP;
756 cr3_reuse_pcid = CR3_REUSE_PCID;
757 cr3_pcid_proc_intel = PCID_PROC_INTEL;
758 }
759 }
760
761 /*
762 * Add PG_G attribute to already mapped kernel pages. pg_g_kern
763 * is calculated in locore0.S and may be set to:
764 *
765 * 0 if this CPU does not safely support global pages in the kernel
766 * (Intel/Meltdown)
767 * PG_G if this CPU does safely support global pages in the kernel
768 * (AMD)
769 */
770 #if KERNBASE == VM_MIN_KERNEL_ADDRESS
771 for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
772 #else
773 kva_end = roundup((vaddr_t)&end, PAGE_SIZE);
774 for (kva = KERNBASE; kva < kva_end ;
775 #endif
776 kva += PAGE_SIZE) {
777 unsigned long p1i = pl1_i(kva);
778 if (pmap_valid_entry(PTE_BASE[p1i]))
779 PTE_BASE[p1i] |= pg_g_kern;
780 }
781
782 /*
783 * Map the direct map. The first 4GB were mapped in locore, here
784 * we map the rest if it exists. We actually use the direct map
785 * here to set up the page tables, we're assuming that we're still
786 * operating in the lower 4GB of memory.
787 *
788 * Map (up to) the first 512GB of physical memory first. This part
789 * is handled differently than physical memory > 512GB since we have
790 * already mapped part of this range in locore0.
791 */
792 ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT;
793 if (ndmpdp < NDML2_ENTRIES)
794 ndmpdp = NDML2_ENTRIES; /* At least 4GB */
795 if (ndmpdp > 512)
796 ndmpdp = 512; /* At most 512GB */
797
798 dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & pg_frame;
799
800 dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE;
801
802 for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) {
803 paddr_t pdp;
804 vaddr_t va;
805
806 pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
807 va = PMAP_DIRECT_MAP(pdp);
808
809 *((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT);
810 *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U |
811 PG_M | pg_nx | pg_crypt;
812 }
813
814 for (i = NDML2_ENTRIES; i < ndmpdp; i++) {
815 paddr_t pdp;
816 vaddr_t va;
817
818 pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
819 va = PMAP_DIRECT_MAP(pdp);
820
821 *((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT);
822 *((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M | pg_nx |
823 pg_crypt;
824 }
825
826 kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U |
827 PG_M | pg_nx | pg_crypt;
828
829 /* Map any remaining physical memory > 512GB */
830 for (curslot = 1 ; curslot < NUM_L4_SLOT_DIRECT ; curslot++) {
831 /*
832 * Start of current range starts at PA (curslot) * 512GB
833 */
834 start_cur = (paddr_t)(curslot * NBPD_L4);
835 if (max_pa > start_cur) {
836 /* Next 512GB, new PML4e and L3(512GB) page */
837 dmpd = first_avail; first_avail += PAGE_SIZE;
838 pml3 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
839 kpm->pm_pdir[PDIR_SLOT_DIRECT + curslot] = dmpd |
840 PG_KW | PG_V | PG_U | PG_M | pg_nx | pg_crypt;
841
842 /* Calculate full 1GB pages in this 512GB region */
843 p = ((max_pa - start_cur) >> L3_SHIFT);
844
845 /* Check if a partial (<1GB) page remains */
846 if (max_pa & L2_MASK)
847 p++;
848
849 /*
850 * Handle the case where this range is full and there
851 * is still more memory after (p would be > 512).
852 */
853 if (p > NPDPG)
854 p = NPDPG;
855
856 /* Allocate 'p' L2(1GB) pages and populate */
857 for (i = 0; i < p; i++) {
858 dmpd = first_avail; first_avail += PAGE_SIZE;
859 pml2 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
860 pml3[i] = dmpd |
861 PG_RW | PG_V | PG_U | PG_M | pg_nx |
862 pg_crypt;
863
864 cur_pa = start_cur + (i << L3_SHIFT);
865 j = 0;
866
867 while (cur_pa < max_pa && j < NPDPG) {
868 pml2[j] = curslot * NBPD_L4 +
869 (uint64_t)i * NBPD_L3 +
870 (uint64_t)j * NBPD_L2;
871 pml2[j] |= PG_RW | PG_V | pg_g_kern |
872 PG_U | PG_M | pg_nx | PG_PS |
873 pg_crypt;
874 cur_pa += NBPD_L2;
875 j++;
876 }
877 }
878 }
879 }
880
881 tlbflush();
882
883 msgbuf_vaddr = virtual_avail;
884 virtual_avail += round_page(MSGBUFSIZE);
885
886 idt_vaddr = virtual_avail;
887 virtual_avail += 2 * PAGE_SIZE;
888 idt_paddr = first_avail; /* steal a page */
889 first_avail += 2 * PAGE_SIZE;
890
891 #if defined(MULTIPROCESSOR) || \
892 (NACPI > 0 && !defined(SMALL_KERNEL))
893 /*
894 * Grab a page below 4G for things that need it (i.e.
895 * having an initial %cr3 for the MP trampoline).
896 */
897 lo32_vaddr = virtual_avail;
898 virtual_avail += PAGE_SIZE;
899 lo32_paddr = first_avail;
900 first_avail += PAGE_SIZE;
901 #endif
902
903 /*
904 * init the global lists.
905 */
906 LIST_INIT(&pmaps);
907
908 /*
909 * initialize the pmap pools.
910 */
911
912 pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_VM, 0,
913 "pmappl", NULL);
914 pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0,
915 "pvpl", &pool_allocator_single);
916 pool_sethiwat(&pmap_pv_pool, 32 * 1024);
917
918 /*
919 * initialize the PDE pool.
920 */
921
922 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_VM, 0,
923 "pdppl", &pool_allocator_single);
924
925 kpm->pm_pdir_intel = NULL;
926 kpm->pm_pdirpa_intel = 0;
927
928 /*
929 * ensure the TLB is sync'd with reality by flushing it...
930 */
931
932 tlbflush();
933
934 return first_avail;
935 }
936
937 void
pmap_init_percpu(void)938 pmap_init_percpu(void)
939 {
940 pool_cache_init(&pmap_pv_pool);
941 }
942
943 /*
944 * pmap_randomize
945 *
946 * Randomizes the location of the kernel pmap
947 */
948 void
pmap_randomize(void)949 pmap_randomize(void)
950 {
951 pd_entry_t *pml4va, *oldpml4va;
952 paddr_t pml4pa;
953 int i;
954
955 pml4va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
956 if (pml4va == NULL)
957 panic("%s: km_alloc failed", __func__);
958
959 /* Copy old PML4 page to new one */
960 oldpml4va = pmap_kernel()->pm_pdir;
961 memcpy(pml4va, oldpml4va, PAGE_SIZE);
962
963 /* Switch to new PML4 */
964 pmap_extract(pmap_kernel(), (vaddr_t)pml4va, &pml4pa);
965 lcr3(pml4pa);
966
967 /* Fixup pmap_kernel and proc0's %cr3 */
968 pmap_kernel()->pm_pdirpa = pml4pa;
969 pmap_kernel()->pm_pdir = pml4va;
970 proc0.p_addr->u_pcb.pcb_cr3 = pml4pa;
971
972 /* Fixup recursive PTE PML4E slot. We are only changing the PA */
973 pml4va[PDIR_SLOT_PTE] = pml4pa | (pml4va[PDIR_SLOT_PTE] & ~pg_frame);
974
975 for (i = 0; i < NPDPG; i++) {
976 /* PTE slot already handled earlier */
977 if (i == PDIR_SLOT_PTE)
978 continue;
979
980 if (pml4va[i] & pg_frame)
981 pmap_randomize_level(&pml4va[i], 3);
982 }
983
984 /* Wipe out bootstrap PML4 */
985 memset(oldpml4va, 0, PAGE_SIZE);
986 tlbflush();
987 }
988
989 void
pmap_randomize_level(pd_entry_t * pde,int level)990 pmap_randomize_level(pd_entry_t *pde, int level)
991 {
992 pd_entry_t *new_pd_va;
993 paddr_t old_pd_pa, new_pd_pa;
994 vaddr_t old_pd_va;
995 struct vm_page *pg;
996 int i;
997
998 if (level == 0)
999 return;
1000
1001 if (level < PTP_LEVELS - 1 && (*pde & PG_PS))
1002 return;
1003
1004 new_pd_va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
1005 if (new_pd_va == NULL)
1006 panic("%s: cannot allocate page for L%d page directory",
1007 __func__, level);
1008
1009 old_pd_pa = *pde & pg_frame;
1010 old_pd_va = PMAP_DIRECT_MAP(old_pd_pa);
1011 pmap_extract(pmap_kernel(), (vaddr_t)new_pd_va, &new_pd_pa);
1012 memcpy(new_pd_va, (void *)old_pd_va, PAGE_SIZE);
1013 *pde = new_pd_pa | (*pde & ~pg_frame);
1014
1015 tlbflush();
1016 memset((void *)old_pd_va, 0, PAGE_SIZE);
1017
1018 pg = PHYS_TO_VM_PAGE(old_pd_pa);
1019 if (pg != NULL) {
1020 pg->wire_count--;
1021 pmap_kernel()->pm_stats.resident_count--;
1022 if (pg->wire_count <= 1)
1023 uvm_pagefree(pg);
1024 }
1025
1026 for (i = 0; i < NPDPG; i++)
1027 if (new_pd_va[i] & pg_frame)
1028 pmap_randomize_level(&new_pd_va[i], level - 1);
1029 }
1030
1031 /*
1032 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1033 * trampoline code can be entered.
1034 */
1035 paddr_t
pmap_prealloc_lowmem_ptps(paddr_t first_avail)1036 pmap_prealloc_lowmem_ptps(paddr_t first_avail)
1037 {
1038 pd_entry_t *pdes;
1039 int level;
1040 paddr_t newp;
1041
1042 pdes = pmap_kernel()->pm_pdir;
1043 level = PTP_LEVELS;
1044 for (;;) {
1045 newp = first_avail; first_avail += PAGE_SIZE;
1046 memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
1047 pdes[pl_i(0, level)] =
1048 (newp & pg_frame) | PG_V | PG_RW | pg_crypt;
1049 level--;
1050 if (level <= 1)
1051 break;
1052 pdes = normal_pdes[level - 2];
1053 }
1054
1055 return first_avail;
1056 }
1057
1058 /*
1059 * pmap_init: no further initialization required on this platform
1060 */
1061 void
pmap_init(void)1062 pmap_init(void)
1063 {
1064 pmap_initialized = 1;
1065 }
1066
1067 /*
1068 * p v _ e n t r y f u n c t i o n s
1069 */
1070
1071 /*
1072 * main pv_entry manipulation functions:
1073 * pmap_enter_pv: enter a mapping onto a pv list
1074 * pmap_remove_pv: remove a mapping from a pv list
1075 */
1076
1077 /*
1078 * pmap_enter_pv: enter a mapping onto a pv list
1079 *
1080 * => caller should adjust ptp's wire_count before calling
1081 *
1082 * pve: preallocated pve for us to use
1083 * ptp: PTP in pmap that maps this VA
1084 */
1085
1086 void
pmap_enter_pv(struct vm_page * pg,struct pv_entry * pve,struct pmap * pmap,vaddr_t va,struct vm_page * ptp)1087 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
1088 vaddr_t va, struct vm_page *ptp)
1089 {
1090 pve->pv_pmap = pmap;
1091 pve->pv_va = va;
1092 pve->pv_ptp = ptp; /* NULL for kernel pmap */
1093 mtx_enter(&pg->mdpage.pv_mtx);
1094 pve->pv_next = pg->mdpage.pv_list; /* add to ... */
1095 pg->mdpage.pv_list = pve; /* ... list */
1096 mtx_leave(&pg->mdpage.pv_mtx);
1097 }
1098
1099 /*
1100 * pmap_remove_pv: try to remove a mapping from a pv_list
1101 *
1102 * => caller should adjust ptp's wire_count and free PTP if needed
1103 * => we return the removed pve
1104 */
1105
1106 struct pv_entry *
pmap_remove_pv(struct vm_page * pg,struct pmap * pmap,vaddr_t va)1107 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
1108 {
1109 struct pv_entry *pve, **prevptr;
1110
1111 mtx_enter(&pg->mdpage.pv_mtx);
1112 prevptr = &pg->mdpage.pv_list;
1113 while ((pve = *prevptr) != NULL) {
1114 if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */
1115 *prevptr = pve->pv_next; /* remove it! */
1116 break;
1117 }
1118 prevptr = &pve->pv_next; /* previous pointer */
1119 }
1120 mtx_leave(&pg->mdpage.pv_mtx);
1121 return(pve); /* return removed pve */
1122 }
1123
1124 /*
1125 * p t p f u n c t i o n s
1126 */
1127
1128 struct vm_page *
pmap_find_ptp(struct pmap * pmap,vaddr_t va,paddr_t pa,int level)1129 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1130 {
1131 int lidx = level - 1;
1132 struct vm_page *pg;
1133
1134 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1135 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
1136 return (pmap->pm_ptphint[lidx]);
1137
1138 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1139
1140 return pg;
1141 }
1142
1143 void
pmap_freepage(struct pmap * pmap,struct vm_page * ptp,int level,struct pg_to_free * pagelist)1144 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
1145 struct pg_to_free *pagelist)
1146 {
1147 int lidx;
1148 struct uvm_object *obj;
1149
1150 lidx = level - 1;
1151
1152 obj = &pmap->pm_obj[lidx];
1153 pmap->pm_stats.resident_count--;
1154 if (pmap->pm_ptphint[lidx] == ptp)
1155 pmap->pm_ptphint[lidx] = RBT_ROOT(uvm_objtree, &obj->memt);
1156 ptp->wire_count = 0;
1157 uvm_pagerealloc(ptp, NULL, 0);
1158 TAILQ_INSERT_TAIL(pagelist, ptp, pageq);
1159 }
1160
1161 void
pmap_free_ptp(struct pmap * pmap,struct vm_page * ptp,vaddr_t va,struct pg_to_free * pagelist)1162 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1163 struct pg_to_free *pagelist)
1164 {
1165 unsigned long index;
1166 int level;
1167 vaddr_t invaladdr;
1168
1169 level = 1;
1170 do {
1171 pmap_freepage(pmap, ptp, level, pagelist);
1172 index = pl_i(va, level + 1);
1173 pmap_pte_set(&normal_pdes[level - 1][index], 0);
1174 if (level == PTP_LEVELS - 1 && pmap->pm_pdir_intel != NULL) {
1175 /* Zap special meltdown PML4e */
1176 pmap_pte_set(&pmap->pm_pdir_intel[index], 0);
1177 DPRINTF("%s: cleared meltdown PML4e @ index %lu "
1178 "(va range start 0x%llx)\n", __func__, index,
1179 (uint64_t)(index << L4_SHIFT));
1180 }
1181 invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
1182 (vaddr_t)normal_pdes[level - 2];
1183 pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE,
1184 pmap_is_curpmap(curpcb->pcb_pmap));
1185 if (level < PTP_LEVELS - 1) {
1186 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1187 ptp->wire_count--;
1188 if (ptp->wire_count > 1)
1189 break;
1190 }
1191 } while (++level < PTP_LEVELS);
1192 }
1193
1194 /*
1195 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1196 *
1197 * => pmap should NOT be pmap_kernel()
1198 */
1199
1200 struct vm_page *
pmap_get_ptp(struct pmap * pmap,vaddr_t va)1201 pmap_get_ptp(struct pmap *pmap, vaddr_t va)
1202 {
1203 struct vm_page *ptp, *pptp;
1204 int i;
1205 unsigned long index;
1206 pd_entry_t *pva, *pva_intel;
1207 paddr_t ppa, pa;
1208 struct uvm_object *obj;
1209
1210 ptp = NULL;
1211 pa = (paddr_t)-1;
1212
1213 /*
1214 * Loop through all page table levels seeing if we need to
1215 * add a new page to that level.
1216 */
1217 for (i = PTP_LEVELS; i > 1; i--) {
1218 /*
1219 * Save values from previous round.
1220 */
1221 pptp = ptp;
1222 ppa = pa;
1223
1224 index = pl_i(va, i);
1225 pva = normal_pdes[i - 2];
1226
1227 if (pmap_valid_entry(pva[index])) {
1228 ppa = pva[index] & pg_frame;
1229 ptp = NULL;
1230 continue;
1231 }
1232
1233 obj = &pmap->pm_obj[i-2];
1234 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1235 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1236
1237 if (ptp == NULL)
1238 return NULL;
1239
1240 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
1241 ptp->wire_count = 1;
1242 pmap->pm_ptphint[i - 2] = ptp;
1243 pa = VM_PAGE_TO_PHYS(ptp);
1244 pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V | pg_crypt);
1245
1246 /*
1247 * Meltdown Special case - if we are adding a new PML4e for
1248 * usermode addresses, just copy the PML4e to the U-K page
1249 * table.
1250 */
1251 if (pmap->pm_pdir_intel != NULL && i == PTP_LEVELS &&
1252 va < VM_MAXUSER_ADDRESS) {
1253 pva_intel = pmap->pm_pdir_intel;
1254 pva_intel[index] = pva[index];
1255 DPRINTF("%s: copying usermode PML4e (content=0x%llx) "
1256 "from 0x%llx -> 0x%llx\n", __func__, pva[index],
1257 (uint64_t)&pva[index], (uint64_t)&pva_intel[index]);
1258 }
1259
1260 pmap->pm_stats.resident_count++;
1261 /*
1262 * If we're not in the top level, increase the
1263 * wire count of the parent page.
1264 */
1265 if (i < PTP_LEVELS) {
1266 if (pptp == NULL)
1267 pptp = pmap_find_ptp(pmap, va, ppa, i);
1268 #ifdef DIAGNOSTIC
1269 if (pptp == NULL)
1270 panic("%s: pde page disappeared", __func__);
1271 #endif
1272 pptp->wire_count++;
1273 }
1274 }
1275
1276 /*
1277 * ptp is not NULL if we just allocated a new ptp. If it's
1278 * still NULL, we must look up the existing one.
1279 */
1280 if (ptp == NULL) {
1281 ptp = pmap_find_ptp(pmap, va, ppa, 1);
1282 #ifdef DIAGNOSTIC
1283 if (ptp == NULL) {
1284 printf("va %lx ppa %lx\n", (unsigned long)va,
1285 (unsigned long)ppa);
1286 panic("%s: unmanaged user PTP", __func__);
1287 }
1288 #endif
1289 }
1290
1291 pmap->pm_ptphint[0] = ptp;
1292 return(ptp);
1293 }
1294
1295 /*
1296 * p m a p l i f e c y c l e f u n c t i o n s
1297 */
1298
1299 /*
1300 * pmap_pdp_ctor: constructor for the PDP cache.
1301 */
1302
1303 void
pmap_pdp_ctor(pd_entry_t * pdir)1304 pmap_pdp_ctor(pd_entry_t *pdir)
1305 {
1306 paddr_t pdirpa;
1307 int npde, i;
1308 struct pmap *kpm = pmap_kernel();
1309
1310 /* fetch the physical address of the page directory. */
1311 (void) pmap_extract(kpm, (vaddr_t) pdir, &pdirpa);
1312
1313 /* zero init area */
1314 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
1315
1316 /* put in recursive PDE to map the PTEs */
1317 pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW | pg_nx | pg_crypt;
1318
1319 npde = nkptp[PTP_LEVELS - 1];
1320
1321 /* put in kernel VM PDEs */
1322 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
1323 npde * sizeof(pd_entry_t));
1324
1325 /* zero the rest */
1326 memset(&pdir[PDIR_SLOT_KERN + npde], 0,
1327 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
1328
1329 for (i = 0; i < NUM_L4_SLOT_DIRECT; i++)
1330 pdir[PDIR_SLOT_DIRECT + i] = kpm->pm_pdir[PDIR_SLOT_DIRECT + i];
1331
1332 #if VM_MIN_KERNEL_ADDRESS != KERNBASE
1333 pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)];
1334 #endif
1335 }
1336
1337 void
pmap_pdp_ctor_intel(pd_entry_t * pdir)1338 pmap_pdp_ctor_intel(pd_entry_t *pdir)
1339 {
1340 struct pmap *kpm = pmap_kernel();
1341
1342 /* Copy PML4es from pmap_kernel's U-K view */
1343 memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE);
1344 }
1345
1346 /*
1347 * pmap_create: create a pmap
1348 *
1349 * => note: old pmap interface took a "size" args which allowed for
1350 * the creation of "software only" pmaps (not in bsd).
1351 */
1352
1353 struct pmap *
pmap_create(void)1354 pmap_create(void)
1355 {
1356 struct pmap *pmap;
1357 int i;
1358
1359 pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
1360
1361 mtx_init(&pmap->pm_mtx, IPL_VM);
1362
1363 /* init uvm_object */
1364 for (i = 0; i < PTP_LEVELS - 1; i++) {
1365 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, 1);
1366 pmap->pm_ptphint[i] = NULL;
1367 }
1368 pmap->pm_stats.wired_count = 0;
1369 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */
1370 pmap->pm_type = PMAP_TYPE_NORMAL;
1371 pmap->eptp = 0;
1372
1373 /* allocate PDP */
1374
1375 /*
1376 * note that there is no need to splvm to protect us from
1377 * malloc since malloc allocates out of a submap and we should
1378 * have already allocated kernel PTPs to cover the range...
1379 */
1380
1381 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
1382 pmap_pdp_ctor(pmap->pm_pdir);
1383
1384 pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & pg_frame;
1385
1386 /*
1387 * Intel CPUs need a special page table to be used during usermode
1388 * execution, one that lacks all kernel mappings.
1389 */
1390 if (cpu_meltdown) {
1391 pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK);
1392 pmap_pdp_ctor_intel(pmap->pm_pdir_intel);
1393 pmap->pm_stats.resident_count++;
1394 if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
1395 &pmap->pm_pdirpa_intel))
1396 panic("%s: unknown PA mapping for meltdown PML4",
1397 __func__);
1398 } else {
1399 pmap->pm_pdir_intel = NULL;
1400 pmap->pm_pdirpa_intel = 0;
1401 }
1402
1403 mtx_enter(&pmaps_lock);
1404 LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1405 mtx_leave(&pmaps_lock);
1406 return (pmap);
1407 }
1408
1409 /*
1410 * pmap_destroy: drop reference count on pmap. free pmap if
1411 * reference count goes to zero.
1412 */
1413
1414 void
pmap_destroy(struct pmap * pmap)1415 pmap_destroy(struct pmap *pmap)
1416 {
1417 struct vm_page *pg;
1418 int refs;
1419 int i;
1420
1421 /*
1422 * drop reference count
1423 */
1424
1425 refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs);
1426 if (refs > 0) {
1427 return;
1428 }
1429
1430 /*
1431 * remove it from global list of pmaps
1432 */
1433 mtx_enter(&pmaps_lock);
1434 LIST_REMOVE(pmap, pm_list);
1435 mtx_leave(&pmaps_lock);
1436
1437 /*
1438 * free any remaining PTPs
1439 */
1440
1441 for (i = 0; i < PTP_LEVELS - 1; i++) {
1442 while ((pg = RBT_ROOT(uvm_objtree,
1443 &pmap->pm_obj[i].memt)) != NULL) {
1444 KASSERT((pg->pg_flags & PG_BUSY) == 0);
1445
1446 pg->wire_count = 0;
1447 pmap->pm_stats.resident_count--;
1448
1449 uvm_pagefree(pg);
1450 }
1451 }
1452
1453 pool_put(&pmap_pdp_pool, pmap->pm_pdir);
1454
1455 if (pmap->pm_pdir_intel != NULL) {
1456 pmap->pm_stats.resident_count--;
1457 pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
1458 }
1459
1460 pool_put(&pmap_pmap_pool, pmap);
1461 }
1462
1463 /*
1464 * Add a reference to the specified pmap.
1465 */
1466
1467 void
pmap_reference(struct pmap * pmap)1468 pmap_reference(struct pmap *pmap)
1469 {
1470 atomic_inc_int(&pmap->pm_obj[0].uo_refs);
1471 }
1472
1473 /*
1474 * pmap_activate: activate a process' pmap (fill in %cr3)
1475 *
1476 * => called from cpu_fork() and when switching pmaps during exec
1477 * => if p is the curproc, then load it into the MMU
1478 */
1479
1480 void
pmap_activate(struct proc * p)1481 pmap_activate(struct proc *p)
1482 {
1483 struct pcb *pcb = &p->p_addr->u_pcb;
1484 struct pmap *pmap = p->p_vmspace->vm_map.pmap;
1485
1486 pcb->pcb_pmap = pmap;
1487 pcb->pcb_cr3 = pmap->pm_pdirpa;
1488 pcb->pcb_cr3 |= (pmap != pmap_kernel()) ? cr3_pcid_proc :
1489 (PCID_KERN | cr3_reuse_pcid);
1490
1491 if (p != curproc)
1492 return;
1493
1494 if ((p->p_flag & P_SYSTEM) == 0) {
1495 struct cpu_info *self = curcpu();
1496
1497 /* mark the pmap in use by this processor */
1498 self->ci_proc_pmap = pmap;
1499
1500 /* in case we return to userspace without context switching */
1501 if (cpu_meltdown) {
1502 self->ci_kern_cr3 = pcb->pcb_cr3 | cr3_reuse_pcid;
1503 self->ci_user_cr3 = pmap->pm_pdirpa_intel |
1504 cr3_pcid_proc_intel;
1505 }
1506 }
1507
1508 lcr3(pcb->pcb_cr3);
1509 }
1510
1511 /*
1512 * pmap_deactivate: deactivate a process' pmap
1513 */
1514
1515 void
pmap_deactivate(struct proc * p)1516 pmap_deactivate(struct proc *p)
1517 {
1518 if ((p->p_flag & P_SYSTEM) == 0) {
1519 struct cpu_info *self = curcpu();
1520
1521 /*
1522 * mark the pmap no longer in use by this processor.
1523 */
1524 KASSERT(self->ci_proc_pmap == p->p_vmspace->vm_map.pmap);
1525 self->ci_proc_pmap = NULL;
1526 }
1527 }
1528
1529 /*
1530 * end of lifecycle functions
1531 */
1532
1533 /*
1534 * some misc. functions
1535 */
1536
1537 int
pmap_pdes_valid(vaddr_t va,pd_entry_t * lastpde)1538 pmap_pdes_valid(vaddr_t va, pd_entry_t *lastpde)
1539 {
1540 int i;
1541 unsigned long index;
1542 pd_entry_t pde;
1543
1544 for (i = PTP_LEVELS; i > 1; i--) {
1545 index = pl_i(va, i);
1546 pde = normal_pdes[i - 2][index];
1547 if (!pmap_valid_entry(pde))
1548 return 0;
1549 }
1550 if (lastpde != NULL)
1551 *lastpde = pde;
1552 return 1;
1553 }
1554
1555 /*
1556 * pmap_extract: extract a PA for the given VA
1557 */
1558
1559 int
pmap_extract(struct pmap * pmap,vaddr_t va,paddr_t * pap)1560 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1561 {
1562 pt_entry_t *ptes, pte;
1563 int level, offs;
1564
1565 if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE &&
1566 va < PMAP_DIRECT_END) {
1567 *pap = va - PMAP_DIRECT_BASE;
1568 return 1;
1569 }
1570
1571 if (pmap != pmap_kernel())
1572 mtx_enter(&pmap->pm_mtx);
1573
1574 level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
1575 pte = ptes[offs];
1576
1577 if (pmap != pmap_kernel())
1578 mtx_leave(&pmap->pm_mtx);
1579
1580 if (__predict_true(level == 0 && pmap_valid_entry(pte))) {
1581 if (pap != NULL)
1582 *pap = (pte & pg_frame) | (va & PAGE_MASK);
1583 return 1;
1584 }
1585 if (level == 1 && (pte & (PG_PS|PG_V)) == (PG_PS|PG_V)) {
1586 if (pap != NULL)
1587 *pap = (pte & pg_lgframe) | (va & PAGE_MASK_L2);
1588 return 1;
1589 }
1590
1591 return 0;
1592 }
1593
1594 /*
1595 * pmap_zero_page: zero a page
1596 */
1597
1598 void
pmap_zero_page(struct vm_page * pg)1599 pmap_zero_page(struct vm_page *pg)
1600 {
1601 pagezero(pmap_map_direct(pg));
1602 }
1603
1604 /*
1605 * pmap_flush_cache: flush the cache for a virtual address.
1606 */
1607 void
pmap_flush_cache(vaddr_t addr,vsize_t len)1608 pmap_flush_cache(vaddr_t addr, vsize_t len)
1609 {
1610 vaddr_t i;
1611
1612 if (curcpu()->ci_cflushsz == 0) {
1613 wbinvd_on_all_cpus();
1614 return;
1615 }
1616
1617 /* all cpus that have clflush also have mfence. */
1618 mfence();
1619 for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz)
1620 clflush(i);
1621 mfence();
1622 }
1623
1624 /*
1625 * pmap_copy_page: copy a page
1626 */
1627
1628 void
pmap_copy_page(struct vm_page * srcpg,struct vm_page * dstpg)1629 pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
1630 {
1631 vaddr_t srcva = pmap_map_direct(srcpg);
1632 vaddr_t dstva = pmap_map_direct(dstpg);
1633
1634 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
1635 }
1636
1637 /*
1638 * p m a p r e m o v e f u n c t i o n s
1639 *
1640 * functions that remove mappings
1641 */
1642
1643 /*
1644 * pmap_remove_ptes: remove PTEs from a PTP
1645 *
1646 * => PTP must be mapped into KVA
1647 * => PTP should be null if pmap == pmap_kernel()
1648 */
1649
1650 void
pmap_remove_ptes(struct pmap * pmap,struct vm_page * ptp,vaddr_t ptpva,vaddr_t startva,vaddr_t endva,int flags,struct pv_entry ** free_pvs)1651 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1652 vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1653 {
1654 struct pv_entry *pve;
1655 pt_entry_t *pte = (pt_entry_t *) ptpva;
1656 struct vm_page *pg;
1657 pt_entry_t opte;
1658
1659 /*
1660 * note that ptpva points to the PTE that maps startva. this may
1661 * or may not be the first PTE in the PTP.
1662 *
1663 * we loop through the PTP while there are still PTEs to look at
1664 * and the wire_count is greater than 1 (because we use the wire_count
1665 * to keep track of the number of real PTEs in the PTP).
1666 */
1667
1668 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1669 ; pte++, startva += PAGE_SIZE) {
1670 if (!pmap_valid_entry(*pte))
1671 continue; /* VA not mapped */
1672 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1673 continue;
1674 }
1675
1676 /* atomically save the old PTE and zap! it */
1677 opte = pmap_pte_set(pte, 0);
1678
1679 if (opte & PG_W)
1680 pmap->pm_stats.wired_count--;
1681 pmap->pm_stats.resident_count--;
1682
1683 if (ptp != NULL)
1684 ptp->wire_count--; /* dropping a PTE */
1685
1686 pg = PHYS_TO_VM_PAGE(opte & pg_frame);
1687
1688 /*
1689 * if we are not on a pv list we are done.
1690 */
1691
1692 if ((opte & PG_PVLIST) == 0) {
1693 #ifdef DIAGNOSTIC
1694 if (pg != NULL)
1695 panic("%s: managed page without PG_PVLIST: "
1696 "va 0x%lx, opte 0x%llx", __func__,
1697 startva, opte);
1698 #endif
1699 continue;
1700 }
1701
1702 #ifdef DIAGNOSTIC
1703 if (pg == NULL)
1704 panic("%s: unmanaged page marked PG_PVLIST: "
1705 "va 0x%lx, opte 0x%llx", __func__,
1706 startva, opte);
1707 #endif
1708
1709 /* sync R/M bits */
1710 pmap_sync_flags_pte(pg, opte);
1711 pve = pmap_remove_pv(pg, pmap, startva);
1712 if (pve != NULL) {
1713 pve->pv_next = *free_pvs;
1714 *free_pvs = pve;
1715 }
1716
1717 /* end of "for" loop: time for next pte */
1718 }
1719 }
1720
1721 /*
1722 * pmap_remove_pte: remove a single PTE from a PTP
1723 *
1724 * => PTP must be mapped into KVA
1725 * => PTP should be null if pmap == pmap_kernel()
1726 * => returns true if we removed a mapping
1727 */
1728
1729 int
pmap_remove_pte(struct pmap * pmap,struct vm_page * ptp,pt_entry_t * pte,vaddr_t va,int flags,struct pv_entry ** free_pvs)1730 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
1731 vaddr_t va, int flags, struct pv_entry **free_pvs)
1732 {
1733 struct pv_entry *pve;
1734 struct vm_page *pg;
1735 pt_entry_t opte;
1736
1737 if (!pmap_valid_entry(*pte))
1738 return 0; /* VA not mapped */
1739 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1740 return 0;
1741 }
1742
1743 /* atomically save the old PTE and zap! it */
1744 opte = pmap_pte_set(pte, 0);
1745
1746 if (opte & PG_W)
1747 pmap->pm_stats.wired_count--;
1748 pmap->pm_stats.resident_count--;
1749
1750 if (ptp != NULL)
1751 ptp->wire_count--; /* dropping a PTE */
1752
1753 pg = PHYS_TO_VM_PAGE(opte & pg_frame);
1754
1755 /*
1756 * if we are not on a pv list we are done.
1757 */
1758 if ((opte & PG_PVLIST) == 0) {
1759 #ifdef DIAGNOSTIC
1760 if (pg != NULL)
1761 panic("%s: managed page without PG_PVLIST: "
1762 "va 0x%lx, opte 0x%llx", __func__, va, opte);
1763 #endif
1764 return 1;
1765 }
1766
1767 #ifdef DIAGNOSTIC
1768 if (pg == NULL)
1769 panic("%s: unmanaged page marked PG_PVLIST: "
1770 "va 0x%lx, opte 0x%llx", __func__, va, opte);
1771 #endif
1772
1773 /* sync R/M bits */
1774 pmap_sync_flags_pte(pg, opte);
1775 pve = pmap_remove_pv(pg, pmap, va);
1776 if (pve != NULL) {
1777 pve->pv_next = *free_pvs;
1778 *free_pvs = pve;
1779 }
1780
1781 return 1;
1782 }
1783
1784 /*
1785 * pmap_remove: top level mapping removal function
1786 *
1787 * => caller should not be holding any pmap locks
1788 */
1789
1790 void
pmap_remove(struct pmap * pmap,vaddr_t sva,vaddr_t eva)1791 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
1792 {
1793 #if NVMM > 0
1794 if (pmap_is_ept(pmap))
1795 pmap_remove_ept(pmap, sva, eva);
1796 else
1797 #endif /* NVMM > 0 */
1798 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
1799 }
1800
1801 /*
1802 * pmap_do_remove: mapping removal guts
1803 *
1804 * => caller should not be holding any pmap locks
1805 */
1806
1807 void
pmap_do_remove(struct pmap * pmap,vaddr_t sva,vaddr_t eva,int flags)1808 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1809 {
1810 pd_entry_t pde;
1811 int result;
1812 paddr_t ptppa;
1813 vaddr_t blkendva;
1814 struct vm_page *ptp;
1815 struct pv_entry *pve;
1816 struct pv_entry *free_pvs = NULL;
1817 vaddr_t va;
1818 int shootall = 0, shootself;
1819 struct pg_to_free empty_ptps;
1820 paddr_t scr3;
1821
1822 TAILQ_INIT(&empty_ptps);
1823
1824 scr3 = pmap_map_ptes(pmap);
1825 shootself = (scr3 == 0);
1826
1827 /*
1828 * removing one page? take shortcut function.
1829 */
1830
1831 if (sva + PAGE_SIZE == eva) {
1832 if (pmap_pdes_valid(sva, &pde)) {
1833
1834 /* PA of the PTP */
1835 ptppa = pde & pg_frame;
1836
1837 /* get PTP if non-kernel mapping */
1838
1839 if (pmap == pmap_kernel()) {
1840 /* we never free kernel PTPs */
1841 ptp = NULL;
1842 } else {
1843 ptp = pmap_find_ptp(pmap, sva, ptppa, 1);
1844 #ifdef DIAGNOSTIC
1845 if (ptp == NULL)
1846 panic("%s: unmanaged PTP detected "
1847 "in shortcut path", __func__);
1848 #endif
1849 }
1850
1851 /* do it! */
1852 result = pmap_remove_pte(pmap, ptp,
1853 &PTE_BASE[pl1_i(sva)], sva, flags, &free_pvs);
1854
1855 /*
1856 * if mapping removed and the PTP is no longer
1857 * being used, free it!
1858 */
1859
1860 if (result && ptp && ptp->wire_count <= 1)
1861 pmap_free_ptp(pmap, ptp, sva, &empty_ptps);
1862 pmap_tlb_shootpage(pmap, sva, shootself);
1863 pmap_unmap_ptes(pmap, scr3);
1864 pmap_tlb_shootwait();
1865 } else {
1866 pmap_unmap_ptes(pmap, scr3);
1867 }
1868
1869 goto cleanup;
1870 }
1871
1872 if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
1873 shootall = 1;
1874
1875 for (va = sva; va < eva; va = blkendva) {
1876 /* determine range of block */
1877 blkendva = x86_round_pdr(va + 1);
1878 if (blkendva > eva)
1879 blkendva = eva;
1880
1881 /*
1882 * XXXCDC: our PTE mappings should never be removed
1883 * with pmap_remove! if we allow this (and why would
1884 * we?) then we end up freeing the pmap's page
1885 * directory page (PDP) before we are finished using
1886 * it when we hit it in the recursive mapping. this
1887 * is BAD.
1888 *
1889 * long term solution is to move the PTEs out of user
1890 * address space. and into kernel address space (up
1891 * with APTE). then we can set VM_MAXUSER_ADDRESS to
1892 * be VM_MAX_ADDRESS.
1893 */
1894
1895 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
1896 /* XXXCDC: ugly hack to avoid freeing PDP here */
1897 continue;
1898
1899 if (!pmap_pdes_valid(va, &pde))
1900 continue;
1901
1902 /* PA of the PTP */
1903 ptppa = pde & pg_frame;
1904
1905 /* get PTP if non-kernel mapping */
1906 if (pmap == pmap_kernel()) {
1907 /* we never free kernel PTPs */
1908 ptp = NULL;
1909 } else {
1910 ptp = pmap_find_ptp(pmap, va, ptppa, 1);
1911 #ifdef DIAGNOSTIC
1912 if (ptp == NULL)
1913 panic("%s: unmanaged PTP detected", __func__);
1914 #endif
1915 }
1916 pmap_remove_ptes(pmap, ptp, (vaddr_t)&PTE_BASE[pl1_i(va)],
1917 va, blkendva, flags, &free_pvs);
1918
1919 /* if PTP is no longer being used, free it! */
1920 if (ptp && ptp->wire_count <= 1) {
1921 pmap_free_ptp(pmap, ptp, va, &empty_ptps);
1922 }
1923 }
1924
1925 if (shootall)
1926 pmap_tlb_shoottlb(pmap, shootself);
1927 else
1928 pmap_tlb_shootrange(pmap, sva, eva, shootself);
1929
1930 pmap_unmap_ptes(pmap, scr3);
1931 pmap_tlb_shootwait();
1932
1933 cleanup:
1934 while ((pve = free_pvs) != NULL) {
1935 free_pvs = pve->pv_next;
1936 pool_put(&pmap_pv_pool, pve);
1937 }
1938
1939 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1940 TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1941 uvm_pagefree(ptp);
1942 }
1943 }
1944
1945 /*
1946 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1947 *
1948 * => R/M bits are sync'd back to attrs
1949 */
1950
1951 void
pmap_page_remove(struct vm_page * pg)1952 pmap_page_remove(struct vm_page *pg)
1953 {
1954 struct pv_entry *pve;
1955 struct pmap *pm;
1956 pt_entry_t opte;
1957 #ifdef DIAGNOSTIC
1958 pd_entry_t pde;
1959 #endif
1960 struct pg_to_free empty_ptps;
1961 struct vm_page *ptp;
1962 paddr_t scr3;
1963 int shootself;
1964
1965 TAILQ_INIT(&empty_ptps);
1966
1967 mtx_enter(&pg->mdpage.pv_mtx);
1968 while ((pve = pg->mdpage.pv_list) != NULL) {
1969 pmap_reference(pve->pv_pmap);
1970 pm = pve->pv_pmap;
1971 mtx_leave(&pg->mdpage.pv_mtx);
1972
1973 /* XXX use direct map? */
1974 scr3 = pmap_map_ptes(pm); /* locks pmap */
1975 shootself = (scr3 == 0);
1976
1977 /*
1978 * We dropped the pvlist lock before grabbing the pmap
1979 * lock to avoid lock ordering problems. This means
1980 * we have to check the pvlist again since somebody
1981 * else might have modified it. All we care about is
1982 * that the pvlist entry matches the pmap we just
1983 * locked. If it doesn't, unlock the pmap and try
1984 * again.
1985 */
1986 mtx_enter(&pg->mdpage.pv_mtx);
1987 if ((pve = pg->mdpage.pv_list) == NULL ||
1988 pve->pv_pmap != pm) {
1989 mtx_leave(&pg->mdpage.pv_mtx);
1990 pmap_unmap_ptes(pm, scr3); /* unlocks pmap */
1991 pmap_destroy(pm);
1992 mtx_enter(&pg->mdpage.pv_mtx);
1993 continue;
1994 }
1995
1996 pg->mdpage.pv_list = pve->pv_next;
1997 mtx_leave(&pg->mdpage.pv_mtx);
1998
1999 #ifdef DIAGNOSTIC
2000 if (pve->pv_ptp != NULL && pmap_pdes_valid(pve->pv_va, &pde) &&
2001 (pde & pg_frame) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
2002 printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__,
2003 pg, pve->pv_va, pve->pv_ptp);
2004 printf("%s: PTP's phys addr: "
2005 "actual=%lx, recorded=%lx\n", __func__,
2006 (unsigned long)(pde & pg_frame),
2007 VM_PAGE_TO_PHYS(pve->pv_ptp));
2008 panic("%s: mapped managed page has "
2009 "invalid pv_ptp field", __func__);
2010 }
2011 #endif
2012
2013 /* atomically save the old PTE and zap it */
2014 opte = pmap_pte_set(&PTE_BASE[pl1_i(pve->pv_va)], 0);
2015
2016 if (opte & PG_W)
2017 pve->pv_pmap->pm_stats.wired_count--;
2018 pve->pv_pmap->pm_stats.resident_count--;
2019
2020 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself);
2021
2022 pmap_sync_flags_pte(pg, opte);
2023
2024 /* update the PTP reference count. free if last reference. */
2025 if (pve->pv_ptp != NULL) {
2026 pve->pv_ptp->wire_count--;
2027 if (pve->pv_ptp->wire_count <= 1) {
2028 pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
2029 pve->pv_va, &empty_ptps);
2030 }
2031 }
2032 pmap_unmap_ptes(pve->pv_pmap, scr3); /* unlocks pmap */
2033 pmap_destroy(pve->pv_pmap);
2034 pool_put(&pmap_pv_pool, pve);
2035 mtx_enter(&pg->mdpage.pv_mtx);
2036 }
2037 mtx_leave(&pg->mdpage.pv_mtx);
2038
2039 pmap_tlb_shootwait();
2040
2041 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
2042 TAILQ_REMOVE(&empty_ptps, ptp, pageq);
2043 uvm_pagefree(ptp);
2044 }
2045 }
2046
2047 /*
2048 * p m a p a t t r i b u t e f u n c t i o n s
2049 * functions that test/change managed page's attributes
2050 * since a page can be mapped multiple times we must check each PTE that
2051 * maps it by going down the pv lists.
2052 */
2053
2054 /*
2055 * pmap_test_attrs: test a page's attributes
2056 */
2057
2058 int
pmap_test_attrs(struct vm_page * pg,unsigned int testbits)2059 pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
2060 {
2061 struct pv_entry *pve;
2062 pt_entry_t *ptes;
2063 int level, offs;
2064 u_long mybits, testflags;
2065
2066 testflags = pmap_pte2flags(testbits);
2067
2068 if (pg->pg_flags & testflags)
2069 return 1;
2070
2071 mybits = 0;
2072 mtx_enter(&pg->mdpage.pv_mtx);
2073 for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
2074 pve = pve->pv_next) {
2075 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
2076 &offs);
2077 mybits |= (ptes[offs] & testbits);
2078 }
2079 mtx_leave(&pg->mdpage.pv_mtx);
2080
2081 if (mybits == 0)
2082 return 0;
2083
2084 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
2085
2086 return 1;
2087 }
2088
2089 /*
2090 * pmap_clear_attrs: change a page's attributes
2091 *
2092 * => we return 1 if we cleared one of the bits we were asked to
2093 */
2094
2095 int
pmap_clear_attrs(struct vm_page * pg,unsigned long clearbits)2096 pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
2097 {
2098 struct pv_entry *pve;
2099 pt_entry_t *ptes, opte;
2100 u_long clearflags;
2101 int result, level, offs;
2102
2103 clearflags = pmap_pte2flags(clearbits);
2104
2105 result = pg->pg_flags & clearflags;
2106 if (result)
2107 atomic_clearbits_int(&pg->pg_flags, clearflags);
2108
2109 mtx_enter(&pg->mdpage.pv_mtx);
2110 for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
2111 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
2112 &offs);
2113 opte = ptes[offs];
2114 if (opte & clearbits) {
2115 result = 1;
2116 pmap_pte_clearbits(&ptes[offs], (opte & clearbits));
2117 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va,
2118 pmap_is_curpmap(pve->pv_pmap));
2119 }
2120 }
2121 mtx_leave(&pg->mdpage.pv_mtx);
2122
2123 pmap_tlb_shootwait();
2124
2125 return (result != 0);
2126 }
2127
2128 /*
2129 * p m a p p r o t e c t i o n f u n c t i o n s
2130 */
2131
2132 /*
2133 * pmap_page_protect: change the protection of all recorded mappings
2134 * of a managed page
2135 *
2136 * => NOTE: this is an inline function in pmap.h
2137 */
2138
2139 /* see pmap.h */
2140
2141 /*
2142 * pmap_protect: set the protection in of the pages in a pmap
2143 *
2144 * => NOTE: this is an inline function in pmap.h
2145 */
2146
2147 /* see pmap.h */
2148
2149 /*
2150 * pmap_write_protect: write-protect pages in a pmap
2151 */
2152
2153 void
pmap_write_protect(struct pmap * pmap,vaddr_t sva,vaddr_t eva,vm_prot_t prot)2154 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
2155 {
2156 pt_entry_t *spte, *epte;
2157 pt_entry_t clear = 0, set = 0;
2158 vaddr_t blockend;
2159 int shootall = 0, shootself;
2160 vaddr_t va;
2161 paddr_t scr3;
2162
2163 scr3 = pmap_map_ptes(pmap);
2164 shootself = (scr3 == 0);
2165
2166 /* should be ok, but just in case ... */
2167 sva &= PG_FRAME;
2168 eva &= PG_FRAME;
2169
2170 if (!(prot & PROT_READ))
2171 set |= pg_xo;
2172 if (!(prot & PROT_WRITE))
2173 clear = PG_RW;
2174 if (!(prot & PROT_EXEC))
2175 set |= pg_nx;
2176
2177 if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
2178 shootall = 1;
2179
2180 for (va = sva; va < eva ; va = blockend) {
2181 blockend = (va & L2_FRAME) + NBPD_L2;
2182 if (blockend > eva)
2183 blockend = eva;
2184
2185 /*
2186 * XXXCDC: our PTE mappings should never be write-protected!
2187 *
2188 * long term solution is to move the PTEs out of user
2189 * address space. and into kernel address space (up
2190 * with APTE). then we can set VM_MAXUSER_ADDRESS to
2191 * be VM_MAX_ADDRESS.
2192 */
2193
2194 /* XXXCDC: ugly hack to avoid freeing PDP here */
2195 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
2196 continue;
2197
2198 /* empty block? */
2199 if (!pmap_pdes_valid(va, NULL))
2200 continue;
2201
2202 #ifdef DIAGNOSTIC
2203 if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS)
2204 panic("%s: PTE space", __func__);
2205 #endif
2206
2207 spte = &PTE_BASE[pl1_i(va)];
2208 epte = &PTE_BASE[pl1_i(blockend)];
2209
2210 for (/*null */; spte < epte ; spte++) {
2211 if (!pmap_valid_entry(*spte))
2212 continue;
2213 pmap_pte_clearbits(spte, clear);
2214 pmap_pte_setbits(spte, set);
2215 }
2216 }
2217
2218 if (shootall)
2219 pmap_tlb_shoottlb(pmap, shootself);
2220 else
2221 pmap_tlb_shootrange(pmap, sva, eva, shootself);
2222
2223 pmap_unmap_ptes(pmap, scr3);
2224 pmap_tlb_shootwait();
2225 }
2226
2227 /*
2228 * end of protection functions
2229 */
2230
2231 /*
2232 * pmap_unwire: clear the wired bit in the PTE
2233 *
2234 * => mapping should already be in map
2235 */
2236
2237 void
pmap_unwire(struct pmap * pmap,vaddr_t va)2238 pmap_unwire(struct pmap *pmap, vaddr_t va)
2239 {
2240 pt_entry_t *ptes;
2241 int level, offs;
2242
2243 level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
2244
2245 if (level == 0) {
2246
2247 #ifdef DIAGNOSTIC
2248 if (!pmap_valid_entry(ptes[offs]))
2249 panic("%s: invalid (unmapped) va 0x%lx", __func__, va);
2250 #endif
2251 if (__predict_true((ptes[offs] & PG_W) != 0)) {
2252 pmap_pte_clearbits(&ptes[offs], PG_W);
2253 pmap->pm_stats.wired_count--;
2254 }
2255 #ifdef DIAGNOSTIC
2256 else {
2257 printf("%s: wiring for pmap %p va 0x%lx "
2258 "didn't change!\n", __func__, pmap, va);
2259 }
2260 #endif
2261 }
2262 #ifdef DIAGNOSTIC
2263 else {
2264 panic("%s: invalid PDE", __func__);
2265 }
2266 #endif
2267 }
2268
2269 void
pmap_enter_special(vaddr_t va,paddr_t pa,vm_prot_t prot)2270 pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot)
2271 {
2272 uint64_t l4idx, l3idx, l2idx, l1idx;
2273 pd_entry_t *pd, *ptp;
2274 paddr_t npa;
2275 struct pmap *pmap = pmap_kernel();
2276 pt_entry_t *ptes;
2277 int level, offs;
2278
2279 /* If CPU is secure, no need to do anything */
2280 if (!cpu_meltdown)
2281 return;
2282
2283 /* Must be kernel VA */
2284 if (va < VM_MIN_KERNEL_ADDRESS)
2285 panic("%s: invalid special mapping va 0x%lx requested",
2286 __func__, va);
2287
2288 if (pmap->pm_pdir_intel == NULL)
2289 pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool,
2290 PR_WAITOK | PR_ZERO);
2291
2292 l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */
2293 l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
2294 l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */
2295 l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */
2296
2297 DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld "
2298 "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va,
2299 (uint64_t)pa, l4idx, l3idx, l2idx, l1idx);
2300
2301 /* Start at PML4 / top level */
2302 pd = pmap->pm_pdir_intel;
2303
2304 if (pd == NULL)
2305 panic("%s: PML4 not initialized for pmap @ %p", __func__,
2306 pmap);
2307
2308 /* npa = physaddr of PDPT */
2309 npa = pd[l4idx] & PMAP_PA_MASK;
2310
2311 /* Valid PML4e for the 512GB region containing va? */
2312 if (!npa) {
2313 /* No valid PML4E - allocate PDPT page and set PML4E */
2314
2315 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
2316
2317 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2318 panic("%s: can't locate PDPT page", __func__);
2319
2320 pd[l4idx] = (npa | PG_RW | PG_V | pg_crypt);
2321
2322 DPRINTF("%s: allocated new PDPT page at phys 0x%llx, "
2323 "setting PML4e[%lld] = 0x%llx\n", __func__,
2324 (uint64_t)npa, l4idx, pd[l4idx]);
2325 }
2326
2327 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2328 if (pd == NULL)
2329 panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
2330 (uint64_t)npa);
2331
2332 /* npa = physaddr of PD page */
2333 npa = pd[l3idx] & PMAP_PA_MASK;
2334
2335 /* Valid PDPTe for the 1GB region containing va? */
2336 if (!npa) {
2337 /* No valid PDPTe - allocate PD page and set PDPTe */
2338
2339 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
2340
2341 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2342 panic("%s: can't locate PD page", __func__);
2343
2344 pd[l3idx] = (npa | PG_RW | PG_V | pg_crypt);
2345
2346 DPRINTF("%s: allocated new PD page at phys 0x%llx, "
2347 "setting PDPTe[%lld] = 0x%llx\n", __func__,
2348 (uint64_t)npa, l3idx, pd[l3idx]);
2349 }
2350
2351 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2352 if (pd == NULL)
2353 panic("%s: can't locate PD page @ pa=0x%llx", __func__,
2354 (uint64_t)npa);
2355
2356 /* npa = physaddr of PT page */
2357 npa = pd[l2idx] & PMAP_PA_MASK;
2358
2359 /* Valid PDE for the 2MB region containing va? */
2360 if (!npa) {
2361 /* No valid PDE - allocate PT page and set PDE */
2362
2363 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
2364
2365 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2366 panic("%s: can't locate PT page", __func__);
2367
2368 pd[l2idx] = (npa | PG_RW | PG_V | pg_crypt);
2369
2370 DPRINTF("%s: allocated new PT page at phys 0x%llx, "
2371 "setting PDE[%lld] = 0x%llx\n", __func__,
2372 (uint64_t)npa, l2idx, pd[l2idx]);
2373 }
2374
2375 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2376 if (pd == NULL)
2377 panic("%s: can't locate PT page @ pa=0x%llx", __func__,
2378 (uint64_t)npa);
2379
2380 DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot "
2381 "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd,
2382 (uint64_t)prot, (uint64_t)pd[l1idx]);
2383
2384 pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_W | pg_crypt;
2385
2386 /*
2387 * Look up the corresponding U+K entry. If we're installing the
2388 * same PA into the U-K map then set the PG_G bit on both and copy
2389 * the cache-control bits from the U+K entry to the U-K entry.
2390 */
2391 level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
2392 if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) {
2393 if (((pd[l1idx] ^ ptes[offs]) & pg_frame) == 0) {
2394 pd[l1idx] |= PG_G | (ptes[offs] & (PG_N | PG_WT));
2395 ptes[offs] |= PG_G;
2396 } else {
2397 DPRINTF("%s: special diffing mapping at %llx\n",
2398 __func__, (long long)va);
2399 }
2400 } else
2401 DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
2402
2403 DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]);
2404 }
2405
2406 #if NVMM > 0
2407 /*
2408 * pmap_convert
2409 *
2410 * Converts 'pmap' to the new 'mode'.
2411 *
2412 * Parameters:
2413 * pmap: the pmap to convert
2414 * mode: the new mode (see pmap.h, PMAP_TYPE_xxx)
2415 */
2416 void
pmap_convert(struct pmap * pmap,int mode)2417 pmap_convert(struct pmap *pmap, int mode)
2418 {
2419 pt_entry_t *pte;
2420
2421 mtx_enter(&pmap->pm_mtx);
2422 pmap->pm_type = mode;
2423
2424 if (pmap_is_ept(pmap)) {
2425 /* Clear PML4 */
2426 pte = (pt_entry_t *)pmap->pm_pdir;
2427 memset(pte, 0, PAGE_SIZE);
2428
2429 /* Give back the meltdown pdir */
2430 if (pmap->pm_pdir_intel != NULL) {
2431 pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
2432 pmap->pm_pdir_intel = NULL;
2433 }
2434 }
2435 mtx_leave(&pmap->pm_mtx);
2436 }
2437
2438 void
pmap_remove_ept(struct pmap * pmap,vaddr_t sgpa,vaddr_t egpa)2439 pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa)
2440 {
2441 vaddr_t v;
2442
2443 mtx_enter(&pmap->pm_mtx);
2444
2445 DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa,
2446 (uint64_t)egpa);
2447 for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE)
2448 pmap_do_remove_ept(pmap, v);
2449
2450 pmap_shootept(pmap, 1);
2451
2452 mtx_leave(&pmap->pm_mtx);
2453
2454 pmap_tlb_shootwait();
2455 }
2456
2457 void
pmap_do_remove_ept(struct pmap * pmap,paddr_t gpa)2458 pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
2459 {
2460 uint64_t l4idx, l3idx, l2idx, l1idx;
2461 struct vm_page *pg3, *pg2, *pg1;
2462 paddr_t npa3, npa2, npa1;
2463 pd_entry_t *pd4, *pd3, *pd2, *pd1;
2464 pd_entry_t *pptes;
2465
2466 MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
2467
2468 l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
2469 l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
2470 l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
2471 l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
2472
2473 /* Start at PML4 / top level */
2474 pd4 = (pd_entry_t *)pmap->pm_pdir;
2475
2476 if (pd4 == NULL)
2477 return;
2478
2479 /* npa3 = physaddr of PDPT */
2480 npa3 = pd4[l4idx] & PMAP_PA_MASK;
2481 if (!npa3)
2482 return;
2483 pd3 = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
2484 pg3 = PHYS_TO_VM_PAGE(npa3);
2485
2486 /* npa2 = physaddr of PD page */
2487 npa2 = pd3[l3idx] & PMAP_PA_MASK;
2488 if (!npa2)
2489 return;
2490 pd2 = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
2491 pg2 = PHYS_TO_VM_PAGE(npa2);
2492
2493 /* npa1 = physaddr of PT page */
2494 npa1 = pd2[l2idx] & PMAP_PA_MASK;
2495 if (!npa1)
2496 return;
2497 pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1);
2498 pg1 = PHYS_TO_VM_PAGE(npa1);
2499
2500 if (pd1[l1idx] == 0)
2501 return;
2502
2503 pd1[l1idx] = 0;
2504 pg1->wire_count--;
2505 pmap->pm_stats.resident_count--;
2506
2507 if (pg1->wire_count > 1)
2508 return;
2509
2510 pg1->wire_count = 0;
2511 pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
2512 pptes[l2idx] = 0;
2513 uvm_pagefree(pg1);
2514 pmap->pm_stats.resident_count--;
2515
2516 pg2->wire_count--;
2517 if (pg2->wire_count > 1)
2518 return;
2519
2520 pg2->wire_count = 0;
2521 pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
2522 pptes[l3idx] = 0;
2523 uvm_pagefree(pg2);
2524 pmap->pm_stats.resident_count--;
2525
2526 pg3->wire_count--;
2527 if (pg3->wire_count > 1)
2528 return;
2529
2530 pg3->wire_count = 0;
2531 pptes = pd4;
2532 pptes[l4idx] = 0;
2533 uvm_pagefree(pg3);
2534 pmap->pm_stats.resident_count--;
2535 }
2536
2537 int
pmap_enter_ept(struct pmap * pmap,paddr_t gpa,paddr_t hpa,vm_prot_t prot)2538 pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot)
2539 {
2540 uint64_t l4idx, l3idx, l2idx, l1idx;
2541 pd_entry_t *pd, npte;
2542 struct vm_page *ptp, *pptp;
2543 paddr_t npa;
2544 struct uvm_object *obj;
2545 int ret = 0;
2546
2547 if (gpa > MAXDSIZ)
2548 return ENOMEM;
2549
2550 l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
2551 l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
2552 l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
2553 l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
2554
2555 mtx_enter(&pmap->pm_mtx);
2556
2557 /* Start at PML4 / top level */
2558 pd = (pd_entry_t *)pmap->pm_pdir;
2559
2560 if (pd == NULL) {
2561 ret = ENOMEM;
2562 goto unlock;
2563 }
2564
2565 /* npa = physaddr of PDPT */
2566 npa = pd[l4idx] & PMAP_PA_MASK;
2567
2568 /* Valid PML4e for the 512GB region containing gpa? */
2569 if (!npa) {
2570 /* No valid PML4e - allocate PDPT page and set PML4e */
2571 obj = &pmap->pm_obj[2]; /* PML4 UVM object */
2572 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 3), NULL,
2573 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2574
2575 if (ptp == NULL) {
2576 ret = ENOMEM;
2577 goto unlock;
2578 }
2579 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2580
2581 /*
2582 * New PDPT page - we are setting the first entry, so set
2583 * the wired count to 1
2584 */
2585 ptp->wire_count = 1;
2586
2587 /* Calculate phys address of this new PDPT page */
2588 npa = VM_PAGE_TO_PHYS(ptp);
2589
2590 /*
2591 * Higher levels get full perms; specific permissions are
2592 * entered at the lowest level.
2593 */
2594 pd[l4idx] = (npa | EPT_R | EPT_W | EPT_X);
2595
2596 pmap->pm_stats.resident_count++;
2597
2598 pptp = ptp;
2599 } else {
2600 /* Already allocated PML4e */
2601 pptp = PHYS_TO_VM_PAGE(npa);
2602 }
2603
2604 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2605 if (pd == NULL)
2606 panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
2607 (uint64_t)npa);
2608
2609 /* npa = physaddr of PD page */
2610 npa = pd[l3idx] & PMAP_PA_MASK;
2611
2612 /* Valid PDPTe for the 1GB region containing gpa? */
2613 if (!npa) {
2614 /* No valid PDPTe - allocate PD page and set PDPTe */
2615 obj = &pmap->pm_obj[1]; /* PDPT UVM object */
2616 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 2), NULL,
2617 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2618
2619 if (ptp == NULL) {
2620 ret = ENOMEM;
2621 goto unlock;
2622 }
2623 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2624
2625 /*
2626 * New PD page - we are setting the first entry, so set
2627 * the wired count to 1
2628 */
2629 ptp->wire_count = 1;
2630 pptp->wire_count++;
2631
2632 npa = VM_PAGE_TO_PHYS(ptp);
2633
2634 /*
2635 * Higher levels get full perms; specific permissions are
2636 * entered at the lowest level.
2637 */
2638 pd[l3idx] = (npa | EPT_R | EPT_W | EPT_X);
2639
2640 pmap->pm_stats.resident_count++;
2641
2642 pptp = ptp;
2643 } else {
2644 /* Already allocated PDPTe */
2645 pptp = PHYS_TO_VM_PAGE(npa);
2646 }
2647
2648 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2649 if (pd == NULL)
2650 panic("%s: can't locate PD page @ pa=0x%llx", __func__,
2651 (uint64_t)npa);
2652
2653 /* npa = physaddr of PT page */
2654 npa = pd[l2idx] & PMAP_PA_MASK;
2655
2656 /* Valid PDE for the 2MB region containing gpa? */
2657 if (!npa) {
2658 /* No valid PDE - allocate PT page and set PDE */
2659 obj = &pmap->pm_obj[0]; /* PDE UVM object */
2660 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 1), NULL,
2661 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2662
2663 if (ptp == NULL) {
2664 ret = ENOMEM;
2665 goto unlock;
2666 }
2667 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2668
2669 ptp->wire_count = 1;
2670 pptp->wire_count++;
2671
2672 npa = VM_PAGE_TO_PHYS(ptp);
2673
2674 /*
2675 * Higher level get full perms; specific permissions are
2676 * entered at the lowest level.
2677 */
2678 pd[l2idx] = (npa | EPT_R | EPT_W | EPT_X);
2679
2680 pmap->pm_stats.resident_count++;
2681
2682 } else {
2683 /* Find final ptp */
2684 ptp = PHYS_TO_VM_PAGE(npa);
2685 if (ptp == NULL)
2686 panic("%s: ptp page vanished?", __func__);
2687 }
2688
2689 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2690 if (pd == NULL)
2691 panic("%s: can't locate PT page @ pa=0x%llx", __func__,
2692 (uint64_t)npa);
2693
2694 npte = hpa | EPT_WB;
2695 if (prot & PROT_READ)
2696 npte |= EPT_R;
2697 if (prot & PROT_WRITE)
2698 npte |= EPT_W;
2699 if (prot & PROT_EXEC)
2700 npte |= EPT_X;
2701
2702 if (pd[l1idx] == 0) {
2703 ptp->wire_count++;
2704 pmap->pm_stats.resident_count++;
2705 } else {
2706 /* XXX flush ept */
2707 }
2708
2709 pd[l1idx] = npte;
2710
2711 unlock:
2712 mtx_leave(&pmap->pm_mtx);
2713
2714 return ret;
2715 }
2716 #endif /* NVMM > 0 */
2717
2718 /*
2719 * pmap_enter: enter a mapping into a pmap
2720 *
2721 * => must be done "now" ... no lazy-evaluation
2722 */
2723
2724 int
pmap_enter(struct pmap * pmap,vaddr_t va,paddr_t pa,vm_prot_t prot,int flags)2725 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
2726 {
2727 pt_entry_t opte, npte;
2728 struct vm_page *ptp, *pg = NULL;
2729 struct pv_entry *pve, *opve = NULL;
2730 int ptpdelta, wireddelta, resdelta;
2731 int wired = (flags & PMAP_WIRED) != 0;
2732 int crypt = (flags & PMAP_NOCRYPT) == 0;
2733 int nocache = (pa & PMAP_NOCACHE) != 0;
2734 int wc = (pa & PMAP_WC) != 0;
2735 int error, shootself;
2736 paddr_t scr3;
2737
2738 #if NVMM > 0
2739 if (pmap_is_ept(pmap))
2740 return pmap_enter_ept(pmap, va, pa, prot);
2741 #endif /* NVMM > 0 */
2742
2743 KASSERT(!(wc && nocache));
2744 pa &= PMAP_PA_MASK;
2745
2746 #ifdef DIAGNOSTIC
2747 if (va == (vaddr_t) PDP_BASE)
2748 panic("%s: trying to map over PDP!", __func__);
2749
2750 /* sanity check: kernel PTPs should already have been pre-allocated */
2751 if (va >= VM_MIN_KERNEL_ADDRESS &&
2752 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
2753 panic("%s: missing kernel PTP for va %lx!", __func__, va);
2754
2755 #endif
2756
2757 pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
2758 if (pve == NULL) {
2759 if (flags & PMAP_CANFAIL) {
2760 error = ENOMEM;
2761 goto out;
2762 }
2763 panic("%s: no pv entries available", __func__);
2764 }
2765
2766 /*
2767 * map in ptes and get a pointer to our PTP (unless we are the kernel)
2768 */
2769
2770 scr3 = pmap_map_ptes(pmap);
2771 shootself = (scr3 == 0);
2772 if (pmap == pmap_kernel()) {
2773 ptp = NULL;
2774 } else {
2775 ptp = pmap_get_ptp(pmap, va);
2776 if (ptp == NULL) {
2777 if (flags & PMAP_CANFAIL) {
2778 pmap_unmap_ptes(pmap, scr3);
2779 error = ENOMEM;
2780 goto out;
2781 }
2782 panic("%s: get ptp failed", __func__);
2783 }
2784 }
2785 opte = PTE_BASE[pl1_i(va)]; /* old PTE */
2786
2787 /*
2788 * is there currently a valid mapping at our VA?
2789 */
2790
2791 if (pmap_valid_entry(opte)) {
2792 /*
2793 * first, calculate pm_stats updates. resident count will not
2794 * change since we are replacing/changing a valid mapping.
2795 * wired count might change...
2796 */
2797
2798 resdelta = 0;
2799 if (wired && (opte & PG_W) == 0)
2800 wireddelta = 1;
2801 else if (!wired && (opte & PG_W) != 0)
2802 wireddelta = -1;
2803 else
2804 wireddelta = 0;
2805 ptpdelta = 0;
2806
2807 /*
2808 * is the currently mapped PA the same as the one we
2809 * want to map?
2810 */
2811
2812 if ((opte & pg_frame) == pa) {
2813
2814 /* if this is on the PVLIST, sync R/M bit */
2815 if (opte & PG_PVLIST) {
2816 pg = PHYS_TO_VM_PAGE(pa);
2817 #ifdef DIAGNOSTIC
2818 if (pg == NULL)
2819 panic("%s: same pa, PG_PVLIST "
2820 "mapping with unmanaged page: "
2821 "va 0x%lx, opte 0x%llx, pa 0x%lx",
2822 __func__, va, opte, pa);
2823 #endif
2824 pmap_sync_flags_pte(pg, opte);
2825 } else {
2826 #ifdef DIAGNOSTIC
2827 if (PHYS_TO_VM_PAGE(pa) != NULL)
2828 panic("%s: same pa, no PG_PVLIST "
2829 "mapping with managed page: "
2830 "va 0x%lx, opte 0x%llx, pa 0x%lx",
2831 __func__, va, opte, pa);
2832 #endif
2833 }
2834 goto enter_now;
2835 }
2836
2837 /*
2838 * changing PAs: we must remove the old one first
2839 */
2840
2841 /*
2842 * if current mapping is on a pvlist,
2843 * remove it (sync R/M bits)
2844 */
2845
2846 if (opte & PG_PVLIST) {
2847 pg = PHYS_TO_VM_PAGE(opte & pg_frame);
2848 #ifdef DIAGNOSTIC
2849 if (pg == NULL)
2850 panic("%s: PG_PVLIST mapping with unmanaged "
2851 "page: va 0x%lx, opte 0x%llx, pa 0x%lx",
2852 __func__, va, opte, pa);
2853 #endif
2854 pmap_sync_flags_pte(pg, opte);
2855 opve = pmap_remove_pv(pg, pmap, va);
2856 pg = NULL; /* This is not the page we are looking for */
2857 }
2858 } else { /* opte not valid */
2859 resdelta = 1;
2860 if (wired)
2861 wireddelta = 1;
2862 else
2863 wireddelta = 0;
2864 if (ptp != NULL)
2865 ptpdelta = 1;
2866 else
2867 ptpdelta = 0;
2868 }
2869
2870 /*
2871 * pve is either NULL or points to a now-free pv_entry structure
2872 * (the latter case is if we called pmap_remove_pv above).
2873 *
2874 * if this entry is to be on a pvlist, enter it now.
2875 */
2876
2877 if (pmap_initialized)
2878 pg = PHYS_TO_VM_PAGE(pa);
2879
2880 if (pg != NULL) {
2881 pmap_enter_pv(pg, pve, pmap, va, ptp);
2882 pve = NULL;
2883 }
2884
2885 enter_now:
2886 /*
2887 * at this point pg is !NULL if we want the PG_PVLIST bit set
2888 */
2889
2890 pmap->pm_stats.resident_count += resdelta;
2891 pmap->pm_stats.wired_count += wireddelta;
2892 if (ptp != NULL)
2893 ptp->wire_count += ptpdelta;
2894
2895 KASSERT(pg == PHYS_TO_VM_PAGE(pa));
2896
2897 npte = pa | protection_codes[prot] | PG_V;
2898 if (pg != NULL) {
2899 npte |= PG_PVLIST;
2900 /*
2901 * make sure that if the page is write combined all
2902 * instances of pmap_enter make it so.
2903 */
2904 if (pg->pg_flags & PG_PMAP_WC) {
2905 KASSERT(nocache == 0);
2906 wc = 1;
2907 }
2908 }
2909 if (wc)
2910 npte |= pmap_pg_wc;
2911 if (wired)
2912 npte |= PG_W;
2913 if (nocache)
2914 npte |= PG_N;
2915 if (va < VM_MAXUSER_ADDRESS)
2916 npte |= ((flags & PMAP_EFI) ? 0 : PG_u);
2917 else if (va < VM_MAX_ADDRESS)
2918 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
2919 if (pmap == pmap_kernel())
2920 npte |= pg_g_kern;
2921 if (crypt)
2922 npte |= pg_crypt;
2923
2924 /*
2925 * If the old entry wasn't valid, we can just update it and
2926 * go. If it was valid, and this isn't a read->write
2927 * transition, then we can safely just update it and flush
2928 * any old TLB entries.
2929 *
2930 * If it _was_ valid and this _is_ a read->write transition,
2931 * then this could be a CoW resolution and we need to make
2932 * sure no CPU can see the new writable mapping while another
2933 * still has the old mapping in its TLB, so insert a correct
2934 * but unwritable mapping, flush any old TLB entries, then
2935 * make it writable.
2936 */
2937 if (! pmap_valid_entry(opte)) {
2938 PTE_BASE[pl1_i(va)] = npte;
2939 } else if ((opte | (npte ^ PG_RW)) & PG_RW) {
2940 /* previously writable or not making writable */
2941 PTE_BASE[pl1_i(va)] = npte;
2942 if (nocache && (opte & PG_N) == 0)
2943 wbinvd_on_all_cpus();
2944 pmap_tlb_shootpage(pmap, va, shootself);
2945 } else {
2946 PTE_BASE[pl1_i(va)] = npte ^ PG_RW;
2947 if (nocache && (opte & PG_N) == 0) /* XXX impossible? */
2948 wbinvd_on_all_cpus();
2949 pmap_tlb_shootpage(pmap, va, shootself);
2950 pmap_tlb_shootwait();
2951 PTE_BASE[pl1_i(va)] = npte;
2952 }
2953
2954 pmap_unmap_ptes(pmap, scr3);
2955 pmap_tlb_shootwait();
2956
2957 error = 0;
2958
2959 out:
2960 if (pve != NULL)
2961 pool_put(&pmap_pv_pool, pve);
2962 if (opve != NULL)
2963 pool_put(&pmap_pv_pool, opve);
2964
2965 return error;
2966 }
2967
2968 int
pmap_get_physpage(vaddr_t va,int level,paddr_t * paddrp)2969 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
2970 {
2971 struct vm_page *ptp;
2972 struct pmap *kpm = pmap_kernel();
2973
2974 if (uvm.page_init_done == 0) {
2975 vaddr_t va;
2976
2977 /*
2978 * we're growing the kernel pmap early (from
2979 * uvm_pageboot_alloc()). this case must be
2980 * handled a little differently.
2981 */
2982
2983 va = pmap_steal_memory(PAGE_SIZE, NULL, NULL);
2984 *paddrp = PMAP_DIRECT_UNMAP(va);
2985 } else {
2986 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
2987 ptp_va2o(va, level), NULL,
2988 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2989 if (ptp == NULL)
2990 panic("%s: out of memory", __func__);
2991 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2992 ptp->wire_count = 1;
2993 *paddrp = VM_PAGE_TO_PHYS(ptp);
2994 }
2995 kpm->pm_stats.resident_count++;
2996 return 1;
2997 }
2998
2999 /*
3000 * Allocate the amount of specified ptps for a ptp level, and populate
3001 * all levels below accordingly, mapping virtual addresses starting at
3002 * kva.
3003 *
3004 * Used by pmap_growkernel.
3005 */
3006 void
pmap_alloc_level(vaddr_t kva,int lvl,long * needed_ptps)3007 pmap_alloc_level(vaddr_t kva, int lvl, long *needed_ptps)
3008 {
3009 unsigned long i;
3010 vaddr_t va;
3011 paddr_t pa;
3012 unsigned long index, endindex;
3013 int level;
3014 pd_entry_t *pdep;
3015
3016 for (level = lvl; level > 1; level--) {
3017 if (level == PTP_LEVELS)
3018 pdep = pmap_kernel()->pm_pdir;
3019 else
3020 pdep = normal_pdes[level - 2];
3021 va = kva;
3022 index = pl_i(kva, level);
3023 endindex = index + needed_ptps[level - 1];
3024 /*
3025 * XXX special case for first time call.
3026 */
3027 if (nkptp[level - 1] != 0)
3028 index++;
3029 else
3030 endindex--;
3031
3032 for (i = index; i <= endindex; i++) {
3033 pmap_get_physpage(va, level - 1, &pa);
3034 pdep[i] = pa | PG_RW | PG_V | pg_nx | pg_crypt;
3035 nkptp[level - 1]++;
3036 va += nbpd[level - 1];
3037 }
3038 }
3039 }
3040
3041 /*
3042 * pmap_growkernel: increase usage of KVM space
3043 *
3044 * => we allocate new PTPs for the kernel and install them in all
3045 * the pmaps on the system.
3046 */
3047
3048 static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS;
3049
3050 vaddr_t
pmap_growkernel(vaddr_t maxkvaddr)3051 pmap_growkernel(vaddr_t maxkvaddr)
3052 {
3053 struct pmap *kpm = pmap_kernel(), *pm;
3054 int s, i;
3055 unsigned newpdes;
3056 long needed_kptp[PTP_LEVELS], target_nptp, old;
3057
3058 if (maxkvaddr <= pmap_maxkvaddr)
3059 return pmap_maxkvaddr;
3060
3061 maxkvaddr = x86_round_pdr(maxkvaddr);
3062 old = nkptp[PTP_LEVELS - 1];
3063 /*
3064 * This loop could be optimized more, but pmap_growkernel()
3065 * is called infrequently.
3066 */
3067 for (i = PTP_LEVELS - 1; i >= 1; i--) {
3068 target_nptp = pl_i(maxkvaddr, i + 1) -
3069 pl_i(VM_MIN_KERNEL_ADDRESS, i + 1);
3070 /*
3071 * XXX only need to check toplevel.
3072 */
3073 if (target_nptp > nkptpmax[i])
3074 panic("%s: out of KVA space", __func__);
3075 needed_kptp[i] = target_nptp - nkptp[i] + 1;
3076 }
3077
3078
3079 s = splhigh(); /* to be safe */
3080 pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
3081
3082 /*
3083 * If the number of top level entries changed, update all
3084 * pmaps.
3085 */
3086 if (needed_kptp[PTP_LEVELS - 1] != 0) {
3087 newpdes = nkptp[PTP_LEVELS - 1] - old;
3088 mtx_enter(&pmaps_lock);
3089 LIST_FOREACH(pm, &pmaps, pm_list) {
3090 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
3091 &kpm->pm_pdir[PDIR_SLOT_KERN + old],
3092 newpdes * sizeof (pd_entry_t));
3093 }
3094 mtx_leave(&pmaps_lock);
3095 }
3096 pmap_maxkvaddr = maxkvaddr;
3097 splx(s);
3098
3099 return maxkvaddr;
3100 }
3101
3102 vaddr_t
pmap_steal_memory(vsize_t size,vaddr_t * start,vaddr_t * end)3103 pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end)
3104 {
3105 int segno;
3106 u_int npg;
3107 vaddr_t va;
3108 paddr_t pa;
3109 struct vm_physseg *seg;
3110
3111 size = round_page(size);
3112 npg = atop(size);
3113
3114 for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) {
3115 if (seg->avail_end - seg->avail_start < npg)
3116 continue;
3117 /*
3118 * We can only steal at an ``unused'' segment boundary,
3119 * i.e. either at the start or at the end.
3120 */
3121 if (seg->avail_start == seg->start ||
3122 seg->avail_end == seg->end)
3123 break;
3124 }
3125 if (segno == vm_nphysseg) {
3126 panic("%s: out of memory", __func__);
3127 } else {
3128 if (seg->avail_start == seg->start) {
3129 pa = ptoa(seg->avail_start);
3130 seg->avail_start += npg;
3131 seg->start += npg;
3132 } else {
3133 pa = ptoa(seg->avail_end) - size;
3134 seg->avail_end -= npg;
3135 seg->end -= npg;
3136 }
3137 /*
3138 * If all the segment has been consumed now, remove it.
3139 * Note that the crash dump code still knows about it
3140 * and will dump it correctly.
3141 */
3142 if (seg->start == seg->end) {
3143 if (vm_nphysseg-- == 1)
3144 panic("%s: out of memory", __func__);
3145 while (segno < vm_nphysseg) {
3146 seg[0] = seg[1]; /* struct copy */
3147 seg++;
3148 segno++;
3149 }
3150 }
3151
3152 va = PMAP_DIRECT_MAP(pa);
3153 memset((void *)va, 0, size);
3154 }
3155
3156 if (start != NULL)
3157 *start = virtual_avail;
3158 if (end != NULL)
3159 *end = VM_MAX_KERNEL_ADDRESS;
3160
3161 return (va);
3162 }
3163
3164 #ifdef MULTIPROCESSOR
3165 /*
3166 * Locking for tlb shootdown.
3167 *
3168 * We lock by setting tlb_shoot_wait to the number of cpus that will
3169 * receive our tlb shootdown. After sending the IPIs, we don't need to
3170 * worry about locking order or interrupts spinning for the lock because
3171 * the call that grabs the "lock" isn't the one that releases it. And
3172 * there is nothing that can block the IPI that releases the lock.
3173 *
3174 * The functions are organized so that we first count the number of
3175 * cpus we need to send the IPI to, then we grab the counter, then
3176 * we send the IPIs, then we finally do our own shootdown.
3177 *
3178 * Our shootdown is last to make it parallel with the other cpus
3179 * to shorten the spin time.
3180 *
3181 * Notice that we depend on failures to send IPIs only being able to
3182 * happen during boot. If they happen later, the above assumption
3183 * doesn't hold since we can end up in situations where noone will
3184 * release the lock if we get an interrupt in a bad moment.
3185 */
3186 #ifdef MP_LOCKDEBUG
3187 #include <ddb/db_output.h>
3188 extern int __mp_lock_spinout;
3189 #endif
3190
3191 volatile long tlb_shoot_wait __attribute__((section(".kudata")));
3192
3193 volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
3194 volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
3195 volatile int tlb_shoot_first_pcid __attribute__((section(".kudata")));
3196
3197 #if NVMM > 0
3198 #include <amd64/vmmvar.h>
3199 volatile uint64_t ept_shoot_mode __attribute__((section(".kudata")));
3200 volatile struct vmx_invept_descriptor ept_shoot_vid
3201 __attribute__((section(".kudata")));
3202 #endif /* NVMM > 0 */
3203
3204 /* Obtain the "lock" for TLB shooting */
3205 static inline int
pmap_start_tlb_shoot(long wait,const char * func)3206 pmap_start_tlb_shoot(long wait, const char *func)
3207 {
3208 int s = splvm();
3209
3210 while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
3211 #ifdef MP_LOCKDEBUG
3212 int nticks = __mp_lock_spinout;
3213 #endif
3214 while (tlb_shoot_wait != 0) {
3215 CPU_BUSY_CYCLE();
3216 #ifdef MP_LOCKDEBUG
3217 if (--nticks <= 0) {
3218 db_printf("%s: spun out", func);
3219 db_enter();
3220 nticks = __mp_lock_spinout;
3221 }
3222 #endif
3223 }
3224 }
3225
3226 return s;
3227 }
3228
3229 void
pmap_tlb_shootpage(struct pmap * pm,vaddr_t va,int shootself)3230 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
3231 {
3232 struct cpu_info *ci, *self = curcpu();
3233 CPU_INFO_ITERATOR cii;
3234 long wait = 0;
3235 u_int64_t mask = 0;
3236 int is_kva = va >= VM_MIN_KERNEL_ADDRESS;
3237
3238 CPU_INFO_FOREACH(cii, ci) {
3239 if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
3240 continue;
3241 if (!is_kva && !pmap_is_active(pm, ci))
3242 continue;
3243 mask |= (1ULL << ci->ci_cpuid);
3244 wait++;
3245 }
3246
3247 if (wait > 0) {
3248 int s = pmap_start_tlb_shoot(wait, __func__);
3249
3250 tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
3251 tlb_shoot_addr1 = va;
3252 CPU_INFO_FOREACH(cii, ci) {
3253 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3254 continue;
3255 if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0)
3256 panic("%s: ipi failed", __func__);
3257 }
3258 splx(s);
3259 }
3260
3261 if (!pmap_use_pcid) {
3262 if (shootself)
3263 pmap_update_pg(va);
3264 } else if (is_kva) {
3265 invpcid(INVPCID_ADDR, PCID_PROC, va);
3266 invpcid(INVPCID_ADDR, PCID_KERN, va);
3267 } else if (shootself) {
3268 invpcid(INVPCID_ADDR, PCID_PROC, va);
3269 if (cpu_meltdown)
3270 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3271 }
3272 }
3273
3274 void
pmap_tlb_shootrange(struct pmap * pm,vaddr_t sva,vaddr_t eva,int shootself)3275 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
3276 {
3277 struct cpu_info *ci, *self = curcpu();
3278 CPU_INFO_ITERATOR cii;
3279 long wait = 0;
3280 u_int64_t mask = 0;
3281 int is_kva = sva >= VM_MIN_KERNEL_ADDRESS;
3282 vaddr_t va;
3283
3284 CPU_INFO_FOREACH(cii, ci) {
3285 if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
3286 continue;
3287 if (!is_kva && !pmap_is_active(pm, ci))
3288 continue;
3289 mask |= (1ULL << ci->ci_cpuid);
3290 wait++;
3291 }
3292
3293 if (wait > 0) {
3294 int s = pmap_start_tlb_shoot(wait, __func__);
3295
3296 tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
3297 tlb_shoot_addr1 = sva;
3298 tlb_shoot_addr2 = eva;
3299 CPU_INFO_FOREACH(cii, ci) {
3300 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3301 continue;
3302 if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0)
3303 panic("%s: ipi failed", __func__);
3304 }
3305 splx(s);
3306 }
3307
3308 if (!pmap_use_pcid) {
3309 if (shootself) {
3310 for (va = sva; va < eva; va += PAGE_SIZE)
3311 pmap_update_pg(va);
3312 }
3313 } else if (is_kva) {
3314 for (va = sva; va < eva; va += PAGE_SIZE) {
3315 invpcid(INVPCID_ADDR, PCID_PROC, va);
3316 invpcid(INVPCID_ADDR, PCID_KERN, va);
3317 }
3318 } else if (shootself) {
3319 if (cpu_meltdown) {
3320 for (va = sva; va < eva; va += PAGE_SIZE) {
3321 invpcid(INVPCID_ADDR, PCID_PROC, va);
3322 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3323 }
3324 } else {
3325 for (va = sva; va < eva; va += PAGE_SIZE)
3326 invpcid(INVPCID_ADDR, PCID_PROC, va);
3327 }
3328 }
3329 }
3330
3331 void
pmap_tlb_shoottlb(struct pmap * pm,int shootself)3332 pmap_tlb_shoottlb(struct pmap *pm, int shootself)
3333 {
3334 struct cpu_info *ci, *self = curcpu();
3335 CPU_INFO_ITERATOR cii;
3336 long wait = 0;
3337 u_int64_t mask = 0;
3338
3339 KASSERT(pm != pmap_kernel());
3340
3341 CPU_INFO_FOREACH(cii, ci) {
3342 if (ci == self || !pmap_is_active(pm, ci) ||
3343 !(ci->ci_flags & CPUF_RUNNING))
3344 continue;
3345 mask |= (1ULL << ci->ci_cpuid);
3346 wait++;
3347 }
3348
3349 if (wait) {
3350 int s = pmap_start_tlb_shoot(wait, __func__);
3351 CPU_INFO_FOREACH(cii, ci) {
3352 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3353 continue;
3354 if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0)
3355 panic("%s: ipi failed", __func__);
3356 }
3357 splx(s);
3358 }
3359
3360 if (shootself) {
3361 if (!pmap_use_pcid)
3362 tlbflush();
3363 else {
3364 invpcid(INVPCID_PCID, PCID_PROC, 0);
3365 if (cpu_meltdown)
3366 invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
3367 }
3368 }
3369 }
3370
3371 #if NVMM > 0
3372 /*
3373 * pmap_shootept: similar to pmap_tlb_shoottlb, but for remotely invalidating
3374 * EPT using invept.
3375 */
3376 void
pmap_shootept(struct pmap * pm,int shootself)3377 pmap_shootept(struct pmap *pm, int shootself)
3378 {
3379 struct cpu_info *ci, *self = curcpu();
3380 struct vmx_invept_descriptor vid;
3381 CPU_INFO_ITERATOR cii;
3382 long wait = 0;
3383 u_int64_t mask = 0;
3384
3385 KASSERT(pmap_is_ept(pm));
3386
3387 CPU_INFO_FOREACH(cii, ci) {
3388 if (ci == self || !pmap_is_active(pm, ci) ||
3389 !(ci->ci_flags & CPUF_RUNNING) ||
3390 !(ci->ci_flags & CPUF_VMM))
3391 continue;
3392 mask |= (1ULL << ci->ci_cpuid);
3393 wait++;
3394 }
3395
3396 if (wait) {
3397 int s = pmap_start_tlb_shoot(wait, __func__);
3398
3399 ept_shoot_mode = self->ci_vmm_cap.vcc_vmx.vmx_invept_mode;
3400 ept_shoot_vid.vid_eptp = pm->eptp;
3401 ept_shoot_vid.vid_reserved = 0;
3402
3403 CPU_INFO_FOREACH(cii, ci) {
3404 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3405 continue;
3406 if (x86_fast_ipi(ci, LAPIC_IPI_INVEPT) != 0)
3407 panic("%s: ipi failed", __func__);
3408 }
3409
3410 splx(s);
3411 }
3412
3413 if (shootself && (self->ci_flags & CPUF_VMM)) {
3414 vid.vid_eptp = pm->eptp;
3415 vid.vid_reserved = 0;
3416 invept(self->ci_vmm_cap.vcc_vmx.vmx_invept_mode, &vid);
3417 }
3418 }
3419 #endif /* NVMM > 0 */
3420
3421 void
pmap_tlb_shootwait(void)3422 pmap_tlb_shootwait(void)
3423 {
3424 #ifdef MP_LOCKDEBUG
3425 int nticks = __mp_lock_spinout;
3426 #endif
3427 while (tlb_shoot_wait != 0) {
3428 CPU_BUSY_CYCLE();
3429 #ifdef MP_LOCKDEBUG
3430 if (--nticks <= 0) {
3431 db_printf("%s: spun out", __func__);
3432 db_enter();
3433 nticks = __mp_lock_spinout;
3434 }
3435 #endif
3436 }
3437 }
3438
3439 #else /* MULTIPROCESSOR */
3440
3441 void
pmap_tlb_shootpage(struct pmap * pm,vaddr_t va,int shootself)3442 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
3443 {
3444 if (!pmap_use_pcid) {
3445 if (shootself)
3446 pmap_update_pg(va);
3447 } else if (va >= VM_MIN_KERNEL_ADDRESS) {
3448 invpcid(INVPCID_ADDR, PCID_PROC, va);
3449 invpcid(INVPCID_ADDR, PCID_KERN, va);
3450 } else if (shootself) {
3451 invpcid(INVPCID_ADDR, PCID_PROC, va);
3452 if (cpu_meltdown)
3453 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3454 }
3455 }
3456
3457 void
pmap_tlb_shootrange(struct pmap * pm,vaddr_t sva,vaddr_t eva,int shootself)3458 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
3459 {
3460 vaddr_t va;
3461
3462 if (!pmap_use_pcid) {
3463 if (shootself) {
3464 for (va = sva; va < eva; va += PAGE_SIZE)
3465 pmap_update_pg(va);
3466 }
3467 } else if (sva >= VM_MIN_KERNEL_ADDRESS) {
3468 for (va = sva; va < eva; va += PAGE_SIZE) {
3469 invpcid(INVPCID_ADDR, PCID_PROC, va);
3470 invpcid(INVPCID_ADDR, PCID_KERN, va);
3471 }
3472 } else if (shootself) {
3473 if (cpu_meltdown) {
3474 for (va = sva; va < eva; va += PAGE_SIZE) {
3475 invpcid(INVPCID_ADDR, PCID_PROC, va);
3476 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3477 }
3478 } else {
3479 for (va = sva; va < eva; va += PAGE_SIZE)
3480 invpcid(INVPCID_ADDR, PCID_PROC, va);
3481 }
3482 }
3483 }
3484
3485 void
pmap_tlb_shoottlb(struct pmap * pm,int shootself)3486 pmap_tlb_shoottlb(struct pmap *pm, int shootself)
3487 {
3488 if (shootself) {
3489 if (!pmap_use_pcid)
3490 tlbflush();
3491 else {
3492 invpcid(INVPCID_PCID, PCID_PROC, 0);
3493 if (cpu_meltdown)
3494 invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
3495 }
3496 }
3497 }
3498
3499 #if NVMM > 0
3500 void
pmap_shootept(struct pmap * pm,int shootself)3501 pmap_shootept(struct pmap *pm, int shootself)
3502 {
3503 struct cpu_info *self = curcpu();
3504 struct vmx_invept_descriptor vid;
3505
3506 KASSERT(pmap_is_ept(pm));
3507
3508 if (shootself && (self->ci_flags & CPUF_VMM)) {
3509 vid.vid_eptp = pm->eptp;
3510 vid.vid_reserved = 0;
3511 invept(self->ci_vmm_cap.vcc_vmx.vmx_invept_mode, &vid);
3512 }
3513 }
3514 #endif /* NVMM > 0 */
3515
3516 #endif /* MULTIPROCESSOR */
3517