xref: /openbsd/sys/arch/amd64/amd64/pmap.c (revision 9e30618e)
1 /*	$OpenBSD: pmap.c,v 1.178 2024/11/02 07:58:58 mpi Exp $	*/
2 /*	$NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright 2001 (c) Wasabi Systems, Inc.
31  * All rights reserved.
32  *
33  * Written by Frank van der Linden for Wasabi Systems, Inc.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgement:
45  *      This product includes software developed for the NetBSD Project by
46  *      Wasabi Systems, Inc.
47  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
48  *    or promote products derived from this software without specific prior
49  *    written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
53  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
54  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
55  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
56  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
57  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
58  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
59  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
60  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
61  * POSSIBILITY OF SUCH DAMAGE.
62  */
63 
64 /*
65  * This is the i386 pmap modified and generalized to support x86-64
66  * as well. The idea is to hide the upper N levels of the page tables
67  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
68  * is mostly untouched, except that it uses some more generalized
69  * macros and interfaces.
70  *
71  * This pmap has been tested on the i386 as well, and it can be easily
72  * adapted to PAE.
73  *
74  * fvdl@wasabisystems.com 18-Jun-2001
75  */
76 
77 /*
78  * pmap.c: i386 pmap module rewrite
79  * Chuck Cranor <chuck@ccrc.wustl.edu>
80  * 11-Aug-97
81  *
82  * history of this pmap module: in addition to my own input, i used
83  *    the following references for this rewrite of the i386 pmap:
84  *
85  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
86  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
87  *     it was then ported to the i386 by William Jolitz of UUNET
88  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
89  *     project fixed some bugs and provided some speed ups.
90  *
91  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
92  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
93  *     and David Greenman.
94  *
95  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
96  *     between several processors.   the VAX version was done by
97  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
98  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
99  *     David Golub, and Richard Draves.    the alpha version was
100  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
101  *     (NetBSD/alpha).
102  */
103 
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/atomic.h>
107 #include <sys/proc.h>
108 #include <sys/pool.h>
109 #include <sys/user.h>
110 #include <sys/mutex.h>
111 
112 #include <uvm/uvm.h>
113 
114 #include <machine/cpu.h>
115 #ifdef MULTIPROCESSOR
116 #include <machine/i82489reg.h>
117 #include <machine/i82489var.h>
118 #endif
119 
120 #include "vmm.h"
121 
122 #if NVMM > 0
123 #include <machine/vmmvar.h>
124 #endif /* NVMM > 0 */
125 
126 #include "acpi.h"
127 
128 /* #define PMAP_DEBUG */
129 
130 #ifdef PMAP_DEBUG
131 #define DPRINTF(x...)   do { printf(x); } while(0)
132 #else
133 #define DPRINTF(x...)
134 #endif /* PMAP_DEBUG */
135 
136 
137 /*
138  * general info:
139  *
140  *  - for an explanation of how the i386 MMU hardware works see
141  *    the comments in <machine/pte.h>.
142  *
143  *  - for an explanation of the general memory structure used by
144  *    this pmap (including the recursive mapping), see the comments
145  *    in <machine/pmap.h>.
146  *
147  * this file contains the code for the "pmap module."   the module's
148  * job is to manage the hardware's virtual to physical address mappings.
149  * note that there are two levels of mapping in the VM system:
150  *
151  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
152  *      to map ranges of virtual address space to objects/files.  for
153  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
154  *      to the file /bin/ls starting at offset zero."   note that
155  *      the upper layer mapping is not concerned with how individual
156  *      vm_pages are mapped.
157  *
158  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
159  *      from virtual addresses.   it is concerned with which vm_page is
160  *      mapped where.   for example, when you run /bin/ls and start
161  *      at page 0x1000 the fault routine may lookup the correct page
162  *      of the /bin/ls file and then ask the pmap layer to establish
163  *      a mapping for it.
164  *
165  * note that information in the lower layer of the VM system can be
166  * thrown away since it can easily be reconstructed from the info
167  * in the upper layer.
168  *
169  * data structures we use include:
170  *  - struct pmap: describes the address space of one process
171  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
172  *  - struct pg_to_free: a list of virtual addresses whose mappings
173  *	have been changed.   used for TLB flushing.
174  */
175 
176 /*
177  * memory allocation
178  *
179  *  - there are three data structures that we must dynamically allocate:
180  *
181  * [A] new process' page directory page (PDP)
182  *	- plan 1: done at pmap_create() we use
183  *	  pool_get(&pmap_pmap_pool, PR_WAITOK) to do this allocation.
184  *
185  * if we are low in free physical memory then we sleep in
186  * pool_get() -- in this case this is ok since we are creating
187  * a new pmap and should not be holding any locks.
188  *
189  * XXX: the fork code currently has no way to return an "out of
190  * memory, try again" error code since uvm_fork [fka vm_fork]
191  * is a void function.
192  *
193  * [B] new page tables pages (PTP)
194  *	call uvm_pagealloc()
195  *		=> success: zero page, add to pm_pdir
196  *		=> failure: we are out of free vm_pages, let pmap_enter()
197  *		   tell UVM about it.
198  *
199  * note: for kernel PTPs, we start with NKPTP of them.   as we map
200  * kernel memory (at uvm_map time) we check to see if we've grown
201  * the kernel pmap.   if so, we call the optional function
202  * pmap_growkernel() to grow the kernel PTPs in advance.
203  *
204  * [C] pv_entry structures
205  *	- try to allocate one from the pool.
206  *	If we fail, we simply let pmap_enter() tell UVM about it.
207  */
208 
209 long nkptp[] = NKPTP_INITIALIZER;
210 
211 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
212 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
213 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
214 const long nbpd[] = NBPD_INITIALIZER;
215 pd_entry_t *const normal_pdes[] = PDES_INITIALIZER;
216 
217 #define pmap_pte_set(p, n)		atomic_swap_64(p, n)
218 #define pmap_pte_clearbits(p, b)	x86_atomic_clearbits_u64(p, b)
219 #define pmap_pte_setbits(p, b)		x86_atomic_setbits_u64(p, b)
220 
221 /*
222  * global data structures
223  */
224 
225 struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
226 
227 /*
228  * pg_nx: NX PTE bit (if CPU supports)
229  * pg_g_kern: PG_G if global pages should be used in kernel mappings,
230  *	0 otherwise (for insecure CPUs)
231  */
232 pt_entry_t pg_nx = 0;
233 pt_entry_t pg_g_kern = 0;
234 
235 /* pg_xo: XO PTE bits, set to PKU key1 (if cpu supports PKU) */
236 pt_entry_t pg_xo;
237 
238 /* pg_crypt, pg_frame, pg_lgframe: will be derived from CPUID */
239 pt_entry_t pg_crypt = 0;
240 pt_entry_t pg_frame = PG_FRAME;
241 pt_entry_t pg_lgframe = PG_LGFRAME;
242 
243 /*
244  * pmap_pg_wc: if our processor supports PAT then we set this
245  * to be the pte bits for Write Combining. Else we fall back to
246  * UC- so mtrrs can override the cacheability;
247  */
248 int pmap_pg_wc = PG_UCMINUS;
249 
250 /*
251  * pmap_use_pcid: nonzero if PCID use is enabled (currently we require INVPCID)
252  *
253  * The next three are zero unless and until PCID support is enabled so code
254  * can just 'or' them in as needed without tests.
255  * cr3_pcid: CR3_REUSE_PCID
256  * cr3_pcid_proc and cr3_pcid_temp: PCID_PROC and PCID_TEMP
257  */
258 #if PCID_KERN != 0
259 # error "pmap.c assumes PCID_KERN is zero"
260 #endif
261 int pmap_use_pcid;
262 static u_int cr3_pcid_proc;
263 static u_int cr3_pcid_temp;
264 /* these two are accessed from locore.o */
265 paddr_t cr3_reuse_pcid;
266 paddr_t cr3_pcid_proc_intel;
267 
268 /*
269  * other data structures
270  */
271 
272 pt_entry_t protection_codes[8];     /* maps MI prot to i386 prot code */
273 int pmap_initialized = 0;	    /* pmap_init done yet? */
274 
275 /*
276  * pv management structures.
277  */
278 struct pool pmap_pv_pool;
279 
280 /*
281  * linked list of all non-kernel pmaps
282  */
283 
284 struct pmap_head pmaps;
285 struct mutex pmaps_lock = MUTEX_INITIALIZER(IPL_VM);
286 
287 /*
288  * pool that pmap structures are allocated from
289  */
290 
291 struct pool pmap_pmap_pool;
292 
293 /*
294  * When we're freeing a ptp, we need to delay the freeing until all
295  * tlb shootdown has been done. This is the list of the to-be-freed pages.
296  */
297 TAILQ_HEAD(pg_to_free, vm_page);
298 
299 /*
300  * pool that PDPs are allocated from
301  */
302 
303 struct pool pmap_pdp_pool;
304 void pmap_pdp_ctor(pd_entry_t *);
305 void pmap_pdp_ctor_intel(pd_entry_t *);
306 
307 extern vaddr_t msgbuf_vaddr;
308 extern paddr_t msgbuf_paddr;
309 
310 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
311 extern paddr_t idt_paddr;
312 
313 extern vaddr_t lo32_vaddr;
314 extern vaddr_t lo32_paddr;
315 
316 vaddr_t virtual_avail;
317 extern int end;
318 
319 /*
320  * local prototypes
321  */
322 
323 void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *,
324     vaddr_t, struct vm_page *);
325 struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t);
326 struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
327 int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs);
328 void pmap_free_ptp(struct pmap *, struct vm_page *,
329     vaddr_t, struct pg_to_free *);
330 void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
331 #ifdef MULTIPROCESSOR
332 static int pmap_is_active(struct pmap *, struct cpu_info *);
333 #endif
334 paddr_t pmap_map_ptes(struct pmap *);
335 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
336 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
337 #if NVMM > 0
338 void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t);
339 void pmap_do_remove_ept(struct pmap *, vaddr_t);
340 int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t);
341 void pmap_shootept(struct pmap *, int);
342 #endif /* NVMM > 0 */
343 int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
344     vaddr_t, int, struct pv_entry **);
345 void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
346     vaddr_t, vaddr_t, int, struct pv_entry **);
347 #define PMAP_REMOVE_ALL		0	/* remove all mappings */
348 #define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
349 
350 void pmap_unmap_ptes(struct pmap *, paddr_t);
351 int pmap_get_physpage(vaddr_t, int, paddr_t *);
352 int pmap_pdes_valid(vaddr_t, pd_entry_t *);
353 void pmap_alloc_level(vaddr_t, int, long *);
354 
355 static inline
356 void pmap_sync_flags_pte(struct vm_page *, u_long);
357 
358 void pmap_tlb_shootpage(struct pmap *, vaddr_t, int);
359 void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int);
360 void pmap_tlb_shoottlb(struct pmap *, int);
361 #ifdef MULTIPROCESSOR
362 void pmap_tlb_shootwait(void);
363 #else
364 #define	pmap_tlb_shootwait()		do { } while (0)
365 #endif
366 
367 /*
368  * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
369  */
370 
371 /*
372  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
373  *		of course the kernel is always loaded
374  */
375 
376 static inline int
pmap_is_curpmap(struct pmap * pmap)377 pmap_is_curpmap(struct pmap *pmap)
378 {
379 	return((pmap == pmap_kernel()) ||
380 	       (pmap->pm_pdirpa == (rcr3() & CR3_PADDR)));
381 }
382 
383 /*
384  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
385  */
386 
387 #ifdef MULTIPROCESSOR
388 static inline int
pmap_is_active(struct pmap * pmap,struct cpu_info * ci)389 pmap_is_active(struct pmap *pmap, struct cpu_info *ci)
390 {
391 	return (pmap == pmap_kernel() || pmap == ci->ci_proc_pmap
392 #if NVMM > 0
393 	    || (pmap_is_ept(pmap) && pmap == ci->ci_ept_pmap)
394 #endif /* NVMM > 0 */
395 	    );
396 }
397 #endif
398 
399 static inline u_int
pmap_pte2flags(u_long pte)400 pmap_pte2flags(u_long pte)
401 {
402 	return (((pte & PG_U) ? PG_PMAP_REF : 0) |
403 	    ((pte & PG_M) ? PG_PMAP_MOD : 0));
404 }
405 
406 static inline void
pmap_sync_flags_pte(struct vm_page * pg,u_long pte)407 pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
408 {
409 	if (pte & (PG_U|PG_M)) {
410 		atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
411 	}
412 }
413 
414 /*
415  * pmap_map_ptes: map a pmap's PTEs into KVM
416  *
417  * This should not be done for EPT pmaps
418  */
419 paddr_t
pmap_map_ptes(struct pmap * pmap)420 pmap_map_ptes(struct pmap *pmap)
421 {
422 	paddr_t cr3;
423 
424 	KASSERT(!pmap_is_ept(pmap));
425 
426 	/* the kernel's pmap is always accessible */
427 	if (pmap == pmap_kernel())
428 		return 0;
429 
430 	/*
431 	 * Lock the target map before switching to its page tables to
432 	 * guarantee other CPUs have finished changing the tables before
433 	 * we potentially start caching table and TLB entries.
434 	 */
435 	mtx_enter(&pmap->pm_mtx);
436 
437 	cr3 = rcr3();
438 	KASSERT((cr3 & CR3_PCID) == PCID_KERN ||
439 		(cr3 & CR3_PCID) == PCID_PROC);
440 	if (pmap->pm_pdirpa == (cr3 & CR3_PADDR))
441 		cr3 = 0;
442 	else {
443 		cr3 |= cr3_reuse_pcid;
444 		lcr3(pmap->pm_pdirpa | cr3_pcid_temp);
445 	}
446 
447 	return cr3;
448 }
449 
450 void
pmap_unmap_ptes(struct pmap * pmap,paddr_t save_cr3)451 pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
452 {
453 	if (pmap != pmap_kernel())
454 		mtx_leave(&pmap->pm_mtx);
455 
456 	if (save_cr3 != 0)
457 		lcr3(save_cr3);
458 }
459 
460 int
pmap_find_pte_direct(struct pmap * pm,vaddr_t va,pt_entry_t ** pd,int * offs)461 pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs)
462 {
463 	u_long mask, shift;
464 	pd_entry_t pde;
465 	paddr_t pdpa;
466 	int lev;
467 
468 	pdpa = pm->pm_pdirpa;
469 	shift = L4_SHIFT;
470 	mask = L4_MASK;
471 	for (lev = PTP_LEVELS; lev > 0; lev--) {
472 		*pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa);
473 		*offs = (VA_SIGN_POS(va) & mask) >> shift;
474 		pde = (*pd)[*offs];
475 
476 		/* Large pages are different, break early if we run into one. */
477 		if ((pde & (PG_PS|PG_V)) != PG_V)
478 			return (lev - 1);
479 
480 		pdpa = ((*pd)[*offs] & pg_frame);
481 		/* 4096/8 == 512 == 2^9 entries per level */
482 		shift -= 9;
483 		mask >>= 9;
484 	}
485 
486 	return (0);
487 }
488 
489 /*
490  * p m a p   k e n t e r   f u n c t i o n s
491  *
492  * functions to quickly enter/remove pages from the kernel address
493  * space.   pmap_kremove is exported to MI kernel.  we make use of
494  * the recursive PTE mappings.
495  */
496 
497 /*
498  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
499  *
500  * => no need to lock anything, assume va is already allocated
501  * => should be faster than normal pmap enter function
502  */
503 
504 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot)505 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
506 {
507 	pt_entry_t *pte, opte, npte;
508 
509 	pte = kvtopte(va);
510 
511 	npte = (pa & PMAP_PA_MASK) | ((prot & PROT_WRITE) ? PG_RW : PG_RO) |
512 	    ((pa & PMAP_NOCACHE) ? PG_N : 0) |
513 	    ((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V |
514 	    ((pa & PMAP_NOCRYPT) ? 0 : pg_crypt);
515 
516 	/* special 1:1 mappings in the first 2MB must not be global */
517 	if (va >= (vaddr_t)NBPD_L2)
518 		npte |= pg_g_kern;
519 
520 	if (!(prot & PROT_EXEC))
521 		npte |= pg_nx;
522 	opte = pmap_pte_set(pte, npte);
523 #ifdef LARGEPAGES
524 	/* XXX For now... */
525 	if (opte & PG_PS)
526 		panic("%s: PG_PS", __func__);
527 #endif
528 	if (pmap_valid_entry(opte)) {
529 		if ((pa & PMAP_NOCACHE && (opte & PG_N) == 0) ||
530 		    (pa & PMAP_NOCRYPT))
531 			wbinvd_on_all_cpus();
532 		/* This shouldn't happen */
533 		pmap_tlb_shootpage(pmap_kernel(), va, 1);
534 		pmap_tlb_shootwait();
535 	}
536 }
537 
538 /*
539  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
540  *
541  * => no need to lock anything
542  * => caller must dispose of any vm_page mapped in the va range
543  * => note: not an inline function
544  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
545  * => we assume kernel only unmaps valid addresses and thus don't bother
546  *    checking the valid bit before doing TLB flushing
547  */
548 
549 void
pmap_kremove(vaddr_t sva,vsize_t len)550 pmap_kremove(vaddr_t sva, vsize_t len)
551 {
552 	pt_entry_t *pte, opte;
553 	vaddr_t va, eva;
554 
555 	eva = sva + len;
556 
557 	for (va = sva; va != eva; va += PAGE_SIZE) {
558 		pte = kvtopte(va);
559 
560 		opte = pmap_pte_set(pte, 0);
561 #ifdef LARGEPAGES
562 		KASSERT((opte & PG_PS) == 0);
563 #endif
564 		KASSERT((opte & PG_PVLIST) == 0);
565 	}
566 
567 	pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1);
568 	pmap_tlb_shootwait();
569 }
570 
571 /*
572  * pmap_set_pml4_early
573  *
574  * Utility function to map 2GB of 2MB pages to 'pa'. The VA that is assigned
575  * is the pml4 entry for 'early mappings' (see pmap.h). This function is used
576  * by display drivers that need to map their framebuffers early, before the
577  * pmap is fully initialized (eg, to show panic messages).
578  *
579  * Users of this function must call pmap_clear_pml4_early to remove the
580  * mapping when finished.
581  *
582  * Parameters:
583  *  pa: phys addr to map
584  *
585  * Return value:
586  *  VA mapping to 'pa'. This mapping is 2GB in size and starts at the base
587  *   of the 2MB region containing 'va'.
588  */
589 vaddr_t
pmap_set_pml4_early(paddr_t pa)590 pmap_set_pml4_early(paddr_t pa)
591 {
592 	extern paddr_t early_pte_pages;
593 	pt_entry_t *pml4e, *pte;
594 	int i, j, off;
595 	paddr_t curpa;
596 	vaddr_t va;
597 
598 	pml4e = (pt_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
599 	pml4e[PDIR_SLOT_EARLY] = (pd_entry_t)early_pte_pages | PG_V | PG_RW |
600 	    pg_crypt;
601 
602 	off = pa & PAGE_MASK_L2;
603 	curpa = pa & L2_FRAME;
604 
605 	pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
606 	memset(pte, 0, 3 * NBPG);
607 
608 	pte[0] = (early_pte_pages + NBPG) | PG_V | PG_RW | pg_crypt;
609 	pte[1] = (early_pte_pages + 2 * NBPG) | PG_V | PG_RW | pg_crypt;
610 
611 	pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages + NBPG);
612 	for (i = 0; i < 2; i++) {
613 		/* 2 early pages of mappings */
614 		for (j = 0; j < 512; j++) {
615 			/* j[0..511] : 2MB mappings per page */
616 			pte[(i * 512) + j] = curpa | PG_V | PG_RW | PG_PS |
617 			    pg_crypt;
618 			curpa += (2 * 1024 * 1024);
619 		}
620 	}
621 
622 	va = (vaddr_t)((PDIR_SLOT_EARLY * 512ULL) << L3_SHIFT) + off;
623 	return VA_SIGN_NEG(va);
624 }
625 
626 /*
627  * pmap_clear_pml4_early
628  *
629  * Clears the mapping previously established with pmap_set_pml4_early.
630  */
631 void
pmap_clear_pml4_early(void)632 pmap_clear_pml4_early(void)
633 {
634 	extern paddr_t early_pte_pages;
635 	pt_entry_t *pml4e, *pte;
636 
637 	pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
638 	memset(pte, 0, 3 * NBPG);
639 
640 	pml4e = (pd_entry_t *)pmap_kernel()->pm_pdir;
641 	pml4e[PDIR_SLOT_EARLY] = 0;
642 	tlbflush();
643 }
644 
645 /*
646  * p m a p   i n i t   f u n c t i o n s
647  *
648  * pmap_bootstrap and pmap_init are called during system startup
649  * to init the pmap module.   pmap_bootstrap() does a low level
650  * init just to get things rolling.   pmap_init() finishes the job.
651  */
652 
653 /*
654  * pmap_bootstrap: get the system in a state where it can run with VM
655  *	properly enabled (called before main()).   the VM system is
656  *      fully init'd later...
657  */
658 
659 paddr_t
pmap_bootstrap(paddr_t first_avail,paddr_t max_pa)660 pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
661 {
662 	vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS;
663 	struct pmap *kpm;
664 	int curslot, i, j, p;
665 	long ndmpdp;
666 	paddr_t dmpd, dmpdp, start_cur, cur_pa;
667 	vaddr_t kva, kva_end;
668 	pt_entry_t *pml3, *pml2;
669 
670 	KASSERT(((0x1000ULL | pg_crypt) & pg_frame) == 0x1000ULL);
671 
672 	/*
673 	 * define the boundaries of the managed kernel virtual address
674 	 * space.
675 	 */
676 
677 	virtual_avail = kva_start;		/* first free KVA */
678 
679 	/*
680 	 * If PKU is available, initialize PROT_EXEC entry correctly,
681 	 * and enable the feature before it gets used
682 	 * XXX Some Hypervisors forget to save/restore PKU
683 	 */
684 	if (cpuid_level >= 0x7) {
685 		uint32_t ecx, dummy;
686 
687 		CPUID_LEAF(0x7, 0, dummy, dummy, ecx, dummy);
688 		if (ecx & SEFF0ECX_PKU) {
689 			lcr4(rcr4() | CR4_PKE);
690 			pg_xo = PG_XO;
691 		}
692 	}
693 
694 	/*
695 	 * set up protection_codes: we need to be able to convert from
696 	 * a MI protection code (some combo of VM_PROT...) to something
697 	 * we can jam into a i386 PTE.
698 	 */
699 
700 	protection_codes[PROT_NONE] = pg_nx;			/* --- */
701 	protection_codes[PROT_EXEC] = pg_xo;			/* --x */
702 	protection_codes[PROT_READ] = PG_RO | pg_nx;		/* -r- */
703 	protection_codes[PROT_READ | PROT_EXEC] = PG_RO;	/* -rx */
704 	protection_codes[PROT_WRITE] = PG_RW | pg_nx;		/* w-- */
705 	protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW;	/* w-x */
706 	protection_codes[PROT_WRITE | PROT_READ] = PG_RW | pg_nx; /* wr- */
707 	protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW;	/* wrx */
708 
709 	/*
710 	 * now we init the kernel's pmap
711 	 *
712 	 * the kernel pmap's pm_obj is not used for much.   however, in
713 	 * user pmaps the pm_obj contains the list of active PTPs.
714 	 * the pm_obj currently does not have a pager.
715 	 */
716 
717 	kpm = pmap_kernel();
718 	for (i = 0; i < PTP_LEVELS - 1; i++) {
719 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1);
720 		kpm->pm_ptphint[i] = NULL;
721 	}
722 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
723 	kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
724 	kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
725 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
726 		atop(kva_start - VM_MIN_KERNEL_ADDRESS);
727 	/*
728 	 * the above is just a rough estimate and not critical to the proper
729 	 * operation of the system.
730 	 */
731 
732 	kpm->pm_type = PMAP_TYPE_NORMAL;
733 
734 	curpcb->pcb_pmap = kpm;	/* proc0's pcb */
735 
736 	/*
737 	 * Configure and enable PCID use if supported.
738 	 * Currently we require INVPCID support.
739 	 */
740 	if ((cpu_ecxfeature & CPUIDECX_PCID) && cpuid_level >= 0x07) {
741 		uint32_t ebx, dummy;
742 		CPUID_LEAF(0x7, 0, dummy, ebx, dummy, dummy);
743 		if (ebx & SEFF0EBX_INVPCID) {
744 			pmap_use_pcid = 1;
745 			/*
746 			 * We cannot use global mappings because
747 			 * invpcid function 0 does not invalidate global
748 			 * mappings. The hardware can cache kernel
749 			 * mappings based on PCID_KERN, i.e. there is no
750 			 * need for global mappings.
751 			 */
752 			pg_g_kern = 0;
753 			lcr4( rcr4() | CR4_PCIDE );
754 			cr3_pcid_proc = PCID_PROC;
755 			cr3_pcid_temp = PCID_TEMP;
756 			cr3_reuse_pcid = CR3_REUSE_PCID;
757 			cr3_pcid_proc_intel = PCID_PROC_INTEL;
758 		}
759 	}
760 
761 	/*
762 	 * Add PG_G attribute to already mapped kernel pages. pg_g_kern
763 	 * is calculated in locore0.S and may be set to:
764 	 *
765 	 * 0 if this CPU does not safely support global pages in the kernel
766 	 *  (Intel/Meltdown)
767 	 * PG_G if this CPU does safely support global pages in the kernel
768 	 *  (AMD)
769 	 */
770 #if KERNBASE == VM_MIN_KERNEL_ADDRESS
771 	for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
772 #else
773 	kva_end = roundup((vaddr_t)&end, PAGE_SIZE);
774 	for (kva = KERNBASE; kva < kva_end ;
775 #endif
776 	     kva += PAGE_SIZE) {
777 		unsigned long p1i = pl1_i(kva);
778 		if (pmap_valid_entry(PTE_BASE[p1i]))
779 			PTE_BASE[p1i] |= pg_g_kern;
780 	}
781 
782 	/*
783 	 * Map the direct map. The first 4GB were mapped in locore, here
784 	 * we map the rest if it exists. We actually use the direct map
785 	 * here to set up the page tables, we're assuming that we're still
786 	 * operating in the lower 4GB of memory.
787 	 *
788 	 * Map (up to) the first 512GB of physical memory first. This part
789 	 * is handled differently than physical memory > 512GB since we have
790 	 * already mapped part of this range in locore0.
791 	 */
792 	ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT;
793 	if (ndmpdp < NDML2_ENTRIES)
794 		ndmpdp = NDML2_ENTRIES;		/* At least 4GB */
795 	if (ndmpdp > 512)
796 		ndmpdp = 512;			/* At most 512GB */
797 
798 	dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & pg_frame;
799 
800 	dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE;
801 
802 	for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) {
803 		paddr_t pdp;
804 		vaddr_t va;
805 
806 		pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
807 		va = PMAP_DIRECT_MAP(pdp);
808 
809 		*((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT);
810 		*((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U |
811 		    PG_M | pg_nx | pg_crypt;
812 	}
813 
814 	for (i = NDML2_ENTRIES; i < ndmpdp; i++) {
815 		paddr_t pdp;
816 		vaddr_t va;
817 
818 		pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
819 		va = PMAP_DIRECT_MAP(pdp);
820 
821 		*((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT);
822 		*((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M | pg_nx |
823 		    pg_crypt;
824 	}
825 
826 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U |
827 	    PG_M | pg_nx | pg_crypt;
828 
829 	/* Map any remaining physical memory > 512GB */
830 	for (curslot = 1 ; curslot < NUM_L4_SLOT_DIRECT ; curslot++) {
831 		/*
832 		 * Start of current range starts at PA (curslot) * 512GB
833 		 */
834 		start_cur = (paddr_t)(curslot * NBPD_L4);
835 		if (max_pa > start_cur) {
836 			/* Next 512GB, new PML4e and L3(512GB) page */
837 			dmpd = first_avail; first_avail += PAGE_SIZE;
838 			pml3 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
839 			kpm->pm_pdir[PDIR_SLOT_DIRECT + curslot] = dmpd |
840 			    PG_KW | PG_V | PG_U | PG_M | pg_nx | pg_crypt;
841 
842 			/* Calculate full 1GB pages in this 512GB region */
843 			p = ((max_pa - start_cur) >> L3_SHIFT);
844 
845 			/* Check if a partial (<1GB) page remains */
846 			if (max_pa & L2_MASK)
847 				p++;
848 
849 			/*
850 			 * Handle the case where this range is full and there
851 			 * is still more memory after (p would be > 512).
852 			 */
853 			if (p > NPDPG)
854 				p = NPDPG;
855 
856 			/* Allocate 'p' L2(1GB) pages and populate */
857 			for (i = 0; i < p; i++) {
858 				dmpd = first_avail; first_avail += PAGE_SIZE;
859 				pml2 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
860 				pml3[i] = dmpd |
861 				    PG_RW | PG_V | PG_U | PG_M | pg_nx |
862 				    pg_crypt;
863 
864 				cur_pa = start_cur + (i << L3_SHIFT);
865 				j = 0;
866 
867 				while (cur_pa < max_pa && j < NPDPG) {
868 					pml2[j] = curslot * NBPD_L4 +
869 					    (uint64_t)i * NBPD_L3 +
870 					    (uint64_t)j * NBPD_L2;
871 					pml2[j] |= PG_RW | PG_V | pg_g_kern |
872 					    PG_U | PG_M | pg_nx | PG_PS |
873 					    pg_crypt;
874 					cur_pa += NBPD_L2;
875 					j++;
876 				}
877 			}
878 		}
879 	}
880 
881 	tlbflush();
882 
883 	msgbuf_vaddr = virtual_avail;
884 	virtual_avail += round_page(MSGBUFSIZE);
885 
886 	idt_vaddr = virtual_avail;
887 	virtual_avail += 2 * PAGE_SIZE;
888 	idt_paddr = first_avail;			/* steal a page */
889 	first_avail += 2 * PAGE_SIZE;
890 
891 #if defined(MULTIPROCESSOR) || \
892     (NACPI > 0 && !defined(SMALL_KERNEL))
893 	/*
894 	 * Grab a page below 4G for things that need it (i.e.
895 	 * having an initial %cr3 for the MP trampoline).
896 	 */
897 	lo32_vaddr = virtual_avail;
898 	virtual_avail += PAGE_SIZE;
899 	lo32_paddr = first_avail;
900 	first_avail += PAGE_SIZE;
901 #endif
902 
903 	/*
904 	 * init the global lists.
905 	 */
906 	LIST_INIT(&pmaps);
907 
908 	/*
909 	 * initialize the pmap pools.
910 	 */
911 
912 	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_VM, 0,
913 	    "pmappl", NULL);
914 	pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0,
915 	    "pvpl", &pool_allocator_single);
916 	pool_sethiwat(&pmap_pv_pool, 32 * 1024);
917 
918 	/*
919 	 * initialize the PDE pool.
920 	 */
921 
922 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_VM, 0,
923 	    "pdppl", &pool_allocator_single);
924 
925 	kpm->pm_pdir_intel = NULL;
926 	kpm->pm_pdirpa_intel = 0;
927 
928 	/*
929 	 * ensure the TLB is sync'd with reality by flushing it...
930 	 */
931 
932 	tlbflush();
933 
934 	return first_avail;
935 }
936 
937 void
pmap_init_percpu(void)938 pmap_init_percpu(void)
939 {
940 	pool_cache_init(&pmap_pv_pool);
941 }
942 
943 /*
944  * pmap_randomize
945  *
946  * Randomizes the location of the kernel pmap
947  */
948 void
pmap_randomize(void)949 pmap_randomize(void)
950 {
951 	pd_entry_t *pml4va, *oldpml4va;
952 	paddr_t pml4pa;
953 	int i;
954 
955 	pml4va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
956 	if (pml4va == NULL)
957 		panic("%s: km_alloc failed", __func__);
958 
959 	/* Copy old PML4 page to new one */
960 	oldpml4va = pmap_kernel()->pm_pdir;
961 	memcpy(pml4va, oldpml4va, PAGE_SIZE);
962 
963 	/* Switch to new PML4 */
964 	pmap_extract(pmap_kernel(), (vaddr_t)pml4va, &pml4pa);
965 	lcr3(pml4pa);
966 
967 	/* Fixup pmap_kernel and proc0's %cr3 */
968 	pmap_kernel()->pm_pdirpa = pml4pa;
969 	pmap_kernel()->pm_pdir = pml4va;
970 	proc0.p_addr->u_pcb.pcb_cr3 = pml4pa;
971 
972 	/* Fixup recursive PTE PML4E slot. We are only changing the PA */
973 	pml4va[PDIR_SLOT_PTE] = pml4pa | (pml4va[PDIR_SLOT_PTE] & ~pg_frame);
974 
975 	for (i = 0; i < NPDPG; i++) {
976 		/* PTE slot already handled earlier */
977 		if (i == PDIR_SLOT_PTE)
978 			continue;
979 
980 		if (pml4va[i] & pg_frame)
981 			pmap_randomize_level(&pml4va[i], 3);
982 	}
983 
984 	/* Wipe out bootstrap PML4 */
985 	memset(oldpml4va, 0, PAGE_SIZE);
986 	tlbflush();
987 }
988 
989 void
pmap_randomize_level(pd_entry_t * pde,int level)990 pmap_randomize_level(pd_entry_t *pde, int level)
991 {
992 	pd_entry_t *new_pd_va;
993 	paddr_t old_pd_pa, new_pd_pa;
994 	vaddr_t old_pd_va;
995 	struct vm_page *pg;
996 	int i;
997 
998 	if (level == 0)
999 		return;
1000 
1001 	if (level < PTP_LEVELS - 1 && (*pde & PG_PS))
1002 		return;
1003 
1004 	new_pd_va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
1005 	if (new_pd_va == NULL)
1006 		panic("%s: cannot allocate page for L%d page directory",
1007 		    __func__, level);
1008 
1009 	old_pd_pa = *pde & pg_frame;
1010 	old_pd_va = PMAP_DIRECT_MAP(old_pd_pa);
1011 	pmap_extract(pmap_kernel(), (vaddr_t)new_pd_va, &new_pd_pa);
1012 	memcpy(new_pd_va, (void *)old_pd_va, PAGE_SIZE);
1013 	*pde = new_pd_pa | (*pde & ~pg_frame);
1014 
1015 	tlbflush();
1016 	memset((void *)old_pd_va, 0, PAGE_SIZE);
1017 
1018 	pg = PHYS_TO_VM_PAGE(old_pd_pa);
1019 	if (pg != NULL) {
1020 		pg->wire_count--;
1021 		pmap_kernel()->pm_stats.resident_count--;
1022 		if (pg->wire_count <= 1)
1023 			uvm_pagefree(pg);
1024 	}
1025 
1026 	for (i = 0; i < NPDPG; i++)
1027 		if (new_pd_va[i] & pg_frame)
1028 			pmap_randomize_level(&new_pd_va[i], level - 1);
1029 }
1030 
1031 /*
1032  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1033  * trampoline code can be entered.
1034  */
1035 paddr_t
pmap_prealloc_lowmem_ptps(paddr_t first_avail)1036 pmap_prealloc_lowmem_ptps(paddr_t first_avail)
1037 {
1038 	pd_entry_t *pdes;
1039 	int level;
1040 	paddr_t newp;
1041 
1042 	pdes = pmap_kernel()->pm_pdir;
1043 	level = PTP_LEVELS;
1044 	for (;;) {
1045 		newp = first_avail; first_avail += PAGE_SIZE;
1046 		memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
1047 		pdes[pl_i(0, level)] =
1048 		    (newp & pg_frame) | PG_V | PG_RW | pg_crypt;
1049 		level--;
1050 		if (level <= 1)
1051 			break;
1052 		pdes = normal_pdes[level - 2];
1053 	}
1054 
1055 	return first_avail;
1056 }
1057 
1058 /*
1059  * pmap_init: no further initialization required on this platform
1060  */
1061 void
pmap_init(void)1062 pmap_init(void)
1063 {
1064 	pmap_initialized = 1;
1065 }
1066 
1067 /*
1068  * p v _ e n t r y   f u n c t i o n s
1069  */
1070 
1071 /*
1072  * main pv_entry manipulation functions:
1073  *   pmap_enter_pv: enter a mapping onto a pv list
1074  *   pmap_remove_pv: remove a mapping from a pv list
1075  */
1076 
1077 /*
1078  * pmap_enter_pv: enter a mapping onto a pv list
1079  *
1080  * => caller should adjust ptp's wire_count before calling
1081  *
1082  * pve: preallocated pve for us to use
1083  * ptp: PTP in pmap that maps this VA
1084  */
1085 
1086 void
pmap_enter_pv(struct vm_page * pg,struct pv_entry * pve,struct pmap * pmap,vaddr_t va,struct vm_page * ptp)1087 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
1088     vaddr_t va, struct vm_page *ptp)
1089 {
1090 	pve->pv_pmap = pmap;
1091 	pve->pv_va = va;
1092 	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
1093 	mtx_enter(&pg->mdpage.pv_mtx);
1094 	pve->pv_next = pg->mdpage.pv_list;	/* add to ... */
1095 	pg->mdpage.pv_list = pve;		/* ... list */
1096 	mtx_leave(&pg->mdpage.pv_mtx);
1097 }
1098 
1099 /*
1100  * pmap_remove_pv: try to remove a mapping from a pv_list
1101  *
1102  * => caller should adjust ptp's wire_count and free PTP if needed
1103  * => we return the removed pve
1104  */
1105 
1106 struct pv_entry *
pmap_remove_pv(struct vm_page * pg,struct pmap * pmap,vaddr_t va)1107 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
1108 {
1109 	struct pv_entry *pve, **prevptr;
1110 
1111 	mtx_enter(&pg->mdpage.pv_mtx);
1112 	prevptr = &pg->mdpage.pv_list;
1113 	while ((pve = *prevptr) != NULL) {
1114 		if (pve->pv_pmap == pmap && pve->pv_va == va) {	/* match? */
1115 			*prevptr = pve->pv_next;		/* remove it! */
1116 			break;
1117 		}
1118 		prevptr = &pve->pv_next;		/* previous pointer */
1119 	}
1120 	mtx_leave(&pg->mdpage.pv_mtx);
1121 	return(pve);				/* return removed pve */
1122 }
1123 
1124 /*
1125  * p t p   f u n c t i o n s
1126  */
1127 
1128 struct vm_page *
pmap_find_ptp(struct pmap * pmap,vaddr_t va,paddr_t pa,int level)1129 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1130 {
1131 	int lidx = level - 1;
1132 	struct vm_page *pg;
1133 
1134 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1135 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
1136 		return (pmap->pm_ptphint[lidx]);
1137 
1138 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1139 
1140 	return pg;
1141 }
1142 
1143 void
pmap_freepage(struct pmap * pmap,struct vm_page * ptp,int level,struct pg_to_free * pagelist)1144 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
1145     struct pg_to_free *pagelist)
1146 {
1147 	int lidx;
1148 	struct uvm_object *obj;
1149 
1150 	lidx = level - 1;
1151 
1152 	obj = &pmap->pm_obj[lidx];
1153 	pmap->pm_stats.resident_count--;
1154 	if (pmap->pm_ptphint[lidx] == ptp)
1155 		pmap->pm_ptphint[lidx] = RBT_ROOT(uvm_objtree, &obj->memt);
1156 	ptp->wire_count = 0;
1157 	uvm_pagerealloc(ptp, NULL, 0);
1158 	TAILQ_INSERT_TAIL(pagelist, ptp, pageq);
1159 }
1160 
1161 void
pmap_free_ptp(struct pmap * pmap,struct vm_page * ptp,vaddr_t va,struct pg_to_free * pagelist)1162 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1163     struct pg_to_free *pagelist)
1164 {
1165 	unsigned long index;
1166 	int level;
1167 	vaddr_t invaladdr;
1168 
1169 	level = 1;
1170 	do {
1171 		pmap_freepage(pmap, ptp, level, pagelist);
1172 		index = pl_i(va, level + 1);
1173 		pmap_pte_set(&normal_pdes[level - 1][index], 0);
1174 		if (level == PTP_LEVELS - 1 && pmap->pm_pdir_intel != NULL) {
1175 			/* Zap special meltdown PML4e */
1176 			pmap_pte_set(&pmap->pm_pdir_intel[index], 0);
1177 			DPRINTF("%s: cleared meltdown PML4e @ index %lu "
1178 			    "(va range start 0x%llx)\n", __func__, index,
1179 			    (uint64_t)(index << L4_SHIFT));
1180 		}
1181 		invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
1182 		    (vaddr_t)normal_pdes[level - 2];
1183 		pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE,
1184 		    pmap_is_curpmap(curpcb->pcb_pmap));
1185 		if (level < PTP_LEVELS - 1) {
1186 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1187 			ptp->wire_count--;
1188 			if (ptp->wire_count > 1)
1189 				break;
1190 		}
1191 	} while (++level < PTP_LEVELS);
1192 }
1193 
1194 /*
1195  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1196  *
1197  * => pmap should NOT be pmap_kernel()
1198  */
1199 
1200 struct vm_page *
pmap_get_ptp(struct pmap * pmap,vaddr_t va)1201 pmap_get_ptp(struct pmap *pmap, vaddr_t va)
1202 {
1203 	struct vm_page *ptp, *pptp;
1204 	int i;
1205 	unsigned long index;
1206 	pd_entry_t *pva, *pva_intel;
1207 	paddr_t ppa, pa;
1208 	struct uvm_object *obj;
1209 
1210 	ptp = NULL;
1211 	pa = (paddr_t)-1;
1212 
1213 	/*
1214 	 * Loop through all page table levels seeing if we need to
1215 	 * add a new page to that level.
1216 	 */
1217 	for (i = PTP_LEVELS; i > 1; i--) {
1218 		/*
1219 		 * Save values from previous round.
1220 		 */
1221 		pptp = ptp;
1222 		ppa = pa;
1223 
1224 		index = pl_i(va, i);
1225 		pva = normal_pdes[i - 2];
1226 
1227 		if (pmap_valid_entry(pva[index])) {
1228 			ppa = pva[index] & pg_frame;
1229 			ptp = NULL;
1230 			continue;
1231 		}
1232 
1233 		obj = &pmap->pm_obj[i-2];
1234 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1235 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1236 
1237 		if (ptp == NULL)
1238 			return NULL;
1239 
1240 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
1241 		ptp->wire_count = 1;
1242 		pmap->pm_ptphint[i - 2] = ptp;
1243 		pa = VM_PAGE_TO_PHYS(ptp);
1244 		pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V | pg_crypt);
1245 
1246 		/*
1247 		 * Meltdown Special case - if we are adding a new PML4e for
1248 		 * usermode addresses, just copy the PML4e to the U-K page
1249 		 * table.
1250 		 */
1251 		if (pmap->pm_pdir_intel != NULL && i == PTP_LEVELS &&
1252 		    va < VM_MAXUSER_ADDRESS) {
1253 			pva_intel = pmap->pm_pdir_intel;
1254 			pva_intel[index] = pva[index];
1255 			DPRINTF("%s: copying usermode PML4e (content=0x%llx) "
1256 			    "from 0x%llx -> 0x%llx\n", __func__, pva[index],
1257 			    (uint64_t)&pva[index], (uint64_t)&pva_intel[index]);
1258 		}
1259 
1260 		pmap->pm_stats.resident_count++;
1261 		/*
1262 		 * If we're not in the top level, increase the
1263 		 * wire count of the parent page.
1264 		 */
1265 		if (i < PTP_LEVELS) {
1266 			if (pptp == NULL)
1267 				pptp = pmap_find_ptp(pmap, va, ppa, i);
1268 #ifdef DIAGNOSTIC
1269 			if (pptp == NULL)
1270 				panic("%s: pde page disappeared", __func__);
1271 #endif
1272 			pptp->wire_count++;
1273 		}
1274 	}
1275 
1276 	/*
1277 	 * ptp is not NULL if we just allocated a new ptp. If it's
1278 	 * still NULL, we must look up the existing one.
1279 	 */
1280 	if (ptp == NULL) {
1281 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
1282 #ifdef DIAGNOSTIC
1283 		if (ptp == NULL) {
1284 			printf("va %lx ppa %lx\n", (unsigned long)va,
1285 			    (unsigned long)ppa);
1286 			panic("%s: unmanaged user PTP", __func__);
1287 		}
1288 #endif
1289 	}
1290 
1291 	pmap->pm_ptphint[0] = ptp;
1292 	return(ptp);
1293 }
1294 
1295 /*
1296  * p m a p  l i f e c y c l e   f u n c t i o n s
1297  */
1298 
1299 /*
1300  * pmap_pdp_ctor: constructor for the PDP cache.
1301  */
1302 
1303 void
pmap_pdp_ctor(pd_entry_t * pdir)1304 pmap_pdp_ctor(pd_entry_t *pdir)
1305 {
1306 	paddr_t pdirpa;
1307 	int npde, i;
1308 	struct pmap *kpm = pmap_kernel();
1309 
1310 	/* fetch the physical address of the page directory. */
1311 	(void) pmap_extract(kpm, (vaddr_t) pdir, &pdirpa);
1312 
1313 	/* zero init area */
1314 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
1315 
1316 	/* put in recursive PDE to map the PTEs */
1317 	pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW | pg_nx | pg_crypt;
1318 
1319 	npde = nkptp[PTP_LEVELS - 1];
1320 
1321 	/* put in kernel VM PDEs */
1322 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
1323 	    npde * sizeof(pd_entry_t));
1324 
1325 	/* zero the rest */
1326 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
1327 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
1328 
1329 	for (i = 0; i < NUM_L4_SLOT_DIRECT; i++)
1330 		pdir[PDIR_SLOT_DIRECT + i] = kpm->pm_pdir[PDIR_SLOT_DIRECT + i];
1331 
1332 #if VM_MIN_KERNEL_ADDRESS != KERNBASE
1333 	pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)];
1334 #endif
1335 }
1336 
1337 void
pmap_pdp_ctor_intel(pd_entry_t * pdir)1338 pmap_pdp_ctor_intel(pd_entry_t *pdir)
1339 {
1340 	struct pmap *kpm = pmap_kernel();
1341 
1342 	/* Copy PML4es from pmap_kernel's U-K view */
1343 	memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE);
1344 }
1345 
1346 /*
1347  * pmap_create: create a pmap
1348  *
1349  * => note: old pmap interface took a "size" args which allowed for
1350  *	the creation of "software only" pmaps (not in bsd).
1351  */
1352 
1353 struct pmap *
pmap_create(void)1354 pmap_create(void)
1355 {
1356 	struct pmap *pmap;
1357 	int i;
1358 
1359 	pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
1360 
1361 	mtx_init(&pmap->pm_mtx, IPL_VM);
1362 
1363 	/* init uvm_object */
1364 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1365 		uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, 1);
1366 		pmap->pm_ptphint[i] = NULL;
1367 	}
1368 	pmap->pm_stats.wired_count = 0;
1369 	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
1370 	pmap->pm_type = PMAP_TYPE_NORMAL;
1371 	pmap->eptp = 0;
1372 
1373 	/* allocate PDP */
1374 
1375 	/*
1376 	 * note that there is no need to splvm to protect us from
1377 	 * malloc since malloc allocates out of a submap and we should
1378 	 * have already allocated kernel PTPs to cover the range...
1379 	 */
1380 
1381 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
1382 	pmap_pdp_ctor(pmap->pm_pdir);
1383 
1384 	pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & pg_frame;
1385 
1386 	/*
1387 	 * Intel CPUs need a special page table to be used during usermode
1388 	 * execution, one that lacks all kernel mappings.
1389 	 */
1390 	if (cpu_meltdown) {
1391 		pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK);
1392 		pmap_pdp_ctor_intel(pmap->pm_pdir_intel);
1393 		pmap->pm_stats.resident_count++;
1394 		if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
1395 		    &pmap->pm_pdirpa_intel))
1396 			panic("%s: unknown PA mapping for meltdown PML4",
1397 			    __func__);
1398 	} else {
1399 		pmap->pm_pdir_intel = NULL;
1400 		pmap->pm_pdirpa_intel = 0;
1401 	}
1402 
1403 	mtx_enter(&pmaps_lock);
1404 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1405 	mtx_leave(&pmaps_lock);
1406 	return (pmap);
1407 }
1408 
1409 /*
1410  * pmap_destroy: drop reference count on pmap.   free pmap if
1411  *	reference count goes to zero.
1412  */
1413 
1414 void
pmap_destroy(struct pmap * pmap)1415 pmap_destroy(struct pmap *pmap)
1416 {
1417 	struct vm_page *pg;
1418 	int refs;
1419 	int i;
1420 
1421 	/*
1422 	 * drop reference count
1423 	 */
1424 
1425 	refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs);
1426 	if (refs > 0) {
1427 		return;
1428 	}
1429 
1430 	/*
1431 	 * remove it from global list of pmaps
1432 	 */
1433 	mtx_enter(&pmaps_lock);
1434 	LIST_REMOVE(pmap, pm_list);
1435 	mtx_leave(&pmaps_lock);
1436 
1437 	/*
1438 	 * free any remaining PTPs
1439 	 */
1440 
1441 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1442 		while ((pg = RBT_ROOT(uvm_objtree,
1443 		    &pmap->pm_obj[i].memt)) != NULL) {
1444 			KASSERT((pg->pg_flags & PG_BUSY) == 0);
1445 
1446 			pg->wire_count = 0;
1447 			pmap->pm_stats.resident_count--;
1448 
1449 			uvm_pagefree(pg);
1450 		}
1451 	}
1452 
1453 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
1454 
1455 	if (pmap->pm_pdir_intel != NULL) {
1456 		pmap->pm_stats.resident_count--;
1457 		pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
1458 	}
1459 
1460 	pool_put(&pmap_pmap_pool, pmap);
1461 }
1462 
1463 /*
1464  *	Add a reference to the specified pmap.
1465  */
1466 
1467 void
pmap_reference(struct pmap * pmap)1468 pmap_reference(struct pmap *pmap)
1469 {
1470 	atomic_inc_int(&pmap->pm_obj[0].uo_refs);
1471 }
1472 
1473 /*
1474  * pmap_activate: activate a process' pmap (fill in %cr3)
1475  *
1476  * => called from cpu_fork() and when switching pmaps during exec
1477  * => if p is the curproc, then load it into the MMU
1478  */
1479 
1480 void
pmap_activate(struct proc * p)1481 pmap_activate(struct proc *p)
1482 {
1483 	struct pcb *pcb = &p->p_addr->u_pcb;
1484 	struct pmap *pmap = p->p_vmspace->vm_map.pmap;
1485 
1486 	pcb->pcb_pmap = pmap;
1487 	pcb->pcb_cr3 = pmap->pm_pdirpa;
1488 	pcb->pcb_cr3 |= (pmap != pmap_kernel()) ? cr3_pcid_proc :
1489 	    (PCID_KERN | cr3_reuse_pcid);
1490 
1491 	if (p != curproc)
1492 		return;
1493 
1494 	if ((p->p_flag & P_SYSTEM) == 0) {
1495 		struct cpu_info *self = curcpu();
1496 
1497 		/* mark the pmap in use by this processor */
1498 		self->ci_proc_pmap = pmap;
1499 
1500 		/* in case we return to userspace without context switching */
1501 		if (cpu_meltdown) {
1502 			self->ci_kern_cr3 = pcb->pcb_cr3 | cr3_reuse_pcid;
1503 			self->ci_user_cr3 = pmap->pm_pdirpa_intel |
1504 			    cr3_pcid_proc_intel;
1505 		}
1506 	}
1507 
1508 	lcr3(pcb->pcb_cr3);
1509 }
1510 
1511 /*
1512  * pmap_deactivate: deactivate a process' pmap
1513  */
1514 
1515 void
pmap_deactivate(struct proc * p)1516 pmap_deactivate(struct proc *p)
1517 {
1518 	if ((p->p_flag & P_SYSTEM) == 0) {
1519 		struct cpu_info *self = curcpu();
1520 
1521 		/*
1522 		 * mark the pmap no longer in use by this processor.
1523 		 */
1524 		KASSERT(self->ci_proc_pmap == p->p_vmspace->vm_map.pmap);
1525 		self->ci_proc_pmap = NULL;
1526 	}
1527 }
1528 
1529 /*
1530  * end of lifecycle functions
1531  */
1532 
1533 /*
1534  * some misc. functions
1535  */
1536 
1537 int
pmap_pdes_valid(vaddr_t va,pd_entry_t * lastpde)1538 pmap_pdes_valid(vaddr_t va, pd_entry_t *lastpde)
1539 {
1540 	int i;
1541 	unsigned long index;
1542 	pd_entry_t pde;
1543 
1544 	for (i = PTP_LEVELS; i > 1; i--) {
1545 		index = pl_i(va, i);
1546 		pde = normal_pdes[i - 2][index];
1547 		if (!pmap_valid_entry(pde))
1548 			return 0;
1549 	}
1550 	if (lastpde != NULL)
1551 		*lastpde = pde;
1552 	return 1;
1553 }
1554 
1555 /*
1556  * pmap_extract: extract a PA for the given VA
1557  */
1558 
1559 int
pmap_extract(struct pmap * pmap,vaddr_t va,paddr_t * pap)1560 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1561 {
1562 	pt_entry_t *ptes, pte;
1563 	int level, offs;
1564 
1565 	if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE &&
1566 	    va < PMAP_DIRECT_END) {
1567 		*pap = va - PMAP_DIRECT_BASE;
1568 		return 1;
1569 	}
1570 
1571 	if (pmap != pmap_kernel())
1572 		mtx_enter(&pmap->pm_mtx);
1573 
1574 	level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
1575 	pte = ptes[offs];
1576 
1577 	if (pmap != pmap_kernel())
1578 		mtx_leave(&pmap->pm_mtx);
1579 
1580 	if (__predict_true(level == 0 && pmap_valid_entry(pte))) {
1581 		if (pap != NULL)
1582 			*pap = (pte & pg_frame) | (va & PAGE_MASK);
1583 		return 1;
1584 	}
1585 	if (level == 1 && (pte & (PG_PS|PG_V)) == (PG_PS|PG_V)) {
1586 		if (pap != NULL)
1587 			*pap = (pte & pg_lgframe) | (va & PAGE_MASK_L2);
1588 		return 1;
1589 	}
1590 
1591 	return 0;
1592 }
1593 
1594 /*
1595  * pmap_zero_page: zero a page
1596  */
1597 
1598 void
pmap_zero_page(struct vm_page * pg)1599 pmap_zero_page(struct vm_page *pg)
1600 {
1601 	pagezero(pmap_map_direct(pg));
1602 }
1603 
1604 /*
1605  * pmap_flush_cache: flush the cache for a virtual address.
1606  */
1607 void
pmap_flush_cache(vaddr_t addr,vsize_t len)1608 pmap_flush_cache(vaddr_t addr, vsize_t len)
1609 {
1610 	vaddr_t	i;
1611 
1612 	if (curcpu()->ci_cflushsz == 0) {
1613 		wbinvd_on_all_cpus();
1614 		return;
1615 	}
1616 
1617 	/* all cpus that have clflush also have mfence. */
1618 	mfence();
1619 	for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz)
1620 		clflush(i);
1621 	mfence();
1622 }
1623 
1624 /*
1625  * pmap_copy_page: copy a page
1626  */
1627 
1628 void
pmap_copy_page(struct vm_page * srcpg,struct vm_page * dstpg)1629 pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
1630 {
1631 	vaddr_t srcva = pmap_map_direct(srcpg);
1632 	vaddr_t dstva = pmap_map_direct(dstpg);
1633 
1634 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
1635 }
1636 
1637 /*
1638  * p m a p   r e m o v e   f u n c t i o n s
1639  *
1640  * functions that remove mappings
1641  */
1642 
1643 /*
1644  * pmap_remove_ptes: remove PTEs from a PTP
1645  *
1646  * => PTP must be mapped into KVA
1647  * => PTP should be null if pmap == pmap_kernel()
1648  */
1649 
1650 void
pmap_remove_ptes(struct pmap * pmap,struct vm_page * ptp,vaddr_t ptpva,vaddr_t startva,vaddr_t endva,int flags,struct pv_entry ** free_pvs)1651 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1652     vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1653 {
1654 	struct pv_entry *pve;
1655 	pt_entry_t *pte = (pt_entry_t *) ptpva;
1656 	struct vm_page *pg;
1657 	pt_entry_t opte;
1658 
1659 	/*
1660 	 * note that ptpva points to the PTE that maps startva.   this may
1661 	 * or may not be the first PTE in the PTP.
1662 	 *
1663 	 * we loop through the PTP while there are still PTEs to look at
1664 	 * and the wire_count is greater than 1 (because we use the wire_count
1665 	 * to keep track of the number of real PTEs in the PTP).
1666 	 */
1667 
1668 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1669 			     ; pte++, startva += PAGE_SIZE) {
1670 		if (!pmap_valid_entry(*pte))
1671 			continue;			/* VA not mapped */
1672 		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1673 			continue;
1674 		}
1675 
1676 		/* atomically save the old PTE and zap! it */
1677 		opte = pmap_pte_set(pte, 0);
1678 
1679 		if (opte & PG_W)
1680 			pmap->pm_stats.wired_count--;
1681 		pmap->pm_stats.resident_count--;
1682 
1683 		if (ptp != NULL)
1684 			ptp->wire_count--;		/* dropping a PTE */
1685 
1686 		pg = PHYS_TO_VM_PAGE(opte & pg_frame);
1687 
1688 		/*
1689 		 * if we are not on a pv list we are done.
1690 		 */
1691 
1692 		if ((opte & PG_PVLIST) == 0) {
1693 #ifdef DIAGNOSTIC
1694 			if (pg != NULL)
1695 				panic("%s: managed page without PG_PVLIST: "
1696 				    "va 0x%lx, opte 0x%llx", __func__,
1697 				    startva, opte);
1698 #endif
1699 			continue;
1700 		}
1701 
1702 #ifdef DIAGNOSTIC
1703 		if (pg == NULL)
1704 			panic("%s: unmanaged page marked PG_PVLIST: "
1705 			    "va 0x%lx, opte 0x%llx", __func__,
1706 			    startva, opte);
1707 #endif
1708 
1709 		/* sync R/M bits */
1710 		pmap_sync_flags_pte(pg, opte);
1711 		pve = pmap_remove_pv(pg, pmap, startva);
1712 		if (pve != NULL) {
1713 			pve->pv_next = *free_pvs;
1714 			*free_pvs = pve;
1715 		}
1716 
1717 		/* end of "for" loop: time for next pte */
1718 	}
1719 }
1720 
1721 /*
1722  * pmap_remove_pte: remove a single PTE from a PTP
1723  *
1724  * => PTP must be mapped into KVA
1725  * => PTP should be null if pmap == pmap_kernel()
1726  * => returns true if we removed a mapping
1727  */
1728 
1729 int
pmap_remove_pte(struct pmap * pmap,struct vm_page * ptp,pt_entry_t * pte,vaddr_t va,int flags,struct pv_entry ** free_pvs)1730 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
1731     vaddr_t va, int flags, struct pv_entry **free_pvs)
1732 {
1733 	struct pv_entry *pve;
1734 	struct vm_page *pg;
1735 	pt_entry_t opte;
1736 
1737 	if (!pmap_valid_entry(*pte))
1738 		return 0;		/* VA not mapped */
1739 	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1740 		return 0;
1741 	}
1742 
1743 	/* atomically save the old PTE and zap! it */
1744 	opte = pmap_pte_set(pte, 0);
1745 
1746 	if (opte & PG_W)
1747 		pmap->pm_stats.wired_count--;
1748 	pmap->pm_stats.resident_count--;
1749 
1750 	if (ptp != NULL)
1751 		ptp->wire_count--;		/* dropping a PTE */
1752 
1753 	pg = PHYS_TO_VM_PAGE(opte & pg_frame);
1754 
1755 	/*
1756 	 * if we are not on a pv list we are done.
1757 	 */
1758 	if ((opte & PG_PVLIST) == 0) {
1759 #ifdef DIAGNOSTIC
1760 		if (pg != NULL)
1761 			panic("%s: managed page without PG_PVLIST: "
1762 			    "va 0x%lx, opte 0x%llx", __func__, va, opte);
1763 #endif
1764 		return 1;
1765 	}
1766 
1767 #ifdef DIAGNOSTIC
1768 	if (pg == NULL)
1769 		panic("%s: unmanaged page marked PG_PVLIST: "
1770 		    "va 0x%lx, opte 0x%llx", __func__, va, opte);
1771 #endif
1772 
1773 	/* sync R/M bits */
1774 	pmap_sync_flags_pte(pg, opte);
1775 	pve = pmap_remove_pv(pg, pmap, va);
1776 	if (pve != NULL) {
1777 		pve->pv_next = *free_pvs;
1778 		*free_pvs = pve;
1779 	}
1780 
1781 	return 1;
1782 }
1783 
1784 /*
1785  * pmap_remove: top level mapping removal function
1786  *
1787  * => caller should not be holding any pmap locks
1788  */
1789 
1790 void
pmap_remove(struct pmap * pmap,vaddr_t sva,vaddr_t eva)1791 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
1792 {
1793 #if NVMM > 0
1794 	if (pmap_is_ept(pmap))
1795 		pmap_remove_ept(pmap, sva, eva);
1796 	else
1797 #endif /* NVMM > 0 */
1798 		pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
1799 }
1800 
1801 /*
1802  * pmap_do_remove: mapping removal guts
1803  *
1804  * => caller should not be holding any pmap locks
1805  */
1806 
1807 void
pmap_do_remove(struct pmap * pmap,vaddr_t sva,vaddr_t eva,int flags)1808 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1809 {
1810 	pd_entry_t pde;
1811 	int result;
1812 	paddr_t ptppa;
1813 	vaddr_t blkendva;
1814 	struct vm_page *ptp;
1815 	struct pv_entry *pve;
1816 	struct pv_entry *free_pvs = NULL;
1817 	vaddr_t va;
1818 	int shootall = 0, shootself;
1819 	struct pg_to_free empty_ptps;
1820 	paddr_t scr3;
1821 
1822 	TAILQ_INIT(&empty_ptps);
1823 
1824 	scr3 = pmap_map_ptes(pmap);
1825 	shootself = (scr3 == 0);
1826 
1827 	/*
1828 	 * removing one page?  take shortcut function.
1829 	 */
1830 
1831 	if (sva + PAGE_SIZE == eva) {
1832 		if (pmap_pdes_valid(sva, &pde)) {
1833 
1834 			/* PA of the PTP */
1835 			ptppa = pde & pg_frame;
1836 
1837 			/* get PTP if non-kernel mapping */
1838 
1839 			if (pmap == pmap_kernel()) {
1840 				/* we never free kernel PTPs */
1841 				ptp = NULL;
1842 			} else {
1843 				ptp = pmap_find_ptp(pmap, sva, ptppa, 1);
1844 #ifdef DIAGNOSTIC
1845 				if (ptp == NULL)
1846 					panic("%s: unmanaged PTP detected "
1847 					    "in shortcut path", __func__);
1848 #endif
1849 			}
1850 
1851 			/* do it! */
1852 			result = pmap_remove_pte(pmap, ptp,
1853 			    &PTE_BASE[pl1_i(sva)], sva, flags, &free_pvs);
1854 
1855 			/*
1856 			 * if mapping removed and the PTP is no longer
1857 			 * being used, free it!
1858 			 */
1859 
1860 			if (result && ptp && ptp->wire_count <= 1)
1861 				pmap_free_ptp(pmap, ptp, sva, &empty_ptps);
1862 			pmap_tlb_shootpage(pmap, sva, shootself);
1863 			pmap_unmap_ptes(pmap, scr3);
1864 			pmap_tlb_shootwait();
1865 		} else {
1866 			pmap_unmap_ptes(pmap, scr3);
1867 		}
1868 
1869 		goto cleanup;
1870 	}
1871 
1872 	if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
1873 		shootall = 1;
1874 
1875 	for (va = sva; va < eva; va = blkendva) {
1876 		/* determine range of block */
1877 		blkendva = x86_round_pdr(va + 1);
1878 		if (blkendva > eva)
1879 			blkendva = eva;
1880 
1881 		/*
1882 		 * XXXCDC: our PTE mappings should never be removed
1883 		 * with pmap_remove!  if we allow this (and why would
1884 		 * we?) then we end up freeing the pmap's page
1885 		 * directory page (PDP) before we are finished using
1886 		 * it when we hit it in the recursive mapping.  this
1887 		 * is BAD.
1888 		 *
1889 		 * long term solution is to move the PTEs out of user
1890 		 * address space.  and into kernel address space (up
1891 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1892 		 * be VM_MAX_ADDRESS.
1893 		 */
1894 
1895 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
1896 			/* XXXCDC: ugly hack to avoid freeing PDP here */
1897 			continue;
1898 
1899 		if (!pmap_pdes_valid(va, &pde))
1900 			continue;
1901 
1902 		/* PA of the PTP */
1903 		ptppa = pde & pg_frame;
1904 
1905 		/* get PTP if non-kernel mapping */
1906 		if (pmap == pmap_kernel()) {
1907 			/* we never free kernel PTPs */
1908 			ptp = NULL;
1909 		} else {
1910 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
1911 #ifdef DIAGNOSTIC
1912 			if (ptp == NULL)
1913 				panic("%s: unmanaged PTP detected", __func__);
1914 #endif
1915 		}
1916 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&PTE_BASE[pl1_i(va)],
1917 		    va, blkendva, flags, &free_pvs);
1918 
1919 		/* if PTP is no longer being used, free it! */
1920 		if (ptp && ptp->wire_count <= 1) {
1921 			pmap_free_ptp(pmap, ptp, va, &empty_ptps);
1922 		}
1923 	}
1924 
1925 	if (shootall)
1926 		pmap_tlb_shoottlb(pmap, shootself);
1927 	else
1928 		pmap_tlb_shootrange(pmap, sva, eva, shootself);
1929 
1930 	pmap_unmap_ptes(pmap, scr3);
1931 	pmap_tlb_shootwait();
1932 
1933 cleanup:
1934 	while ((pve = free_pvs) != NULL) {
1935 		free_pvs = pve->pv_next;
1936 		pool_put(&pmap_pv_pool, pve);
1937 	}
1938 
1939 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1940 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1941 		uvm_pagefree(ptp);
1942 	}
1943 }
1944 
1945 /*
1946  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1947  *
1948  * => R/M bits are sync'd back to attrs
1949  */
1950 
1951 void
pmap_page_remove(struct vm_page * pg)1952 pmap_page_remove(struct vm_page *pg)
1953 {
1954 	struct pv_entry *pve;
1955 	struct pmap *pm;
1956 	pt_entry_t opte;
1957 #ifdef DIAGNOSTIC
1958 	pd_entry_t pde;
1959 #endif
1960 	struct pg_to_free empty_ptps;
1961 	struct vm_page *ptp;
1962 	paddr_t scr3;
1963 	int shootself;
1964 
1965 	TAILQ_INIT(&empty_ptps);
1966 
1967 	mtx_enter(&pg->mdpage.pv_mtx);
1968 	while ((pve = pg->mdpage.pv_list) != NULL) {
1969 		pmap_reference(pve->pv_pmap);
1970 		pm = pve->pv_pmap;
1971 		mtx_leave(&pg->mdpage.pv_mtx);
1972 
1973 		/* XXX use direct map? */
1974 		scr3 = pmap_map_ptes(pm);	/* locks pmap */
1975 		shootself = (scr3 == 0);
1976 
1977 		/*
1978 		 * We dropped the pvlist lock before grabbing the pmap
1979 		 * lock to avoid lock ordering problems.  This means
1980 		 * we have to check the pvlist again since somebody
1981 		 * else might have modified it.  All we care about is
1982 		 * that the pvlist entry matches the pmap we just
1983 		 * locked.  If it doesn't, unlock the pmap and try
1984 		 * again.
1985 		 */
1986 		mtx_enter(&pg->mdpage.pv_mtx);
1987 		if ((pve = pg->mdpage.pv_list) == NULL ||
1988 		    pve->pv_pmap != pm) {
1989 			mtx_leave(&pg->mdpage.pv_mtx);
1990 			pmap_unmap_ptes(pm, scr3);	/* unlocks pmap */
1991 			pmap_destroy(pm);
1992 			mtx_enter(&pg->mdpage.pv_mtx);
1993 			continue;
1994 		}
1995 
1996 		pg->mdpage.pv_list = pve->pv_next;
1997 		mtx_leave(&pg->mdpage.pv_mtx);
1998 
1999 #ifdef DIAGNOSTIC
2000 		if (pve->pv_ptp != NULL && pmap_pdes_valid(pve->pv_va, &pde) &&
2001 		   (pde & pg_frame) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
2002 			printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__,
2003 			       pg, pve->pv_va, pve->pv_ptp);
2004 			printf("%s: PTP's phys addr: "
2005 			       "actual=%lx, recorded=%lx\n", __func__,
2006 			       (unsigned long)(pde & pg_frame),
2007 				VM_PAGE_TO_PHYS(pve->pv_ptp));
2008 			panic("%s: mapped managed page has "
2009 			      "invalid pv_ptp field", __func__);
2010 		}
2011 #endif
2012 
2013 		/* atomically save the old PTE and zap it */
2014 		opte = pmap_pte_set(&PTE_BASE[pl1_i(pve->pv_va)], 0);
2015 
2016 		if (opte & PG_W)
2017 			pve->pv_pmap->pm_stats.wired_count--;
2018 		pve->pv_pmap->pm_stats.resident_count--;
2019 
2020 		pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself);
2021 
2022 		pmap_sync_flags_pte(pg, opte);
2023 
2024 		/* update the PTP reference count.  free if last reference. */
2025 		if (pve->pv_ptp != NULL) {
2026 			pve->pv_ptp->wire_count--;
2027 			if (pve->pv_ptp->wire_count <= 1) {
2028 				pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
2029 				    pve->pv_va, &empty_ptps);
2030 			}
2031 		}
2032 		pmap_unmap_ptes(pve->pv_pmap, scr3);	/* unlocks pmap */
2033 		pmap_destroy(pve->pv_pmap);
2034 		pool_put(&pmap_pv_pool, pve);
2035 		mtx_enter(&pg->mdpage.pv_mtx);
2036 	}
2037 	mtx_leave(&pg->mdpage.pv_mtx);
2038 
2039 	pmap_tlb_shootwait();
2040 
2041 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
2042 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
2043 		uvm_pagefree(ptp);
2044 	}
2045 }
2046 
2047 /*
2048  * p m a p   a t t r i b u t e  f u n c t i o n s
2049  * functions that test/change managed page's attributes
2050  * since a page can be mapped multiple times we must check each PTE that
2051  * maps it by going down the pv lists.
2052  */
2053 
2054 /*
2055  * pmap_test_attrs: test a page's attributes
2056  */
2057 
2058 int
pmap_test_attrs(struct vm_page * pg,unsigned int testbits)2059 pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
2060 {
2061 	struct pv_entry *pve;
2062 	pt_entry_t *ptes;
2063 	int level, offs;
2064 	u_long mybits, testflags;
2065 
2066 	testflags = pmap_pte2flags(testbits);
2067 
2068 	if (pg->pg_flags & testflags)
2069 		return 1;
2070 
2071 	mybits = 0;
2072 	mtx_enter(&pg->mdpage.pv_mtx);
2073 	for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
2074 	    pve = pve->pv_next) {
2075 		level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
2076 		    &offs);
2077 		mybits |= (ptes[offs] & testbits);
2078 	}
2079 	mtx_leave(&pg->mdpage.pv_mtx);
2080 
2081 	if (mybits == 0)
2082 		return 0;
2083 
2084 	atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
2085 
2086 	return 1;
2087 }
2088 
2089 /*
2090  * pmap_clear_attrs: change a page's attributes
2091  *
2092  * => we return 1 if we cleared one of the bits we were asked to
2093  */
2094 
2095 int
pmap_clear_attrs(struct vm_page * pg,unsigned long clearbits)2096 pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
2097 {
2098 	struct pv_entry *pve;
2099 	pt_entry_t *ptes, opte;
2100 	u_long clearflags;
2101 	int result, level, offs;
2102 
2103 	clearflags = pmap_pte2flags(clearbits);
2104 
2105 	result = pg->pg_flags & clearflags;
2106 	if (result)
2107 		atomic_clearbits_int(&pg->pg_flags, clearflags);
2108 
2109 	mtx_enter(&pg->mdpage.pv_mtx);
2110 	for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
2111 		level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
2112 		    &offs);
2113 		opte = ptes[offs];
2114 		if (opte & clearbits) {
2115 			result = 1;
2116 			pmap_pte_clearbits(&ptes[offs], (opte & clearbits));
2117 			pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va,
2118 				pmap_is_curpmap(pve->pv_pmap));
2119 		}
2120 	}
2121 	mtx_leave(&pg->mdpage.pv_mtx);
2122 
2123 	pmap_tlb_shootwait();
2124 
2125 	return (result != 0);
2126 }
2127 
2128 /*
2129  * p m a p   p r o t e c t i o n   f u n c t i o n s
2130  */
2131 
2132 /*
2133  * pmap_page_protect: change the protection of all recorded mappings
2134  *	of a managed page
2135  *
2136  * => NOTE: this is an inline function in pmap.h
2137  */
2138 
2139 /* see pmap.h */
2140 
2141 /*
2142  * pmap_protect: set the protection in of the pages in a pmap
2143  *
2144  * => NOTE: this is an inline function in pmap.h
2145  */
2146 
2147 /* see pmap.h */
2148 
2149 /*
2150  * pmap_write_protect: write-protect pages in a pmap
2151  */
2152 
2153 void
pmap_write_protect(struct pmap * pmap,vaddr_t sva,vaddr_t eva,vm_prot_t prot)2154 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
2155 {
2156 	pt_entry_t *spte, *epte;
2157 	pt_entry_t clear = 0, set = 0;
2158 	vaddr_t blockend;
2159 	int shootall = 0, shootself;
2160 	vaddr_t va;
2161 	paddr_t scr3;
2162 
2163 	scr3 = pmap_map_ptes(pmap);
2164 	shootself = (scr3 == 0);
2165 
2166 	/* should be ok, but just in case ... */
2167 	sva &= PG_FRAME;
2168 	eva &= PG_FRAME;
2169 
2170 	if (!(prot & PROT_READ))
2171 		set |= pg_xo;
2172 	if (!(prot & PROT_WRITE))
2173 		clear = PG_RW;
2174 	if (!(prot & PROT_EXEC))
2175 		set |= pg_nx;
2176 
2177 	if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
2178 		shootall = 1;
2179 
2180 	for (va = sva; va < eva ; va = blockend) {
2181 		blockend = (va & L2_FRAME) + NBPD_L2;
2182 		if (blockend > eva)
2183 			blockend = eva;
2184 
2185 		/*
2186 		 * XXXCDC: our PTE mappings should never be write-protected!
2187 		 *
2188 		 * long term solution is to move the PTEs out of user
2189 		 * address space.  and into kernel address space (up
2190 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
2191 		 * be VM_MAX_ADDRESS.
2192 		 */
2193 
2194 		/* XXXCDC: ugly hack to avoid freeing PDP here */
2195 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
2196 			continue;
2197 
2198 		/* empty block? */
2199 		if (!pmap_pdes_valid(va, NULL))
2200 			continue;
2201 
2202 #ifdef DIAGNOSTIC
2203 		if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS)
2204 			panic("%s: PTE space", __func__);
2205 #endif
2206 
2207 		spte = &PTE_BASE[pl1_i(va)];
2208 		epte = &PTE_BASE[pl1_i(blockend)];
2209 
2210 		for (/*null */; spte < epte ; spte++) {
2211 			if (!pmap_valid_entry(*spte))
2212 				continue;
2213 			pmap_pte_clearbits(spte, clear);
2214 			pmap_pte_setbits(spte, set);
2215 		}
2216 	}
2217 
2218 	if (shootall)
2219 		pmap_tlb_shoottlb(pmap, shootself);
2220 	else
2221 		pmap_tlb_shootrange(pmap, sva, eva, shootself);
2222 
2223 	pmap_unmap_ptes(pmap, scr3);
2224 	pmap_tlb_shootwait();
2225 }
2226 
2227 /*
2228  * end of protection functions
2229  */
2230 
2231 /*
2232  * pmap_unwire: clear the wired bit in the PTE
2233  *
2234  * => mapping should already be in map
2235  */
2236 
2237 void
pmap_unwire(struct pmap * pmap,vaddr_t va)2238 pmap_unwire(struct pmap *pmap, vaddr_t va)
2239 {
2240 	pt_entry_t *ptes;
2241 	int level, offs;
2242 
2243 	level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
2244 
2245 	if (level == 0) {
2246 
2247 #ifdef DIAGNOSTIC
2248 		if (!pmap_valid_entry(ptes[offs]))
2249 			panic("%s: invalid (unmapped) va 0x%lx", __func__, va);
2250 #endif
2251 		if (__predict_true((ptes[offs] & PG_W) != 0)) {
2252 			pmap_pte_clearbits(&ptes[offs], PG_W);
2253 			pmap->pm_stats.wired_count--;
2254 		}
2255 #ifdef DIAGNOSTIC
2256 		else {
2257 			printf("%s: wiring for pmap %p va 0x%lx "
2258 			       "didn't change!\n", __func__, pmap, va);
2259 		}
2260 #endif
2261 	}
2262 #ifdef DIAGNOSTIC
2263 	else {
2264 		panic("%s: invalid PDE", __func__);
2265 	}
2266 #endif
2267 }
2268 
2269 void
pmap_enter_special(vaddr_t va,paddr_t pa,vm_prot_t prot)2270 pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot)
2271 {
2272 	uint64_t l4idx, l3idx, l2idx, l1idx;
2273 	pd_entry_t *pd, *ptp;
2274 	paddr_t npa;
2275 	struct pmap *pmap = pmap_kernel();
2276 	pt_entry_t *ptes;
2277 	int level, offs;
2278 
2279 	/* If CPU is secure, no need to do anything */
2280 	if (!cpu_meltdown)
2281 		return;
2282 
2283 	/* Must be kernel VA */
2284 	if (va < VM_MIN_KERNEL_ADDRESS)
2285 		panic("%s: invalid special mapping va 0x%lx requested",
2286 		    __func__, va);
2287 
2288 	if (pmap->pm_pdir_intel == NULL)
2289 		pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool,
2290 		    PR_WAITOK | PR_ZERO);
2291 
2292 	l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */
2293 	l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
2294 	l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */
2295 	l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */
2296 
2297 	DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld "
2298 	    "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va,
2299 	    (uint64_t)pa, l4idx, l3idx, l2idx, l1idx);
2300 
2301 	/* Start at PML4 / top level */
2302 	pd = pmap->pm_pdir_intel;
2303 
2304 	if (pd == NULL)
2305 		panic("%s: PML4 not initialized for pmap @ %p", __func__,
2306 		    pmap);
2307 
2308 	/* npa = physaddr of PDPT */
2309 	npa = pd[l4idx] & PMAP_PA_MASK;
2310 
2311 	/* Valid PML4e for the 512GB region containing va? */
2312 	if (!npa) {
2313 		/* No valid PML4E - allocate PDPT page and set PML4E */
2314 
2315 		ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
2316 
2317 		if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2318 			panic("%s: can't locate PDPT page", __func__);
2319 
2320 		pd[l4idx] = (npa | PG_RW | PG_V | pg_crypt);
2321 
2322 		DPRINTF("%s: allocated new PDPT page at phys 0x%llx, "
2323 		    "setting PML4e[%lld] = 0x%llx\n", __func__,
2324 		    (uint64_t)npa, l4idx, pd[l4idx]);
2325 	}
2326 
2327 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2328 	if (pd == NULL)
2329 		panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
2330 		    (uint64_t)npa);
2331 
2332 	/* npa = physaddr of PD page */
2333 	npa = pd[l3idx] & PMAP_PA_MASK;
2334 
2335 	/* Valid PDPTe for the 1GB region containing va? */
2336 	if (!npa) {
2337 		/* No valid PDPTe - allocate PD page and set PDPTe */
2338 
2339 		ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
2340 
2341 		if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2342 			panic("%s: can't locate PD page", __func__);
2343 
2344 		pd[l3idx] = (npa | PG_RW | PG_V | pg_crypt);
2345 
2346 		DPRINTF("%s: allocated new PD page at phys 0x%llx, "
2347 		    "setting PDPTe[%lld] = 0x%llx\n", __func__,
2348 		    (uint64_t)npa, l3idx, pd[l3idx]);
2349 	}
2350 
2351 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2352 	if (pd == NULL)
2353 		panic("%s: can't locate PD page @ pa=0x%llx", __func__,
2354 		    (uint64_t)npa);
2355 
2356 	/* npa = physaddr of PT page */
2357 	npa = pd[l2idx] & PMAP_PA_MASK;
2358 
2359 	/* Valid PDE for the 2MB region containing va? */
2360 	if (!npa) {
2361 		/* No valid PDE - allocate PT page and set PDE */
2362 
2363 		ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
2364 
2365 		if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2366 			panic("%s: can't locate PT page", __func__);
2367 
2368 		pd[l2idx] = (npa | PG_RW | PG_V | pg_crypt);
2369 
2370 		DPRINTF("%s: allocated new PT page at phys 0x%llx, "
2371 		    "setting PDE[%lld] = 0x%llx\n", __func__,
2372 		    (uint64_t)npa, l2idx, pd[l2idx]);
2373 	}
2374 
2375 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2376 	if (pd == NULL)
2377 		panic("%s: can't locate PT page @ pa=0x%llx", __func__,
2378 		    (uint64_t)npa);
2379 
2380 	DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot "
2381 	    "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd,
2382 	    (uint64_t)prot, (uint64_t)pd[l1idx]);
2383 
2384 	pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_W | pg_crypt;
2385 
2386 	/*
2387 	 * Look up the corresponding U+K entry.  If we're installing the
2388 	 * same PA into the U-K map then set the PG_G bit on both and copy
2389 	 * the cache-control bits from the U+K entry to the U-K entry.
2390 	 */
2391 	level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
2392 	if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) {
2393 		if (((pd[l1idx] ^ ptes[offs]) & pg_frame) == 0) {
2394 			pd[l1idx] |= PG_G | (ptes[offs] & (PG_N | PG_WT));
2395 			ptes[offs] |= PG_G;
2396 		} else {
2397 			DPRINTF("%s: special diffing mapping at %llx\n",
2398 			    __func__, (long long)va);
2399 		}
2400 	} else
2401 		DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
2402 
2403 	DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]);
2404 }
2405 
2406 #if NVMM > 0
2407 /*
2408  * pmap_convert
2409  *
2410  * Converts 'pmap' to the new 'mode'.
2411  *
2412  * Parameters:
2413  *  pmap: the pmap to convert
2414  *  mode: the new mode (see pmap.h, PMAP_TYPE_xxx)
2415  */
2416 void
pmap_convert(struct pmap * pmap,int mode)2417 pmap_convert(struct pmap *pmap, int mode)
2418 {
2419 	pt_entry_t *pte;
2420 
2421 	mtx_enter(&pmap->pm_mtx);
2422 	pmap->pm_type = mode;
2423 
2424 	if (pmap_is_ept(pmap)) {
2425 		/* Clear PML4 */
2426 		pte = (pt_entry_t *)pmap->pm_pdir;
2427 		memset(pte, 0, PAGE_SIZE);
2428 
2429 		/* Give back the meltdown pdir */
2430 		if (pmap->pm_pdir_intel != NULL) {
2431 			pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
2432 			pmap->pm_pdir_intel = NULL;
2433 		}
2434 	}
2435 	mtx_leave(&pmap->pm_mtx);
2436 }
2437 
2438 void
pmap_remove_ept(struct pmap * pmap,vaddr_t sgpa,vaddr_t egpa)2439 pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa)
2440 {
2441 	vaddr_t v;
2442 
2443 	mtx_enter(&pmap->pm_mtx);
2444 
2445 	DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa,
2446 	    (uint64_t)egpa);
2447 	for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE)
2448 		pmap_do_remove_ept(pmap, v);
2449 
2450 	pmap_shootept(pmap, 1);
2451 
2452 	mtx_leave(&pmap->pm_mtx);
2453 
2454 	pmap_tlb_shootwait();
2455 }
2456 
2457 void
pmap_do_remove_ept(struct pmap * pmap,paddr_t gpa)2458 pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
2459 {
2460 	uint64_t l4idx, l3idx, l2idx, l1idx;
2461 	struct vm_page *pg3, *pg2, *pg1;
2462 	paddr_t npa3, npa2, npa1;
2463 	pd_entry_t *pd4, *pd3, *pd2, *pd1;
2464 	pd_entry_t *pptes;
2465 
2466 	MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
2467 
2468 	l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
2469 	l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
2470 	l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
2471 	l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
2472 
2473 	/* Start at PML4 / top level */
2474 	pd4 = (pd_entry_t *)pmap->pm_pdir;
2475 
2476 	if (pd4 == NULL)
2477 		return;
2478 
2479 	/* npa3 = physaddr of PDPT */
2480 	npa3 = pd4[l4idx] & PMAP_PA_MASK;
2481 	if (!npa3)
2482 		return;
2483 	pd3 = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
2484 	pg3 = PHYS_TO_VM_PAGE(npa3);
2485 
2486 	/* npa2 = physaddr of PD page */
2487 	npa2 = pd3[l3idx] & PMAP_PA_MASK;
2488 	if (!npa2)
2489 		return;
2490 	pd2 = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
2491 	pg2 = PHYS_TO_VM_PAGE(npa2);
2492 
2493 	/* npa1 = physaddr of PT page */
2494 	npa1 = pd2[l2idx] & PMAP_PA_MASK;
2495 	if (!npa1)
2496 		return;
2497 	pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1);
2498 	pg1 = PHYS_TO_VM_PAGE(npa1);
2499 
2500 	if (pd1[l1idx] == 0)
2501 		return;
2502 
2503 	pd1[l1idx] = 0;
2504 	pg1->wire_count--;
2505 	pmap->pm_stats.resident_count--;
2506 
2507 	if (pg1->wire_count > 1)
2508 		return;
2509 
2510 	pg1->wire_count = 0;
2511 	pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
2512 	pptes[l2idx] = 0;
2513 	uvm_pagefree(pg1);
2514 	pmap->pm_stats.resident_count--;
2515 
2516 	pg2->wire_count--;
2517 	if (pg2->wire_count > 1)
2518 		return;
2519 
2520 	pg2->wire_count = 0;
2521 	pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
2522 	pptes[l3idx] = 0;
2523 	uvm_pagefree(pg2);
2524 	pmap->pm_stats.resident_count--;
2525 
2526 	pg3->wire_count--;
2527 	if (pg3->wire_count > 1)
2528 		return;
2529 
2530 	pg3->wire_count = 0;
2531 	pptes = pd4;
2532 	pptes[l4idx] = 0;
2533 	uvm_pagefree(pg3);
2534 	pmap->pm_stats.resident_count--;
2535 }
2536 
2537 int
pmap_enter_ept(struct pmap * pmap,paddr_t gpa,paddr_t hpa,vm_prot_t prot)2538 pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot)
2539 {
2540 	uint64_t l4idx, l3idx, l2idx, l1idx;
2541 	pd_entry_t *pd, npte;
2542 	struct vm_page *ptp, *pptp;
2543 	paddr_t npa;
2544 	struct uvm_object *obj;
2545 	int ret = 0;
2546 
2547 	if (gpa > MAXDSIZ)
2548 		return ENOMEM;
2549 
2550 	l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
2551 	l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
2552 	l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
2553 	l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
2554 
2555 	mtx_enter(&pmap->pm_mtx);
2556 
2557 	/* Start at PML4 / top level */
2558 	pd = (pd_entry_t *)pmap->pm_pdir;
2559 
2560 	if (pd == NULL) {
2561 		ret = ENOMEM;
2562 		goto unlock;
2563 	}
2564 
2565 	/* npa = physaddr of PDPT */
2566 	npa = pd[l4idx] & PMAP_PA_MASK;
2567 
2568 	/* Valid PML4e for the 512GB region containing gpa? */
2569 	if (!npa) {
2570 		/* No valid PML4e - allocate PDPT page and set PML4e */
2571 		obj = &pmap->pm_obj[2];	/* PML4 UVM object */
2572 		ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 3), NULL,
2573 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2574 
2575 		if (ptp == NULL) {
2576 			ret = ENOMEM;
2577 			goto unlock;
2578 		}
2579 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2580 
2581 		/*
2582 		 * New PDPT page - we are setting the first entry, so set
2583 		 * the wired count to 1
2584 		 */
2585 		ptp->wire_count = 1;
2586 
2587 		/* Calculate phys address of this new PDPT page */
2588 		npa = VM_PAGE_TO_PHYS(ptp);
2589 
2590 		/*
2591 		 * Higher levels get full perms; specific permissions are
2592 		 * entered at the lowest level.
2593 		 */
2594 		pd[l4idx] = (npa | EPT_R | EPT_W | EPT_X);
2595 
2596 		pmap->pm_stats.resident_count++;
2597 
2598 		pptp = ptp;
2599 	} else {
2600 		/* Already allocated PML4e */
2601 		pptp = PHYS_TO_VM_PAGE(npa);
2602 	}
2603 
2604 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2605 	if (pd == NULL)
2606 		panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
2607 		    (uint64_t)npa);
2608 
2609 	/* npa = physaddr of PD page */
2610 	npa = pd[l3idx] & PMAP_PA_MASK;
2611 
2612 	/* Valid PDPTe for the 1GB region containing gpa? */
2613 	if (!npa) {
2614 		/* No valid PDPTe - allocate PD page and set PDPTe */
2615 		obj = &pmap->pm_obj[1];	/* PDPT UVM object */
2616 		ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 2), NULL,
2617 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2618 
2619 		if (ptp == NULL) {
2620 			ret = ENOMEM;
2621 			goto unlock;
2622 		}
2623 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2624 
2625 		/*
2626 		 * New PD page - we are setting the first entry, so set
2627 		 * the wired count to 1
2628 		 */
2629 		ptp->wire_count = 1;
2630 		pptp->wire_count++;
2631 
2632 		npa = VM_PAGE_TO_PHYS(ptp);
2633 
2634 		/*
2635 		 * Higher levels get full perms; specific permissions are
2636 		 * entered at the lowest level.
2637 		 */
2638 		pd[l3idx] = (npa | EPT_R | EPT_W | EPT_X);
2639 
2640 		pmap->pm_stats.resident_count++;
2641 
2642 		pptp = ptp;
2643 	} else {
2644 		/* Already allocated PDPTe */
2645 		pptp = PHYS_TO_VM_PAGE(npa);
2646 	}
2647 
2648 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2649 	if (pd == NULL)
2650 		panic("%s: can't locate PD page @ pa=0x%llx", __func__,
2651 		    (uint64_t)npa);
2652 
2653 	/* npa = physaddr of PT page */
2654 	npa = pd[l2idx] & PMAP_PA_MASK;
2655 
2656 	/* Valid PDE for the 2MB region containing gpa? */
2657 	if (!npa) {
2658 		/* No valid PDE - allocate PT page and set PDE */
2659 		obj = &pmap->pm_obj[0];	/* PDE UVM object */
2660 		ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 1), NULL,
2661 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2662 
2663 		if (ptp == NULL) {
2664 			ret = ENOMEM;
2665 			goto unlock;
2666 		}
2667 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2668 
2669 		ptp->wire_count = 1;
2670 		pptp->wire_count++;
2671 
2672 		npa = VM_PAGE_TO_PHYS(ptp);
2673 
2674 		/*
2675 		 * Higher level get full perms; specific permissions are
2676 		 * entered at the lowest level.
2677 		 */
2678 		pd[l2idx] = (npa | EPT_R | EPT_W | EPT_X);
2679 
2680 		pmap->pm_stats.resident_count++;
2681 
2682 	} else {
2683 		/* Find final ptp */
2684 		ptp = PHYS_TO_VM_PAGE(npa);
2685 		if (ptp == NULL)
2686 			panic("%s: ptp page vanished?", __func__);
2687 	}
2688 
2689 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2690 	if (pd == NULL)
2691 		panic("%s: can't locate PT page @ pa=0x%llx", __func__,
2692 		    (uint64_t)npa);
2693 
2694 	npte = hpa | EPT_WB;
2695 	if (prot & PROT_READ)
2696 		npte |= EPT_R;
2697 	if (prot & PROT_WRITE)
2698 		npte |= EPT_W;
2699 	if (prot & PROT_EXEC)
2700 		npte |= EPT_X;
2701 
2702 	if (pd[l1idx] == 0) {
2703 		ptp->wire_count++;
2704 		pmap->pm_stats.resident_count++;
2705 	} else {
2706 		/* XXX flush ept */
2707 	}
2708 
2709 	pd[l1idx] = npte;
2710 
2711 unlock:
2712 	mtx_leave(&pmap->pm_mtx);
2713 
2714 	return ret;
2715 }
2716 #endif /* NVMM > 0 */
2717 
2718 /*
2719  * pmap_enter: enter a mapping into a pmap
2720  *
2721  * => must be done "now" ... no lazy-evaluation
2722  */
2723 
2724 int
pmap_enter(struct pmap * pmap,vaddr_t va,paddr_t pa,vm_prot_t prot,int flags)2725 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
2726 {
2727 	pt_entry_t opte, npte;
2728 	struct vm_page *ptp, *pg = NULL;
2729 	struct pv_entry *pve, *opve = NULL;
2730 	int ptpdelta, wireddelta, resdelta;
2731 	int wired = (flags & PMAP_WIRED) != 0;
2732 	int crypt = (flags & PMAP_NOCRYPT) == 0;
2733 	int nocache = (pa & PMAP_NOCACHE) != 0;
2734 	int wc = (pa & PMAP_WC) != 0;
2735 	int error, shootself;
2736 	paddr_t scr3;
2737 
2738 #if NVMM > 0
2739 	if (pmap_is_ept(pmap))
2740 		return pmap_enter_ept(pmap, va, pa, prot);
2741 #endif /* NVMM > 0 */
2742 
2743 	KASSERT(!(wc && nocache));
2744 	pa &= PMAP_PA_MASK;
2745 
2746 #ifdef DIAGNOSTIC
2747 	if (va == (vaddr_t) PDP_BASE)
2748 		panic("%s: trying to map over PDP!", __func__);
2749 
2750 	/* sanity check: kernel PTPs should already have been pre-allocated */
2751 	if (va >= VM_MIN_KERNEL_ADDRESS &&
2752 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
2753 		panic("%s: missing kernel PTP for va %lx!", __func__, va);
2754 
2755 #endif
2756 
2757 	pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
2758 	if (pve == NULL) {
2759 		if (flags & PMAP_CANFAIL) {
2760 			error = ENOMEM;
2761 			goto out;
2762 		}
2763 		panic("%s: no pv entries available", __func__);
2764 	}
2765 
2766 	/*
2767 	 * map in ptes and get a pointer to our PTP (unless we are the kernel)
2768 	 */
2769 
2770 	scr3 = pmap_map_ptes(pmap);
2771 	shootself = (scr3 == 0);
2772 	if (pmap == pmap_kernel()) {
2773 		ptp = NULL;
2774 	} else {
2775 		ptp = pmap_get_ptp(pmap, va);
2776 		if (ptp == NULL) {
2777 			if (flags & PMAP_CANFAIL) {
2778 				pmap_unmap_ptes(pmap, scr3);
2779 				error = ENOMEM;
2780 				goto out;
2781 			}
2782 			panic("%s: get ptp failed", __func__);
2783 		}
2784 	}
2785 	opte = PTE_BASE[pl1_i(va)];		/* old PTE */
2786 
2787 	/*
2788 	 * is there currently a valid mapping at our VA?
2789 	 */
2790 
2791 	if (pmap_valid_entry(opte)) {
2792 		/*
2793 		 * first, calculate pm_stats updates.  resident count will not
2794 		 * change since we are replacing/changing a valid mapping.
2795 		 * wired count might change...
2796 		 */
2797 
2798 		resdelta = 0;
2799 		if (wired && (opte & PG_W) == 0)
2800 			wireddelta = 1;
2801 		else if (!wired && (opte & PG_W) != 0)
2802 			wireddelta = -1;
2803 		else
2804 			wireddelta = 0;
2805 		ptpdelta = 0;
2806 
2807 		/*
2808 		 * is the currently mapped PA the same as the one we
2809 		 * want to map?
2810 		 */
2811 
2812 		if ((opte & pg_frame) == pa) {
2813 
2814 			/* if this is on the PVLIST, sync R/M bit */
2815 			if (opte & PG_PVLIST) {
2816 				pg = PHYS_TO_VM_PAGE(pa);
2817 #ifdef DIAGNOSTIC
2818 				if (pg == NULL)
2819 					panic("%s: same pa, PG_PVLIST "
2820 					    "mapping with unmanaged page: "
2821 					    "va 0x%lx, opte 0x%llx, pa 0x%lx",
2822 					    __func__, va, opte, pa);
2823 #endif
2824 				pmap_sync_flags_pte(pg, opte);
2825 			} else {
2826 #ifdef DIAGNOSTIC
2827 				if (PHYS_TO_VM_PAGE(pa) != NULL)
2828 					panic("%s: same pa, no PG_PVLIST "
2829 					    "mapping with managed page: "
2830 					    "va 0x%lx, opte 0x%llx, pa 0x%lx",
2831 					    __func__, va, opte, pa);
2832 #endif
2833 			}
2834 			goto enter_now;
2835 		}
2836 
2837 		/*
2838 		 * changing PAs: we must remove the old one first
2839 		 */
2840 
2841 		/*
2842 		 * if current mapping is on a pvlist,
2843 		 * remove it (sync R/M bits)
2844 		 */
2845 
2846 		if (opte & PG_PVLIST) {
2847 			pg = PHYS_TO_VM_PAGE(opte & pg_frame);
2848 #ifdef DIAGNOSTIC
2849 			if (pg == NULL)
2850 				panic("%s: PG_PVLIST mapping with unmanaged "
2851 				    "page: va 0x%lx, opte 0x%llx, pa 0x%lx",
2852 				    __func__, va, opte, pa);
2853 #endif
2854 			pmap_sync_flags_pte(pg, opte);
2855 			opve = pmap_remove_pv(pg, pmap, va);
2856 			pg = NULL; /* This is not the page we are looking for */
2857 		}
2858 	} else {	/* opte not valid */
2859 		resdelta = 1;
2860 		if (wired)
2861 			wireddelta = 1;
2862 		else
2863 			wireddelta = 0;
2864 		if (ptp != NULL)
2865 			ptpdelta = 1;
2866 		else
2867 			ptpdelta = 0;
2868 	}
2869 
2870 	/*
2871 	 * pve is either NULL or points to a now-free pv_entry structure
2872 	 * (the latter case is if we called pmap_remove_pv above).
2873 	 *
2874 	 * if this entry is to be on a pvlist, enter it now.
2875 	 */
2876 
2877 	if (pmap_initialized)
2878 		pg = PHYS_TO_VM_PAGE(pa);
2879 
2880 	if (pg != NULL) {
2881 		pmap_enter_pv(pg, pve, pmap, va, ptp);
2882 		pve = NULL;
2883 	}
2884 
2885 enter_now:
2886 	/*
2887 	 * at this point pg is !NULL if we want the PG_PVLIST bit set
2888 	 */
2889 
2890 	pmap->pm_stats.resident_count += resdelta;
2891 	pmap->pm_stats.wired_count += wireddelta;
2892 	if (ptp != NULL)
2893 		ptp->wire_count += ptpdelta;
2894 
2895 	KASSERT(pg == PHYS_TO_VM_PAGE(pa));
2896 
2897 	npte = pa | protection_codes[prot] | PG_V;
2898 	if (pg != NULL) {
2899 		npte |= PG_PVLIST;
2900 		/*
2901 		 * make sure that if the page is write combined all
2902 		 * instances of pmap_enter make it so.
2903 		 */
2904 		if (pg->pg_flags & PG_PMAP_WC) {
2905 			KASSERT(nocache == 0);
2906 			wc = 1;
2907 		}
2908 	}
2909 	if (wc)
2910 		npte |= pmap_pg_wc;
2911 	if (wired)
2912 		npte |= PG_W;
2913 	if (nocache)
2914 		npte |= PG_N;
2915 	if (va < VM_MAXUSER_ADDRESS)
2916 		npte |= ((flags & PMAP_EFI) ? 0 : PG_u);
2917 	else if (va < VM_MAX_ADDRESS)
2918 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
2919 	if (pmap == pmap_kernel())
2920 		npte |= pg_g_kern;
2921 	if (crypt)
2922 		npte |= pg_crypt;
2923 
2924 	/*
2925 	 * If the old entry wasn't valid, we can just update it and
2926 	 * go.  If it was valid, and this isn't a read->write
2927 	 * transition, then we can safely just update it and flush
2928 	 * any old TLB entries.
2929 	 *
2930 	 * If it _was_ valid and this _is_ a read->write transition,
2931 	 * then this could be a CoW resolution and we need to make
2932 	 * sure no CPU can see the new writable mapping while another
2933 	 * still has the old mapping in its TLB, so insert a correct
2934 	 * but unwritable mapping, flush any old TLB entries, then
2935 	 * make it writable.
2936 	 */
2937 	if (! pmap_valid_entry(opte)) {
2938 		PTE_BASE[pl1_i(va)] = npte;
2939 	} else if ((opte | (npte ^ PG_RW)) & PG_RW) {
2940 		/* previously writable or not making writable */
2941 		PTE_BASE[pl1_i(va)] = npte;
2942 		if (nocache && (opte & PG_N) == 0)
2943 			wbinvd_on_all_cpus();
2944 		pmap_tlb_shootpage(pmap, va, shootself);
2945 	} else {
2946 		PTE_BASE[pl1_i(va)] = npte ^ PG_RW;
2947 		if (nocache && (opte & PG_N) == 0) /* XXX impossible? */
2948 			wbinvd_on_all_cpus();
2949 		pmap_tlb_shootpage(pmap, va, shootself);
2950 		pmap_tlb_shootwait();
2951 		PTE_BASE[pl1_i(va)] = npte;
2952 	}
2953 
2954 	pmap_unmap_ptes(pmap, scr3);
2955 	pmap_tlb_shootwait();
2956 
2957 	error = 0;
2958 
2959 out:
2960 	if (pve != NULL)
2961 		pool_put(&pmap_pv_pool, pve);
2962 	if (opve != NULL)
2963 		pool_put(&pmap_pv_pool, opve);
2964 
2965 	return error;
2966 }
2967 
2968 int
pmap_get_physpage(vaddr_t va,int level,paddr_t * paddrp)2969 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
2970 {
2971 	struct vm_page *ptp;
2972 	struct pmap *kpm = pmap_kernel();
2973 
2974 	if (uvm.page_init_done == 0) {
2975 		vaddr_t va;
2976 
2977 		/*
2978 		 * we're growing the kernel pmap early (from
2979 		 * uvm_pageboot_alloc()).  this case must be
2980 		 * handled a little differently.
2981 		 */
2982 
2983 		va = pmap_steal_memory(PAGE_SIZE, NULL, NULL);
2984 		*paddrp = PMAP_DIRECT_UNMAP(va);
2985 	} else {
2986 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
2987 				    ptp_va2o(va, level), NULL,
2988 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2989 		if (ptp == NULL)
2990 			panic("%s: out of memory", __func__);
2991 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2992 		ptp->wire_count = 1;
2993 		*paddrp = VM_PAGE_TO_PHYS(ptp);
2994 	}
2995 	kpm->pm_stats.resident_count++;
2996 	return 1;
2997 }
2998 
2999 /*
3000  * Allocate the amount of specified ptps for a ptp level, and populate
3001  * all levels below accordingly, mapping virtual addresses starting at
3002  * kva.
3003  *
3004  * Used by pmap_growkernel.
3005  */
3006 void
pmap_alloc_level(vaddr_t kva,int lvl,long * needed_ptps)3007 pmap_alloc_level(vaddr_t kva, int lvl, long *needed_ptps)
3008 {
3009 	unsigned long i;
3010 	vaddr_t va;
3011 	paddr_t pa;
3012 	unsigned long index, endindex;
3013 	int level;
3014 	pd_entry_t *pdep;
3015 
3016 	for (level = lvl; level > 1; level--) {
3017 		if (level == PTP_LEVELS)
3018 			pdep = pmap_kernel()->pm_pdir;
3019 		else
3020 			pdep = normal_pdes[level - 2];
3021 		va = kva;
3022 		index = pl_i(kva, level);
3023 		endindex = index + needed_ptps[level - 1];
3024 		/*
3025 		 * XXX special case for first time call.
3026 		 */
3027 		if (nkptp[level - 1] != 0)
3028 			index++;
3029 		else
3030 			endindex--;
3031 
3032 		for (i = index; i <= endindex; i++) {
3033 			pmap_get_physpage(va, level - 1, &pa);
3034 			pdep[i] = pa | PG_RW | PG_V | pg_nx | pg_crypt;
3035 			nkptp[level - 1]++;
3036 			va += nbpd[level - 1];
3037 		}
3038 	}
3039 }
3040 
3041 /*
3042  * pmap_growkernel: increase usage of KVM space
3043  *
3044  * => we allocate new PTPs for the kernel and install them in all
3045  *	the pmaps on the system.
3046  */
3047 
3048 static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS;
3049 
3050 vaddr_t
pmap_growkernel(vaddr_t maxkvaddr)3051 pmap_growkernel(vaddr_t maxkvaddr)
3052 {
3053 	struct pmap *kpm = pmap_kernel(), *pm;
3054 	int s, i;
3055 	unsigned newpdes;
3056 	long needed_kptp[PTP_LEVELS], target_nptp, old;
3057 
3058 	if (maxkvaddr <= pmap_maxkvaddr)
3059 		return pmap_maxkvaddr;
3060 
3061 	maxkvaddr = x86_round_pdr(maxkvaddr);
3062 	old = nkptp[PTP_LEVELS - 1];
3063 	/*
3064 	 * This loop could be optimized more, but pmap_growkernel()
3065 	 * is called infrequently.
3066 	 */
3067 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
3068 		target_nptp = pl_i(maxkvaddr, i + 1) -
3069 		    pl_i(VM_MIN_KERNEL_ADDRESS, i + 1);
3070 		/*
3071 		 * XXX only need to check toplevel.
3072 		 */
3073 		if (target_nptp > nkptpmax[i])
3074 			panic("%s: out of KVA space", __func__);
3075 		needed_kptp[i] = target_nptp - nkptp[i] + 1;
3076 	}
3077 
3078 
3079 	s = splhigh();	/* to be safe */
3080 	pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
3081 
3082 	/*
3083 	 * If the number of top level entries changed, update all
3084 	 * pmaps.
3085 	 */
3086 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
3087 		newpdes = nkptp[PTP_LEVELS - 1] - old;
3088 		mtx_enter(&pmaps_lock);
3089 		LIST_FOREACH(pm, &pmaps, pm_list) {
3090 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
3091 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
3092 			       newpdes * sizeof (pd_entry_t));
3093 		}
3094 		mtx_leave(&pmaps_lock);
3095 	}
3096 	pmap_maxkvaddr = maxkvaddr;
3097 	splx(s);
3098 
3099 	return maxkvaddr;
3100 }
3101 
3102 vaddr_t
pmap_steal_memory(vsize_t size,vaddr_t * start,vaddr_t * end)3103 pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end)
3104 {
3105 	int segno;
3106 	u_int npg;
3107 	vaddr_t va;
3108 	paddr_t pa;
3109 	struct vm_physseg *seg;
3110 
3111 	size = round_page(size);
3112 	npg = atop(size);
3113 
3114 	for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) {
3115 		if (seg->avail_end - seg->avail_start < npg)
3116 			continue;
3117 		/*
3118 		 * We can only steal at an ``unused'' segment boundary,
3119 		 * i.e. either at the start or at the end.
3120 		 */
3121 		if (seg->avail_start == seg->start ||
3122 		    seg->avail_end == seg->end)
3123 			break;
3124 	}
3125 	if (segno == vm_nphysseg) {
3126 		panic("%s: out of memory", __func__);
3127 	} else {
3128 		if (seg->avail_start == seg->start) {
3129 			pa = ptoa(seg->avail_start);
3130 			seg->avail_start += npg;
3131 			seg->start += npg;
3132 		} else {
3133 			pa = ptoa(seg->avail_end) - size;
3134 			seg->avail_end -= npg;
3135 			seg->end -= npg;
3136 		}
3137 		/*
3138 		 * If all the segment has been consumed now, remove it.
3139 		 * Note that the crash dump code still knows about it
3140 		 * and will dump it correctly.
3141 		 */
3142 		if (seg->start == seg->end) {
3143 			if (vm_nphysseg-- == 1)
3144 				panic("%s: out of memory", __func__);
3145 			while (segno < vm_nphysseg) {
3146 				seg[0] = seg[1]; /* struct copy */
3147 				seg++;
3148 				segno++;
3149 			}
3150 		}
3151 
3152 		va = PMAP_DIRECT_MAP(pa);
3153 		memset((void *)va, 0, size);
3154 	}
3155 
3156 	if (start != NULL)
3157 		*start = virtual_avail;
3158 	if (end != NULL)
3159 		*end = VM_MAX_KERNEL_ADDRESS;
3160 
3161 	return (va);
3162 }
3163 
3164 #ifdef MULTIPROCESSOR
3165 /*
3166  * Locking for tlb shootdown.
3167  *
3168  * We lock by setting tlb_shoot_wait to the number of cpus that will
3169  * receive our tlb shootdown. After sending the IPIs, we don't need to
3170  * worry about locking order or interrupts spinning for the lock because
3171  * the call that grabs the "lock" isn't the one that releases it. And
3172  * there is nothing that can block the IPI that releases the lock.
3173  *
3174  * The functions are organized so that we first count the number of
3175  * cpus we need to send the IPI to, then we grab the counter, then
3176  * we send the IPIs, then we finally do our own shootdown.
3177  *
3178  * Our shootdown is last to make it parallel with the other cpus
3179  * to shorten the spin time.
3180  *
3181  * Notice that we depend on failures to send IPIs only being able to
3182  * happen during boot. If they happen later, the above assumption
3183  * doesn't hold since we can end up in situations where noone will
3184  * release the lock if we get an interrupt in a bad moment.
3185  */
3186 #ifdef MP_LOCKDEBUG
3187 #include <ddb/db_output.h>
3188 extern int __mp_lock_spinout;
3189 #endif
3190 
3191 volatile long tlb_shoot_wait __attribute__((section(".kudata")));
3192 
3193 volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
3194 volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
3195 volatile int tlb_shoot_first_pcid __attribute__((section(".kudata")));
3196 
3197 #if NVMM > 0
3198 #include <amd64/vmmvar.h>
3199 volatile uint64_t ept_shoot_mode __attribute__((section(".kudata")));
3200 volatile struct vmx_invept_descriptor ept_shoot_vid
3201     __attribute__((section(".kudata")));
3202 #endif /* NVMM > 0 */
3203 
3204 /* Obtain the "lock" for TLB shooting */
3205 static inline int
pmap_start_tlb_shoot(long wait,const char * func)3206 pmap_start_tlb_shoot(long wait, const char *func)
3207 {
3208 	int s = splvm();
3209 
3210 	while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
3211 #ifdef MP_LOCKDEBUG
3212 		int nticks = __mp_lock_spinout;
3213 #endif
3214 		while (tlb_shoot_wait != 0) {
3215 			CPU_BUSY_CYCLE();
3216 #ifdef MP_LOCKDEBUG
3217 			if (--nticks <= 0) {
3218 				db_printf("%s: spun out", func);
3219 				db_enter();
3220 				nticks = __mp_lock_spinout;
3221 			}
3222 #endif
3223 		}
3224 	}
3225 
3226 	return s;
3227 }
3228 
3229 void
pmap_tlb_shootpage(struct pmap * pm,vaddr_t va,int shootself)3230 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
3231 {
3232 	struct cpu_info *ci, *self = curcpu();
3233 	CPU_INFO_ITERATOR cii;
3234 	long wait = 0;
3235 	u_int64_t mask = 0;
3236 	int is_kva = va >= VM_MIN_KERNEL_ADDRESS;
3237 
3238 	CPU_INFO_FOREACH(cii, ci) {
3239 		if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
3240 			continue;
3241 		if (!is_kva && !pmap_is_active(pm, ci))
3242 			continue;
3243 		mask |= (1ULL << ci->ci_cpuid);
3244 		wait++;
3245 	}
3246 
3247 	if (wait > 0) {
3248 		int s = pmap_start_tlb_shoot(wait, __func__);
3249 
3250 		tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
3251 		tlb_shoot_addr1 = va;
3252 		CPU_INFO_FOREACH(cii, ci) {
3253 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3254 				continue;
3255 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0)
3256 				panic("%s: ipi failed", __func__);
3257 		}
3258 		splx(s);
3259 	}
3260 
3261 	if (!pmap_use_pcid) {
3262 		if (shootself)
3263 			pmap_update_pg(va);
3264 	} else if (is_kva) {
3265 		invpcid(INVPCID_ADDR, PCID_PROC, va);
3266 		invpcid(INVPCID_ADDR, PCID_KERN, va);
3267 	} else if (shootself) {
3268 		invpcid(INVPCID_ADDR, PCID_PROC, va);
3269 		if (cpu_meltdown)
3270 			invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3271 	}
3272 }
3273 
3274 void
pmap_tlb_shootrange(struct pmap * pm,vaddr_t sva,vaddr_t eva,int shootself)3275 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
3276 {
3277 	struct cpu_info *ci, *self = curcpu();
3278 	CPU_INFO_ITERATOR cii;
3279 	long wait = 0;
3280 	u_int64_t mask = 0;
3281 	int is_kva = sva >= VM_MIN_KERNEL_ADDRESS;
3282 	vaddr_t va;
3283 
3284 	CPU_INFO_FOREACH(cii, ci) {
3285 		if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
3286 			continue;
3287 		if (!is_kva && !pmap_is_active(pm, ci))
3288 			continue;
3289 		mask |= (1ULL << ci->ci_cpuid);
3290 		wait++;
3291 	}
3292 
3293 	if (wait > 0) {
3294 		int s = pmap_start_tlb_shoot(wait, __func__);
3295 
3296 		tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
3297 		tlb_shoot_addr1 = sva;
3298 		tlb_shoot_addr2 = eva;
3299 		CPU_INFO_FOREACH(cii, ci) {
3300 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3301 				continue;
3302 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0)
3303 				panic("%s: ipi failed", __func__);
3304 		}
3305 		splx(s);
3306 	}
3307 
3308 	if (!pmap_use_pcid) {
3309 		if (shootself) {
3310 			for (va = sva; va < eva; va += PAGE_SIZE)
3311 				pmap_update_pg(va);
3312 		}
3313 	} else if (is_kva) {
3314 		for (va = sva; va < eva; va += PAGE_SIZE) {
3315 			invpcid(INVPCID_ADDR, PCID_PROC, va);
3316 			invpcid(INVPCID_ADDR, PCID_KERN, va);
3317 		}
3318 	} else if (shootself) {
3319 		if (cpu_meltdown) {
3320 			for (va = sva; va < eva; va += PAGE_SIZE) {
3321 				invpcid(INVPCID_ADDR, PCID_PROC, va);
3322 				invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3323 			}
3324 		} else {
3325 			for (va = sva; va < eva; va += PAGE_SIZE)
3326 				invpcid(INVPCID_ADDR, PCID_PROC, va);
3327 		}
3328 	}
3329 }
3330 
3331 void
pmap_tlb_shoottlb(struct pmap * pm,int shootself)3332 pmap_tlb_shoottlb(struct pmap *pm, int shootself)
3333 {
3334 	struct cpu_info *ci, *self = curcpu();
3335 	CPU_INFO_ITERATOR cii;
3336 	long wait = 0;
3337 	u_int64_t mask = 0;
3338 
3339 	KASSERT(pm != pmap_kernel());
3340 
3341 	CPU_INFO_FOREACH(cii, ci) {
3342 		if (ci == self || !pmap_is_active(pm, ci) ||
3343 		    !(ci->ci_flags & CPUF_RUNNING))
3344 			continue;
3345 		mask |= (1ULL << ci->ci_cpuid);
3346 		wait++;
3347 	}
3348 
3349 	if (wait) {
3350 		int s = pmap_start_tlb_shoot(wait, __func__);
3351 		CPU_INFO_FOREACH(cii, ci) {
3352 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3353 				continue;
3354 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0)
3355 				panic("%s: ipi failed", __func__);
3356 		}
3357 		splx(s);
3358 	}
3359 
3360 	if (shootself) {
3361 		if (!pmap_use_pcid)
3362 			tlbflush();
3363 		else {
3364 			invpcid(INVPCID_PCID, PCID_PROC, 0);
3365 			if (cpu_meltdown)
3366 				invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
3367 		}
3368 	}
3369 }
3370 
3371 #if NVMM > 0
3372 /*
3373  * pmap_shootept: similar to pmap_tlb_shoottlb, but for remotely invalidating
3374  * EPT using invept.
3375  */
3376 void
pmap_shootept(struct pmap * pm,int shootself)3377 pmap_shootept(struct pmap *pm, int shootself)
3378 {
3379 	struct cpu_info *ci, *self = curcpu();
3380 	struct vmx_invept_descriptor vid;
3381 	CPU_INFO_ITERATOR cii;
3382 	long wait = 0;
3383 	u_int64_t mask = 0;
3384 
3385 	KASSERT(pmap_is_ept(pm));
3386 
3387 	CPU_INFO_FOREACH(cii, ci) {
3388 		if (ci == self || !pmap_is_active(pm, ci) ||
3389 		    !(ci->ci_flags & CPUF_RUNNING) ||
3390 		    !(ci->ci_flags & CPUF_VMM))
3391 			continue;
3392 		mask |= (1ULL << ci->ci_cpuid);
3393 		wait++;
3394 	}
3395 
3396 	if (wait) {
3397 		int s = pmap_start_tlb_shoot(wait, __func__);
3398 
3399 		ept_shoot_mode = self->ci_vmm_cap.vcc_vmx.vmx_invept_mode;
3400 		ept_shoot_vid.vid_eptp = pm->eptp;
3401 		ept_shoot_vid.vid_reserved = 0;
3402 
3403 		CPU_INFO_FOREACH(cii, ci) {
3404 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3405 				continue;
3406 			if (x86_fast_ipi(ci, LAPIC_IPI_INVEPT) != 0)
3407 				panic("%s: ipi failed", __func__);
3408 		}
3409 
3410 		splx(s);
3411 	}
3412 
3413 	if (shootself && (self->ci_flags & CPUF_VMM)) {
3414 		vid.vid_eptp = pm->eptp;
3415 		vid.vid_reserved = 0;
3416 		invept(self->ci_vmm_cap.vcc_vmx.vmx_invept_mode, &vid);
3417 	}
3418 }
3419 #endif /* NVMM > 0 */
3420 
3421 void
pmap_tlb_shootwait(void)3422 pmap_tlb_shootwait(void)
3423 {
3424 #ifdef MP_LOCKDEBUG
3425 	int nticks = __mp_lock_spinout;
3426 #endif
3427 	while (tlb_shoot_wait != 0) {
3428 		CPU_BUSY_CYCLE();
3429 #ifdef MP_LOCKDEBUG
3430 		if (--nticks <= 0) {
3431 			db_printf("%s: spun out", __func__);
3432 			db_enter();
3433 			nticks = __mp_lock_spinout;
3434 		}
3435 #endif
3436 	}
3437 }
3438 
3439 #else /* MULTIPROCESSOR */
3440 
3441 void
pmap_tlb_shootpage(struct pmap * pm,vaddr_t va,int shootself)3442 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
3443 {
3444 	if (!pmap_use_pcid) {
3445 		if (shootself)
3446 			pmap_update_pg(va);
3447 	} else if (va >= VM_MIN_KERNEL_ADDRESS) {
3448 		invpcid(INVPCID_ADDR, PCID_PROC, va);
3449 		invpcid(INVPCID_ADDR, PCID_KERN, va);
3450 	} else if (shootself) {
3451 		invpcid(INVPCID_ADDR, PCID_PROC, va);
3452 		if (cpu_meltdown)
3453 			invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3454 	}
3455 }
3456 
3457 void
pmap_tlb_shootrange(struct pmap * pm,vaddr_t sva,vaddr_t eva,int shootself)3458 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
3459 {
3460 	vaddr_t va;
3461 
3462 	if (!pmap_use_pcid) {
3463 		if (shootself) {
3464 			for (va = sva; va < eva; va += PAGE_SIZE)
3465 				pmap_update_pg(va);
3466 		}
3467 	} else if (sva >= VM_MIN_KERNEL_ADDRESS) {
3468 		for (va = sva; va < eva; va += PAGE_SIZE) {
3469 			invpcid(INVPCID_ADDR, PCID_PROC, va);
3470 			invpcid(INVPCID_ADDR, PCID_KERN, va);
3471 		}
3472 	} else if (shootself) {
3473 		if (cpu_meltdown) {
3474 			for (va = sva; va < eva; va += PAGE_SIZE) {
3475 				invpcid(INVPCID_ADDR, PCID_PROC, va);
3476 				invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3477 			}
3478 		} else {
3479 			for (va = sva; va < eva; va += PAGE_SIZE)
3480 				invpcid(INVPCID_ADDR, PCID_PROC, va);
3481 		}
3482 	}
3483 }
3484 
3485 void
pmap_tlb_shoottlb(struct pmap * pm,int shootself)3486 pmap_tlb_shoottlb(struct pmap *pm, int shootself)
3487 {
3488 	if (shootself) {
3489 		if (!pmap_use_pcid)
3490 			tlbflush();
3491 		else {
3492 			invpcid(INVPCID_PCID, PCID_PROC, 0);
3493 			if (cpu_meltdown)
3494 				invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
3495 		}
3496 	}
3497 }
3498 
3499 #if NVMM > 0
3500 void
pmap_shootept(struct pmap * pm,int shootself)3501 pmap_shootept(struct pmap *pm, int shootself)
3502 {
3503 	struct cpu_info *self = curcpu();
3504 	struct vmx_invept_descriptor vid;
3505 
3506 	KASSERT(pmap_is_ept(pm));
3507 
3508 	if (shootself && (self->ci_flags & CPUF_VMM)) {
3509 		vid.vid_eptp = pm->eptp;
3510 		vid.vid_reserved = 0;
3511 		invept(self->ci_vmm_cap.vcc_vmx.vmx_invept_mode, &vid);
3512 	}
3513 }
3514 #endif /* NVMM > 0 */
3515 
3516 #endif /* MULTIPROCESSOR */
3517