xref: /openbsd/sys/arch/amd64/amd64/pmap.c (revision 55cc5ba3)
1 /*	$OpenBSD: pmap.c,v 1.141 2020/12/16 21:11:35 bluhm Exp $	*/
2 /*	$NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright 2001 (c) Wasabi Systems, Inc.
31  * All rights reserved.
32  *
33  * Written by Frank van der Linden for Wasabi Systems, Inc.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgement:
45  *      This product includes software developed for the NetBSD Project by
46  *      Wasabi Systems, Inc.
47  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
48  *    or promote products derived from this software without specific prior
49  *    written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
53  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
54  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
55  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
56  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
57  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
58  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
59  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
60  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
61  * POSSIBILITY OF SUCH DAMAGE.
62  */
63 
64 /*
65  * This is the i386 pmap modified and generalized to support x86-64
66  * as well. The idea is to hide the upper N levels of the page tables
67  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
68  * is mostly untouched, except that it uses some more generalized
69  * macros and interfaces.
70  *
71  * This pmap has been tested on the i386 as well, and it can be easily
72  * adapted to PAE.
73  *
74  * fvdl@wasabisystems.com 18-Jun-2001
75  */
76 
77 /*
78  * pmap.c: i386 pmap module rewrite
79  * Chuck Cranor <chuck@ccrc.wustl.edu>
80  * 11-Aug-97
81  *
82  * history of this pmap module: in addition to my own input, i used
83  *    the following references for this rewrite of the i386 pmap:
84  *
85  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
86  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
87  *     it was then ported to the i386 by William Jolitz of UUNET
88  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
89  *     project fixed some bugs and provided some speed ups.
90  *
91  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
92  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
93  *     and David Greenman.
94  *
95  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
96  *     between several processors.   the VAX version was done by
97  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
98  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
99  *     David Golub, and Richard Draves.    the alpha version was
100  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
101  *     (NetBSD/alpha).
102  */
103 
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/atomic.h>
107 #include <sys/proc.h>
108 #include <sys/pool.h>
109 #include <sys/user.h>
110 #include <sys/mutex.h>
111 
112 #include <uvm/uvm.h>
113 
114 #include <machine/cpu.h>
115 #ifdef MULTIPROCESSOR
116 #include <machine/i82489reg.h>
117 #include <machine/i82489var.h>
118 #endif
119 
120 #include "vmm.h"
121 
122 #if NVMM > 0
123 #include <machine/vmmvar.h>
124 #endif /* NVMM > 0 */
125 
126 #include "acpi.h"
127 
128 /* #define PMAP_DEBUG */
129 
130 #ifdef PMAP_DEBUG
131 #define DPRINTF(x...)   do { printf(x); } while(0)
132 #else
133 #define DPRINTF(x...)
134 #endif /* PMAP_DEBUG */
135 
136 
137 /*
138  * general info:
139  *
140  *  - for an explanation of how the i386 MMU hardware works see
141  *    the comments in <machine/pte.h>.
142  *
143  *  - for an explanation of the general memory structure used by
144  *    this pmap (including the recursive mapping), see the comments
145  *    in <machine/pmap.h>.
146  *
147  * this file contains the code for the "pmap module."   the module's
148  * job is to manage the hardware's virtual to physical address mappings.
149  * note that there are two levels of mapping in the VM system:
150  *
151  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
152  *      to map ranges of virtual address space to objects/files.  for
153  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
154  *      to the file /bin/ls starting at offset zero."   note that
155  *      the upper layer mapping is not concerned with how individual
156  *      vm_pages are mapped.
157  *
158  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
159  *      from virtual addresses.   it is concerned with which vm_page is
160  *      mapped where.   for example, when you run /bin/ls and start
161  *      at page 0x1000 the fault routine may lookup the correct page
162  *      of the /bin/ls file and then ask the pmap layer to establish
163  *      a mapping for it.
164  *
165  * note that information in the lower layer of the VM system can be
166  * thrown away since it can easily be reconstructed from the info
167  * in the upper layer.
168  *
169  * data structures we use include:
170  *  - struct pmap: describes the address space of one process
171  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
172  *  - struct pg_to_free: a list of virtual addresses whose mappings
173  *	have been changed.   used for TLB flushing.
174  */
175 
176 /*
177  * memory allocation
178  *
179  *  - there are three data structures that we must dynamically allocate:
180  *
181  * [A] new process' page directory page (PDP)
182  *	- plan 1: done at pmap_create() we use
183  *	  pool_get(&pmap_pmap_pool, PR_WAITOK) to do this allocation.
184  *
185  * if we are low in free physical memory then we sleep in
186  * pool_get() -- in this case this is ok since we are creating
187  * a new pmap and should not be holding any locks.
188  *
189  * XXX: the fork code currently has no way to return an "out of
190  * memory, try again" error code since uvm_fork [fka vm_fork]
191  * is a void function.
192  *
193  * [B] new page tables pages (PTP)
194  *	call uvm_pagealloc()
195  *		=> success: zero page, add to pm_pdir
196  *		=> failure: we are out of free vm_pages, let pmap_enter()
197  *		   tell UVM about it.
198  *
199  * note: for kernel PTPs, we start with NKPTP of them.   as we map
200  * kernel memory (at uvm_map time) we check to see if we've grown
201  * the kernel pmap.   if so, we call the optional function
202  * pmap_growkernel() to grow the kernel PTPs in advance.
203  *
204  * [C] pv_entry structures
205  *	- try to allocate one from the pool.
206  *	If we fail, we simply let pmap_enter() tell UVM about it.
207  */
208 
209 long nkptp[] = NKPTP_INITIALIZER;
210 
211 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
212 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
213 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
214 const long nbpd[] = NBPD_INITIALIZER;
215 pd_entry_t *const normal_pdes[] = PDES_INITIALIZER;
216 
217 #define pmap_pte_set(p, n)		atomic_swap_64(p, n)
218 #define pmap_pte_clearbits(p, b)	x86_atomic_clearbits_u64(p, b)
219 #define pmap_pte_setbits(p, b)		x86_atomic_setbits_u64(p, b)
220 
221 /*
222  * global data structures
223  */
224 
225 struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
226 
227 /*
228  * pmap_pg_wc: if our processor supports PAT then we set this
229  * to be the pte bits for Write Combining. Else we fall back to
230  * UC- so mtrrs can override the cacheability;
231  */
232 int pmap_pg_wc = PG_UCMINUS;
233 
234 /*
235  * pmap_use_pcid: nonzero if PCID use is enabled (currently we require INVPCID)
236  *
237  * The next three are zero unless and until PCID support is enabled so code
238  * can just 'or' them in as needed without tests.
239  * cr3_pcid: CR3_REUSE_PCID
240  * cr3_pcid_proc and cr3_pcid_temp: PCID_PROC and PCID_TEMP
241  */
242 #if PCID_KERN != 0
243 # error "pmap.c assumes PCID_KERN is zero"
244 #endif
245 int pmap_use_pcid;
246 static u_int cr3_pcid_proc;
247 static u_int cr3_pcid_temp;
248 /* these two are accessed from locore.o */
249 paddr_t cr3_reuse_pcid;
250 paddr_t cr3_pcid_proc_intel;
251 
252 /*
253  * other data structures
254  */
255 
256 pt_entry_t protection_codes[8];     /* maps MI prot to i386 prot code */
257 int pmap_initialized = 0;	    /* pmap_init done yet? */
258 
259 /*
260  * pv management structures.
261  */
262 struct pool pmap_pv_pool;
263 
264 /*
265  * linked list of all non-kernel pmaps
266  */
267 
268 struct pmap_head pmaps;
269 
270 /*
271  * pool that pmap structures are allocated from
272  */
273 
274 struct pool pmap_pmap_pool;
275 
276 /*
277  * When we're freeing a ptp, we need to delay the freeing until all
278  * tlb shootdown has been done. This is the list of the to-be-freed pages.
279  */
280 TAILQ_HEAD(pg_to_free, vm_page);
281 
282 /*
283  * pool that PDPs are allocated from
284  */
285 
286 struct pool pmap_pdp_pool;
287 void pmap_pdp_ctor(pd_entry_t *);
288 void pmap_pdp_ctor_intel(pd_entry_t *);
289 
290 extern vaddr_t msgbuf_vaddr;
291 extern paddr_t msgbuf_paddr;
292 
293 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
294 extern paddr_t idt_paddr;
295 
296 extern vaddr_t lo32_vaddr;
297 extern vaddr_t lo32_paddr;
298 
299 vaddr_t virtual_avail;
300 extern int end;
301 
302 /*
303  * local prototypes
304  */
305 
306 void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *,
307     vaddr_t, struct vm_page *);
308 struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t);
309 struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
310 int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs);
311 void pmap_free_ptp(struct pmap *, struct vm_page *,
312     vaddr_t, struct pg_to_free *);
313 void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
314 #ifdef MULTIPROCESSOR
315 static int pmap_is_active(struct pmap *, int);
316 #endif
317 paddr_t pmap_map_ptes(struct pmap *);
318 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
319 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
320 void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t);
321 void pmap_do_remove_ept(struct pmap *, vaddr_t);
322 int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t);
323 int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
324     vaddr_t, int, struct pv_entry **);
325 void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
326     vaddr_t, vaddr_t, int, struct pv_entry **);
327 #define PMAP_REMOVE_ALL		0	/* remove all mappings */
328 #define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
329 
330 void pmap_unmap_ptes(struct pmap *, paddr_t);
331 int pmap_get_physpage(vaddr_t, int, paddr_t *);
332 int pmap_pdes_valid(vaddr_t, pd_entry_t *);
333 void pmap_alloc_level(vaddr_t, int, long *);
334 
335 static inline
336 void pmap_sync_flags_pte(struct vm_page *, u_long);
337 
338 void pmap_tlb_shootpage(struct pmap *, vaddr_t, int);
339 void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int);
340 void pmap_tlb_shoottlb(struct pmap *, int);
341 #ifdef MULTIPROCESSOR
342 void pmap_tlb_shootwait(void);
343 #else
344 #define	pmap_tlb_shootwait()
345 #endif
346 
347 /*
348  * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
349  */
350 
351 /*
352  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
353  *		of course the kernel is always loaded
354  */
355 
356 static __inline int
357 pmap_is_curpmap(struct pmap *pmap)
358 {
359 	return((pmap == pmap_kernel()) ||
360 	       (pmap->pm_pdirpa == (rcr3() & CR3_PADDR)));
361 }
362 
363 /*
364  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
365  */
366 
367 #ifdef MULTIPROCESSOR
368 static __inline int
369 pmap_is_active(struct pmap *pmap, int cpu_id)
370 {
371 	return (pmap == pmap_kernel() ||
372 	    (pmap->pm_cpus & (1ULL << cpu_id)) != 0);
373 }
374 #endif
375 
376 static __inline u_int
377 pmap_pte2flags(u_long pte)
378 {
379 	return (((pte & PG_U) ? PG_PMAP_REF : 0) |
380 	    ((pte & PG_M) ? PG_PMAP_MOD : 0));
381 }
382 
383 static inline void
384 pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
385 {
386 	if (pte & (PG_U|PG_M)) {
387 		atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
388 	}
389 }
390 
391 /*
392  * pmap_map_ptes: map a pmap's PTEs into KVM
393  *
394  * This should not be done for EPT pmaps
395  */
396 paddr_t
397 pmap_map_ptes(struct pmap *pmap)
398 {
399 	paddr_t cr3;
400 
401 	KASSERT(pmap->pm_type != PMAP_TYPE_EPT);
402 
403 	/* the kernel's pmap is always accessible */
404 	if (pmap == pmap_kernel())
405 		return 0;
406 
407 	/*
408 	 * Lock the target map before switching to its page tables to
409 	 * guarantee other CPUs have finished changing the tables before
410 	 * we potentially start caching table and TLB entries.
411 	 */
412 	mtx_enter(&pmap->pm_mtx);
413 
414 	cr3 = rcr3();
415 	KASSERT((cr3 & CR3_PCID) == PCID_KERN ||
416 		(cr3 & CR3_PCID) == PCID_PROC);
417 	if (pmap->pm_pdirpa == (cr3 & CR3_PADDR))
418 		cr3 = 0;
419 	else {
420 		cr3 |= cr3_reuse_pcid;
421 		lcr3(pmap->pm_pdirpa | cr3_pcid_temp);
422 	}
423 
424 	return cr3;
425 }
426 
427 void
428 pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
429 {
430 	if (pmap != pmap_kernel())
431 		mtx_leave(&pmap->pm_mtx);
432 
433 	if (save_cr3 != 0)
434 		lcr3(save_cr3);
435 }
436 
437 int
438 pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs)
439 {
440 	u_long mask, shift;
441 	pd_entry_t pde;
442 	paddr_t pdpa;
443 	int lev;
444 
445 	pdpa = pm->pm_pdirpa;
446 	shift = L4_SHIFT;
447 	mask = L4_MASK;
448 	for (lev = PTP_LEVELS; lev > 0; lev--) {
449 		*pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa);
450 		*offs = (VA_SIGN_POS(va) & mask) >> shift;
451 		pde = (*pd)[*offs];
452 
453 		/* Large pages are different, break early if we run into one. */
454 		if ((pde & (PG_PS|PG_V)) != PG_V)
455 			return (lev - 1);
456 
457 		pdpa = ((*pd)[*offs] & PG_FRAME);
458 		/* 4096/8 == 512 == 2^9 entries per level */
459 		shift -= 9;
460 		mask >>= 9;
461 	}
462 
463 	return (0);
464 }
465 
466 /*
467  * p m a p   k e n t e r   f u n c t i o n s
468  *
469  * functions to quickly enter/remove pages from the kernel address
470  * space.   pmap_kremove is exported to MI kernel.  we make use of
471  * the recursive PTE mappings.
472  */
473 
474 /*
475  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
476  *
477  * => no need to lock anything, assume va is already allocated
478  * => should be faster than normal pmap enter function
479  */
480 
481 void
482 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
483 {
484 	pt_entry_t *pte, opte, npte;
485 
486 	pte = kvtopte(va);
487 
488 	npte = (pa & PMAP_PA_MASK) | ((prot & PROT_WRITE) ? PG_RW : PG_RO) |
489 	    ((pa & PMAP_NOCACHE) ? PG_N : 0) |
490 	    ((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V;
491 
492 	/* special 1:1 mappings in the first 2MB must not be global */
493 	if (va >= (vaddr_t)NBPD_L2)
494 		npte |= pg_g_kern;
495 
496 	if (!(prot & PROT_EXEC))
497 		npte |= pg_nx;
498 	opte = pmap_pte_set(pte, npte);
499 #ifdef LARGEPAGES
500 	/* XXX For now... */
501 	if (opte & PG_PS)
502 		panic("%s: PG_PS", __func__);
503 #endif
504 	if (pmap_valid_entry(opte)) {
505 		if (pa & PMAP_NOCACHE && (opte & PG_N) == 0)
506 			wbinvd_on_all_cpus();
507 		/* This shouldn't happen */
508 		pmap_tlb_shootpage(pmap_kernel(), va, 1);
509 		pmap_tlb_shootwait();
510 	}
511 }
512 
513 /*
514  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
515  *
516  * => no need to lock anything
517  * => caller must dispose of any vm_page mapped in the va range
518  * => note: not an inline function
519  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
520  * => we assume kernel only unmaps valid addresses and thus don't bother
521  *    checking the valid bit before doing TLB flushing
522  */
523 
524 void
525 pmap_kremove(vaddr_t sva, vsize_t len)
526 {
527 	pt_entry_t *pte, opte;
528 	vaddr_t va, eva;
529 
530 	eva = sva + len;
531 
532 	for (va = sva; va != eva; va += PAGE_SIZE) {
533 		pte = kvtopte(va);
534 
535 		opte = pmap_pte_set(pte, 0);
536 #ifdef LARGEPAGES
537 		KASSERT((opte & PG_PS) == 0);
538 #endif
539 		KASSERT((opte & PG_PVLIST) == 0);
540 	}
541 
542 	pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1);
543 	pmap_tlb_shootwait();
544 }
545 
546 /*
547  * pmap_set_pml4_early
548  *
549  * Utility function to map 2GB of 2MB pages to 'pa'. The VA that is assigned
550  * is the pml4 entry for 'early mappings' (see pmap.h). This function is used
551  * by display drivers that need to map their framebuffers early, before the
552  * pmap is fully initialized (eg, to show panic messages).
553  *
554  * Users of this function must call pmap_clear_pml4_early to remove the
555  * mapping when finished.
556  *
557  * Parameters:
558  *  pa: phys addr to map
559  *
560  * Return value:
561  *  VA mapping to 'pa'. This mapping is 2GB in size and starts at the base
562  *   of the 2MB region containing 'va'.
563  */
564 vaddr_t
565 pmap_set_pml4_early(paddr_t pa)
566 {
567 	extern paddr_t early_pte_pages;
568 	pt_entry_t *pml4e, *pte;
569 	int i, j, off;
570 	paddr_t curpa;
571 	vaddr_t va;
572 
573 	pml4e = (pt_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
574 	pml4e[PDIR_SLOT_EARLY] = (pd_entry_t)early_pte_pages | PG_V | PG_RW;
575 
576 	off = pa & PAGE_MASK_L2;
577 	curpa = pa & L2_FRAME;
578 
579 	pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
580 	memset(pte, 0, 3 * NBPG);
581 
582 	pte[0] = (early_pte_pages + NBPG) | PG_V | PG_RW;
583 	pte[1] = (early_pte_pages + 2 * NBPG) | PG_V | PG_RW;
584 
585 	pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages + NBPG);
586 	for (i = 0; i < 2; i++) {
587 		/* 2 early pages of mappings */
588 		for (j = 0; j < 512; j++) {
589 			/* j[0..511] : 2MB mappings per page */
590 			pte[(i * 512) + j] = curpa | PG_V | PG_RW | PG_PS;
591 			curpa += (2 * 1024 * 1024);
592 		}
593 	}
594 
595 	va = (vaddr_t)((PDIR_SLOT_EARLY * 512ULL) << L3_SHIFT) + off;
596 	return VA_SIGN_NEG(va);
597 }
598 
599 /*
600  * pmap_clear_pml4_early
601  *
602  * Clears the mapping previously established with pmap_set_pml4_early.
603  */
604 void
605 pmap_clear_pml4_early(void)
606 {
607 	extern paddr_t early_pte_pages;
608 	pt_entry_t *pml4e, *pte;
609 
610 	pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages);
611 	memset(pte, 0, 3 * NBPG);
612 
613 	pml4e = (pd_entry_t *)pmap_kernel()->pm_pdir;
614 	pml4e[PDIR_SLOT_EARLY] = 0;
615 	tlbflush();
616 }
617 
618 /*
619  * p m a p   i n i t   f u n c t i o n s
620  *
621  * pmap_bootstrap and pmap_init are called during system startup
622  * to init the pmap module.   pmap_bootstrap() does a low level
623  * init just to get things rolling.   pmap_init() finishes the job.
624  */
625 
626 /*
627  * pmap_bootstrap: get the system in a state where it can run with VM
628  *	properly enabled (called before main()).   the VM system is
629  *      fully init'd later...
630  */
631 
632 paddr_t
633 pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
634 {
635 	vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS;
636 	struct pmap *kpm;
637 	int curslot, i, j, p;
638 	long ndmpdp;
639 	paddr_t dmpd, dmpdp, start_cur, cur_pa;
640 	vaddr_t kva, kva_end;
641 	pt_entry_t *pml3, *pml2;
642 
643 	/*
644 	 * define the boundaries of the managed kernel virtual address
645 	 * space.
646 	 */
647 
648 	virtual_avail = kva_start;		/* first free KVA */
649 
650 	/*
651 	 * set up protection_codes: we need to be able to convert from
652 	 * a MI protection code (some combo of VM_PROT...) to something
653 	 * we can jam into a i386 PTE.
654 	 */
655 
656 	protection_codes[PROT_NONE] = pg_nx;			/* --- */
657 	protection_codes[PROT_EXEC] = PG_RO;			/* --x */
658 	protection_codes[PROT_READ] = PG_RO | pg_nx;		/* -r- */
659 	protection_codes[PROT_READ | PROT_EXEC] = PG_RO;	/* -rx */
660 	protection_codes[PROT_WRITE] = PG_RW | pg_nx;		/* w-- */
661 	protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW;	/* w-x */
662 	protection_codes[PROT_WRITE | PROT_READ] = PG_RW | pg_nx; /* wr- */
663 	protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW;	/* wrx */
664 
665 	/*
666 	 * now we init the kernel's pmap
667 	 *
668 	 * the kernel pmap's pm_obj is not used for much.   however, in
669 	 * user pmaps the pm_obj contains the list of active PTPs.
670 	 * the pm_obj currently does not have a pager.
671 	 */
672 
673 	kpm = pmap_kernel();
674 	for (i = 0; i < PTP_LEVELS - 1; i++) {
675 		uvm_objinit(&kpm->pm_obj[i], NULL, 1);
676 		kpm->pm_ptphint[i] = NULL;
677 	}
678 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
679 	kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
680 	kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
681 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
682 		atop(kva_start - VM_MIN_KERNEL_ADDRESS);
683 	/*
684 	 * the above is just a rough estimate and not critical to the proper
685 	 * operation of the system.
686 	 */
687 
688 	kpm->pm_type = PMAP_TYPE_NORMAL;
689 
690 	curpcb->pcb_pmap = kpm;	/* proc0's pcb */
691 
692 	/*
693 	 * Configure and enable PCID use if supported.
694 	 * Currently we require INVPCID support.
695 	 */
696 	if ((cpu_ecxfeature & CPUIDECX_PCID) && cpuid_level >= 0x07) {
697 		uint32_t ebx, dummy;
698 		CPUID_LEAF(0x7, 0, dummy, ebx, dummy, dummy);
699 		if (ebx & SEFF0EBX_INVPCID) {
700 			pmap_use_pcid = 1;
701 			lcr4( rcr4() | CR4_PCIDE );
702 			cr3_pcid_proc = PCID_PROC;
703 			cr3_pcid_temp = PCID_TEMP;
704 			cr3_reuse_pcid = CR3_REUSE_PCID;
705 			cr3_pcid_proc_intel = PCID_PROC_INTEL;
706 		}
707 	}
708 
709 	/*
710 	 * Add PG_G attribute to already mapped kernel pages. pg_g_kern
711 	 * is calculated in locore0.S and may be set to:
712 	 *
713 	 * 0 if this CPU does not safely support global pages in the kernel
714 	 *  (Intel/Meltdown)
715 	 * PG_G if this CPU does safely support global pages in the kernel
716 	 *  (AMD)
717 	 */
718 #if KERNBASE == VM_MIN_KERNEL_ADDRESS
719 	for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
720 #else
721 	kva_end = roundup((vaddr_t)&end, PAGE_SIZE);
722 	for (kva = KERNBASE; kva < kva_end ;
723 #endif
724 	     kva += PAGE_SIZE) {
725 		unsigned long p1i = pl1_i(kva);
726 		if (pmap_valid_entry(PTE_BASE[p1i]))
727 			PTE_BASE[p1i] |= pg_g_kern;
728 	}
729 
730 	/*
731 	 * Map the direct map. The first 4GB were mapped in locore, here
732 	 * we map the rest if it exists. We actually use the direct map
733 	 * here to set up the page tables, we're assuming that we're still
734 	 * operating in the lower 4GB of memory.
735 	 *
736 	 * Map (up to) the first 512GB of physical memory first. This part
737 	 * is handled differently than physical memory > 512GB since we have
738 	 * already mapped part of this range in locore0.
739 	 */
740 	ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT;
741 	if (ndmpdp < NDML2_ENTRIES)
742 		ndmpdp = NDML2_ENTRIES;		/* At least 4GB */
743 	if (ndmpdp > 512)
744 		ndmpdp = 512;			/* At most 512GB */
745 
746 	dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & PG_FRAME;
747 
748 	dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE;
749 
750 	for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) {
751 		paddr_t pdp;
752 		vaddr_t va;
753 
754 		pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
755 		va = PMAP_DIRECT_MAP(pdp);
756 
757 		*((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT);
758 		*((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U |
759 		    PG_M | pg_nx;
760 	}
761 
762 	for (i = NDML2_ENTRIES; i < ndmpdp; i++) {
763 		paddr_t pdp;
764 		vaddr_t va;
765 
766 		pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
767 		va = PMAP_DIRECT_MAP(pdp);
768 
769 		*((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT);
770 		*((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M | pg_nx;
771 	}
772 
773 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U |
774 	    PG_M | pg_nx;
775 
776 	/* Map any remaining physical memory > 512GB */
777 	for (curslot = 1 ; curslot < NUM_L4_SLOT_DIRECT ; curslot++) {
778 		/*
779 		 * Start of current range starts at PA (curslot) * 512GB
780 		 */
781 		start_cur = (paddr_t)(curslot * NBPD_L4);
782 		if (max_pa > start_cur) {
783 			/* Next 512GB, new PML4e and L3(512GB) page */
784 			dmpd = first_avail; first_avail += PAGE_SIZE;
785 			pml3 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
786 			kpm->pm_pdir[PDIR_SLOT_DIRECT + curslot] = dmpd |
787 			    PG_KW | PG_V | PG_U | PG_M | pg_nx;
788 
789 			/* Calculate full 1GB pages in this 512GB region */
790 			p = ((max_pa - start_cur) >> L3_SHIFT);
791 
792 			/* Check if a partial (<1GB) page remains */
793 			if (max_pa & L2_MASK)
794 				p++;
795 
796 			/*
797 			 * Handle the case where this range is full and there
798 			 * is still more memory after (p would be > 512).
799 			 */
800 			if (p > NPDPG)
801 				p = NPDPG;
802 
803 			/* Allocate 'p' L2(1GB) pages and populate */
804 			for (i = 0; i < p; i++) {
805 				dmpd = first_avail; first_avail += PAGE_SIZE;
806 				pml2 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd);
807 				pml3[i] = dmpd |
808 				    PG_RW | PG_V | PG_U | PG_M | pg_nx;
809 
810 				cur_pa = start_cur + (i << L3_SHIFT);
811 				j = 0;
812 
813 				while (cur_pa < max_pa && j < NPDPG) {
814 					pml2[j] = curslot * NBPD_L4 +
815 					    (uint64_t)i * NBPD_L3 +
816 					    (uint64_t)j * NBPD_L2;
817 					pml2[j] |= PG_RW | PG_V | pg_g_kern |
818 					    PG_U | PG_M | pg_nx | PG_PS;
819 					cur_pa += NBPD_L2;
820 					j++;
821 				}
822 			}
823 		}
824 	}
825 
826 	tlbflush();
827 
828 	msgbuf_vaddr = virtual_avail;
829 	virtual_avail += round_page(MSGBUFSIZE);
830 
831 	idt_vaddr = virtual_avail;
832 	virtual_avail += 2 * PAGE_SIZE;
833 	idt_paddr = first_avail;			/* steal a page */
834 	first_avail += 2 * PAGE_SIZE;
835 
836 #if defined(MULTIPROCESSOR) || \
837     (NACPI > 0 && !defined(SMALL_KERNEL))
838 	/*
839 	 * Grab a page below 4G for things that need it (i.e.
840 	 * having an initial %cr3 for the MP trampoline).
841 	 */
842 	lo32_vaddr = virtual_avail;
843 	virtual_avail += PAGE_SIZE;
844 	lo32_paddr = first_avail;
845 	first_avail += PAGE_SIZE;
846 #endif
847 
848 	/*
849 	 * init the global lists.
850 	 */
851 	LIST_INIT(&pmaps);
852 
853 	/*
854 	 * initialize the pmap pools.
855 	 */
856 
857 	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_VM, 0,
858 	    "pmappl", NULL);
859 	pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0,
860 	    "pvpl", &pool_allocator_single);
861 	pool_sethiwat(&pmap_pv_pool, 32 * 1024);
862 
863 	/*
864 	 * initialize the PDE pool.
865 	 */
866 
867 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_VM, 0,
868 	    "pdppl", &pool_allocator_single);
869 
870 	kpm->pm_pdir_intel = NULL;
871 	kpm->pm_pdirpa_intel = 0;
872 
873 	/*
874 	 * ensure the TLB is sync'd with reality by flushing it...
875 	 */
876 
877 	tlbflush();
878 
879 	return first_avail;
880 }
881 
882 /*
883  * pmap_randomize
884  *
885  * Randomizes the location of the kernel pmap
886  */
887 void
888 pmap_randomize(void)
889 {
890 	pd_entry_t *pml4va, *oldpml4va;
891 	paddr_t pml4pa;
892 	int i;
893 
894 	pml4va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
895 	if (pml4va == NULL)
896 		panic("%s: km_alloc failed", __func__);
897 
898 	/* Copy old PML4 page to new one */
899 	oldpml4va = pmap_kernel()->pm_pdir;
900 	memcpy(pml4va, oldpml4va, PAGE_SIZE);
901 
902 	/* Switch to new PML4 */
903 	pmap_extract(pmap_kernel(), (vaddr_t)pml4va, &pml4pa);
904 	lcr3(pml4pa);
905 
906 	/* Fixup pmap_kernel and proc0's %cr3 */
907 	pmap_kernel()->pm_pdirpa = pml4pa;
908 	pmap_kernel()->pm_pdir = pml4va;
909 	proc0.p_addr->u_pcb.pcb_cr3 = pml4pa;
910 
911 	/* Fixup recursive PTE PML4E slot. We are only changing the PA */
912 	pml4va[PDIR_SLOT_PTE] = pml4pa | (pml4va[PDIR_SLOT_PTE] & ~PG_FRAME);
913 
914 	for (i = 0; i < NPDPG; i++) {
915 		/* PTE slot already handled earlier */
916 		if (i == PDIR_SLOT_PTE)
917 			continue;
918 
919 		if (pml4va[i] & PG_FRAME)
920 			pmap_randomize_level(&pml4va[i], 3);
921 	}
922 
923 	/* Wipe out bootstrap PML4 */
924 	memset(oldpml4va, 0, PAGE_SIZE);
925 	tlbflush();
926 }
927 
928 void
929 pmap_randomize_level(pd_entry_t *pde, int level)
930 {
931 	pd_entry_t *new_pd_va;
932 	paddr_t old_pd_pa, new_pd_pa;
933 	vaddr_t old_pd_va;
934 	struct vm_page *pg;
935 	int i;
936 
937 	if (level == 0)
938 		return;
939 
940 	if (level < PTP_LEVELS - 1 && (*pde & PG_PS))
941 		return;
942 
943 	new_pd_va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait);
944 	if (new_pd_va == NULL)
945 		panic("%s: cannot allocate page for L%d page directory",
946 		    __func__, level);
947 
948 	old_pd_pa = *pde & PG_FRAME;
949 	old_pd_va = PMAP_DIRECT_MAP(old_pd_pa);
950 	pmap_extract(pmap_kernel(), (vaddr_t)new_pd_va, &new_pd_pa);
951 	memcpy(new_pd_va, (void *)old_pd_va, PAGE_SIZE);
952 	*pde = new_pd_pa | (*pde & ~PG_FRAME);
953 
954 	tlbflush();
955 	memset((void *)old_pd_va, 0, PAGE_SIZE);
956 
957 	pg = PHYS_TO_VM_PAGE(old_pd_pa);
958 	if (pg != NULL) {
959 		pg->wire_count--;
960 		pmap_kernel()->pm_stats.resident_count--;
961 		if (pg->wire_count <= 1)
962 			uvm_pagefree(pg);
963 	}
964 
965 	for (i = 0; i < NPDPG; i++)
966 		if (new_pd_va[i] & PG_FRAME)
967 			pmap_randomize_level(&new_pd_va[i], level - 1);
968 }
969 
970 /*
971  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
972  * trampoline code can be entered.
973  */
974 paddr_t
975 pmap_prealloc_lowmem_ptps(paddr_t first_avail)
976 {
977 	pd_entry_t *pdes;
978 	int level;
979 	paddr_t newp;
980 
981 	pdes = pmap_kernel()->pm_pdir;
982 	level = PTP_LEVELS;
983 	for (;;) {
984 		newp = first_avail; first_avail += PAGE_SIZE;
985 		memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
986 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
987 		level--;
988 		if (level <= 1)
989 			break;
990 		pdes = normal_pdes[level - 2];
991 	}
992 
993 	return first_avail;
994 }
995 
996 /*
997  * pmap_init: no further initialization required on this platform
998  */
999 void
1000 pmap_init(void)
1001 {
1002 	pmap_initialized = 1;
1003 }
1004 
1005 /*
1006  * p v _ e n t r y   f u n c t i o n s
1007  */
1008 
1009 /*
1010  * main pv_entry manipulation functions:
1011  *   pmap_enter_pv: enter a mapping onto a pv list
1012  *   pmap_remove_pv: remove a mapping from a pv list
1013  */
1014 
1015 /*
1016  * pmap_enter_pv: enter a mapping onto a pv list
1017  *
1018  * => caller should adjust ptp's wire_count before calling
1019  *
1020  * pve: preallocated pve for us to use
1021  * ptp: PTP in pmap that maps this VA
1022  */
1023 
1024 void
1025 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
1026     vaddr_t va, struct vm_page *ptp)
1027 {
1028 	pve->pv_pmap = pmap;
1029 	pve->pv_va = va;
1030 	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
1031 	mtx_enter(&pg->mdpage.pv_mtx);
1032 	pve->pv_next = pg->mdpage.pv_list;	/* add to ... */
1033 	pg->mdpage.pv_list = pve;		/* ... list */
1034 	mtx_leave(&pg->mdpage.pv_mtx);
1035 }
1036 
1037 /*
1038  * pmap_remove_pv: try to remove a mapping from a pv_list
1039  *
1040  * => caller should adjust ptp's wire_count and free PTP if needed
1041  * => we return the removed pve
1042  */
1043 
1044 struct pv_entry *
1045 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
1046 {
1047 	struct pv_entry *pve, **prevptr;
1048 
1049 	mtx_enter(&pg->mdpage.pv_mtx);
1050 	prevptr = &pg->mdpage.pv_list;
1051 	while ((pve = *prevptr) != NULL) {
1052 		if (pve->pv_pmap == pmap && pve->pv_va == va) {	/* match? */
1053 			*prevptr = pve->pv_next;		/* remove it! */
1054 			break;
1055 		}
1056 		prevptr = &pve->pv_next;		/* previous pointer */
1057 	}
1058 	mtx_leave(&pg->mdpage.pv_mtx);
1059 	return(pve);				/* return removed pve */
1060 }
1061 
1062 /*
1063  * p t p   f u n c t i o n s
1064  */
1065 
1066 struct vm_page *
1067 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1068 {
1069 	int lidx = level - 1;
1070 	struct vm_page *pg;
1071 
1072 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1073 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
1074 		return (pmap->pm_ptphint[lidx]);
1075 
1076 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1077 
1078 	return pg;
1079 }
1080 
1081 void
1082 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
1083     struct pg_to_free *pagelist)
1084 {
1085 	int lidx;
1086 	struct uvm_object *obj;
1087 
1088 	lidx = level - 1;
1089 
1090 	obj = &pmap->pm_obj[lidx];
1091 	pmap->pm_stats.resident_count--;
1092 	if (pmap->pm_ptphint[lidx] == ptp)
1093 		pmap->pm_ptphint[lidx] = RBT_ROOT(uvm_objtree, &obj->memt);
1094 	ptp->wire_count = 0;
1095 	uvm_pagerealloc(ptp, NULL, 0);
1096 	TAILQ_INSERT_TAIL(pagelist, ptp, pageq);
1097 }
1098 
1099 void
1100 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1101     struct pg_to_free *pagelist)
1102 {
1103 	unsigned long index;
1104 	int level;
1105 	vaddr_t invaladdr;
1106 
1107 	level = 1;
1108 	do {
1109 		pmap_freepage(pmap, ptp, level, pagelist);
1110 		index = pl_i(va, level + 1);
1111 		pmap_pte_set(&normal_pdes[level - 1][index], 0);
1112 		if (level == PTP_LEVELS - 1 && pmap->pm_pdir_intel != NULL) {
1113 			/* Zap special meltdown PML4e */
1114 			pmap_pte_set(&pmap->pm_pdir_intel[index], 0);
1115 			DPRINTF("%s: cleared meltdown PML4e @ index %lu "
1116 			    "(va range start 0x%llx)\n", __func__, index,
1117 			    (uint64_t)(index << L4_SHIFT));
1118 		}
1119 		invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
1120 		    (vaddr_t)normal_pdes[level - 2];
1121 		pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE,
1122 		    pmap_is_curpmap(curpcb->pcb_pmap));
1123 		if (level < PTP_LEVELS - 1) {
1124 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1125 			ptp->wire_count--;
1126 			if (ptp->wire_count > 1)
1127 				break;
1128 		}
1129 	} while (++level < PTP_LEVELS);
1130 }
1131 
1132 /*
1133  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1134  *
1135  * => pmap should NOT be pmap_kernel()
1136  */
1137 
1138 struct vm_page *
1139 pmap_get_ptp(struct pmap *pmap, vaddr_t va)
1140 {
1141 	struct vm_page *ptp, *pptp;
1142 	int i;
1143 	unsigned long index;
1144 	pd_entry_t *pva, *pva_intel;
1145 	paddr_t ppa, pa;
1146 	struct uvm_object *obj;
1147 
1148 	ptp = NULL;
1149 	pa = (paddr_t)-1;
1150 
1151 	/*
1152 	 * Loop through all page table levels seeing if we need to
1153 	 * add a new page to that level.
1154 	 */
1155 	for (i = PTP_LEVELS; i > 1; i--) {
1156 		/*
1157 		 * Save values from previous round.
1158 		 */
1159 		pptp = ptp;
1160 		ppa = pa;
1161 
1162 		index = pl_i(va, i);
1163 		pva = normal_pdes[i - 2];
1164 
1165 		if (pmap_valid_entry(pva[index])) {
1166 			ppa = pva[index] & PG_FRAME;
1167 			ptp = NULL;
1168 			continue;
1169 		}
1170 
1171 		obj = &pmap->pm_obj[i-2];
1172 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1173 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1174 
1175 		if (ptp == NULL)
1176 			return NULL;
1177 
1178 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
1179 		ptp->wire_count = 1;
1180 		pmap->pm_ptphint[i - 2] = ptp;
1181 		pa = VM_PAGE_TO_PHYS(ptp);
1182 		pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V);
1183 
1184 		/*
1185 		 * Meltdown Special case - if we are adding a new PML4e for
1186 		 * usermode addresses, just copy the PML4e to the U-K page
1187 		 * table.
1188 		 */
1189 		if (pmap->pm_pdir_intel != NULL && i == PTP_LEVELS &&
1190 		    va < VM_MAXUSER_ADDRESS) {
1191 			pva_intel = pmap->pm_pdir_intel;
1192 			pva_intel[index] = pva[index];
1193 			DPRINTF("%s: copying usermode PML4e (content=0x%llx) "
1194 			    "from 0x%llx -> 0x%llx\n", __func__, pva[index],
1195 			    (uint64_t)&pva[index], (uint64_t)&pva_intel[index]);
1196 		}
1197 
1198 		pmap->pm_stats.resident_count++;
1199 		/*
1200 		 * If we're not in the top level, increase the
1201 		 * wire count of the parent page.
1202 		 */
1203 		if (i < PTP_LEVELS) {
1204 			if (pptp == NULL)
1205 				pptp = pmap_find_ptp(pmap, va, ppa, i);
1206 #ifdef DIAGNOSTIC
1207 			if (pptp == NULL)
1208 				panic("%s: pde page disappeared", __func__);
1209 #endif
1210 			pptp->wire_count++;
1211 		}
1212 	}
1213 
1214 	/*
1215 	 * ptp is not NULL if we just allocated a new ptp. If it's
1216 	 * still NULL, we must look up the existing one.
1217 	 */
1218 	if (ptp == NULL) {
1219 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
1220 #ifdef DIAGNOSTIC
1221 		if (ptp == NULL) {
1222 			printf("va %lx ppa %lx\n", (unsigned long)va,
1223 			    (unsigned long)ppa);
1224 			panic("%s: unmanaged user PTP", __func__);
1225 		}
1226 #endif
1227 	}
1228 
1229 	pmap->pm_ptphint[0] = ptp;
1230 	return(ptp);
1231 }
1232 
1233 /*
1234  * p m a p  l i f e c y c l e   f u n c t i o n s
1235  */
1236 
1237 /*
1238  * pmap_pdp_ctor: constructor for the PDP cache.
1239  */
1240 
1241 void
1242 pmap_pdp_ctor(pd_entry_t *pdir)
1243 {
1244 	paddr_t pdirpa;
1245 	int npde, i;
1246 	struct pmap *kpm = pmap_kernel();
1247 
1248 	/* fetch the physical address of the page directory. */
1249 	(void) pmap_extract(kpm, (vaddr_t) pdir, &pdirpa);
1250 
1251 	/* zero init area */
1252 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
1253 
1254 	/* put in recursive PDE to map the PTEs */
1255 	pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW | pg_nx;
1256 
1257 	npde = nkptp[PTP_LEVELS - 1];
1258 
1259 	/* put in kernel VM PDEs */
1260 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
1261 	    npde * sizeof(pd_entry_t));
1262 
1263 	/* zero the rest */
1264 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
1265 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
1266 
1267 	for (i = 0; i < NUM_L4_SLOT_DIRECT; i++)
1268 		pdir[PDIR_SLOT_DIRECT + i] = kpm->pm_pdir[PDIR_SLOT_DIRECT + i];
1269 
1270 #if VM_MIN_KERNEL_ADDRESS != KERNBASE
1271 	pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)];
1272 #endif
1273 }
1274 
1275 void
1276 pmap_pdp_ctor_intel(pd_entry_t *pdir)
1277 {
1278 	struct pmap *kpm = pmap_kernel();
1279 
1280 	/* Copy PML4es from pmap_kernel's U-K view */
1281 	memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE);
1282 }
1283 
1284 /*
1285  * pmap_create: create a pmap
1286  *
1287  * => note: old pmap interface took a "size" args which allowed for
1288  *	the creation of "software only" pmaps (not in bsd).
1289  */
1290 
1291 struct pmap *
1292 pmap_create(void)
1293 {
1294 	struct pmap *pmap;
1295 	int i;
1296 
1297 	pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
1298 
1299 	mtx_init(&pmap->pm_mtx, IPL_VM);
1300 
1301 	/* init uvm_object */
1302 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1303 		uvm_objinit(&pmap->pm_obj[i], NULL, 1);
1304 		pmap->pm_ptphint[i] = NULL;
1305 	}
1306 	pmap->pm_stats.wired_count = 0;
1307 	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
1308 	pmap->pm_cpus = 0;
1309 	pmap->pm_type = PMAP_TYPE_NORMAL;
1310 
1311 	/* allocate PDP */
1312 
1313 	/*
1314 	 * note that there is no need to splvm to protect us from
1315 	 * malloc since malloc allocates out of a submap and we should
1316 	 * have already allocated kernel PTPs to cover the range...
1317 	 */
1318 
1319 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
1320 	pmap_pdp_ctor(pmap->pm_pdir);
1321 
1322 	pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME;
1323 
1324 	/*
1325 	 * Intel CPUs need a special page table to be used during usermode
1326 	 * execution, one that lacks all kernel mappings.
1327 	 */
1328 	if (cpu_meltdown) {
1329 		pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK);
1330 		pmap_pdp_ctor_intel(pmap->pm_pdir_intel);
1331 		pmap->pm_stats.resident_count++;
1332 		if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel,
1333 		    &pmap->pm_pdirpa_intel))
1334 			panic("%s: unknown PA mapping for meltdown PML4",
1335 			    __func__);
1336 	} else {
1337 		pmap->pm_pdir_intel = NULL;
1338 		pmap->pm_pdirpa_intel = 0;
1339 	}
1340 
1341 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1342 	return (pmap);
1343 }
1344 
1345 /*
1346  * pmap_destroy: drop reference count on pmap.   free pmap if
1347  *	reference count goes to zero.
1348  */
1349 
1350 void
1351 pmap_destroy(struct pmap *pmap)
1352 {
1353 	struct vm_page *pg;
1354 	int refs;
1355 	int i;
1356 
1357 	/*
1358 	 * drop reference count
1359 	 */
1360 
1361 	refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs);
1362 	if (refs > 0) {
1363 		return;
1364 	}
1365 
1366 	/*
1367 	 * reference count is zero, free pmap resources and then free pmap.
1368 	 */
1369 
1370 #ifdef DIAGNOSTIC
1371 	if (__predict_false(pmap->pm_cpus != 0))
1372 		printf("%s: pmap %p cpus=0x%llx\n", __func__,
1373 		    (void *)pmap, pmap->pm_cpus);
1374 #endif
1375 
1376 	/*
1377 	 * remove it from global list of pmaps
1378 	 */
1379 	LIST_REMOVE(pmap, pm_list);
1380 
1381 	/*
1382 	 * free any remaining PTPs
1383 	 */
1384 
1385 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1386 		while ((pg = RBT_ROOT(uvm_objtree,
1387 		    &pmap->pm_obj[i].memt)) != NULL) {
1388 			KASSERT((pg->pg_flags & PG_BUSY) == 0);
1389 
1390 			pg->wire_count = 0;
1391 			pmap->pm_stats.resident_count--;
1392 
1393 			uvm_pagefree(pg);
1394 		}
1395 	}
1396 
1397 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
1398 
1399 	if (pmap->pm_pdir_intel != NULL) {
1400 		pmap->pm_stats.resident_count--;
1401 		pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
1402 	}
1403 
1404 	pool_put(&pmap_pmap_pool, pmap);
1405 }
1406 
1407 /*
1408  *	Add a reference to the specified pmap.
1409  */
1410 
1411 void
1412 pmap_reference(struct pmap *pmap)
1413 {
1414 	atomic_inc_int(&pmap->pm_obj[0].uo_refs);
1415 }
1416 
1417 /*
1418  * pmap_activate: activate a process' pmap (fill in %cr3)
1419  *
1420  * => called from cpu_fork() and when switching pmaps during exec
1421  * => if p is the curproc, then load it into the MMU
1422  */
1423 
1424 void
1425 pmap_activate(struct proc *p)
1426 {
1427 	struct pcb *pcb = &p->p_addr->u_pcb;
1428 	struct pmap *pmap = p->p_vmspace->vm_map.pmap;
1429 
1430 	pcb->pcb_pmap = pmap;
1431 	pcb->pcb_cr3 = pmap->pm_pdirpa;
1432 	pcb->pcb_cr3 |= (pmap != pmap_kernel()) ? cr3_pcid_proc :
1433 	    (PCID_KERN | cr3_reuse_pcid);
1434 
1435 	if (p == curproc) {
1436 		lcr3(pcb->pcb_cr3);
1437 
1438 		/* in case we return to userspace without context switching */
1439 		if (cpu_meltdown) {
1440 			struct cpu_info *self = curcpu();
1441 
1442 			self->ci_kern_cr3 = pcb->pcb_cr3 | cr3_reuse_pcid;
1443 			self->ci_user_cr3 = pmap->pm_pdirpa_intel |
1444 			    cr3_pcid_proc_intel;
1445 		}
1446 
1447 		/*
1448 		 * mark the pmap in use by this processor.
1449 		 */
1450 		x86_atomic_setbits_u64(&pmap->pm_cpus, (1ULL << cpu_number()));
1451 	}
1452 }
1453 
1454 /*
1455  * pmap_deactivate: deactivate a process' pmap
1456  */
1457 
1458 void
1459 pmap_deactivate(struct proc *p)
1460 {
1461 	struct pmap *pmap = p->p_vmspace->vm_map.pmap;
1462 
1463 	/*
1464 	 * mark the pmap no longer in use by this processor.
1465 	 */
1466 	x86_atomic_clearbits_u64(&pmap->pm_cpus, (1ULL << cpu_number()));
1467 }
1468 
1469 /*
1470  * end of lifecycle functions
1471  */
1472 
1473 /*
1474  * some misc. functions
1475  */
1476 
1477 int
1478 pmap_pdes_valid(vaddr_t va, pd_entry_t *lastpde)
1479 {
1480 	int i;
1481 	unsigned long index;
1482 	pd_entry_t pde;
1483 
1484 	for (i = PTP_LEVELS; i > 1; i--) {
1485 		index = pl_i(va, i);
1486 		pde = normal_pdes[i - 2][index];
1487 		if (!pmap_valid_entry(pde))
1488 			return 0;
1489 	}
1490 	if (lastpde != NULL)
1491 		*lastpde = pde;
1492 	return 1;
1493 }
1494 
1495 /*
1496  * pmap_extract: extract a PA for the given VA
1497  */
1498 
1499 int
1500 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1501 {
1502 	pt_entry_t *ptes;
1503 	int level, offs;
1504 
1505 	if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE &&
1506 	    va < PMAP_DIRECT_END) {
1507 		*pap = va - PMAP_DIRECT_BASE;
1508 		return 1;
1509 	}
1510 
1511 	level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
1512 
1513 	if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) {
1514 		if (pap != NULL)
1515 			*pap = (ptes[offs] & PG_FRAME) | (va & PAGE_MASK);
1516 		return 1;
1517 	}
1518 	if (level == 1 && (ptes[offs] & (PG_PS|PG_V)) == (PG_PS|PG_V)) {
1519 		if (pap != NULL)
1520 			*pap = (ptes[offs] & PG_LGFRAME) | (va & PAGE_MASK_L2);
1521 		return 1;
1522 	}
1523 
1524 	return 0;
1525 }
1526 
1527 /*
1528  * pmap_zero_page: zero a page
1529  */
1530 
1531 void
1532 pmap_zero_page(struct vm_page *pg)
1533 {
1534 	pagezero(pmap_map_direct(pg));
1535 }
1536 
1537 /*
1538  * pmap_flush_cache: flush the cache for a virtual address.
1539  */
1540 void
1541 pmap_flush_cache(vaddr_t addr, vsize_t len)
1542 {
1543 	vaddr_t	i;
1544 
1545 	if (curcpu()->ci_cflushsz == 0) {
1546 		wbinvd_on_all_cpus();
1547 		return;
1548 	}
1549 
1550 	/* all cpus that have clflush also have mfence. */
1551 	mfence();
1552 	for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz)
1553 		clflush(i);
1554 	mfence();
1555 }
1556 
1557 /*
1558  * pmap_copy_page: copy a page
1559  */
1560 
1561 void
1562 pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
1563 {
1564 	vaddr_t srcva = pmap_map_direct(srcpg);
1565 	vaddr_t dstva = pmap_map_direct(dstpg);
1566 
1567 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
1568 }
1569 
1570 /*
1571  * p m a p   r e m o v e   f u n c t i o n s
1572  *
1573  * functions that remove mappings
1574  */
1575 
1576 /*
1577  * pmap_remove_ptes: remove PTEs from a PTP
1578  *
1579  * => must have proper locking on pmap_master_lock
1580  * => PTP must be mapped into KVA
1581  * => PTP should be null if pmap == pmap_kernel()
1582  */
1583 
1584 void
1585 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1586     vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1587 {
1588 	struct pv_entry *pve;
1589 	pt_entry_t *pte = (pt_entry_t *) ptpva;
1590 	struct vm_page *pg;
1591 	pt_entry_t opte;
1592 
1593 	/*
1594 	 * note that ptpva points to the PTE that maps startva.   this may
1595 	 * or may not be the first PTE in the PTP.
1596 	 *
1597 	 * we loop through the PTP while there are still PTEs to look at
1598 	 * and the wire_count is greater than 1 (because we use the wire_count
1599 	 * to keep track of the number of real PTEs in the PTP).
1600 	 */
1601 
1602 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1603 			     ; pte++, startva += PAGE_SIZE) {
1604 		if (!pmap_valid_entry(*pte))
1605 			continue;			/* VA not mapped */
1606 		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1607 			continue;
1608 		}
1609 
1610 		/* atomically save the old PTE and zap! it */
1611 		opte = pmap_pte_set(pte, 0);
1612 
1613 		if (opte & PG_W)
1614 			pmap->pm_stats.wired_count--;
1615 		pmap->pm_stats.resident_count--;
1616 
1617 		if (ptp != NULL)
1618 			ptp->wire_count--;		/* dropping a PTE */
1619 
1620 		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1621 
1622 		/*
1623 		 * if we are not on a pv list we are done.
1624 		 */
1625 
1626 		if ((opte & PG_PVLIST) == 0) {
1627 #ifdef DIAGNOSTIC
1628 			if (pg != NULL)
1629 				panic("%s: managed page without PG_PVLIST: "
1630 				    "va 0x%lx, opte 0x%llx", __func__,
1631 				    startva, opte);
1632 #endif
1633 			continue;
1634 		}
1635 
1636 #ifdef DIAGNOSTIC
1637 		if (pg == NULL)
1638 			panic("%s: unmanaged page marked PG_PVLIST: "
1639 			    "va 0x%lx, opte 0x%llx", __func__,
1640 			    startva, opte);
1641 #endif
1642 
1643 		/* sync R/M bits */
1644 		pmap_sync_flags_pte(pg, opte);
1645 		pve = pmap_remove_pv(pg, pmap, startva);
1646 		if (pve != NULL) {
1647 			pve->pv_next = *free_pvs;
1648 			*free_pvs = pve;
1649 		}
1650 
1651 		/* end of "for" loop: time for next pte */
1652 	}
1653 }
1654 
1655 /*
1656  * pmap_remove_pte: remove a single PTE from a PTP
1657  *
1658  * => must have proper locking on pmap_master_lock
1659  * => PTP must be mapped into KVA
1660  * => PTP should be null if pmap == pmap_kernel()
1661  * => returns true if we removed a mapping
1662  */
1663 
1664 int
1665 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
1666     vaddr_t va, int flags, struct pv_entry **free_pvs)
1667 {
1668 	struct pv_entry *pve;
1669 	struct vm_page *pg;
1670 	pt_entry_t opte;
1671 
1672 	if (!pmap_valid_entry(*pte))
1673 		return 0;		/* VA not mapped */
1674 	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1675 		return 0;
1676 	}
1677 
1678 	/* atomically save the old PTE and zap! it */
1679 	opte = pmap_pte_set(pte, 0);
1680 
1681 	if (opte & PG_W)
1682 		pmap->pm_stats.wired_count--;
1683 	pmap->pm_stats.resident_count--;
1684 
1685 	if (ptp != NULL)
1686 		ptp->wire_count--;		/* dropping a PTE */
1687 
1688 	pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1689 
1690 	/*
1691 	 * if we are not on a pv list we are done.
1692 	 */
1693 	if ((opte & PG_PVLIST) == 0) {
1694 #ifdef DIAGNOSTIC
1695 		if (pg != NULL)
1696 			panic("%s: managed page without PG_PVLIST: "
1697 			    "va 0x%lx, opte 0x%llx", __func__, va, opte);
1698 #endif
1699 		return 1;
1700 	}
1701 
1702 #ifdef DIAGNOSTIC
1703 	if (pg == NULL)
1704 		panic("%s: unmanaged page marked PG_PVLIST: "
1705 		    "va 0x%lx, opte 0x%llx", __func__, va, opte);
1706 #endif
1707 
1708 	/* sync R/M bits */
1709 	pmap_sync_flags_pte(pg, opte);
1710 	pve = pmap_remove_pv(pg, pmap, va);
1711 	if (pve != NULL) {
1712 		pve->pv_next = *free_pvs;
1713 		*free_pvs = pve;
1714 	}
1715 
1716 	return 1;
1717 }
1718 
1719 /*
1720  * pmap_remove: top level mapping removal function
1721  *
1722  * => caller should not be holding any pmap locks
1723  */
1724 
1725 void
1726 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
1727 {
1728 	if (pmap->pm_type == PMAP_TYPE_EPT)
1729 		pmap_remove_ept(pmap, sva, eva);
1730 	else
1731 		pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
1732 }
1733 
1734 /*
1735  * pmap_do_remove: mapping removal guts
1736  *
1737  * => caller should not be holding any pmap locks
1738  */
1739 
1740 void
1741 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1742 {
1743 	pd_entry_t pde;
1744 	int result;
1745 	paddr_t ptppa;
1746 	vaddr_t blkendva;
1747 	struct vm_page *ptp;
1748 	struct pv_entry *pve;
1749 	struct pv_entry *free_pvs = NULL;
1750 	vaddr_t va;
1751 	int shootall = 0, shootself;
1752 	struct pg_to_free empty_ptps;
1753 	paddr_t scr3;
1754 
1755 	TAILQ_INIT(&empty_ptps);
1756 
1757 	scr3 = pmap_map_ptes(pmap);
1758 	shootself = (scr3 == 0);
1759 
1760 	/*
1761 	 * removing one page?  take shortcut function.
1762 	 */
1763 
1764 	if (sva + PAGE_SIZE == eva) {
1765 		if (pmap_pdes_valid(sva, &pde)) {
1766 
1767 			/* PA of the PTP */
1768 			ptppa = pde & PG_FRAME;
1769 
1770 			/* get PTP if non-kernel mapping */
1771 
1772 			if (pmap == pmap_kernel()) {
1773 				/* we never free kernel PTPs */
1774 				ptp = NULL;
1775 			} else {
1776 				ptp = pmap_find_ptp(pmap, sva, ptppa, 1);
1777 #ifdef DIAGNOSTIC
1778 				if (ptp == NULL)
1779 					panic("%s: unmanaged PTP detected",
1780 					      __func__);
1781 #endif
1782 			}
1783 
1784 			/* do it! */
1785 			result = pmap_remove_pte(pmap, ptp,
1786 			    &PTE_BASE[pl1_i(sva)], sva, flags, &free_pvs);
1787 
1788 			/*
1789 			 * if mapping removed and the PTP is no longer
1790 			 * being used, free it!
1791 			 */
1792 
1793 			if (result && ptp && ptp->wire_count <= 1)
1794 				pmap_free_ptp(pmap, ptp, sva, &empty_ptps);
1795 			pmap_tlb_shootpage(pmap, sva, shootself);
1796 			pmap_unmap_ptes(pmap, scr3);
1797 			pmap_tlb_shootwait();
1798 		} else {
1799 			pmap_unmap_ptes(pmap, scr3);
1800 		}
1801 
1802 		goto cleanup;
1803 	}
1804 
1805 	if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
1806 		shootall = 1;
1807 
1808 	for (va = sva; va < eva; va = blkendva) {
1809 		/* determine range of block */
1810 		blkendva = x86_round_pdr(va + 1);
1811 		if (blkendva > eva)
1812 			blkendva = eva;
1813 
1814 		/*
1815 		 * XXXCDC: our PTE mappings should never be removed
1816 		 * with pmap_remove!  if we allow this (and why would
1817 		 * we?) then we end up freeing the pmap's page
1818 		 * directory page (PDP) before we are finished using
1819 		 * it when we hit in in the recursive mapping.  this
1820 		 * is BAD.
1821 		 *
1822 		 * long term solution is to move the PTEs out of user
1823 		 * address space.  and into kernel address space (up
1824 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1825 		 * be VM_MAX_ADDRESS.
1826 		 */
1827 
1828 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
1829 			/* XXXCDC: ugly hack to avoid freeing PDP here */
1830 			continue;
1831 
1832 		if (!pmap_pdes_valid(va, &pde))
1833 			continue;
1834 
1835 		/* PA of the PTP */
1836 		ptppa = pde & PG_FRAME;
1837 
1838 		/* get PTP if non-kernel mapping */
1839 		if (pmap == pmap_kernel()) {
1840 			/* we never free kernel PTPs */
1841 			ptp = NULL;
1842 		} else {
1843 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
1844 #ifdef DIAGNOSTIC
1845 			if (ptp == NULL)
1846 				panic("%s: unmanaged PTP detected", __func__);
1847 #endif
1848 		}
1849 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&PTE_BASE[pl1_i(va)],
1850 		    va, blkendva, flags, &free_pvs);
1851 
1852 		/* if PTP is no longer being used, free it! */
1853 		if (ptp && ptp->wire_count <= 1) {
1854 			pmap_free_ptp(pmap, ptp, va, &empty_ptps);
1855 		}
1856 	}
1857 
1858 	if (shootall)
1859 		pmap_tlb_shoottlb(pmap, shootself);
1860 	else
1861 		pmap_tlb_shootrange(pmap, sva, eva, shootself);
1862 
1863 	pmap_unmap_ptes(pmap, scr3);
1864 	pmap_tlb_shootwait();
1865 
1866 cleanup:
1867 	while ((pve = free_pvs) != NULL) {
1868 		free_pvs = pve->pv_next;
1869 		pool_put(&pmap_pv_pool, pve);
1870 	}
1871 
1872 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1873 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1874 		uvm_pagefree(ptp);
1875 	}
1876 }
1877 
1878 /*
1879  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1880  *
1881  * => R/M bits are sync'd back to attrs
1882  */
1883 
1884 void
1885 pmap_page_remove(struct vm_page *pg)
1886 {
1887 	struct pv_entry *pve;
1888 	struct pmap *pm;
1889 	pt_entry_t opte;
1890 #ifdef DIAGNOSTIC
1891 	pd_entry_t pde;
1892 #endif
1893 	struct pg_to_free empty_ptps;
1894 	struct vm_page *ptp;
1895 	paddr_t scr3;
1896 	int shootself;
1897 
1898 	TAILQ_INIT(&empty_ptps);
1899 
1900 	mtx_enter(&pg->mdpage.pv_mtx);
1901 	while ((pve = pg->mdpage.pv_list) != NULL) {
1902 		pmap_reference(pve->pv_pmap);
1903 		pm = pve->pv_pmap;
1904 		mtx_leave(&pg->mdpage.pv_mtx);
1905 
1906 		/* XXX use direct map? */
1907 		scr3 = pmap_map_ptes(pm);	/* locks pmap */
1908 		shootself = (scr3 == 0);
1909 
1910 		/*
1911 		 * We dropped the pvlist lock before grabbing the pmap
1912 		 * lock to avoid lock ordering problems.  This means
1913 		 * we have to check the pvlist again since somebody
1914 		 * else might have modified it.  All we care about is
1915 		 * that the pvlist entry matches the pmap we just
1916 		 * locked.  If it doesn't, unlock the pmap and try
1917 		 * again.
1918 		 */
1919 		mtx_enter(&pg->mdpage.pv_mtx);
1920 		if ((pve = pg->mdpage.pv_list) == NULL ||
1921 		    pve->pv_pmap != pm) {
1922 			mtx_leave(&pg->mdpage.pv_mtx);
1923 			pmap_unmap_ptes(pm, scr3);	/* unlocks pmap */
1924 			pmap_destroy(pm);
1925 			mtx_enter(&pg->mdpage.pv_mtx);
1926 			continue;
1927 		}
1928 
1929 		pg->mdpage.pv_list = pve->pv_next;
1930 		mtx_leave(&pg->mdpage.pv_mtx);
1931 
1932 #ifdef DIAGNOSTIC
1933 		if (pve->pv_ptp != NULL && pmap_pdes_valid(pve->pv_va, &pde) &&
1934 		   (pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
1935 			printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__,
1936 			       pg, pve->pv_va, pve->pv_ptp);
1937 			printf("%s: PTP's phys addr: "
1938 			       "actual=%lx, recorded=%lx\n", __func__,
1939 			       (unsigned long)(pde & PG_FRAME),
1940 				VM_PAGE_TO_PHYS(pve->pv_ptp));
1941 			panic("%s: mapped managed page has "
1942 			      "invalid pv_ptp field", __func__);
1943 		}
1944 #endif
1945 
1946 		/* atomically save the old PTE and zap it */
1947 		opte = pmap_pte_set(&PTE_BASE[pl1_i(pve->pv_va)], 0);
1948 
1949 		if (opte & PG_W)
1950 			pve->pv_pmap->pm_stats.wired_count--;
1951 		pve->pv_pmap->pm_stats.resident_count--;
1952 
1953 		pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself);
1954 
1955 		pmap_sync_flags_pte(pg, opte);
1956 
1957 		/* update the PTP reference count.  free if last reference. */
1958 		if (pve->pv_ptp != NULL) {
1959 			pve->pv_ptp->wire_count--;
1960 			if (pve->pv_ptp->wire_count <= 1) {
1961 				pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
1962 				    pve->pv_va, &empty_ptps);
1963 			}
1964 		}
1965 		pmap_unmap_ptes(pve->pv_pmap, scr3);	/* unlocks pmap */
1966 		pmap_destroy(pve->pv_pmap);
1967 		pool_put(&pmap_pv_pool, pve);
1968 		mtx_enter(&pg->mdpage.pv_mtx);
1969 	}
1970 	mtx_leave(&pg->mdpage.pv_mtx);
1971 
1972 	pmap_tlb_shootwait();
1973 
1974 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1975 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1976 		uvm_pagefree(ptp);
1977 	}
1978 }
1979 
1980 /*
1981  * p m a p   a t t r i b u t e  f u n c t i o n s
1982  * functions that test/change managed page's attributes
1983  * since a page can be mapped multiple times we must check each PTE that
1984  * maps it by going down the pv lists.
1985  */
1986 
1987 /*
1988  * pmap_test_attrs: test a page's attributes
1989  */
1990 
1991 int
1992 pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
1993 {
1994 	struct pv_entry *pve;
1995 	pt_entry_t *ptes;
1996 	int level, offs;
1997 	u_long mybits, testflags;
1998 
1999 	testflags = pmap_pte2flags(testbits);
2000 
2001 	if (pg->pg_flags & testflags)
2002 		return 1;
2003 
2004 	mybits = 0;
2005 	mtx_enter(&pg->mdpage.pv_mtx);
2006 	for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
2007 	    pve = pve->pv_next) {
2008 		level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
2009 		    &offs);
2010 		mybits |= (ptes[offs] & testbits);
2011 	}
2012 	mtx_leave(&pg->mdpage.pv_mtx);
2013 
2014 	if (mybits == 0)
2015 		return 0;
2016 
2017 	atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
2018 
2019 	return 1;
2020 }
2021 
2022 /*
2023  * pmap_clear_attrs: change a page's attributes
2024  *
2025  * => we return 1 if we cleared one of the bits we were asked to
2026  */
2027 
2028 int
2029 pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
2030 {
2031 	struct pv_entry *pve;
2032 	pt_entry_t *ptes, opte;
2033 	u_long clearflags;
2034 	int result, level, offs;
2035 
2036 	clearflags = pmap_pte2flags(clearbits);
2037 
2038 	result = pg->pg_flags & clearflags;
2039 	if (result)
2040 		atomic_clearbits_int(&pg->pg_flags, clearflags);
2041 
2042 	mtx_enter(&pg->mdpage.pv_mtx);
2043 	for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
2044 		level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
2045 		    &offs);
2046 		opte = ptes[offs];
2047 		if (opte & clearbits) {
2048 			result = 1;
2049 			pmap_pte_clearbits(&ptes[offs], (opte & clearbits));
2050 			pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va,
2051 				pmap_is_curpmap(pve->pv_pmap));
2052 		}
2053 	}
2054 	mtx_leave(&pg->mdpage.pv_mtx);
2055 
2056 	pmap_tlb_shootwait();
2057 
2058 	return (result != 0);
2059 }
2060 
2061 /*
2062  * p m a p   p r o t e c t i o n   f u n c t i o n s
2063  */
2064 
2065 /*
2066  * pmap_page_protect: change the protection of all recorded mappings
2067  *	of a managed page
2068  *
2069  * => NOTE: this is an inline function in pmap.h
2070  */
2071 
2072 /* see pmap.h */
2073 
2074 /*
2075  * pmap_protect: set the protection in of the pages in a pmap
2076  *
2077  * => NOTE: this is an inline function in pmap.h
2078  */
2079 
2080 /* see pmap.h */
2081 
2082 /*
2083  * pmap_write_protect: write-protect pages in a pmap
2084  */
2085 
2086 void
2087 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
2088 {
2089 	pt_entry_t nx, *spte, *epte;
2090 	vaddr_t blockend;
2091 	int shootall = 0, shootself;
2092 	vaddr_t va;
2093 	paddr_t scr3;
2094 
2095 	scr3 = pmap_map_ptes(pmap);
2096 	shootself = (scr3 == 0);
2097 
2098 	/* should be ok, but just in case ... */
2099 	sva &= PG_FRAME;
2100 	eva &= PG_FRAME;
2101 
2102 	nx = 0;
2103 	if (!(prot & PROT_EXEC))
2104 		nx = pg_nx;
2105 
2106 	if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS)
2107 		shootall = 1;
2108 
2109 	for (va = sva; va < eva ; va = blockend) {
2110 		blockend = (va & L2_FRAME) + NBPD_L2;
2111 		if (blockend > eva)
2112 			blockend = eva;
2113 
2114 		/*
2115 		 * XXXCDC: our PTE mappings should never be write-protected!
2116 		 *
2117 		 * long term solution is to move the PTEs out of user
2118 		 * address space.  and into kernel address space (up
2119 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
2120 		 * be VM_MAX_ADDRESS.
2121 		 */
2122 
2123 		/* XXXCDC: ugly hack to avoid freeing PDP here */
2124 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
2125 			continue;
2126 
2127 		/* empty block? */
2128 		if (!pmap_pdes_valid(va, NULL))
2129 			continue;
2130 
2131 #ifdef DIAGNOSTIC
2132 		if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS)
2133 			panic("%s: PTE space", __func__);
2134 #endif
2135 
2136 		spte = &PTE_BASE[pl1_i(va)];
2137 		epte = &PTE_BASE[pl1_i(blockend)];
2138 
2139 		for (/*null */; spte < epte ; spte++) {
2140 			if (!pmap_valid_entry(*spte))
2141 				continue;
2142 			pmap_pte_clearbits(spte, PG_RW);
2143 			pmap_pte_setbits(spte, nx);
2144 		}
2145 	}
2146 
2147 	if (shootall)
2148 		pmap_tlb_shoottlb(pmap, shootself);
2149 	else
2150 		pmap_tlb_shootrange(pmap, sva, eva, shootself);
2151 
2152 	pmap_unmap_ptes(pmap, scr3);
2153 	pmap_tlb_shootwait();
2154 }
2155 
2156 /*
2157  * end of protection functions
2158  */
2159 
2160 /*
2161  * pmap_unwire: clear the wired bit in the PTE
2162  *
2163  * => mapping should already be in map
2164  */
2165 
2166 void
2167 pmap_unwire(struct pmap *pmap, vaddr_t va)
2168 {
2169 	pt_entry_t *ptes;
2170 	int level, offs;
2171 
2172 	level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
2173 
2174 	if (level == 0) {
2175 
2176 #ifdef DIAGNOSTIC
2177 		if (!pmap_valid_entry(ptes[offs]))
2178 			panic("%s: invalid (unmapped) va 0x%lx", __func__, va);
2179 #endif
2180 		if (__predict_true((ptes[offs] & PG_W) != 0)) {
2181 			pmap_pte_clearbits(&ptes[offs], PG_W);
2182 			pmap->pm_stats.wired_count--;
2183 		}
2184 #ifdef DIAGNOSTIC
2185 		else {
2186 			printf("%s: wiring for pmap %p va 0x%lx "
2187 			       "didn't change!\n", __func__, pmap, va);
2188 		}
2189 #endif
2190 	}
2191 #ifdef DIAGNOSTIC
2192 	else {
2193 		panic("%s: invalid PDE", __func__);
2194 	}
2195 #endif
2196 }
2197 
2198 /*
2199  * pmap_collect: free resources held by a pmap
2200  *
2201  * => optional function.
2202  * => called when a process is swapped out to free memory.
2203  */
2204 
2205 void
2206 pmap_collect(struct pmap *pmap)
2207 {
2208 	/*
2209 	 * free all of the pt pages by removing the physical mappings
2210 	 * for its entire address space.
2211 	 */
2212 
2213 /*	pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
2214 	    PMAP_REMOVE_SKIPWIRED);
2215 */
2216 }
2217 
2218 /*
2219  * pmap_copy: copy mappings from one pmap to another
2220  *
2221  * => optional function
2222  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
2223  */
2224 
2225 /*
2226  * defined as macro in pmap.h
2227  */
2228 
2229 void
2230 pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot)
2231 {
2232 	uint64_t l4idx, l3idx, l2idx, l1idx;
2233 	pd_entry_t *pd, *ptp;
2234 	paddr_t npa;
2235 	struct pmap *pmap = pmap_kernel();
2236 	pt_entry_t *ptes;
2237 	int level, offs;
2238 
2239 	/* If CPU is secure, no need to do anything */
2240 	if (!cpu_meltdown)
2241 		return;
2242 
2243 	/* Must be kernel VA */
2244 	if (va < VM_MIN_KERNEL_ADDRESS)
2245 		panic("%s: invalid special mapping va 0x%lx requested",
2246 		    __func__, va);
2247 
2248 	if (pmap->pm_pdir_intel == NULL)
2249 		pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool,
2250 		    PR_WAITOK | PR_ZERO);
2251 
2252 	l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */
2253 	l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
2254 	l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */
2255 	l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */
2256 
2257 	DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld "
2258 	    "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va,
2259 	    (uint64_t)pa, l4idx, l3idx, l2idx, l1idx);
2260 
2261 	/* Start at PML4 / top level */
2262 	pd = pmap->pm_pdir_intel;
2263 
2264 	if (pd == NULL)
2265 		panic("%s: PML4 not initialized for pmap @ %p", __func__,
2266 		    pmap);
2267 
2268 	/* npa = physaddr of PDPT */
2269 	npa = pd[l4idx] & PMAP_PA_MASK;
2270 
2271 	/* Valid PML4e for the 512GB region containing va? */
2272 	if (!npa) {
2273 		/* No valid PML4E - allocate PDPT page and set PML4E */
2274 
2275 		ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
2276 
2277 		if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2278 			panic("%s: can't locate PDPT page", __func__);
2279 
2280 		pd[l4idx] = (npa | PG_RW | PG_V);
2281 
2282 		DPRINTF("%s: allocated new PDPT page at phys 0x%llx, "
2283 		    "setting PML4e[%lld] = 0x%llx\n", __func__,
2284 		    (uint64_t)npa, l4idx, pd[l4idx]);
2285 	}
2286 
2287 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2288 	if (pd == NULL)
2289 		panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
2290 		    (uint64_t)npa);
2291 
2292 	/* npa = physaddr of PD page */
2293 	npa = pd[l3idx] & PMAP_PA_MASK;
2294 
2295 	/* Valid PDPTe for the 1GB region containing va? */
2296 	if (!npa) {
2297 		/* No valid PDPTe - allocate PD page and set PDPTe */
2298 
2299 		ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
2300 
2301 		if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2302 			panic("%s: can't locate PD page", __func__);
2303 
2304 		pd[l3idx] = (npa | PG_RW | PG_V);
2305 
2306 		DPRINTF("%s: allocated new PD page at phys 0x%llx, "
2307 		    "setting PDPTe[%lld] = 0x%llx\n", __func__,
2308 		    (uint64_t)npa, l3idx, pd[l3idx]);
2309 	}
2310 
2311 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2312 	if (pd == NULL)
2313 		panic("%s: can't locate PD page @ pa=0x%llx", __func__,
2314 		    (uint64_t)npa);
2315 
2316 	/* npa = physaddr of PT page */
2317 	npa = pd[l2idx] & PMAP_PA_MASK;
2318 
2319 	/* Valid PDE for the 2MB region containing va? */
2320 	if (!npa) {
2321 		/* No valid PDE - allocate PT page and set PDE */
2322 
2323 		ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO);
2324 
2325 		if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2326 			panic("%s: can't locate PT page", __func__);
2327 
2328 		pd[l2idx] = (npa | PG_RW | PG_V);
2329 
2330 		DPRINTF("%s: allocated new PT page at phys 0x%llx, "
2331 		    "setting PDE[%lld] = 0x%llx\n", __func__,
2332 		    (uint64_t)npa, l2idx, pd[l2idx]);
2333 	}
2334 
2335 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2336 	if (pd == NULL)
2337 		panic("%s: can't locate PT page @ pa=0x%llx", __func__,
2338 		    (uint64_t)npa);
2339 
2340 	DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot "
2341 	    "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd,
2342 	    (uint64_t)prot, (uint64_t)pd[l1idx]);
2343 
2344 	pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_W;
2345 
2346 	/*
2347 	 * Look up the corresponding U+K entry.  If we're installing the
2348 	 * same PA into the U-K map then set the PG_G bit on both and copy
2349 	 * the cache-control bits from the U+K entry to the U-K entry.
2350 	 */
2351 	level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
2352 	if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) {
2353 		if (((pd[l1idx] ^ ptes[offs]) & PG_FRAME) == 0) {
2354 			pd[l1idx] |= PG_G | (ptes[offs] & (PG_N | PG_WT));
2355 			ptes[offs] |= PG_G;
2356 		} else {
2357 			DPRINTF("%s: special diffing mapping at %llx\n",
2358 			    __func__, (long long)va);
2359 		}
2360 	} else
2361 		DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
2362 
2363 	DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]);
2364 }
2365 
2366 void
2367 pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa)
2368 {
2369 	vaddr_t v;
2370 #if NVMM > 0
2371 	struct vmx_invept_descriptor vid;
2372 #endif /* NVMM > 0 */
2373 
2374 	DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa,
2375 	    (uint64_t)egpa);
2376 	for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE)
2377 		pmap_do_remove_ept(pmap, v);
2378 
2379 #if NVMM > 0
2380 	if (pmap->eptp != 0) {
2381 		memset(&vid, 0, sizeof(vid));
2382 		vid.vid_eptp = pmap->eptp;
2383 		DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__,
2384 		    vid.vid_eptp);
2385 		invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid);
2386 	}
2387 #endif /* NVMM > 0 */
2388 }
2389 
2390 void
2391 pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
2392 {
2393 	uint64_t l4idx, l3idx, l2idx, l1idx;
2394 	struct vm_page *pg3, *pg2, *pg1;
2395 	paddr_t npa3, npa2, npa1;
2396 	pd_entry_t *pd4, *pd3, *pd2, *pd1;
2397 	pd_entry_t *pptes;
2398 
2399 	l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
2400 	l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
2401 	l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
2402 	l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
2403 
2404 	/* Start at PML4 / top level */
2405 	pd4 = (pd_entry_t *)pmap->pm_pdir;
2406 
2407 	if (pd4 == NULL)
2408 		return;
2409 
2410 	/* npa3 = physaddr of PDPT */
2411 	npa3 = pd4[l4idx] & PMAP_PA_MASK;
2412 	if (!npa3)
2413 		return;
2414 	pd3 = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
2415 	pg3 = PHYS_TO_VM_PAGE(npa3);
2416 
2417 	/* npa2 = physaddr of PD page */
2418 	npa2 = pd3[l3idx] & PMAP_PA_MASK;
2419 	if (!npa2)
2420 		return;
2421 	pd2 = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
2422 	pg2 = PHYS_TO_VM_PAGE(npa2);
2423 
2424 	/* npa1 = physaddr of PT page */
2425 	npa1 = pd2[l2idx] & PMAP_PA_MASK;
2426 	if (!npa1)
2427 		return;
2428 	pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1);
2429 	pg1 = PHYS_TO_VM_PAGE(npa1);
2430 
2431 	if (pd1[l1idx] == 0)
2432 		return;
2433 
2434 	pd1[l1idx] = 0;
2435 	pg1->wire_count--;
2436 	pmap->pm_stats.resident_count--;
2437 
2438 	if (pg1->wire_count > 1)
2439 		return;
2440 
2441 	pg1->wire_count = 0;
2442 	pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa2);
2443 	pptes[l2idx] = 0;
2444 	uvm_pagefree(pg1);
2445 	pmap->pm_stats.resident_count--;
2446 
2447 	pg2->wire_count--;
2448 	if (pg2->wire_count > 1)
2449 		return;
2450 
2451 	pg2->wire_count = 0;
2452 	pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa3);
2453 	pptes[l3idx] = 0;
2454 	uvm_pagefree(pg2);
2455 	pmap->pm_stats.resident_count--;
2456 
2457 	pg3->wire_count--;
2458 	if (pg3->wire_count > 1)
2459 		return;
2460 
2461 	pg3->wire_count = 0;
2462 	pptes = pd4;
2463 	pptes[l4idx] = 0;
2464 	uvm_pagefree(pg3);
2465 	pmap->pm_stats.resident_count--;
2466 }
2467 
2468 int
2469 pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot)
2470 {
2471 	uint64_t l4idx, l3idx, l2idx, l1idx;
2472 	pd_entry_t *pd, npte;
2473 	struct vm_page *ptp, *pptp;
2474 	paddr_t npa;
2475 	struct uvm_object *obj;
2476 
2477 	if (gpa > MAXDSIZ)
2478 		return ENOMEM;
2479 
2480 	l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */
2481 	l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */
2482 	l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
2483 	l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
2484 
2485 	/* Start at PML4 / top level */
2486 	pd = (pd_entry_t *)pmap->pm_pdir;
2487 
2488 	if (pd == NULL)
2489 		return ENOMEM;
2490 
2491 	/* npa = physaddr of PDPT */
2492 	npa = pd[l4idx] & PMAP_PA_MASK;
2493 
2494 	/* Valid PML4e for the 512GB region containing gpa? */
2495 	if (!npa) {
2496 		/* No valid PML4e - allocate PDPT page and set PML4e */
2497 		obj = &pmap->pm_obj[2];	/* PML4 UVM object */
2498 		ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 3), NULL,
2499 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2500 
2501 		if (ptp == NULL)
2502 			return ENOMEM;
2503 
2504 		/*
2505 		 * New PDPT page - we are setting the first entry, so set
2506 		 * the wired count to 1
2507 		 */
2508 		ptp->wire_count = 1;
2509 
2510 		/* Calculate phys address of this new PDPT page */
2511 		npa = VM_PAGE_TO_PHYS(ptp);
2512 
2513 		/*
2514 		 * Higher levels get full perms; specific permissions are
2515 		 * entered at the lowest level.
2516 		 */
2517 		pd[l4idx] = (npa | EPT_R | EPT_W | EPT_X);
2518 
2519 		pmap->pm_stats.resident_count++;
2520 
2521 		pptp = ptp;
2522 	} else {
2523 		/* Already allocated PML4e */
2524 		pptp = PHYS_TO_VM_PAGE(npa);
2525 	}
2526 
2527 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2528 	if (pd == NULL)
2529 		panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
2530 		    (uint64_t)npa);
2531 
2532 	/* npa = physaddr of PD page */
2533 	npa = pd[l3idx] & PMAP_PA_MASK;
2534 
2535 	/* Valid PDPTe for the 1GB region containing gpa? */
2536 	if (!npa) {
2537 		/* No valid PDPTe - allocate PD page and set PDPTe */
2538 		obj = &pmap->pm_obj[1];	/* PDPT UVM object */
2539 		ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 2), NULL,
2540 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2541 
2542 		if (ptp == NULL)
2543 			return ENOMEM;
2544 
2545 		/*
2546 		 * New PD page - we are setting the first entry, so set
2547 		 * the wired count to 1
2548 		 */
2549 		ptp->wire_count = 1;
2550 		pptp->wire_count++;
2551 
2552 		npa = VM_PAGE_TO_PHYS(ptp);
2553 
2554 		/*
2555 		 * Higher levels get full perms; specific permissions are
2556 		 * entered at the lowest level.
2557 		 */
2558 		pd[l3idx] = (npa | EPT_R | EPT_W | EPT_X);
2559 
2560 		pmap->pm_stats.resident_count++;
2561 
2562 		pptp = ptp;
2563 	} else {
2564 		/* Already allocated PDPTe */
2565 		pptp = PHYS_TO_VM_PAGE(npa);
2566 	}
2567 
2568 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2569 	if (pd == NULL)
2570 		panic("%s: can't locate PD page @ pa=0x%llx", __func__,
2571 		    (uint64_t)npa);
2572 
2573 	/* npa = physaddr of PT page */
2574 	npa = pd[l2idx] & PMAP_PA_MASK;
2575 
2576 	/* Valid PDE for the 2MB region containing gpa? */
2577 	if (!npa) {
2578 		/* No valid PDE - allocate PT page and set PDE */
2579 		obj = &pmap->pm_obj[0];	/* PDE UVM object */
2580 		ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 1), NULL,
2581 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2582 
2583 		if (ptp == NULL)
2584 			return ENOMEM;
2585 
2586 		pptp->wire_count++;
2587 
2588 		npa = VM_PAGE_TO_PHYS(ptp);
2589 
2590 		/*
2591 		 * Higher level get full perms; specific permissions are
2592 		 * entered at the lowest level.
2593 		 */
2594 		pd[l2idx] = (npa | EPT_R | EPT_W | EPT_X);
2595 
2596 		pmap->pm_stats.resident_count++;
2597 
2598 	} else {
2599 		/* Find final ptp */
2600 		ptp = PHYS_TO_VM_PAGE(npa);
2601 		if (ptp == NULL)
2602 			panic("%s: ptp page vanished?", __func__);
2603 	}
2604 
2605 	pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa);
2606 	if (pd == NULL)
2607 		panic("%s: can't locate PT page @ pa=0x%llx", __func__,
2608 		    (uint64_t)npa);
2609 
2610 	npte = hpa | EPT_WB;
2611 	if (prot & PROT_READ)
2612 		npte |= EPT_R;
2613 	if (prot & PROT_WRITE)
2614 		npte |= EPT_W;
2615 	if (prot & PROT_EXEC)
2616 		npte |= EPT_X;
2617 
2618 	if (pd[l1idx] == 0) {
2619 		ptp->wire_count++;
2620 		pmap->pm_stats.resident_count++;
2621 	} else {
2622 		/* XXX flush ept */
2623 	}
2624 
2625 	pd[l1idx] = npte;
2626 
2627 	return 0;
2628 }
2629 
2630 /*
2631  * pmap_enter: enter a mapping into a pmap
2632  *
2633  * => must be done "now" ... no lazy-evaluation
2634  */
2635 
2636 int
2637 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
2638 {
2639 	pt_entry_t opte, npte;
2640 	struct vm_page *ptp, *pg = NULL;
2641 	struct pv_entry *pve, *opve = NULL;
2642 	int ptpdelta, wireddelta, resdelta;
2643 	int wired = (flags & PMAP_WIRED) != 0;
2644 	int nocache = (pa & PMAP_NOCACHE) != 0;
2645 	int wc = (pa & PMAP_WC) != 0;
2646 	int error, shootself;
2647 	paddr_t scr3;
2648 
2649 	if (pmap->pm_type == PMAP_TYPE_EPT)
2650 		return pmap_enter_ept(pmap, va, pa, prot);
2651 
2652 	KASSERT(!(wc && nocache));
2653 	pa &= PMAP_PA_MASK;
2654 
2655 #ifdef DIAGNOSTIC
2656 	if (va == (vaddr_t) PDP_BASE)
2657 		panic("%s: trying to map over PDP!", __func__);
2658 
2659 	/* sanity check: kernel PTPs should already have been pre-allocated */
2660 	if (va >= VM_MIN_KERNEL_ADDRESS &&
2661 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
2662 		panic("%s: missing kernel PTP for va %lx!", __func__, va);
2663 
2664 #endif
2665 
2666 	pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
2667 	if (pve == NULL) {
2668 		if (flags & PMAP_CANFAIL) {
2669 			error = ENOMEM;
2670 			goto out;
2671 		}
2672 		panic("%s: no pv entries available", __func__);
2673 	}
2674 
2675 	/*
2676 	 * map in ptes and get a pointer to our PTP (unless we are the kernel)
2677 	 */
2678 
2679 	scr3 = pmap_map_ptes(pmap);
2680 	shootself = (scr3 == 0);
2681 	if (pmap == pmap_kernel()) {
2682 		ptp = NULL;
2683 	} else {
2684 		ptp = pmap_get_ptp(pmap, va);
2685 		if (ptp == NULL) {
2686 			if (flags & PMAP_CANFAIL) {
2687 				pmap_unmap_ptes(pmap, scr3);
2688 				error = ENOMEM;
2689 				goto out;
2690 			}
2691 			panic("%s: get ptp failed", __func__);
2692 		}
2693 	}
2694 	opte = PTE_BASE[pl1_i(va)];		/* old PTE */
2695 
2696 	/*
2697 	 * is there currently a valid mapping at our VA?
2698 	 */
2699 
2700 	if (pmap_valid_entry(opte)) {
2701 		/*
2702 		 * first, calculate pm_stats updates.  resident count will not
2703 		 * change since we are replacing/changing a valid mapping.
2704 		 * wired count might change...
2705 		 */
2706 
2707 		resdelta = 0;
2708 		if (wired && (opte & PG_W) == 0)
2709 			wireddelta = 1;
2710 		else if (!wired && (opte & PG_W) != 0)
2711 			wireddelta = -1;
2712 		else
2713 			wireddelta = 0;
2714 		ptpdelta = 0;
2715 
2716 		/*
2717 		 * is the currently mapped PA the same as the one we
2718 		 * want to map?
2719 		 */
2720 
2721 		if ((opte & PG_FRAME) == pa) {
2722 
2723 			/* if this is on the PVLIST, sync R/M bit */
2724 			if (opte & PG_PVLIST) {
2725 				pg = PHYS_TO_VM_PAGE(pa);
2726 #ifdef DIAGNOSTIC
2727 				if (pg == NULL)
2728 					panic("%s: same pa, PG_PVLIST "
2729 					    "mapping with unmanaged page: "
2730 					    "va 0x%lx, opte 0x%llx, pa 0x%lx",
2731 					    __func__, va, opte, pa);
2732 #endif
2733 				pmap_sync_flags_pte(pg, opte);
2734 			} else {
2735 #ifdef DIAGNOSTIC
2736 				if (PHYS_TO_VM_PAGE(pa) != NULL)
2737 					panic("%s: same pa, no PG_PVLIST "
2738 					    "mapping with managed page: "
2739 					    "va 0x%lx, opte 0x%llx, pa 0x%lx",
2740 					    __func__, va, opte, pa);
2741 #endif
2742 			}
2743 			goto enter_now;
2744 		}
2745 
2746 		/*
2747 		 * changing PAs: we must remove the old one first
2748 		 */
2749 
2750 		/*
2751 		 * if current mapping is on a pvlist,
2752 		 * remove it (sync R/M bits)
2753 		 */
2754 
2755 		if (opte & PG_PVLIST) {
2756 			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
2757 #ifdef DIAGNOSTIC
2758 			if (pg == NULL)
2759 				panic("%s: PG_PVLIST mapping with unmanaged "
2760 				    "page: va 0x%lx, opte 0x%llx, pa 0x%lx",
2761 				    __func__, va, opte, pa);
2762 #endif
2763 			pmap_sync_flags_pte(pg, opte);
2764 			opve = pmap_remove_pv(pg, pmap, va);
2765 			pg = NULL; /* This is not the page we are looking for */
2766 		}
2767 	} else {	/* opte not valid */
2768 		resdelta = 1;
2769 		if (wired)
2770 			wireddelta = 1;
2771 		else
2772 			wireddelta = 0;
2773 		if (ptp != NULL)
2774 			ptpdelta = 1;
2775 		else
2776 			ptpdelta = 0;
2777 	}
2778 
2779 	/*
2780 	 * pve is either NULL or points to a now-free pv_entry structure
2781 	 * (the latter case is if we called pmap_remove_pv above).
2782 	 *
2783 	 * if this entry is to be on a pvlist, enter it now.
2784 	 */
2785 
2786 	if (pmap_initialized)
2787 		pg = PHYS_TO_VM_PAGE(pa);
2788 
2789 	if (pg != NULL) {
2790 		pmap_enter_pv(pg, pve, pmap, va, ptp);
2791 		pve = NULL;
2792 	}
2793 
2794 enter_now:
2795 	/*
2796 	 * at this point pg is !NULL if we want the PG_PVLIST bit set
2797 	 */
2798 
2799 	pmap->pm_stats.resident_count += resdelta;
2800 	pmap->pm_stats.wired_count += wireddelta;
2801 	if (ptp != NULL)
2802 		ptp->wire_count += ptpdelta;
2803 
2804 	KASSERT(pg == PHYS_TO_VM_PAGE(pa));
2805 
2806 	npte = pa | protection_codes[prot] | PG_V;
2807 	if (pg != NULL) {
2808 		npte |= PG_PVLIST;
2809 		/*
2810 		 * make sure that if the page is write combined all
2811 		 * instances of pmap_enter make it so.
2812 		 */
2813 		if (pg->pg_flags & PG_PMAP_WC) {
2814 			KASSERT(nocache == 0);
2815 			wc = 1;
2816 		}
2817 	}
2818 	if (wc)
2819 		npte |= pmap_pg_wc;
2820 	if (wired)
2821 		npte |= PG_W;
2822 	if (nocache)
2823 		npte |= PG_N;
2824 	if (va < VM_MAXUSER_ADDRESS)
2825 		npte |= PG_u;
2826 	else if (va < VM_MAX_ADDRESS)
2827 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
2828 	if (pmap == pmap_kernel())
2829 		npte |= pg_g_kern;
2830 
2831 	PTE_BASE[pl1_i(va)] = npte;		/* zap! */
2832 
2833 	/*
2834 	 * If we changed anything other than modified/used bits,
2835 	 * flush the TLB.  (is this overkill?)
2836 	 */
2837 	if (pmap_valid_entry(opte)) {
2838 		if (nocache && (opte & PG_N) == 0)
2839 			wbinvd_on_all_cpus();
2840 		pmap_tlb_shootpage(pmap, va, shootself);
2841 	}
2842 
2843 	pmap_unmap_ptes(pmap, scr3);
2844 	pmap_tlb_shootwait();
2845 
2846 	error = 0;
2847 
2848 out:
2849 	if (pve != NULL)
2850 		pool_put(&pmap_pv_pool, pve);
2851 	if (opve != NULL)
2852 		pool_put(&pmap_pv_pool, opve);
2853 
2854 	return error;
2855 }
2856 
2857 int
2858 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
2859 {
2860 	struct vm_page *ptp;
2861 	struct pmap *kpm = pmap_kernel();
2862 
2863 	if (uvm.page_init_done == 0) {
2864 		vaddr_t va;
2865 
2866 		/*
2867 		 * we're growing the kernel pmap early (from
2868 		 * uvm_pageboot_alloc()).  this case must be
2869 		 * handled a little differently.
2870 		 */
2871 
2872 		va = pmap_steal_memory(PAGE_SIZE, NULL, NULL);
2873 		*paddrp = PMAP_DIRECT_UNMAP(va);
2874 	} else {
2875 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
2876 				    ptp_va2o(va, level), NULL,
2877 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2878 		if (ptp == NULL)
2879 			panic("%s: out of memory", __func__);
2880 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2881 		ptp->wire_count = 1;
2882 		*paddrp = VM_PAGE_TO_PHYS(ptp);
2883 	}
2884 	kpm->pm_stats.resident_count++;
2885 	return 1;
2886 }
2887 
2888 /*
2889  * Allocate the amount of specified ptps for a ptp level, and populate
2890  * all levels below accordingly, mapping virtual addresses starting at
2891  * kva.
2892  *
2893  * Used by pmap_growkernel.
2894  */
2895 void
2896 pmap_alloc_level(vaddr_t kva, int lvl, long *needed_ptps)
2897 {
2898 	unsigned long i;
2899 	vaddr_t va;
2900 	paddr_t pa;
2901 	unsigned long index, endindex;
2902 	int level;
2903 	pd_entry_t *pdep;
2904 
2905 	for (level = lvl; level > 1; level--) {
2906 		if (level == PTP_LEVELS)
2907 			pdep = pmap_kernel()->pm_pdir;
2908 		else
2909 			pdep = normal_pdes[level - 2];
2910 		va = kva;
2911 		index = pl_i(kva, level);
2912 		endindex = index + needed_ptps[level - 1];
2913 		/*
2914 		 * XXX special case for first time call.
2915 		 */
2916 		if (nkptp[level - 1] != 0)
2917 			index++;
2918 		else
2919 			endindex--;
2920 
2921 		for (i = index; i <= endindex; i++) {
2922 			pmap_get_physpage(va, level - 1, &pa);
2923 			pdep[i] = pa | PG_RW | PG_V | pg_nx;
2924 			nkptp[level - 1]++;
2925 			va += nbpd[level - 1];
2926 		}
2927 	}
2928 }
2929 
2930 /*
2931  * pmap_growkernel: increase usage of KVM space
2932  *
2933  * => we allocate new PTPs for the kernel and install them in all
2934  *	the pmaps on the system.
2935  */
2936 
2937 static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS;
2938 
2939 vaddr_t
2940 pmap_growkernel(vaddr_t maxkvaddr)
2941 {
2942 	struct pmap *kpm = pmap_kernel(), *pm;
2943 	int s, i;
2944 	unsigned newpdes;
2945 	long needed_kptp[PTP_LEVELS], target_nptp, old;
2946 
2947 	if (maxkvaddr <= pmap_maxkvaddr)
2948 		return pmap_maxkvaddr;
2949 
2950 	maxkvaddr = x86_round_pdr(maxkvaddr);
2951 	old = nkptp[PTP_LEVELS - 1];
2952 	/*
2953 	 * This loop could be optimized more, but pmap_growkernel()
2954 	 * is called infrequently.
2955 	 */
2956 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
2957 		target_nptp = pl_i(maxkvaddr, i + 1) -
2958 		    pl_i(VM_MIN_KERNEL_ADDRESS, i + 1);
2959 		/*
2960 		 * XXX only need to check toplevel.
2961 		 */
2962 		if (target_nptp > nkptpmax[i])
2963 			panic("%s: out of KVA space", __func__);
2964 		needed_kptp[i] = target_nptp - nkptp[i] + 1;
2965 	}
2966 
2967 
2968 	s = splhigh();	/* to be safe */
2969 	pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
2970 
2971 	/*
2972 	 * If the number of top level entries changed, update all
2973 	 * pmaps.
2974 	 */
2975 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
2976 		newpdes = nkptp[PTP_LEVELS - 1] - old;
2977 		LIST_FOREACH(pm, &pmaps, pm_list) {
2978 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
2979 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
2980 			       newpdes * sizeof (pd_entry_t));
2981 		}
2982 	}
2983 	pmap_maxkvaddr = maxkvaddr;
2984 	splx(s);
2985 
2986 	return maxkvaddr;
2987 }
2988 
2989 vaddr_t
2990 pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end)
2991 {
2992 	int segno;
2993 	u_int npg;
2994 	vaddr_t va;
2995 	paddr_t pa;
2996 	struct vm_physseg *seg;
2997 
2998 	size = round_page(size);
2999 	npg = atop(size);
3000 
3001 	for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) {
3002 		if (seg->avail_end - seg->avail_start < npg)
3003 			continue;
3004 		/*
3005 		 * We can only steal at an ``unused'' segment boundary,
3006 		 * i.e. either at the start or at the end.
3007 		 */
3008 		if (seg->avail_start == seg->start ||
3009 		    seg->avail_end == seg->end)
3010 			break;
3011 	}
3012 	if (segno == vm_nphysseg) {
3013 		panic("%s: out of memory", __func__);
3014 	} else {
3015 		if (seg->avail_start == seg->start) {
3016 			pa = ptoa(seg->avail_start);
3017 			seg->avail_start += npg;
3018 			seg->start += npg;
3019 		} else {
3020 			pa = ptoa(seg->avail_end) - size;
3021 			seg->avail_end -= npg;
3022 			seg->end -= npg;
3023 		}
3024 		/*
3025 		 * If all the segment has been consumed now, remove it.
3026 		 * Note that the crash dump code still knows about it
3027 		 * and will dump it correctly.
3028 		 */
3029 		if (seg->start == seg->end) {
3030 			if (vm_nphysseg-- == 1)
3031 				panic("%s: out of memory", __func__);
3032 			while (segno < vm_nphysseg) {
3033 				seg[0] = seg[1]; /* struct copy */
3034 				seg++;
3035 				segno++;
3036 			}
3037 		}
3038 
3039 		va = PMAP_DIRECT_MAP(pa);
3040 		memset((void *)va, 0, size);
3041 	}
3042 
3043 	if (start != NULL)
3044 		*start = virtual_avail;
3045 	if (end != NULL)
3046 		*end = VM_MAX_KERNEL_ADDRESS;
3047 
3048 	return (va);
3049 }
3050 
3051 void
3052 pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
3053 {
3054 	*vstartp = virtual_avail;
3055 	*vendp = VM_MAX_KERNEL_ADDRESS;
3056 }
3057 
3058 /*
3059  * pmap_convert
3060  *
3061  * Converts 'pmap' to the new 'mode'.
3062  *
3063  * Parameters:
3064  *  pmap: the pmap to convert
3065  *  mode: the new mode (see pmap.h, PMAP_TYPE_xxx)
3066  *
3067  * Return value:
3068  *  always 0
3069  */
3070 int
3071 pmap_convert(struct pmap *pmap, int mode)
3072 {
3073 	pt_entry_t *pte;
3074 
3075 	pmap->pm_type = mode;
3076 
3077 	if (mode == PMAP_TYPE_EPT) {
3078 		/* Clear PML4 */
3079 		pte = (pt_entry_t *)pmap->pm_pdir;
3080 		memset(pte, 0, PAGE_SIZE);
3081 
3082 		/* Give back the meltdown pdir */
3083 		if (pmap->pm_pdir_intel != NULL) {
3084 			pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
3085 			pmap->pm_pdir_intel = NULL;
3086 		}
3087 	}
3088 
3089 	return (0);
3090 }
3091 
3092 #ifdef MULTIPROCESSOR
3093 /*
3094  * Locking for tlb shootdown.
3095  *
3096  * We lock by setting tlb_shoot_wait to the number of cpus that will
3097  * receive our tlb shootdown. After sending the IPIs, we don't need to
3098  * worry about locking order or interrupts spinning for the lock because
3099  * the call that grabs the "lock" isn't the one that releases it. And
3100  * there is nothing that can block the IPI that releases the lock.
3101  *
3102  * The functions are organized so that we first count the number of
3103  * cpus we need to send the IPI to, then we grab the counter, then
3104  * we send the IPIs, then we finally do our own shootdown.
3105  *
3106  * Our shootdown is last to make it parallel with the other cpus
3107  * to shorten the spin time.
3108  *
3109  * Notice that we depend on failures to send IPIs only being able to
3110  * happen during boot. If they happen later, the above assumption
3111  * doesn't hold since we can end up in situations where noone will
3112  * release the lock if we get an interrupt in a bad moment.
3113  */
3114 #ifdef MP_LOCKDEBUG
3115 #include <ddb/db_output.h>
3116 extern int __mp_lock_spinout;
3117 #endif
3118 
3119 volatile long tlb_shoot_wait __attribute__((section(".kudata")));
3120 
3121 volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
3122 volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
3123 volatile int tlb_shoot_first_pcid __attribute__((section(".kudata")));
3124 
3125 void
3126 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
3127 {
3128 	struct cpu_info *ci, *self = curcpu();
3129 	CPU_INFO_ITERATOR cii;
3130 	long wait = 0;
3131 	u_int64_t mask = 0;
3132 	int is_kva = va >= VM_MIN_KERNEL_ADDRESS;
3133 
3134 	CPU_INFO_FOREACH(cii, ci) {
3135 		if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
3136 			continue;
3137 		if (!is_kva && !pmap_is_active(pm, ci->ci_cpuid))
3138 			continue;
3139 		mask |= (1ULL << ci->ci_cpuid);
3140 		wait++;
3141 	}
3142 
3143 	if (wait > 0) {
3144 		int s = splvm();
3145 
3146 		while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
3147 #ifdef MP_LOCKDEBUG
3148 			int nticks = __mp_lock_spinout;
3149 #endif
3150 			while (tlb_shoot_wait != 0) {
3151 				CPU_BUSY_CYCLE();
3152 #ifdef MP_LOCKDEBUG
3153 
3154 				if (--nticks <= 0) {
3155 					db_printf("%s: spun out", __func__);
3156 					db_enter();
3157 					nticks = __mp_lock_spinout;
3158 				}
3159 #endif
3160 			}
3161 		}
3162 		tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
3163 		tlb_shoot_addr1 = va;
3164 		CPU_INFO_FOREACH(cii, ci) {
3165 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3166 				continue;
3167 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0)
3168 				panic("%s: ipi failed", __func__);
3169 		}
3170 		splx(s);
3171 	}
3172 
3173 	if (!pmap_use_pcid) {
3174 		if (shootself)
3175 			pmap_update_pg(va);
3176 	} else if (is_kva) {
3177 		invpcid(INVPCID_ADDR, PCID_PROC, va);
3178 		invpcid(INVPCID_ADDR, PCID_KERN, va);
3179 	} else if (shootself) {
3180 		invpcid(INVPCID_ADDR, PCID_PROC, va);
3181 		if (cpu_meltdown)
3182 			invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3183 	}
3184 }
3185 
3186 void
3187 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
3188 {
3189 	struct cpu_info *ci, *self = curcpu();
3190 	CPU_INFO_ITERATOR cii;
3191 	long wait = 0;
3192 	u_int64_t mask = 0;
3193 	int is_kva = sva >= VM_MIN_KERNEL_ADDRESS;
3194 	vaddr_t va;
3195 
3196 	CPU_INFO_FOREACH(cii, ci) {
3197 		if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
3198 			continue;
3199 		if (!is_kva && !pmap_is_active(pm, ci->ci_cpuid))
3200 			continue;
3201 		mask |= (1ULL << ci->ci_cpuid);
3202 		wait++;
3203 	}
3204 
3205 	if (wait > 0) {
3206 		int s = splvm();
3207 
3208 		while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
3209 #ifdef MP_LOCKDEBUG
3210 			int nticks = __mp_lock_spinout;
3211 #endif
3212 			while (tlb_shoot_wait != 0) {
3213 				CPU_BUSY_CYCLE();
3214 #ifdef MP_LOCKDEBUG
3215 
3216 				if (--nticks <= 0) {
3217 					db_printf("%s: spun out", __func__);
3218 					db_enter();
3219 					nticks = __mp_lock_spinout;
3220 				}
3221 #endif
3222 			}
3223 		}
3224 		tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC;
3225 		tlb_shoot_addr1 = sva;
3226 		tlb_shoot_addr2 = eva;
3227 		CPU_INFO_FOREACH(cii, ci) {
3228 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3229 				continue;
3230 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0)
3231 				panic("%s: ipi failed", __func__);
3232 		}
3233 		splx(s);
3234 	}
3235 
3236 	if (!pmap_use_pcid) {
3237 		if (shootself) {
3238 			for (va = sva; va < eva; va += PAGE_SIZE)
3239 				pmap_update_pg(va);
3240 		}
3241 	} else if (is_kva) {
3242 		for (va = sva; va < eva; va += PAGE_SIZE) {
3243 			invpcid(INVPCID_ADDR, PCID_PROC, va);
3244 			invpcid(INVPCID_ADDR, PCID_KERN, va);
3245 		}
3246 	} else if (shootself) {
3247 		if (cpu_meltdown) {
3248 			for (va = sva; va < eva; va += PAGE_SIZE) {
3249 				invpcid(INVPCID_ADDR, PCID_PROC, va);
3250 				invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3251 			}
3252 		} else {
3253 			for (va = sva; va < eva; va += PAGE_SIZE)
3254 				invpcid(INVPCID_ADDR, PCID_PROC, va);
3255 		}
3256 	}
3257 }
3258 
3259 void
3260 pmap_tlb_shoottlb(struct pmap *pm, int shootself)
3261 {
3262 	struct cpu_info *ci, *self = curcpu();
3263 	CPU_INFO_ITERATOR cii;
3264 	long wait = 0;
3265 	u_int64_t mask = 0;
3266 
3267 	KASSERT(pm != pmap_kernel());
3268 
3269 	CPU_INFO_FOREACH(cii, ci) {
3270 		if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) ||
3271 		    !(ci->ci_flags & CPUF_RUNNING))
3272 			continue;
3273 		mask |= (1ULL << ci->ci_cpuid);
3274 		wait++;
3275 	}
3276 
3277 	if (wait) {
3278 		int s = splvm();
3279 
3280 		while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
3281 #ifdef MP_LOCKDEBUG
3282 			int nticks = __mp_lock_spinout;
3283 #endif
3284 			while (tlb_shoot_wait != 0) {
3285 				CPU_BUSY_CYCLE();
3286 #ifdef MP_LOCKDEBUG
3287 
3288 				if (--nticks <= 0) {
3289 					db_printf("%s: spun out", __func__);
3290 					db_enter();
3291 					nticks = __mp_lock_spinout;
3292 				}
3293 #endif
3294 			}
3295 		}
3296 
3297 		CPU_INFO_FOREACH(cii, ci) {
3298 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3299 				continue;
3300 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0)
3301 				panic("%s: ipi failed", __func__);
3302 		}
3303 		splx(s);
3304 	}
3305 
3306 	if (shootself) {
3307 		if (!pmap_use_pcid)
3308 			tlbflush();
3309 		else {
3310 			invpcid(INVPCID_PCID, PCID_PROC, 0);
3311 			if (cpu_meltdown)
3312 				invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
3313 		}
3314 	}
3315 }
3316 
3317 void
3318 pmap_tlb_shootwait(void)
3319 {
3320 #ifdef MP_LOCKDEBUG
3321 	int nticks = __mp_lock_spinout;
3322 #endif
3323 	while (tlb_shoot_wait != 0) {
3324 		CPU_BUSY_CYCLE();
3325 #ifdef MP_LOCKDEBUG
3326 		if (--nticks <= 0) {
3327 			db_printf("%s: spun out", __func__);
3328 			db_enter();
3329 			nticks = __mp_lock_spinout;
3330 		}
3331 #endif
3332 	}
3333 }
3334 
3335 #else /* MULTIPROCESSOR */
3336 
3337 void
3338 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
3339 {
3340 	if (!pmap_use_pcid) {
3341 		if (shootself)
3342 			pmap_update_pg(va);
3343 	} else if (va >= VM_MIN_KERNEL_ADDRESS) {
3344 		invpcid(INVPCID_ADDR, PCID_PROC, va);
3345 		invpcid(INVPCID_ADDR, PCID_KERN, va);
3346 	} else if (shootself) {
3347 		invpcid(INVPCID_ADDR, PCID_PROC, va);
3348 		if (cpu_meltdown)
3349 			invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3350 	}
3351 }
3352 
3353 void
3354 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
3355 {
3356 	vaddr_t va;
3357 
3358 	if (!pmap_use_pcid) {
3359 		if (shootself) {
3360 			for (va = sva; va < eva; va += PAGE_SIZE)
3361 				pmap_update_pg(va);
3362 		}
3363 	} else if (sva >= VM_MIN_KERNEL_ADDRESS) {
3364 		for (va = sva; va < eva; va += PAGE_SIZE) {
3365 			invpcid(INVPCID_ADDR, PCID_PROC, va);
3366 			invpcid(INVPCID_ADDR, PCID_KERN, va);
3367 		}
3368 	} else if (shootself) {
3369 		if (cpu_meltdown) {
3370 			for (va = sva; va < eva; va += PAGE_SIZE) {
3371 				invpcid(INVPCID_ADDR, PCID_PROC, va);
3372 				invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va);
3373 			}
3374 		} else {
3375 			for (va = sva; va < eva; va += PAGE_SIZE)
3376 				invpcid(INVPCID_ADDR, PCID_PROC, va);
3377 		}
3378 	}
3379 }
3380 
3381 void
3382 pmap_tlb_shoottlb(struct pmap *pm, int shootself)
3383 {
3384 	if (shootself) {
3385 		if (!pmap_use_pcid)
3386 			tlbflush();
3387 		else {
3388 			invpcid(INVPCID_PCID, PCID_PROC, 0);
3389 			if (cpu_meltdown)
3390 				invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0);
3391 		}
3392 	}
3393 }
3394 #endif /* MULTIPROCESSOR */
3395