xref: /openbsd/sys/arch/amd64/amd64/pmap.c (revision 9b7c3dbb)
1 /*	$OpenBSD: pmap.c,v 1.99 2016/06/07 06:23:19 dlg Exp $	*/
2 /*	$NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright 2001 (c) Wasabi Systems, Inc.
31  * All rights reserved.
32  *
33  * Written by Frank van der Linden for Wasabi Systems, Inc.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgement:
45  *      This product includes software developed for the NetBSD Project by
46  *      Wasabi Systems, Inc.
47  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
48  *    or promote products derived from this software without specific prior
49  *    written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
53  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
54  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
55  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
56  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
57  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
58  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
59  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
60  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
61  * POSSIBILITY OF SUCH DAMAGE.
62  */
63 
64 /*
65  * This is the i386 pmap modified and generalized to support x86-64
66  * as well. The idea is to hide the upper N levels of the page tables
67  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
68  * is mostly untouched, except that it uses some more generalized
69  * macros and interfaces.
70  *
71  * This pmap has been tested on the i386 as well, and it can be easily
72  * adapted to PAE.
73  *
74  * fvdl@wasabisystems.com 18-Jun-2001
75  */
76 
77 /*
78  * pmap.c: i386 pmap module rewrite
79  * Chuck Cranor <chuck@ccrc.wustl.edu>
80  * 11-Aug-97
81  *
82  * history of this pmap module: in addition to my own input, i used
83  *    the following references for this rewrite of the i386 pmap:
84  *
85  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
86  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
87  *     it was then ported to the i386 by William Jolitz of UUNET
88  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
89  *     project fixed some bugs and provided some speed ups.
90  *
91  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
92  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
93  *     and David Greenman.
94  *
95  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
96  *     between several processors.   the VAX version was done by
97  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
98  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
99  *     David Golub, and Richard Draves.    the alpha version was
100  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
101  *     (NetBSD/alpha).
102  */
103 
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/atomic.h>
107 #include <sys/proc.h>
108 #include <sys/malloc.h>
109 #include <sys/pool.h>
110 #include <sys/user.h>
111 #include <sys/kernel.h>
112 #include <sys/mutex.h>
113 #include <sys/sched.h>
114 
115 #include <uvm/uvm.h>
116 
117 #include <machine/lock.h>
118 #include <machine/cpu.h>
119 #include <machine/specialreg.h>
120 #ifdef MULTIPROCESSOR
121 #include <machine/i82489reg.h>
122 #include <machine/i82489var.h>
123 #endif
124 
125 
126 #include <machine/isa_machdep.h>
127 
128 #include "acpi.h"
129 
130 /*
131  * general info:
132  *
133  *  - for an explanation of how the i386 MMU hardware works see
134  *    the comments in <machine/pte.h>.
135  *
136  *  - for an explanation of the general memory structure used by
137  *    this pmap (including the recursive mapping), see the comments
138  *    in <machine/pmap.h>.
139  *
140  * this file contains the code for the "pmap module."   the module's
141  * job is to manage the hardware's virtual to physical address mappings.
142  * note that there are two levels of mapping in the VM system:
143  *
144  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
145  *      to map ranges of virtual address space to objects/files.  for
146  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
147  *      to the file /bin/ls starting at offset zero."   note that
148  *      the upper layer mapping is not concerned with how individual
149  *      vm_pages are mapped.
150  *
151  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
152  *      from virtual addresses.   it is concerned with which vm_page is
153  *      mapped where.   for example, when you run /bin/ls and start
154  *      at page 0x1000 the fault routine may lookup the correct page
155  *      of the /bin/ls file and then ask the pmap layer to establish
156  *      a mapping for it.
157  *
158  * note that information in the lower layer of the VM system can be
159  * thrown away since it can easily be reconstructed from the info
160  * in the upper layer.
161  *
162  * data structures we use include:
163  *  - struct pmap: describes the address space of one process
164  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
165  *  - struct pg_to_free: a list of virtual addresses whose mappings
166  *	have been changed.   used for TLB flushing.
167  */
168 
169 /*
170  * memory allocation
171  *
172  *  - there are three data structures that we must dynamically allocate:
173  *
174  * [A] new process' page directory page (PDP)
175  *	- plan 1: done at pmap_create() we use
176  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
177  *	  allocation.
178  *
179  * if we are low in free physical memory then we sleep in
180  * uvm_km_alloc -- in this case this is ok since we are creating
181  * a new pmap and should not be holding any locks.
182  *
183  * if the kernel is totally out of virtual space
184  * (i.e. uvm_km_alloc returns NULL), then we panic.
185  *
186  * XXX: the fork code currently has no way to return an "out of
187  * memory, try again" error code since uvm_fork [fka vm_fork]
188  * is a void function.
189  *
190  * [B] new page tables pages (PTP)
191  * 	call uvm_pagealloc()
192  * 		=> success: zero page, add to pm_pdir
193  * 		=> failure: we are out of free vm_pages, let pmap_enter()
194  *		   tell UVM about it.
195  *
196  * note: for kernel PTPs, we start with NKPTP of them.   as we map
197  * kernel memory (at uvm_map time) we check to see if we've grown
198  * the kernel pmap.   if so, we call the optional function
199  * pmap_growkernel() to grow the kernel PTPs in advance.
200  *
201  * [C] pv_entry structures
202  *	- try to allocate one from the pool.
203  *	If we fail, we simply let pmap_enter() tell UVM about it.
204  */
205 
206 vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
207 int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
208 long nkptp[] = NKPTP_INITIALIZER;
209 long nkptpmax[] = NKPTPMAX_INITIALIZER;
210 long nbpd[] = NBPD_INITIALIZER;
211 pd_entry_t *normal_pdes[] = PDES_INITIALIZER;
212 
213 #define pmap_pte_set(p, n)		atomic_swap_64(p, n)
214 #define pmap_pte_clearbits(p, b)	x86_atomic_clearbits_u64(p, b)
215 #define pmap_pte_setbits(p, b)		x86_atomic_setbits_u64(p, b)
216 
217 /*
218  * global data structures
219  */
220 
221 struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
222 
223 /*
224  * pmap_pg_wc: if our processor supports PAT then we set this
225  * to be the pte bits for Write Combining. Else we fall back to
226  * UC- so mtrrs can override the cacheability;
227  */
228 int pmap_pg_wc = PG_UCMINUS;
229 
230 /*
231  * other data structures
232  */
233 
234 pt_entry_t protection_codes[8];     /* maps MI prot to i386 prot code */
235 boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */
236 
237 /*
238  * pv management structures.
239  */
240 struct pool pmap_pv_pool;
241 
242 /*
243  * linked list of all non-kernel pmaps
244  */
245 
246 struct pmap_head pmaps;
247 
248 /*
249  * pool that pmap structures are allocated from
250  */
251 
252 struct pool pmap_pmap_pool;
253 
254 /*
255  * When we're freeing a ptp, we need to delay the freeing until all
256  * tlb shootdown has been done. This is the list of the to-be-freed pages.
257  */
258 TAILQ_HEAD(pg_to_free, vm_page);
259 
260 /*
261  * pool that PDPs are allocated from
262  */
263 
264 struct pool pmap_pdp_pool;
265 void pmap_pdp_ctor(pd_entry_t *);
266 
267 extern vaddr_t msgbuf_vaddr;
268 extern paddr_t msgbuf_paddr;
269 
270 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
271 extern paddr_t idt_paddr;
272 
273 extern vaddr_t lo32_vaddr;
274 extern vaddr_t lo32_paddr;
275 
276 vaddr_t virtual_avail;
277 extern int end;
278 
279 /*
280  * local prototypes
281  */
282 
283 void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *,
284     vaddr_t, struct vm_page *);
285 struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, pd_entry_t **);
286 struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
287 int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs);
288 void pmap_free_ptp(struct pmap *, struct vm_page *,
289     vaddr_t, pt_entry_t *, pd_entry_t **, struct pg_to_free *);
290 void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
291 static boolean_t pmap_is_active(struct pmap *, int);
292 void pmap_map_ptes(struct pmap *, pt_entry_t **, pd_entry_t ***, paddr_t *);
293 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
294 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
295 boolean_t pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
296     vaddr_t, int, struct pv_entry **);
297 void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
298     vaddr_t, vaddr_t, int, struct pv_entry **);
299 #define PMAP_REMOVE_ALL		0	/* remove all mappings */
300 #define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
301 
302 void pmap_unmap_ptes(struct pmap *, paddr_t);
303 boolean_t pmap_get_physpage(vaddr_t, int, paddr_t *);
304 boolean_t pmap_pdes_valid(vaddr_t, pd_entry_t **, pd_entry_t *);
305 void pmap_alloc_level(pd_entry_t **, vaddr_t, int, long *);
306 
307 void pmap_sync_flags_pte(struct vm_page *, u_long);
308 
309 void pmap_tlb_shootpage(struct pmap *, vaddr_t, int);
310 void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int);
311 void pmap_tlb_shoottlb(struct pmap *, int);
312 #ifdef MULTIPROCESSOR
313 void pmap_tlb_shootwait(void);
314 #else
315 #define	pmap_tlb_shootwait()
316 #endif
317 
318 
319 /*
320  * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
321  */
322 
323 /*
324  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
325  *		of course the kernel is always loaded
326  */
327 
328 static __inline boolean_t
329 pmap_is_curpmap(struct pmap *pmap)
330 {
331 	return((pmap == pmap_kernel()) ||
332 	       (pmap->pm_pdirpa == (paddr_t) rcr3()));
333 }
334 
335 /*
336  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
337  */
338 
339 static __inline boolean_t
340 pmap_is_active(struct pmap *pmap, int cpu_id)
341 {
342 	return (pmap == pmap_kernel() ||
343 	    (pmap->pm_cpus & (1ULL << cpu_id)) != 0);
344 }
345 
346 static __inline u_int
347 pmap_pte2flags(u_long pte)
348 {
349 	return (((pte & PG_U) ? PG_PMAP_REF : 0) |
350 	    ((pte & PG_M) ? PG_PMAP_MOD : 0));
351 }
352 
353 void
354 pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
355 {
356 	if (pte & (PG_U|PG_M)) {
357 		atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte));
358 	}
359 }
360 
361 /*
362  * pmap_map_ptes: map a pmap's PTEs into KVM
363  */
364 
365 void
366 pmap_map_ptes(struct pmap *pmap, pt_entry_t **ptepp, pd_entry_t ***pdeppp, paddr_t *save_cr3)
367 {
368 	paddr_t cr3 = rcr3();
369 
370 	/* the kernel's pmap is always accessible */
371 	if (pmap == pmap_kernel() || pmap->pm_pdirpa == cr3) {
372 		*save_cr3 = 0;
373 	} else {
374 		*save_cr3 = cr3;
375 
376 		/*
377 		 * Not sure if we need this, but better be safe.
378 		 * We don't have the current pmap in order to unset its
379 		 * active bit, but this just means that we may receive
380 		 * an unneccessary cross-CPU TLB flush now and then.
381 		 */
382 		x86_atomic_setbits_u64(&pmap->pm_cpus, (1ULL << cpu_number()));
383 
384 		lcr3(pmap->pm_pdirpa);
385 	}
386 
387 	if (pmap != pmap_kernel())
388 		mtx_enter(&pmap->pm_mtx);
389 
390 	*ptepp = PTE_BASE;
391 	*pdeppp = normal_pdes;
392 }
393 
394 void
395 pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
396 {
397 	if (pmap != pmap_kernel())
398 		mtx_leave(&pmap->pm_mtx);
399 
400 	if (save_cr3 != 0) {
401 		x86_atomic_clearbits_u64(&pmap->pm_cpus, (1ULL << cpu_number()));
402 		lcr3(save_cr3);
403 	}
404 }
405 
406 /*
407  * pmap_fix_ept
408  *
409  * Fixes up an EPT PTE for vaddr 'va' by reconfiguring the low bits to
410  * conform to the EPT format (separate R/W/X bits and various "must be
411  * 0 bits")
412  *
413  * Parameters:
414  *  pm: The pmap in question
415  *  va: The VA to fix up
416  */
417 void
418 pmap_fix_ept(struct pmap *pm, vaddr_t va)
419 {
420 	u_long mask, shift;
421 	pd_entry_t pde, *pd;
422 	paddr_t pdpa;
423 	int lev, offs;
424 
425 	pdpa = pm->pm_pdirpa;
426 	shift = L4_SHIFT;
427 	mask = L4_MASK;
428 	for (lev = PTP_LEVELS; lev > 0; lev--) {
429 		pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa);
430 		offs = (VA_SIGN_POS(va) & mask) >> shift;
431 
432 		pd[offs] |= EPT_R | EPT_W | EPT_X;
433 		/*
434 		 * Levels 3-4 have bits 3:7 'must be 0'
435 		 * Level 2 has bits 3:6 'must be 0', and bit 7 is always
436 		 * 0 in our EPT format (thus, bits 3:7 == 0)
437 		 */
438 		switch(lev) {
439 		case 4:
440 		case 3:
441 		case 2:
442 			/* Bits 3:7 = 0 */
443 			pd[offs] &= ~(0xF8);
444 			break;
445 		case 1: pd[offs] |= EPT_WB;
446 			break;
447 		}
448 
449 		pde = pd[offs];
450 
451 		/* Large pages are different, break early if we run into one. */
452 		if ((pde & (PG_PS|PG_V)) != PG_V)
453 			panic("pmap_fix_ept: large page in EPT");
454 
455 		pdpa = (pd[offs] & PG_FRAME);
456 		/* 4096/8 == 512 == 2^9 entries per level */
457 		shift -= 9;
458 		mask >>= 9;
459 	}
460 }
461 
462 int
463 pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs)
464 {
465 	u_long mask, shift;
466 	pd_entry_t pde;
467 	paddr_t pdpa;
468 	int lev;
469 
470 	pdpa = pm->pm_pdirpa;
471 	shift = L4_SHIFT;
472 	mask = L4_MASK;
473 	for (lev = PTP_LEVELS; lev > 0; lev--) {
474 		*pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa);
475 		*offs = (VA_SIGN_POS(va) & mask) >> shift;
476 		pde = (*pd)[*offs];
477 
478 		/* Large pages are different, break early if we run into one. */
479 		if ((pde & (PG_PS|PG_V)) != PG_V)
480 			return (lev - 1);
481 
482 		pdpa = ((*pd)[*offs] & PG_FRAME);
483 		/* 4096/8 == 512 == 2^9 entries per level */
484 		shift -= 9;
485 		mask >>= 9;
486 	}
487 
488 	return (0);
489 }
490 
491 
492 /*
493  * p m a p   k e n t e r   f u n c t i o n s
494  *
495  * functions to quickly enter/remove pages from the kernel address
496  * space.   pmap_kremove is exported to MI kernel.  we make use of
497  * the recursive PTE mappings.
498  */
499 
500 /*
501  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
502  *
503  * => no need to lock anything, assume va is already allocated
504  * => should be faster than normal pmap enter function
505  */
506 
507 void
508 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
509 {
510 	pt_entry_t *pte, opte, npte;
511 
512 	pte = kvtopte(va);
513 
514 	npte = (pa & PMAP_PA_MASK) | ((prot & PROT_WRITE) ? PG_RW : PG_RO) |
515 	    ((pa & PMAP_NOCACHE) ? PG_N : 0) |
516 	    ((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V;
517 
518 	/* special 1:1 mappings in the first 2MB must not be global */
519 	if (va >= (vaddr_t)NBPD_L2)
520 		npte |= PG_G;
521 
522 	if (!(prot & PROT_EXEC))
523 		npte |= pg_nx;
524 	opte = pmap_pte_set(pte, npte);
525 #ifdef LARGEPAGES
526 	/* XXX For now... */
527 	if (opte & PG_PS)
528 		panic("%s: PG_PS", __func__);
529 #endif
530 	if (pmap_valid_entry(opte)) {
531 		if (pa & PMAP_NOCACHE && (opte & PG_N) == 0)
532 			wbinvd();
533 		/* This shouldn't happen */
534 		pmap_tlb_shootpage(pmap_kernel(), va, 1);
535 		pmap_tlb_shootwait();
536 	}
537 }
538 
539 /*
540  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
541  *
542  * => no need to lock anything
543  * => caller must dispose of any vm_page mapped in the va range
544  * => note: not an inline function
545  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
546  * => we assume kernel only unmaps valid addresses and thus don't bother
547  *    checking the valid bit before doing TLB flushing
548  */
549 
550 void
551 pmap_kremove(vaddr_t sva, vsize_t len)
552 {
553 	pt_entry_t *pte, opte;
554 	vaddr_t va, eva;
555 
556 	eva = sva + len;
557 
558 	for (va = sva; va != eva; va += PAGE_SIZE) {
559 		pte = kvtopte(va);
560 
561 		opte = pmap_pte_set(pte, 0);
562 #ifdef LARGEPAGES
563 		KASSERT((opte & PG_PS) == 0);
564 #endif
565 		KASSERT((opte & PG_PVLIST) == 0);
566 	}
567 
568 	pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1);
569 	pmap_tlb_shootwait();
570 }
571 
572 /*
573  * p m a p   i n i t   f u n c t i o n s
574  *
575  * pmap_bootstrap and pmap_init are called during system startup
576  * to init the pmap module.   pmap_bootstrap() does a low level
577  * init just to get things rolling.   pmap_init() finishes the job.
578  */
579 
580 /*
581  * pmap_bootstrap: get the system in a state where it can run with VM
582  *	properly enabled (called before main()).   the VM system is
583  *      fully init'd later...
584  *
585  * => on i386, locore.s has already enabled the MMU by allocating
586  *	a PDP for the kernel, and nkpde PTP's for the kernel.
587  * => kva_start is the first free virtual address in kernel space
588  */
589 
590 paddr_t
591 pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
592 {
593 	vaddr_t kva, kva_end, kva_start = VM_MIN_KERNEL_ADDRESS;
594 	struct pmap *kpm;
595 	int i;
596 	unsigned long p1i;
597 	long ndmpdp;
598 	paddr_t dmpd, dmpdp;
599 
600 	/*
601 	 * define the boundaries of the managed kernel virtual address
602 	 * space.
603 	 */
604 
605 	virtual_avail = kva_start;		/* first free KVA */
606 
607 	/*
608 	 * set up protection_codes: we need to be able to convert from
609 	 * a MI protection code (some combo of VM_PROT...) to something
610 	 * we can jam into a i386 PTE.
611 	 */
612 
613 	protection_codes[PROT_NONE] = pg_nx;			/* --- */
614 	protection_codes[PROT_EXEC] = PG_RO;			/* --x */
615 	protection_codes[PROT_READ] = PG_RO | pg_nx;		/* -r- */
616 	protection_codes[PROT_READ | PROT_EXEC] = PG_RO;	/* -rx */
617 	protection_codes[PROT_WRITE] = PG_RW | pg_nx;		/* w-- */
618 	protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW;	/* w-x */
619 	protection_codes[PROT_WRITE | PROT_READ] = PG_RW | pg_nx; /* wr- */
620 	protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW;	/* wrx */
621 
622 	/*
623 	 * now we init the kernel's pmap
624 	 *
625 	 * the kernel pmap's pm_obj is not used for much.   however, in
626 	 * user pmaps the pm_obj contains the list of active PTPs.
627 	 * the pm_obj currently does not have a pager.   it might be possible
628 	 * to add a pager that would allow a process to read-only mmap its
629 	 * own page tables (fast user level vtophys?).   this may or may not
630 	 * be useful.
631 	 */
632 
633 	kpm = pmap_kernel();
634 	for (i = 0; i < PTP_LEVELS - 1; i++) {
635 		uvm_objinit(&kpm->pm_obj[i], NULL, 1);
636 		kpm->pm_ptphint[i] = NULL;
637 	}
638 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
639 	kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE);
640 	kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
641 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
642 		atop(kva_start - VM_MIN_KERNEL_ADDRESS);
643 
644 	kpm->pm_type = PMAP_TYPE_NORMAL;
645 
646 	/*
647 	 * the above is just a rough estimate and not critical to the proper
648 	 * operation of the system.
649 	 */
650 
651 	curpcb->pcb_pmap = kpm;	/* proc0's pcb */
652 
653 	/*
654 	 * enable global TLB entries.
655 	 */
656 	/* add PG_G attribute to already mapped kernel pages */
657 #if KERNBASE == VM_MIN_KERNEL_ADDRESS
658 	for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
659 #else
660 	kva_end = roundup((vaddr_t)&end, PAGE_SIZE);
661 	for (kva = KERNBASE; kva < kva_end ;
662 #endif
663 	     kva += PAGE_SIZE) {
664 		p1i = pl1_i(kva);
665 		if (pmap_valid_entry(PTE_BASE[p1i]))
666 			PTE_BASE[p1i] |= PG_G;
667 	}
668 
669 	/*
670 	 * Map the direct map. The first 4GB were mapped in locore, here
671 	 * we map the rest if it exists. We actually use the direct map
672 	 * here to set up the page tables, we're assuming that we're still
673 	 * operating in the lower 4GB of memory.
674 	 */
675 	ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT;
676 	if (ndmpdp < NDML2_ENTRIES)
677 		ndmpdp = NDML2_ENTRIES;		/* At least 4GB */
678 
679 	dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & PG_FRAME;
680 
681 	dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE;
682 
683 	for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) {
684 		paddr_t pdp;
685 		vaddr_t va;
686 
687 		pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
688 		va = PMAP_DIRECT_MAP(pdp);
689 
690 		*((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT);
691 		*((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | PG_G | PG_U |
692 		    PG_M | pg_nx;
693 	}
694 
695 	for (i = NDML2_ENTRIES; i < ndmpdp; i++) {
696 		paddr_t pdp;
697 		vaddr_t va;
698 
699 		pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
700 		va = PMAP_DIRECT_MAP(pdp);
701 
702 		*((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT);
703 		*((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M | pg_nx;
704 	}
705 
706 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U |
707 	    PG_M | pg_nx;
708 
709 	tlbflush();
710 
711 	msgbuf_vaddr = virtual_avail;
712 	virtual_avail += round_page(MSGBUFSIZE);
713 
714 	idt_vaddr = virtual_avail;
715 	virtual_avail += 2 * PAGE_SIZE;
716 	idt_paddr = first_avail;			/* steal a page */
717 	first_avail += 2 * PAGE_SIZE;
718 
719 #if defined(MULTIPROCESSOR) || \
720     (NACPI > 0 && !defined(SMALL_KERNEL))
721 	/*
722 	 * Grab a page below 4G for things that need it (i.e.
723 	 * having an initial %cr3 for the MP trampoline).
724 	 */
725 	lo32_vaddr = virtual_avail;
726 	virtual_avail += PAGE_SIZE;
727 	lo32_paddr = first_avail;
728 	first_avail += PAGE_SIZE;
729 #endif
730 
731 	/*
732 	 * init the global lists.
733 	 */
734 	LIST_INIT(&pmaps);
735 
736 	/*
737 	 * initialize the pmap pool.
738 	 */
739 
740 	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0,
741 	    "pmappl", NULL);
742 	pool_setipl(&pmap_pmap_pool, IPL_NONE);
743 	pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, 0, 0, "pvpl",
744 	    &pool_allocator_single);
745 	pool_setipl(&pmap_pv_pool, IPL_VM);
746 	pool_sethiwat(&pmap_pv_pool, 32 * 1024);
747 
748 	/*
749 	 * initialize the PDE pool.
750 	 */
751 
752 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, PR_WAITOK, "pdppl", NULL);
753 	pool_setipl(&pmap_pdp_pool, IPL_NONE);
754 
755 	/*
756 	 * ensure the TLB is sync'd with reality by flushing it...
757 	 */
758 
759 	tlbflush();
760 
761 	return first_avail;
762 }
763 
764 /*
765  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
766  * trampoline code can be entered.
767  */
768 paddr_t
769 pmap_prealloc_lowmem_ptps(paddr_t first_avail)
770 {
771 	pd_entry_t *pdes;
772 	int level;
773 	paddr_t newp;
774 
775 	pdes = pmap_kernel()->pm_pdir;
776 	level = PTP_LEVELS;
777 	for (;;) {
778 		newp = first_avail; first_avail += PAGE_SIZE;
779 		memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
780 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
781 		level--;
782 		if (level <= 1)
783 			break;
784 		pdes = normal_pdes[level - 2];
785 	}
786 
787 	return first_avail;
788 }
789 
790 /*
791  * pmap_init: called from uvm_init, our job is to get the pmap
792  * system ready to manage mappings... this mainly means initing
793  * the pv_entry stuff.
794  */
795 
796 void
797 pmap_init(void)
798 {
799 	/*
800 	 * done: pmap module is up (and ready for business)
801 	 */
802 
803 	pmap_initialized = TRUE;
804 }
805 
806 /*
807  * p v _ e n t r y   f u n c t i o n s
808  */
809 
810 /*
811  * main pv_entry manipulation functions:
812  *   pmap_enter_pv: enter a mapping onto a pv list
813  *   pmap_remove_pv: remove a mapping from a pv list
814  */
815 
816 /*
817  * pmap_enter_pv: enter a mapping onto a pv list
818  *
819  * => caller should adjust ptp's wire_count before calling
820  *
821  * pve: preallocated pve for us to use
822  * ptp: PTP in pmap that maps this VA
823  */
824 
825 void
826 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
827     vaddr_t va, struct vm_page *ptp)
828 {
829 	pve->pv_pmap = pmap;
830 	pve->pv_va = va;
831 	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
832 	mtx_enter(&pg->mdpage.pv_mtx);
833 	pve->pv_next = pg->mdpage.pv_list;	/* add to ... */
834 	pg->mdpage.pv_list = pve;		/* ... list */
835 	mtx_leave(&pg->mdpage.pv_mtx);
836 }
837 
838 /*
839  * pmap_remove_pv: try to remove a mapping from a pv_list
840  *
841  * => caller should adjust ptp's wire_count and free PTP if needed
842  * => we return the removed pve
843  */
844 
845 struct pv_entry *
846 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
847 {
848 	struct pv_entry *pve, **prevptr;
849 
850 	mtx_enter(&pg->mdpage.pv_mtx);
851 	prevptr = &pg->mdpage.pv_list;
852 	while ((pve = *prevptr) != NULL) {
853 		if (pve->pv_pmap == pmap && pve->pv_va == va) {	/* match? */
854 			*prevptr = pve->pv_next;		/* remove it! */
855 			break;
856 		}
857 		prevptr = &pve->pv_next;		/* previous pointer */
858 	}
859 	mtx_leave(&pg->mdpage.pv_mtx);
860 	return(pve);				/* return removed pve */
861 }
862 
863 /*
864  * p t p   f u n c t i o n s
865  */
866 
867 struct vm_page *
868 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
869 {
870 	int lidx = level - 1;
871 	struct vm_page *pg;
872 
873 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
874 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
875 		return (pmap->pm_ptphint[lidx]);
876 
877 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
878 
879 	return pg;
880 }
881 
882 void
883 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
884     struct pg_to_free *pagelist)
885 {
886 	int lidx;
887 	struct uvm_object *obj;
888 
889 	lidx = level - 1;
890 
891 	obj = &pmap->pm_obj[lidx];
892 	pmap->pm_stats.resident_count--;
893 	if (pmap->pm_ptphint[lidx] == ptp)
894 		pmap->pm_ptphint[lidx] = RB_ROOT(&obj->memt);
895 	ptp->wire_count = 0;
896 	uvm_pagerealloc(ptp, NULL, 0);
897 	TAILQ_INSERT_TAIL(pagelist, ptp, pageq);
898 }
899 
900 void
901 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
902     pt_entry_t *ptes, pd_entry_t **pdes, struct pg_to_free *pagelist)
903 {
904 	unsigned long index;
905 	int level;
906 	vaddr_t invaladdr;
907 	pd_entry_t opde;
908 
909 	level = 1;
910 	do {
911 		pmap_freepage(pmap, ptp, level, pagelist);
912 		index = pl_i(va, level + 1);
913 		opde = pmap_pte_set(&pdes[level - 1][index], 0);
914 		invaladdr = level == 1 ? (vaddr_t)ptes :
915 		    (vaddr_t)pdes[level - 2];
916 		pmap_tlb_shootpage(curpcb->pcb_pmap,
917 		    invaladdr + index * PAGE_SIZE,
918 		    pmap_is_curpmap(curpcb->pcb_pmap));
919 #if defined(MULTIPROCESSOR)
920 		invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
921 		    (vaddr_t)normal_pdes[level - 2];
922 		pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE,
923 		    pmap_is_curpmap(curpcb->pcb_pmap));
924 #endif
925 		if (level < PTP_LEVELS - 1) {
926 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
927 			ptp->wire_count--;
928 			if (ptp->wire_count > 1)
929 				break;
930 		}
931 	} while (++level < PTP_LEVELS);
932 }
933 
934 /*
935  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
936  *
937  * => pmap should NOT be pmap_kernel()
938  */
939 
940 
941 struct vm_page *
942 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes)
943 {
944 	struct vm_page *ptp, *pptp;
945 	int i;
946 	unsigned long index;
947 	pd_entry_t *pva;
948 	paddr_t ppa, pa;
949 	struct uvm_object *obj;
950 
951 	ptp = NULL;
952 	pa = (paddr_t)-1;
953 
954 	/*
955 	 * Loop through all page table levels seeing if we need to
956 	 * add a new page to that level.
957 	 */
958 	for (i = PTP_LEVELS; i > 1; i--) {
959 		/*
960 		 * Save values from previous round.
961 		 */
962 		pptp = ptp;
963 		ppa = pa;
964 
965 		index = pl_i(va, i);
966 		pva = pdes[i - 2];
967 
968 		if (pmap_valid_entry(pva[index])) {
969 			ppa = pva[index] & PG_FRAME;
970 			ptp = NULL;
971 			continue;
972 		}
973 
974 		obj = &pmap->pm_obj[i-2];
975 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
976 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
977 
978 		if (ptp == NULL)
979 			return NULL;
980 
981 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
982 		ptp->wire_count = 1;
983 		pmap->pm_ptphint[i - 2] = ptp;
984 		pa = VM_PAGE_TO_PHYS(ptp);
985 		pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V);
986 		pmap->pm_stats.resident_count++;
987 		/*
988 		 * If we're not in the top level, increase the
989 		 * wire count of the parent page.
990 		 */
991 		if (i < PTP_LEVELS) {
992 			if (pptp == NULL)
993 				pptp = pmap_find_ptp(pmap, va, ppa, i);
994 #ifdef DIAGNOSTIC
995 			if (pptp == NULL)
996 				panic("%s: pde page disappeared", __func__);
997 #endif
998 			pptp->wire_count++;
999 		}
1000 	}
1001 
1002 	/*
1003 	 * ptp is not NULL if we just allocated a new ptp. If it's
1004 	 * still NULL, we must look up the existing one.
1005 	 */
1006 	if (ptp == NULL) {
1007 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
1008 #ifdef DIAGNOSTIC
1009 		if (ptp == NULL) {
1010 			printf("va %lx ppa %lx\n", (unsigned long)va,
1011 			    (unsigned long)ppa);
1012 			panic("%s: unmanaged user PTP", __func__);
1013 		}
1014 #endif
1015 	}
1016 
1017 	pmap->pm_ptphint[0] = ptp;
1018 	return(ptp);
1019 }
1020 
1021 /*
1022  * p m a p  l i f e c y c l e   f u n c t i o n s
1023  */
1024 
1025 /*
1026  * pmap_pdp_ctor: constructor for the PDP cache.
1027  */
1028 
1029 void
1030 pmap_pdp_ctor(pd_entry_t *pdir)
1031 {
1032 	paddr_t pdirpa;
1033 	int npde;
1034 
1035 	/* fetch the physical address of the page directory. */
1036 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
1037 
1038 	/* zero init area */
1039 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
1040 
1041 	/* put in recursive PDE to map the PTEs */
1042 	pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW | pg_nx;
1043 
1044 	npde = nkptp[PTP_LEVELS - 1];
1045 
1046 	/* put in kernel VM PDEs */
1047 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
1048 	    npde * sizeof(pd_entry_t));
1049 
1050 	/* zero the rest */
1051 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
1052 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
1053 
1054 	pdir[PDIR_SLOT_DIRECT] = pmap_kernel()->pm_pdir[PDIR_SLOT_DIRECT];
1055 
1056 #if VM_MIN_KERNEL_ADDRESS != KERNBASE
1057 	pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)];
1058 #endif
1059 }
1060 
1061 /*
1062  * pmap_create: create a pmap
1063  *
1064  * => note: old pmap interface took a "size" args which allowed for
1065  *	the creation of "software only" pmaps (not in bsd).
1066  */
1067 
1068 struct pmap *
1069 pmap_create(void)
1070 {
1071 	struct pmap *pmap;
1072 	int i;
1073 
1074 	pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
1075 
1076 	mtx_init(&pmap->pm_mtx, IPL_VM);
1077 
1078 	/* init uvm_object */
1079 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1080 		uvm_objinit(&pmap->pm_obj[i], NULL, 1);
1081 		pmap->pm_ptphint[i] = NULL;
1082 	}
1083 	pmap->pm_stats.wired_count = 0;
1084 	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
1085 	pmap->pm_cpus = 0;
1086 	pmap->pm_type = PMAP_TYPE_NORMAL;
1087 
1088 	/* allocate PDP */
1089 
1090 	/*
1091 	 * note that there is no need to splvm to protect us from
1092 	 * malloc since malloc allocates out of a submap and we should
1093 	 * have already allocated kernel PTPs to cover the range...
1094 	 */
1095 
1096 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
1097 	pmap_pdp_ctor(pmap->pm_pdir);
1098 
1099 	pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME;
1100 
1101 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1102 	return (pmap);
1103 }
1104 
1105 /*
1106  * pmap_destroy: drop reference count on pmap.   free pmap if
1107  *	reference count goes to zero.
1108  */
1109 
1110 void
1111 pmap_destroy(struct pmap *pmap)
1112 {
1113 	struct vm_page *pg;
1114 	int refs;
1115 	int i;
1116 
1117 	/*
1118 	 * drop reference count
1119 	 */
1120 
1121 	refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs);
1122 	if (refs > 0) {
1123 		return;
1124 	}
1125 
1126 	/*
1127 	 * reference count is zero, free pmap resources and then free pmap.
1128 	 */
1129 
1130 #ifdef DIAGNOSTIC
1131 	if (__predict_false(pmap->pm_cpus != 0))
1132 		printf("%s: pmap %p cpus=0x%llx\n", __func__,
1133 		    (void *)pmap, pmap->pm_cpus);
1134 #endif
1135 
1136 	/*
1137 	 * remove it from global list of pmaps
1138 	 */
1139 	LIST_REMOVE(pmap, pm_list);
1140 
1141 	/*
1142 	 * free any remaining PTPs
1143 	 */
1144 
1145 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1146 		while ((pg = RB_ROOT(&pmap->pm_obj[i].memt)) != NULL) {
1147 			KASSERT((pg->pg_flags & PG_BUSY) == 0);
1148 
1149 			pg->wire_count = 0;
1150 			uvm_pagefree(pg);
1151 		}
1152 	}
1153 
1154 	/* XXX: need to flush it out of other processor's space? */
1155 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
1156 
1157 	pool_put(&pmap_pmap_pool, pmap);
1158 }
1159 
1160 /*
1161  *	Add a reference to the specified pmap.
1162  */
1163 
1164 void
1165 pmap_reference(struct pmap *pmap)
1166 {
1167 	atomic_inc_int(&pmap->pm_obj[0].uo_refs);
1168 }
1169 
1170 /*
1171  * pmap_activate: activate a process' pmap (fill in %cr3)
1172  *
1173  * => called from cpu_fork() and when switching pmaps during exec
1174  * => if p is the curproc, then load it into the MMU
1175  */
1176 
1177 void
1178 pmap_activate(struct proc *p)
1179 {
1180 	struct pcb *pcb = &p->p_addr->u_pcb;
1181 	struct pmap *pmap = p->p_vmspace->vm_map.pmap;
1182 
1183 	pcb->pcb_pmap = pmap;
1184 	pcb->pcb_cr3 = pmap->pm_pdirpa;
1185 	if (p == curproc) {
1186 		lcr3(pcb->pcb_cr3);
1187 
1188 		/*
1189 		 * mark the pmap in use by this processor.
1190 		 */
1191 		x86_atomic_setbits_u64(&pmap->pm_cpus, (1ULL << cpu_number()));
1192 	}
1193 }
1194 
1195 /*
1196  * pmap_deactivate: deactivate a process' pmap
1197  */
1198 
1199 void
1200 pmap_deactivate(struct proc *p)
1201 {
1202 	struct pmap *pmap = p->p_vmspace->vm_map.pmap;
1203 
1204 	/*
1205 	 * mark the pmap no longer in use by this processor.
1206 	 */
1207 	x86_atomic_clearbits_u64(&pmap->pm_cpus, (1ULL << cpu_number()));
1208 }
1209 
1210 /*
1211  * end of lifecycle functions
1212  */
1213 
1214 /*
1215  * some misc. functions
1216  */
1217 
1218 boolean_t
1219 pmap_pdes_valid(vaddr_t va, pd_entry_t **pdes, pd_entry_t *lastpde)
1220 {
1221 	int i;
1222 	unsigned long index;
1223 	pd_entry_t pde;
1224 
1225 	for (i = PTP_LEVELS; i > 1; i--) {
1226 		index = pl_i(va, i);
1227 		pde = pdes[i - 2][index];
1228 		if (!pmap_valid_entry(pde))
1229 			return FALSE;
1230 	}
1231 	if (lastpde != NULL)
1232 		*lastpde = pde;
1233 	return TRUE;
1234 }
1235 
1236 /*
1237  * pmap_extract: extract a PA for the given VA
1238  */
1239 
1240 boolean_t
1241 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1242 {
1243 	pt_entry_t *ptes;
1244 	int level, offs;
1245 
1246 	if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE &&
1247 	    va < PMAP_DIRECT_END) {
1248 		*pap = va - PMAP_DIRECT_BASE;
1249 		return (TRUE);
1250 	}
1251 
1252 	level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
1253 
1254 	if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) {
1255 		if (pap != NULL)
1256 			*pap = (ptes[offs] & PG_FRAME) | (va & PAGE_MASK);
1257 		return (TRUE);
1258 	}
1259 	if (level == 1 && (ptes[offs] & (PG_PS|PG_V)) == (PG_PS|PG_V)) {
1260 		if (pap != NULL)
1261 			*pap = (ptes[offs] & PG_LGFRAME) | (va & PAGE_MASK_L2);
1262 		return (TRUE);
1263 	}
1264 
1265 	return FALSE;
1266 }
1267 
1268 /*
1269  * pmap_zero_page: zero a page
1270  */
1271 
1272 void
1273 pmap_zero_page(struct vm_page *pg)
1274 {
1275 	pagezero(pmap_map_direct(pg));
1276 }
1277 
1278 /*
1279  * pmap_flush_cache: flush the cache for a virtual address.
1280  */
1281 void
1282 pmap_flush_cache(vaddr_t addr, vsize_t len)
1283 {
1284 	vaddr_t	i;
1285 
1286 	if (curcpu()->ci_cflushsz == 0) {
1287 		wbinvd();
1288 		return;
1289 	}
1290 
1291 	/* all cpus that have clflush also have mfence. */
1292 	mfence();
1293 	for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz)
1294 		clflush(i);
1295 	mfence();
1296 }
1297 
1298 /*
1299  * pmap_copy_page: copy a page
1300  */
1301 
1302 void
1303 pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
1304 {
1305 	vaddr_t srcva = pmap_map_direct(srcpg);
1306 	vaddr_t dstva = pmap_map_direct(dstpg);
1307 
1308 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
1309 }
1310 
1311 /*
1312  * p m a p   r e m o v e   f u n c t i o n s
1313  *
1314  * functions that remove mappings
1315  */
1316 
1317 /*
1318  * pmap_remove_ptes: remove PTEs from a PTP
1319  *
1320  * => must have proper locking on pmap_master_lock
1321  * => PTP must be mapped into KVA
1322  * => PTP should be null if pmap == pmap_kernel()
1323  */
1324 
1325 void
1326 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1327     vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1328 {
1329 	struct pv_entry *pve;
1330 	pt_entry_t *pte = (pt_entry_t *) ptpva;
1331 	struct vm_page *pg;
1332 	pt_entry_t opte;
1333 
1334 	/*
1335 	 * note that ptpva points to the PTE that maps startva.   this may
1336 	 * or may not be the first PTE in the PTP.
1337 	 *
1338 	 * we loop through the PTP while there are still PTEs to look at
1339 	 * and the wire_count is greater than 1 (because we use the wire_count
1340 	 * to keep track of the number of real PTEs in the PTP).
1341 	 */
1342 
1343 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1344 			     ; pte++, startva += PAGE_SIZE) {
1345 		if (!pmap_valid_entry(*pte))
1346 			continue;			/* VA not mapped */
1347 		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1348 			continue;
1349 		}
1350 
1351 		/* atomically save the old PTE and zap! it */
1352 		opte = pmap_pte_set(pte, 0);
1353 
1354 		if (opte & PG_W)
1355 			pmap->pm_stats.wired_count--;
1356 		pmap->pm_stats.resident_count--;
1357 
1358 		if (ptp)
1359 			ptp->wire_count--;		/* dropping a PTE */
1360 
1361 		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1362 
1363 		/*
1364 		 * if we are not on a pv list we are done.
1365 		 */
1366 
1367 		if ((opte & PG_PVLIST) == 0) {
1368 #ifdef DIAGNOSTIC
1369 			if (pg != NULL)
1370 				panic("%s: managed page without PG_PVLIST "
1371 				      "for 0x%lx", __func__, startva);
1372 #endif
1373 			continue;
1374 		}
1375 
1376 #ifdef DIAGNOSTIC
1377 		if (pg == NULL)
1378 			panic("%s: unmanaged page marked PG_PVLIST, "
1379 			      "va = 0x%lx, pa = 0x%lx", __func__,
1380 			      startva, (u_long)(opte & PG_FRAME));
1381 #endif
1382 
1383 		/* sync R/M bits */
1384 		pmap_sync_flags_pte(pg, opte);
1385 		pve = pmap_remove_pv(pg, pmap, startva);
1386 		if (pve) {
1387 			pve->pv_next = *free_pvs;
1388 			*free_pvs = pve;
1389 		}
1390 
1391 		/* end of "for" loop: time for next pte */
1392 	}
1393 }
1394 
1395 
1396 /*
1397  * pmap_remove_pte: remove a single PTE from a PTP
1398  *
1399  * => must have proper locking on pmap_master_lock
1400  * => PTP must be mapped into KVA
1401  * => PTP should be null if pmap == pmap_kernel()
1402  * => returns true if we removed a mapping
1403  */
1404 
1405 boolean_t
1406 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
1407     vaddr_t va, int flags, struct pv_entry **free_pvs)
1408 {
1409 	struct pv_entry *pve;
1410 	struct vm_page *pg;
1411 	pt_entry_t opte;
1412 
1413 	if (!pmap_valid_entry(*pte))
1414 		return(FALSE);		/* VA not mapped */
1415 	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
1416 		return(FALSE);
1417 	}
1418 
1419 	/* atomically save the old PTE and zap! it */
1420 	opte = pmap_pte_set(pte, 0);
1421 
1422 	if (opte & PG_W)
1423 		pmap->pm_stats.wired_count--;
1424 	pmap->pm_stats.resident_count--;
1425 
1426 	if (ptp)
1427 		ptp->wire_count--;		/* dropping a PTE */
1428 
1429 	pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
1430 
1431 	/*
1432 	 * if we are not on a pv list we are done.
1433 	 */
1434 	if ((opte & PG_PVLIST) == 0) {
1435 #ifdef DIAGNOSTIC
1436 		if (pg != NULL)
1437 			panic("%s: managed page without PG_PVLIST for 0x%lx",
1438 			      __func__, va);
1439 #endif
1440 		return(TRUE);
1441 	}
1442 
1443 #ifdef DIAGNOSTIC
1444 	if (pg == NULL)
1445 		panic("%s: unmanaged page marked PG_PVLIST, va = 0x%lx, "
1446 		      "pa = 0x%lx", __func__, va, (u_long)(opte & PG_FRAME));
1447 #endif
1448 
1449 	/* sync R/M bits */
1450 	pmap_sync_flags_pte(pg, opte);
1451 	pve = pmap_remove_pv(pg, pmap, va);
1452 	if (pve) {
1453 		pve->pv_next = *free_pvs;
1454 		*free_pvs = pve;
1455 	}
1456 
1457 	return(TRUE);
1458 }
1459 
1460 /*
1461  * pmap_remove: top level mapping removal function
1462  *
1463  * => caller should not be holding any pmap locks
1464  */
1465 
1466 void
1467 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
1468 {
1469 	pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
1470 }
1471 
1472 /*
1473  * pmap_do_remove: mapping removal guts
1474  *
1475  * => caller should not be holding any pmap locks
1476  */
1477 
1478 void
1479 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1480 {
1481 	pt_entry_t *ptes;
1482 	pd_entry_t **pdes, pde;
1483 	boolean_t result;
1484 	paddr_t ptppa;
1485 	vaddr_t blkendva;
1486 	struct vm_page *ptp;
1487 	struct pv_entry *pve;
1488 	struct pv_entry *free_pvs = NULL;
1489 	vaddr_t va;
1490 	int shootall = 0, shootself;
1491 	struct pg_to_free empty_ptps;
1492 	paddr_t scr3;
1493 
1494 	TAILQ_INIT(&empty_ptps);
1495 
1496 	pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
1497 	shootself = (scr3 == 0);
1498 
1499 	/*
1500 	 * removing one page?  take shortcut function.
1501 	 */
1502 
1503 	if (sva + PAGE_SIZE == eva) {
1504 		if (pmap_pdes_valid(sva, pdes, &pde)) {
1505 
1506 			/* PA of the PTP */
1507 			ptppa = pde & PG_FRAME;
1508 
1509 			/* get PTP if non-kernel mapping */
1510 
1511 			if (pmap == pmap_kernel()) {
1512 				/* we never free kernel PTPs */
1513 				ptp = NULL;
1514 			} else {
1515 				ptp = pmap_find_ptp(pmap, sva, ptppa, 1);
1516 #ifdef DIAGNOSTIC
1517 				if (ptp == NULL)
1518 					panic("%s: unmanaged PTP detected",
1519 					      __func__);
1520 #endif
1521 			}
1522 
1523 			/* do it! */
1524 			result = pmap_remove_pte(pmap, ptp,
1525 			    &ptes[pl1_i(sva)], sva, flags, &free_pvs);
1526 
1527 			/*
1528 			 * if mapping removed and the PTP is no longer
1529 			 * being used, free it!
1530 			 */
1531 
1532 			if (result && ptp && ptp->wire_count <= 1)
1533 				pmap_free_ptp(pmap, ptp, sva, ptes, pdes,
1534 				    &empty_ptps);
1535 			pmap_tlb_shootpage(pmap, sva, shootself);
1536 			pmap_unmap_ptes(pmap, scr3);
1537 			pmap_tlb_shootwait();
1538 		} else {
1539 			pmap_unmap_ptes(pmap, scr3);
1540 		}
1541 
1542 		goto cleanup;
1543 	}
1544 
1545 	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1546 		shootall = 1;
1547 
1548 	for (va = sva; va < eva; va = blkendva) {
1549 		/* determine range of block */
1550 		blkendva = x86_round_pdr(va + 1);
1551 		if (blkendva > eva)
1552 			blkendva = eva;
1553 
1554 		/*
1555 		 * XXXCDC: our PTE mappings should never be removed
1556 		 * with pmap_remove!  if we allow this (and why would
1557 		 * we?) then we end up freeing the pmap's page
1558 		 * directory page (PDP) before we are finished using
1559 		 * it when we hit in in the recursive mapping.  this
1560 		 * is BAD.
1561 		 *
1562 		 * long term solution is to move the PTEs out of user
1563 		 * address space.  and into kernel address space (up
1564 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1565 		 * be VM_MAX_ADDRESS.
1566 		 */
1567 
1568 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
1569 			/* XXXCDC: ugly hack to avoid freeing PDP here */
1570 			continue;
1571 
1572 		if (!pmap_pdes_valid(va, pdes, &pde))
1573 			continue;
1574 
1575 		/* PA of the PTP */
1576 		ptppa = pde & PG_FRAME;
1577 
1578 		/* get PTP if non-kernel mapping */
1579 		if (pmap == pmap_kernel()) {
1580 			/* we never free kernel PTPs */
1581 			ptp = NULL;
1582 		} else {
1583 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
1584 #ifdef DIAGNOSTIC
1585 			if (ptp == NULL)
1586 				panic("%s: unmanaged PTP detected", __func__);
1587 #endif
1588 		}
1589 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)],
1590 		    va, blkendva, flags, &free_pvs);
1591 
1592 		/* if PTP is no longer being used, free it! */
1593 		if (ptp && ptp->wire_count <= 1) {
1594 			pmap_free_ptp(pmap, ptp, va, ptes, pdes, &empty_ptps);
1595 		}
1596 	}
1597 
1598 	if (shootall)
1599 		pmap_tlb_shoottlb(pmap, shootself);
1600 	else
1601 		pmap_tlb_shootrange(pmap, sva, eva, shootself);
1602 
1603 	pmap_unmap_ptes(pmap, scr3);
1604 	pmap_tlb_shootwait();
1605 
1606 cleanup:
1607 	while ((pve = free_pvs) != NULL) {
1608 		free_pvs = pve->pv_next;
1609 		pool_put(&pmap_pv_pool, pve);
1610 	}
1611 
1612 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1613 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1614 		uvm_pagefree(ptp);
1615 	}
1616 }
1617 
1618 /*
1619  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1620  *
1621  * => R/M bits are sync'd back to attrs
1622  */
1623 
1624 void
1625 pmap_page_remove(struct vm_page *pg)
1626 {
1627 	struct pv_entry *pve;
1628 	struct pmap *pm;
1629 	pt_entry_t *ptes, opte;
1630 	pd_entry_t **pdes;
1631 #ifdef DIAGNOSTIC
1632 	pd_entry_t pde;
1633 #endif
1634 	struct pg_to_free empty_ptps;
1635 	struct vm_page *ptp;
1636 	paddr_t scr3;
1637 	int shootself;
1638 
1639 	TAILQ_INIT(&empty_ptps);
1640 
1641 	mtx_enter(&pg->mdpage.pv_mtx);
1642 	while ((pve = pg->mdpage.pv_list) != NULL) {
1643 		pmap_reference(pve->pv_pmap);
1644 		pm = pve->pv_pmap;
1645 		mtx_leave(&pg->mdpage.pv_mtx);
1646 
1647 		/* XXX use direct map? */
1648 		pmap_map_ptes(pm, &ptes, &pdes, &scr3);	/* locks pmap */
1649 		shootself = (scr3 == 0);
1650 
1651 		/*
1652 		 * We dropped the pvlist lock before grabbing the pmap
1653 		 * lock to avoid lock ordering problems.  This means
1654 		 * we have to check the pvlist again since somebody
1655 		 * else might have modified it.  All we care about is
1656 		 * that the pvlist entry matches the pmap we just
1657 		 * locked.  If it doesn't, unlock the pmap and try
1658 		 * again.
1659 		 */
1660 		mtx_enter(&pg->mdpage.pv_mtx);
1661 		if ((pve = pg->mdpage.pv_list) == NULL ||
1662 		    pve->pv_pmap != pm) {
1663 			mtx_leave(&pg->mdpage.pv_mtx);
1664 			pmap_unmap_ptes(pm, scr3);	/* unlocks pmap */
1665 			pmap_destroy(pm);
1666 			mtx_enter(&pg->mdpage.pv_mtx);
1667 			continue;
1668 		}
1669 
1670 		pg->mdpage.pv_list = pve->pv_next;
1671 		mtx_leave(&pg->mdpage.pv_mtx);
1672 
1673 #ifdef DIAGNOSTIC
1674 		if (pve->pv_ptp && pmap_pdes_valid(pve->pv_va, pdes, &pde) &&
1675 		   (pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
1676 			printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__,
1677 			       pg, pve->pv_va, pve->pv_ptp);
1678 			printf("%s: PTP's phys addr: "
1679 			       "actual=%lx, recorded=%lx\n", __func__,
1680 			       (unsigned long)(pde & PG_FRAME),
1681 				VM_PAGE_TO_PHYS(pve->pv_ptp));
1682 			panic("%s: mapped managed page has "
1683 			      "invalid pv_ptp field", __func__);
1684 		}
1685 #endif
1686 
1687 		/* atomically save the old PTE and zap it */
1688 		opte = pmap_pte_set(&ptes[pl1_i(pve->pv_va)], 0);
1689 
1690 		if (opte & PG_W)
1691 			pve->pv_pmap->pm_stats.wired_count--;
1692 		pve->pv_pmap->pm_stats.resident_count--;
1693 
1694 		pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself);
1695 
1696 		pmap_sync_flags_pte(pg, opte);
1697 
1698 		/* update the PTP reference count.  free if last reference. */
1699 		if (pve->pv_ptp) {
1700 			pve->pv_ptp->wire_count--;
1701 			if (pve->pv_ptp->wire_count <= 1) {
1702 				pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
1703 				    pve->pv_va, ptes, pdes, &empty_ptps);
1704 			}
1705 		}
1706 		pmap_unmap_ptes(pve->pv_pmap, scr3);	/* unlocks pmap */
1707 		pmap_destroy(pve->pv_pmap);
1708 		pool_put(&pmap_pv_pool, pve);
1709 		mtx_enter(&pg->mdpage.pv_mtx);
1710 	}
1711 	mtx_leave(&pg->mdpage.pv_mtx);
1712 
1713 	pmap_tlb_shootwait();
1714 
1715 	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1716 		TAILQ_REMOVE(&empty_ptps, ptp, pageq);
1717 		uvm_pagefree(ptp);
1718 	}
1719 }
1720 
1721 /*
1722  * p m a p   a t t r i b u t e  f u n c t i o n s
1723  * functions that test/change managed page's attributes
1724  * since a page can be mapped multiple times we must check each PTE that
1725  * maps it by going down the pv lists.
1726  */
1727 
1728 /*
1729  * pmap_test_attrs: test a page's attributes
1730  */
1731 
1732 boolean_t
1733 pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
1734 {
1735 	struct pv_entry *pve;
1736 	pt_entry_t *ptes;
1737 	int level, offs;
1738 	u_long mybits, testflags;
1739 
1740 	testflags = pmap_pte2flags(testbits);
1741 
1742 	if (pg->pg_flags & testflags)
1743 		return (TRUE);
1744 
1745 	mybits = 0;
1746 	mtx_enter(&pg->mdpage.pv_mtx);
1747 	for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
1748 	    pve = pve->pv_next) {
1749 		level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
1750 		    &offs);
1751 		mybits |= (ptes[offs] & testbits);
1752 	}
1753 	mtx_leave(&pg->mdpage.pv_mtx);
1754 
1755 	if (mybits == 0)
1756 		return (FALSE);
1757 
1758 	atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits));
1759 
1760 	return (TRUE);
1761 }
1762 
1763 /*
1764  * pmap_clear_attrs: change a page's attributes
1765  *
1766  * => we return TRUE if we cleared one of the bits we were asked to
1767  */
1768 
1769 boolean_t
1770 pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
1771 {
1772 	struct pv_entry *pve;
1773 	pt_entry_t *ptes, opte;
1774 	u_long clearflags;
1775 	int result, level, offs;
1776 
1777 	clearflags = pmap_pte2flags(clearbits);
1778 
1779 	result = pg->pg_flags & clearflags;
1780 	if (result)
1781 		atomic_clearbits_int(&pg->pg_flags, clearflags);
1782 
1783 	mtx_enter(&pg->mdpage.pv_mtx);
1784 	for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
1785 		level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
1786 		    &offs);
1787 		opte = ptes[offs];
1788 		if (opte & clearbits) {
1789 			result = 1;
1790 			pmap_pte_clearbits(&ptes[offs], (opte & clearbits));
1791 			pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va,
1792 				pmap_is_curpmap(pve->pv_pmap));
1793 		}
1794 	}
1795 	mtx_leave(&pg->mdpage.pv_mtx);
1796 
1797 	pmap_tlb_shootwait();
1798 
1799 	return (result != 0);
1800 }
1801 
1802 /*
1803  * p m a p   p r o t e c t i o n   f u n c t i o n s
1804  */
1805 
1806 /*
1807  * pmap_page_protect: change the protection of all recorded mappings
1808  *	of a managed page
1809  *
1810  * => NOTE: this is an inline function in pmap.h
1811  */
1812 
1813 /* see pmap.h */
1814 
1815 /*
1816  * pmap_protect: set the protection in of the pages in a pmap
1817  *
1818  * => NOTE: this is an inline function in pmap.h
1819  */
1820 
1821 /* see pmap.h */
1822 
1823 /*
1824  * pmap_write_protect: write-protect pages in a pmap
1825  */
1826 
1827 void
1828 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
1829 {
1830 	pt_entry_t nx, *ptes, *spte, *epte;
1831 	pd_entry_t **pdes;
1832 	vaddr_t blockend;
1833 	int shootall = 0, shootself;
1834 	vaddr_t va;
1835 	paddr_t scr3;
1836 
1837 	pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
1838 	shootself = (scr3 == 0);
1839 
1840 	/* should be ok, but just in case ... */
1841 	sva &= PG_FRAME;
1842 	eva &= PG_FRAME;
1843 
1844 	nx = 0;
1845 	if (!(prot & PROT_EXEC))
1846 		nx = pg_nx;
1847 
1848 	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
1849 		shootall = 1;
1850 
1851 	for (va = sva; va < eva ; va = blockend) {
1852 		blockend = (va & L2_FRAME) + NBPD_L2;
1853 		if (blockend > eva)
1854 			blockend = eva;
1855 
1856 		/*
1857 		 * XXXCDC: our PTE mappings should never be write-protected!
1858 		 *
1859 		 * long term solution is to move the PTEs out of user
1860 		 * address space.  and into kernel address space (up
1861 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
1862 		 * be VM_MAX_ADDRESS.
1863 		 */
1864 
1865 		/* XXXCDC: ugly hack to avoid freeing PDP here */
1866 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
1867 			continue;
1868 
1869 		/* empty block? */
1870 		if (!pmap_pdes_valid(va, pdes, NULL))
1871 			continue;
1872 
1873 #ifdef DIAGNOSTIC
1874 		if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS)
1875 			panic("%s: PTE space", __func__);
1876 #endif
1877 
1878 		spte = &ptes[pl1_i(va)];
1879 		epte = &ptes[pl1_i(blockend)];
1880 
1881 		for (/*null */; spte < epte ; spte++) {
1882 			if (!pmap_valid_entry(*spte))
1883 				continue;
1884 			pmap_pte_clearbits(spte, PG_RW);
1885 			pmap_pte_setbits(spte, nx);
1886 		}
1887 	}
1888 
1889 	if (shootall)
1890 		pmap_tlb_shoottlb(pmap, shootself);
1891 	else
1892 		pmap_tlb_shootrange(pmap, sva, eva, shootself);
1893 
1894 	pmap_unmap_ptes(pmap, scr3);
1895 	pmap_tlb_shootwait();
1896 }
1897 
1898 /*
1899  * end of protection functions
1900  */
1901 
1902 /*
1903  * pmap_unwire: clear the wired bit in the PTE
1904  *
1905  * => mapping should already be in map
1906  */
1907 
1908 void
1909 pmap_unwire(struct pmap *pmap, vaddr_t va)
1910 {
1911 	pt_entry_t *ptes;
1912 	int level, offs;
1913 
1914 	level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
1915 
1916 	if (level == 0) {
1917 
1918 #ifdef DIAGNOSTIC
1919 		if (!pmap_valid_entry(ptes[offs]))
1920 			panic("%s: invalid (unmapped) va 0x%lx", __func__, va);
1921 #endif
1922 		if (__predict_true((ptes[offs] & PG_W) != 0)) {
1923 			pmap_pte_clearbits(&ptes[offs], PG_W);
1924 			pmap->pm_stats.wired_count--;
1925 		}
1926 #ifdef DIAGNOSTIC
1927 		else {
1928 			printf("%s: wiring for pmap %p va 0x%lx "
1929 			       "didn't change!\n", __func__, pmap, va);
1930 		}
1931 #endif
1932 	}
1933 #ifdef DIAGNOSTIC
1934 	else {
1935 		panic("%s: invalid PDE", __func__);
1936 	}
1937 #endif
1938 }
1939 
1940 /*
1941  * pmap_collect: free resources held by a pmap
1942  *
1943  * => optional function.
1944  * => called when a process is swapped out to free memory.
1945  */
1946 
1947 void
1948 pmap_collect(struct pmap *pmap)
1949 {
1950 	/*
1951 	 * free all of the pt pages by removing the physical mappings
1952 	 * for its entire address space.
1953 	 */
1954 
1955 /*	pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
1956 	    PMAP_REMOVE_SKIPWIRED);
1957 */
1958 }
1959 
1960 /*
1961  * pmap_copy: copy mappings from one pmap to another
1962  *
1963  * => optional function
1964  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
1965  */
1966 
1967 /*
1968  * defined as macro in pmap.h
1969  */
1970 
1971 /*
1972  * pmap_enter: enter a mapping into a pmap
1973  *
1974  * => must be done "now" ... no lazy-evaluation
1975  */
1976 
1977 int
1978 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
1979 {
1980 	pt_entry_t *ptes, opte, npte;
1981 	pd_entry_t **pdes;
1982 	struct vm_page *ptp, *pg = NULL;
1983 	struct pv_entry *pve, *opve = NULL;
1984 	int ptpdelta, wireddelta, resdelta;
1985 	boolean_t wired = (flags & PMAP_WIRED) != 0;
1986 	boolean_t nocache = (pa & PMAP_NOCACHE) != 0;
1987 	boolean_t wc = (pa & PMAP_WC) != 0;
1988 	int error, shootself;
1989 	paddr_t scr3;
1990 
1991 	KASSERT(!(wc && nocache));
1992 	pa &= PMAP_PA_MASK;
1993 
1994 #ifdef DIAGNOSTIC
1995 	if (va == (vaddr_t) PDP_BASE)
1996 		panic("%s: trying to map over PDP!", __func__);
1997 
1998 	/* sanity check: kernel PTPs should already have been pre-allocated */
1999 	if (va >= VM_MIN_KERNEL_ADDRESS &&
2000 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
2001 		panic("%s: missing kernel PTP for va %lx!", __func__, va);
2002 
2003 #endif
2004 
2005 	pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
2006 	if (pve == NULL) {
2007 		if (flags & PMAP_CANFAIL) {
2008 			error = ENOMEM;
2009 			goto out;
2010 		}
2011 		panic("%s: no pv entries available", __func__);
2012 	}
2013 
2014 	/*
2015 	 * map in ptes and get a pointer to our PTP (unless we are the kernel)
2016 	 */
2017 
2018 	pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
2019 	shootself = (scr3 == 0);
2020 	if (pmap == pmap_kernel()) {
2021 		ptp = NULL;
2022 	} else {
2023 		ptp = pmap_get_ptp(pmap, va, pdes);
2024 		if (ptp == NULL) {
2025 			if (flags & PMAP_CANFAIL) {
2026 				pmap_unmap_ptes(pmap, scr3);
2027 				error = ENOMEM;
2028 				goto out;
2029 			}
2030 			panic("%s: get ptp failed", __func__);
2031 		}
2032 	}
2033 	opte = ptes[pl1_i(va)];		/* old PTE */
2034 
2035 	/*
2036 	 * is there currently a valid mapping at our VA?
2037 	 */
2038 
2039 	if (pmap_valid_entry(opte)) {
2040 		/*
2041 		 * first, calculate pm_stats updates.  resident count will not
2042 		 * change since we are replacing/changing a valid mapping.
2043 		 * wired count might change...
2044 		 */
2045 
2046 		resdelta = 0;
2047 		if (wired && (opte & PG_W) == 0)
2048 			wireddelta = 1;
2049 		else if (!wired && (opte & PG_W) != 0)
2050 			wireddelta = -1;
2051 		else
2052 			wireddelta = 0;
2053 		ptpdelta = 0;
2054 
2055 		/*
2056 		 * is the currently mapped PA the same as the one we
2057 		 * want to map?
2058 		 */
2059 
2060 		if ((opte & PG_FRAME) == pa) {
2061 
2062 			/* if this is on the PVLIST, sync R/M bit */
2063 			if (opte & PG_PVLIST) {
2064 				pg = PHYS_TO_VM_PAGE(pa);
2065 #ifdef DIAGNOSTIC
2066 				if (pg == NULL)
2067 					panic("%s: same pa PG_PVLIST "
2068 					      "mapping with unmanaged page "
2069 					      "pa = 0x%lx (0x%lx)", __func__,
2070 					      pa, atop(pa));
2071 #endif
2072 				pmap_sync_flags_pte(pg, opte);
2073 			} else {
2074 #ifdef DIAGNOSTIC
2075 				if (PHYS_TO_VM_PAGE(pa) != NULL)
2076 					panic("%s: same pa, managed "
2077 					    "page, no PG_VLIST pa: 0x%lx\n",
2078 					    __func__, pa);
2079 #endif
2080 			}
2081 			goto enter_now;
2082 		}
2083 
2084 		/*
2085 		 * changing PAs: we must remove the old one first
2086 		 */
2087 
2088 		/*
2089 		 * if current mapping is on a pvlist,
2090 		 * remove it (sync R/M bits)
2091 		 */
2092 
2093 		if (opte & PG_PVLIST) {
2094 			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
2095 #ifdef DIAGNOSTIC
2096 			if (pg == NULL)
2097 				panic("%s: PG_PVLIST mapping with unmanaged "
2098 				      "page pa = 0x%lx (0x%lx)",
2099 				      __func__, pa, atop(pa));
2100 #endif
2101 			pmap_sync_flags_pte(pg, opte);
2102 			opve = pmap_remove_pv(pg, pmap, va);
2103 			pg = NULL; /* This is not the page we are looking for */
2104 		}
2105 	} else {	/* opte not valid */
2106 		resdelta = 1;
2107 		if (wired)
2108 			wireddelta = 1;
2109 		else
2110 			wireddelta = 0;
2111 		if (ptp)
2112 			ptpdelta = 1;
2113 		else
2114 			ptpdelta = 0;
2115 	}
2116 
2117 	/*
2118 	 * pve is either NULL or points to a now-free pv_entry structure
2119 	 * (the latter case is if we called pmap_remove_pv above).
2120 	 *
2121 	 * if this entry is to be on a pvlist, enter it now.
2122 	 */
2123 
2124 	if (pmap_initialized)
2125 		pg = PHYS_TO_VM_PAGE(pa);
2126 
2127 	if (pg != NULL) {
2128 		pmap_enter_pv(pg, pve, pmap, va, ptp);
2129 		pve = NULL;
2130 	}
2131 
2132 enter_now:
2133 	/*
2134 	 * at this point pg is !NULL if we want the PG_PVLIST bit set
2135 	 */
2136 
2137 	pmap->pm_stats.resident_count += resdelta;
2138 	pmap->pm_stats.wired_count += wireddelta;
2139 	if (ptp)
2140 		ptp->wire_count += ptpdelta;
2141 
2142 	KASSERT(pg == PHYS_TO_VM_PAGE(pa));
2143 
2144 	npte = pa | protection_codes[prot] | PG_V;
2145 	if (pg != NULL) {
2146 		npte |= PG_PVLIST;
2147 		/*
2148 		 * make sure that if the page is write combined all
2149 		 * instances of pmap_enter make it so.
2150 		 */
2151 		if (pg->pg_flags & PG_PMAP_WC) {
2152 			KASSERT(nocache == 0);
2153 			wc = TRUE;
2154 		}
2155 	}
2156 	if (wc)
2157 		npte |= pmap_pg_wc;
2158 	if (wired)
2159 		npte |= PG_W;
2160 	if (nocache)
2161 		npte |= PG_N;
2162 	if (va < VM_MAXUSER_ADDRESS)
2163 		npte |= PG_u;
2164 	else if (va < VM_MAX_ADDRESS)
2165 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
2166 	if (pmap == pmap_kernel())
2167 		npte |= PG_G;
2168 
2169 	ptes[pl1_i(va)] = npte;		/* zap! */
2170 
2171 	/*
2172 	 * If we changed anything other than modified/used bits,
2173 	 * flush the TLB.  (is this overkill?)
2174 	 */
2175 	if (pmap_valid_entry(opte)) {
2176 		if (nocache && (opte & PG_N) == 0)
2177 			wbinvd();
2178 		pmap_tlb_shootpage(pmap, va, shootself);
2179 	}
2180 
2181 	pmap_unmap_ptes(pmap, scr3);
2182 	pmap_tlb_shootwait();
2183 
2184 	error = 0;
2185 
2186 	if (pmap->pm_type == PMAP_TYPE_EPT)
2187 		pmap_fix_ept(pmap, va);
2188 
2189 out:
2190 	if (pve)
2191 		pool_put(&pmap_pv_pool, pve);
2192 	if (opve)
2193 		pool_put(&pmap_pv_pool, opve);
2194 
2195 	return error;
2196 }
2197 
2198 boolean_t
2199 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
2200 {
2201 	struct vm_page *ptp;
2202 	struct pmap *kpm = pmap_kernel();
2203 
2204 	if (uvm.page_init_done == FALSE) {
2205 		vaddr_t va;
2206 
2207 		/*
2208 		 * we're growing the kernel pmap early (from
2209 		 * uvm_pageboot_alloc()).  this case must be
2210 		 * handled a little differently.
2211 		 */
2212 
2213 		va = pmap_steal_memory(PAGE_SIZE, NULL, NULL);
2214 		*paddrp = PMAP_DIRECT_UNMAP(va);
2215 	} else {
2216 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
2217 				    ptp_va2o(va, level), NULL,
2218 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2219 		if (ptp == NULL)
2220 			panic("%s: out of memory", __func__);
2221 		atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
2222 		ptp->wire_count = 1;
2223 		*paddrp = VM_PAGE_TO_PHYS(ptp);
2224 	}
2225 	kpm->pm_stats.resident_count++;
2226 	return TRUE;
2227 }
2228 
2229 /*
2230  * Allocate the amount of specified ptps for a ptp level, and populate
2231  * all levels below accordingly, mapping virtual addresses starting at
2232  * kva.
2233  *
2234  * Used by pmap_growkernel.
2235  */
2236 void
2237 pmap_alloc_level(pd_entry_t **pdes, vaddr_t kva, int lvl, long *needed_ptps)
2238 {
2239 	unsigned long i;
2240 	vaddr_t va;
2241 	paddr_t pa;
2242 	unsigned long index, endindex;
2243 	int level;
2244 	pd_entry_t *pdep;
2245 
2246 	for (level = lvl; level > 1; level--) {
2247 		if (level == PTP_LEVELS)
2248 			pdep = pmap_kernel()->pm_pdir;
2249 		else
2250 			pdep = pdes[level - 2];
2251 		va = kva;
2252 		index = pl_i(kva, level);
2253 		endindex = index + needed_ptps[level - 1];
2254 		/*
2255 		 * XXX special case for first time call.
2256 		 */
2257 		if (nkptp[level - 1] != 0)
2258 			index++;
2259 		else
2260 			endindex--;
2261 
2262 		for (i = index; i <= endindex; i++) {
2263 			pmap_get_physpage(va, level - 1, &pa);
2264 			pdep[i] = pa | PG_RW | PG_V | pg_nx;
2265 			nkptp[level - 1]++;
2266 			va += nbpd[level - 1];
2267 		}
2268 	}
2269 }
2270 
2271 /*
2272  * pmap_growkernel: increase usage of KVM space
2273  *
2274  * => we allocate new PTPs for the kernel and install them in all
2275  *	the pmaps on the system.
2276  */
2277 
2278 static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS;
2279 
2280 vaddr_t
2281 pmap_growkernel(vaddr_t maxkvaddr)
2282 {
2283 	struct pmap *kpm = pmap_kernel(), *pm;
2284 	int s, i;
2285 	unsigned newpdes;
2286 	long needed_kptp[PTP_LEVELS], target_nptp, old;
2287 
2288 	if (maxkvaddr <= pmap_maxkvaddr)
2289 		return pmap_maxkvaddr;
2290 
2291 	maxkvaddr = x86_round_pdr(maxkvaddr);
2292 	old = nkptp[PTP_LEVELS - 1];
2293 	/*
2294 	 * This loop could be optimized more, but pmap_growkernel()
2295 	 * is called infrequently.
2296 	 */
2297 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
2298 		target_nptp = pl_i(maxkvaddr, i + 1) -
2299 		    pl_i(VM_MIN_KERNEL_ADDRESS, i + 1);
2300 		/*
2301 		 * XXX only need to check toplevel.
2302 		 */
2303 		if (target_nptp > nkptpmax[i])
2304 			panic("%s: out of KVA space", __func__);
2305 		needed_kptp[i] = target_nptp - nkptp[i] + 1;
2306 	}
2307 
2308 
2309 	s = splhigh();	/* to be safe */
2310 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS,
2311 	    needed_kptp);
2312 
2313 	/*
2314 	 * If the number of top level entries changed, update all
2315 	 * pmaps.
2316 	 */
2317 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
2318 		newpdes = nkptp[PTP_LEVELS - 1] - old;
2319 		LIST_FOREACH(pm, &pmaps, pm_list) {
2320 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
2321 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
2322 			       newpdes * sizeof (pd_entry_t));
2323 		}
2324 
2325 		/* Invalidate the PDP cache. */
2326 #if 0
2327 		pool_cache_invalidate(&pmap_pdp_cache);
2328 #endif
2329 	}
2330 	pmap_maxkvaddr = maxkvaddr;
2331 	splx(s);
2332 
2333 	return maxkvaddr;
2334 }
2335 
2336 vaddr_t
2337 pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end)
2338 {
2339 	int segno;
2340 	u_int npg;
2341 	vaddr_t va;
2342 	paddr_t pa;
2343 	struct vm_physseg *seg;
2344 
2345 	size = round_page(size);
2346 	npg = atop(size);
2347 
2348 	for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) {
2349 		if (seg->avail_end - seg->avail_start < npg)
2350 			continue;
2351 		/*
2352 		 * We can only steal at an ``unused'' segment boundary,
2353 		 * i.e. either at the start or at the end.
2354 		 */
2355 		if (seg->avail_start == seg->start ||
2356 		    seg->avail_end == seg->end)
2357 			break;
2358 	}
2359 	if (segno == vm_nphysseg) {
2360 		panic("%s: out of memory", __func__);
2361 	} else {
2362 		if (seg->avail_start == seg->start) {
2363 			pa = ptoa(seg->avail_start);
2364 			seg->avail_start += npg;
2365 			seg->start += npg;
2366 		} else {
2367 			pa = ptoa(seg->avail_end) - size;
2368 			seg->avail_end -= npg;
2369 			seg->end -= npg;
2370 		}
2371 		/*
2372 		 * If all the segment has been consumed now, remove it.
2373 		 * Note that the crash dump code still knows about it
2374 		 * and will dump it correctly.
2375 		 */
2376 		if (seg->start == seg->end) {
2377 			if (vm_nphysseg-- == 1)
2378 				panic("%s: out of memory", __func__);
2379 			while (segno < vm_nphysseg) {
2380 				seg[0] = seg[1]; /* struct copy */
2381 				seg++;
2382 				segno++;
2383 			}
2384 		}
2385 
2386 		va = PMAP_DIRECT_MAP(pa);
2387 		memset((void *)va, 0, size);
2388 	}
2389 
2390 	if (start != NULL)
2391 		*start = virtual_avail;
2392 	if (end != NULL)
2393 		*end = VM_MAX_KERNEL_ADDRESS;
2394 
2395 	return (va);
2396 }
2397 
2398 #ifdef DEBUG
2399 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
2400 
2401 /*
2402  * pmap_dump: dump all the mappings from a pmap
2403  *
2404  * => caller should not be holding any pmap locks
2405  */
2406 
2407 void
2408 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
2409 {
2410 	pt_entry_t *ptes, *pte;
2411 	pd_entry_t **pdes;
2412 	vaddr_t blkendva;
2413 	paddr_t scr3;
2414 
2415 	/*
2416 	 * if end is out of range truncate.
2417 	 * if (end == start) update to max.
2418 	 */
2419 
2420 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
2421 		eva = VM_MAXUSER_ADDRESS;
2422 
2423 	pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
2424 
2425 	/*
2426 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
2427 	 */
2428 
2429 	for (/* null */ ; sva < eva ; sva = blkendva) {
2430 
2431 		/* determine range of block */
2432 		blkendva = x86_round_pdr(sva+1);
2433 		if (blkendva > eva)
2434 			blkendva = eva;
2435 
2436 		/* valid block? */
2437 		if (!pmap_pdes_valid(sva, pdes, NULL))
2438 			continue;
2439 
2440 		pte = &ptes[pl1_i(sva)];
2441 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
2442 			if (!pmap_valid_entry(*pte))
2443 				continue;
2444 			printf("va %#lx -> pa %#llx (pte=%#llx)\n",
2445 			       sva, *pte, *pte & PG_FRAME);
2446 		}
2447 	}
2448 	pmap_unmap_ptes(pmap, scr3);
2449 }
2450 #endif
2451 
2452 void
2453 pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
2454 {
2455 	*vstartp = virtual_avail;
2456 	*vendp = VM_MAX_KERNEL_ADDRESS;
2457 }
2458 
2459 /*
2460  * pmap_convert
2461  *
2462  * Converts 'pmap' to the new 'mode'.
2463  *
2464  * Parameters:
2465  *  pmap: the pmap to convert
2466  *  mode: the new mode (see pmap.h, PMAP_TYPE_xxx)
2467  *
2468  * Return value:
2469  *  always 0
2470  */
2471 int
2472 pmap_convert(struct pmap *pmap, int mode)
2473 {
2474 	pt_entry_t *pte;
2475 
2476 	pmap->pm_type = mode;
2477 
2478 	if (mode == PMAP_TYPE_EPT) {
2479 		/* Clear low 512GB region (first PML4E) */
2480 		pte = (pt_entry_t *)pmap->pm_pdir;
2481 		*pte = 0;
2482 	}
2483 
2484 	return (0);
2485 }
2486 
2487 #ifdef MULTIPROCESSOR
2488 /*
2489  * Locking for tlb shootdown.
2490  *
2491  * We lock by setting tlb_shoot_wait to the number of cpus that will
2492  * receive our tlb shootdown. After sending the IPIs, we don't need to
2493  * worry about locking order or interrupts spinning for the lock because
2494  * the call that grabs the "lock" isn't the one that releases it. And
2495  * there is nothing that can block the IPI that releases the lock.
2496  *
2497  * The functions are organized so that we first count the number of
2498  * cpus we need to send the IPI to, then we grab the counter, then
2499  * we send the IPIs, then we finally do our own shootdown.
2500  *
2501  * Our shootdown is last to make it parallel with the other cpus
2502  * to shorten the spin time.
2503  *
2504  * Notice that we depend on failures to send IPIs only being able to
2505  * happen during boot. If they happen later, the above assumption
2506  * doesn't hold since we can end up in situations where noone will
2507  * release the lock if we get an interrupt in a bad moment.
2508  */
2509 
2510 volatile long tlb_shoot_wait;
2511 
2512 volatile vaddr_t tlb_shoot_addr1;
2513 volatile vaddr_t tlb_shoot_addr2;
2514 
2515 void
2516 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
2517 {
2518 	struct cpu_info *ci, *self = curcpu();
2519 	CPU_INFO_ITERATOR cii;
2520 	long wait = 0;
2521 	u_int64_t mask = 0;
2522 
2523 	CPU_INFO_FOREACH(cii, ci) {
2524 		if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) ||
2525 		    !(ci->ci_flags & CPUF_RUNNING))
2526 			continue;
2527 		mask |= (1ULL << ci->ci_cpuid);
2528 		wait++;
2529 	}
2530 
2531 	if (wait > 0) {
2532 		int s = splvm();
2533 
2534 		while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
2535 			while (tlb_shoot_wait != 0)
2536 				SPINLOCK_SPIN_HOOK;
2537 		}
2538 		tlb_shoot_addr1 = va;
2539 		CPU_INFO_FOREACH(cii, ci) {
2540 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2541 				continue;
2542 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0)
2543 				panic("%s: ipi failed", __func__);
2544 		}
2545 		splx(s);
2546 	}
2547 
2548 	if (shootself)
2549 		pmap_update_pg(va);
2550 }
2551 
2552 void
2553 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
2554 {
2555 	struct cpu_info *ci, *self = curcpu();
2556 	CPU_INFO_ITERATOR cii;
2557 	long wait = 0;
2558 	u_int64_t mask = 0;
2559 	vaddr_t va;
2560 
2561 	CPU_INFO_FOREACH(cii, ci) {
2562 		if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) ||
2563 		    !(ci->ci_flags & CPUF_RUNNING))
2564 			continue;
2565 		mask |= (1ULL << ci->ci_cpuid);
2566 		wait++;
2567 	}
2568 
2569 	if (wait > 0) {
2570 		int s = splvm();
2571 
2572 		while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
2573 			while (tlb_shoot_wait != 0)
2574 				SPINLOCK_SPIN_HOOK;
2575 		}
2576 		tlb_shoot_addr1 = sva;
2577 		tlb_shoot_addr2 = eva;
2578 		CPU_INFO_FOREACH(cii, ci) {
2579 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2580 				continue;
2581 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0)
2582 				panic("%s: ipi failed", __func__);
2583 		}
2584 		splx(s);
2585 	}
2586 
2587 	if (shootself)
2588 		for (va = sva; va < eva; va += PAGE_SIZE)
2589 			pmap_update_pg(va);
2590 }
2591 
2592 void
2593 pmap_tlb_shoottlb(struct pmap *pm, int shootself)
2594 {
2595 	struct cpu_info *ci, *self = curcpu();
2596 	CPU_INFO_ITERATOR cii;
2597 	long wait = 0;
2598 	u_int64_t mask = 0;
2599 
2600 	CPU_INFO_FOREACH(cii, ci) {
2601 		if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) || !(ci->ci_flags & CPUF_RUNNING))
2602 			continue;
2603 		mask |= (1ULL << ci->ci_cpuid);
2604 		wait++;
2605 	}
2606 
2607 	if (wait) {
2608 		int s = splvm();
2609 
2610 		while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) {
2611 			while (tlb_shoot_wait != 0)
2612 				SPINLOCK_SPIN_HOOK;
2613 		}
2614 
2615 		CPU_INFO_FOREACH(cii, ci) {
2616 			if ((mask & (1ULL << ci->ci_cpuid)) == 0)
2617 				continue;
2618 			if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0)
2619 				panic("%s: ipi failed", __func__);
2620 		}
2621 		splx(s);
2622 	}
2623 
2624 	if (shootself)
2625 		tlbflush();
2626 }
2627 
2628 void
2629 pmap_tlb_shootwait(void)
2630 {
2631 	while (tlb_shoot_wait != 0)
2632 		SPINLOCK_SPIN_HOOK;
2633 }
2634 
2635 #else
2636 
2637 void
2638 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
2639 {
2640 	if (shootself)
2641 		pmap_update_pg(va);
2642 
2643 }
2644 
2645 void
2646 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
2647 {
2648 	vaddr_t va;
2649 
2650 	if (!shootself)
2651 		return;
2652 
2653 	for (va = sva; va < eva; va += PAGE_SIZE)
2654 		pmap_update_pg(va);
2655 
2656 }
2657 
2658 void
2659 pmap_tlb_shoottlb(struct pmap *pm, int shootself)
2660 {
2661 	if (shootself)
2662 		tlbflush();
2663 }
2664 #endif /* MULTIPROCESSOR */
2665