xref: /netbsd/sys/arch/x86/x86/pmap.c (revision 6550d01e)
1 /*	$NetBSD: pmap.c,v 1.116 2011/02/05 13:50:08 yamt Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
30  *
31  * Permission to use, copy, modify, and distribute this software for any
32  * purpose with or without fee is hereby granted, provided that the above
33  * copyright notice and this permission notice appear in all copies.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43 
44 /*
45  * Copyright (c) 1997 Charles D. Cranor and Washington University.
46  * All rights reserved.
47  *
48  * Redistribution and use in source and binary forms, with or without
49  * modification, are permitted provided that the following conditions
50  * are met:
51  * 1. Redistributions of source code must retain the above copyright
52  *    notice, this list of conditions and the following disclaimer.
53  * 2. Redistributions in binary form must reproduce the above copyright
54  *    notice, this list of conditions and the following disclaimer in the
55  *    documentation and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
58  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
59  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
60  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
61  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
62  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
63  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
64  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
65  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
66  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
67  */
68 
69 /*
70  * Copyright 2001 (c) Wasabi Systems, Inc.
71  * All rights reserved.
72  *
73  * Written by Frank van der Linden for Wasabi Systems, Inc.
74  *
75  * Redistribution and use in source and binary forms, with or without
76  * modification, are permitted provided that the following conditions
77  * are met:
78  * 1. Redistributions of source code must retain the above copyright
79  *    notice, this list of conditions and the following disclaimer.
80  * 2. Redistributions in binary form must reproduce the above copyright
81  *    notice, this list of conditions and the following disclaimer in the
82  *    documentation and/or other materials provided with the distribution.
83  * 3. All advertising materials mentioning features or use of this software
84  *    must display the following acknowledgement:
85  *      This product includes software developed for the NetBSD Project by
86  *      Wasabi Systems, Inc.
87  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
88  *    or promote products derived from this software without specific prior
89  *    written permission.
90  *
91  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
92  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
93  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
94  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
95  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
96  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
97  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
98  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
99  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
100  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
101  * POSSIBILITY OF SUCH DAMAGE.
102  */
103 
104 /*
105  * This is the i386 pmap modified and generalized to support x86-64
106  * as well. The idea is to hide the upper N levels of the page tables
107  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
108  * is mostly untouched, except that it uses some more generalized
109  * macros and interfaces.
110  *
111  * This pmap has been tested on the i386 as well, and it can be easily
112  * adapted to PAE.
113  *
114  * fvdl@wasabisystems.com 18-Jun-2001
115  */
116 
117 /*
118  * pmap.c: i386 pmap module rewrite
119  * Chuck Cranor <chuck@netbsd>
120  * 11-Aug-97
121  *
122  * history of this pmap module: in addition to my own input, i used
123  *    the following references for this rewrite of the i386 pmap:
124  *
125  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
126  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
127  *     it was then ported to the i386 by William Jolitz of UUNET
128  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
129  *     project fixed some bugs and provided some speed ups.
130  *
131  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
132  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
133  *     and David Greenman.
134  *
135  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
136  *     between several processors.   the VAX version was done by
137  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
138  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
139  *     David Golub, and Richard Draves.    the alpha version was
140  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
141  *     (NetBSD/alpha).
142  */
143 
144 #include <sys/cdefs.h>
145 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.116 2011/02/05 13:50:08 yamt Exp $");
146 
147 #include "opt_user_ldt.h"
148 #include "opt_lockdebug.h"
149 #include "opt_multiprocessor.h"
150 #include "opt_xen.h"
151 #if !defined(__x86_64__)
152 #include "opt_kstack_dr0.h"
153 #endif /* !defined(__x86_64__) */
154 
155 #include <sys/param.h>
156 #include <sys/systm.h>
157 #include <sys/proc.h>
158 #include <sys/pool.h>
159 #include <sys/kernel.h>
160 #include <sys/atomic.h>
161 #include <sys/cpu.h>
162 #include <sys/intr.h>
163 #include <sys/xcall.h>
164 
165 #include <uvm/uvm.h>
166 
167 #include <dev/isa/isareg.h>
168 
169 #include <machine/specialreg.h>
170 #include <machine/gdt.h>
171 #include <machine/isa_machdep.h>
172 #include <machine/cpuvar.h>
173 
174 #include <x86/pmap.h>
175 #include <x86/pmap_pv.h>
176 
177 #include <x86/i82489reg.h>
178 #include <x86/i82489var.h>
179 
180 #ifdef XEN
181 #include <xen/xen3-public/xen.h>
182 #include <xen/hypervisor.h>
183 #endif
184 
185 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
186 #if defined(XEN) && defined(__x86_64__)
187 #define PG_k PG_u
188 #else
189 #define PG_k 0
190 #endif
191 
192 /*
193  * general info:
194  *
195  *  - for an explanation of how the i386 MMU hardware works see
196  *    the comments in <machine/pte.h>.
197  *
198  *  - for an explanation of the general memory structure used by
199  *    this pmap (including the recursive mapping), see the comments
200  *    in <machine/pmap.h>.
201  *
202  * this file contains the code for the "pmap module."   the module's
203  * job is to manage the hardware's virtual to physical address mappings.
204  * note that there are two levels of mapping in the VM system:
205  *
206  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
207  *      to map ranges of virtual address space to objects/files.  for
208  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
209  *      to the file /bin/ls starting at offset zero."   note that
210  *      the upper layer mapping is not concerned with how individual
211  *      vm_pages are mapped.
212  *
213  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
214  *      from virtual addresses.   it is concerned with which vm_page is
215  *      mapped where.   for example, when you run /bin/ls and start
216  *      at page 0x1000 the fault routine may lookup the correct page
217  *      of the /bin/ls file and then ask the pmap layer to establish
218  *      a mapping for it.
219  *
220  * note that information in the lower layer of the VM system can be
221  * thrown away since it can easily be reconstructed from the info
222  * in the upper layer.
223  *
224  * data structures we use include:
225  *
226  *  - struct pmap: describes the address space of one thread
227  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
228  *  - struct pv_head: there is one pv_head per managed page of
229  *	physical memory.   the pv_head points to a list of pv_entry
230  *	structures which describe all the <PMAP,VA> pairs that this
231  *      page is mapped in.    this is critical for page based operations
232  *      such as pmap_page_protect() [change protection on _all_ mappings
233  *      of a page]
234  */
235 
236 /*
237  * memory allocation
238  *
239  *  - there are three data structures that we must dynamically allocate:
240  *
241  * [A] new process' page directory page (PDP)
242  *	- plan 1: done at pmap_create() we use
243  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
244  *	  allocation.
245  *
246  * if we are low in free physical memory then we sleep in
247  * uvm_km_alloc -- in this case this is ok since we are creating
248  * a new pmap and should not be holding any locks.
249  *
250  * if the kernel is totally out of virtual space
251  * (i.e. uvm_km_alloc returns NULL), then we panic.
252  *
253  * [B] new page tables pages (PTP)
254  * 	- call uvm_pagealloc()
255  * 		=> success: zero page, add to pm_pdir
256  * 		=> failure: we are out of free vm_pages, let pmap_enter()
257  *		   tell UVM about it.
258  *
259  * note: for kernel PTPs, we start with NKPTP of them.   as we map
260  * kernel memory (at uvm_map time) we check to see if we've grown
261  * the kernel pmap.   if so, we call the optional function
262  * pmap_growkernel() to grow the kernel PTPs in advance.
263  *
264  * [C] pv_entry structures
265  */
266 
267 /*
268  * locking
269  *
270  * we have the following locks that we must contend with:
271  *
272  * mutexes:
273  *
274  * - pmap lock (per pmap, part of uvm_object)
275  *   this lock protects the fields in the pmap structure including
276  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
277  *   in the alternate PTE space (since that is determined by the
278  *   entry in the PDP).
279  *
280  * - pvh_lock (per pv_head)
281  *   this lock protects the pv_entry list which is chained off the
282  *   pv_head structure for a specific managed PA.   it is locked
283  *   when traversing the list (e.g. adding/removing mappings,
284  *   syncing R/M bits, etc.)
285  *
286  * - pmaps_lock
287  *   this lock protects the list of active pmaps (headed by "pmaps").
288  *   we lock it when adding or removing pmaps from this list.
289  *
290  * tlb shootdown
291  *
292  * tlb shootdowns are hard interrupts that operate outside the spl
293  * framework: they don't need to be blocked provided that the pmap module
294  * gets the order of events correct.  the calls are made by talking directly
295  * to the lapic.  the stubs to handle the interrupts are quite short and do
296  * one of the following: invalidate a single page, a range of pages, all
297  * user tlb entries or the entire tlb.
298  *
299  * the cpus synchronize with each other using pmap_mbox structures which are
300  * aligned on 64-byte cache lines.  tlb shootdowns against the kernel pmap
301  * use a global mailbox and are generated using a broadcast ipi (broadcast
302  * to all but the sending cpu).  shootdowns against regular pmaps use
303  * per-cpu mailboxes and are multicast.  kernel and user shootdowns can
304  * execute simultaneously, as can shootdowns within different multithreaded
305  * processes.  TODO:
306  *
307  *   1. figure out which waitpoints can be deferered to pmap_update().
308  *   2. see if there is a cheap way to batch some updates.
309  */
310 
311 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
312 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
313 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
314 const long nbpd[] = NBPD_INITIALIZER;
315 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
316 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
317 
318 long nkptp[] = NKPTP_INITIALIZER;
319 
320 static kmutex_t pmaps_lock;
321 
322 static vaddr_t pmap_maxkvaddr;
323 
324 #define COUNT(x)	/* nothing */
325 
326 /*
327  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
328  * actual locking is done by pm_lock.
329  */
330 #if defined(DIAGNOSTIC)
331 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
332 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
333 	if ((idx) != 0) \
334 		mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock)
335 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
336 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
337 	if ((idx) != 0) \
338 		mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock)
339 #else /* defined(DIAGNOSTIC) */
340 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
341 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
342 #endif /* defined(DIAGNOSTIC) */
343 
344 /*
345  * Misc. event counters.
346  */
347 struct evcnt pmap_iobmp_evcnt;
348 struct evcnt pmap_ldt_evcnt;
349 
350 /*
351  * Global TLB shootdown mailbox.
352  */
353 struct evcnt pmap_tlb_evcnt __aligned(64);
354 struct pmap_mbox pmap_mbox __aligned(64);
355 
356 /*
357  * PAT
358  */
359 #define	PATENTRY(n, type)	(type << ((n) * 8))
360 #define	PAT_UC		0x0ULL
361 #define	PAT_WC		0x1ULL
362 #define	PAT_WT		0x4ULL
363 #define	PAT_WP		0x5ULL
364 #define	PAT_WB		0x6ULL
365 #define	PAT_UCMINUS	0x7ULL
366 
367 static bool cpu_pat_enabled = false;
368 
369 
370 /*
371  * Per-CPU data.  The pmap mailbox is cache intensive so gets its
372  * own line.  Note that the mailbox must be the first item.
373  */
374 struct pmap_cpu {
375 	/* TLB shootdown */
376 	struct pmap_mbox pc_mbox;
377 };
378 
379 union {
380 	struct pmap_cpu pc;
381 	uint8_t padding[64];
382 } pmap_cpu[MAXCPUS] __aligned(64);
383 
384 /*
385  * global data structures
386  */
387 
388 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
389 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
390 
391 /*
392  * pmap_pg_g: if our processor supports PG_G in the PTE then we
393  * set pmap_pg_g to PG_G (otherwise it is zero).
394  */
395 
396 int pmap_pg_g = 0;
397 
398 /*
399  * pmap_largepages: if our processor supports PG_PS and we are
400  * using it, this is set to true.
401  */
402 
403 int pmap_largepages;
404 
405 /*
406  * i386 physical memory comes in a big contig chunk with a small
407  * hole toward the front of it...  the following two paddr_t's
408  * (shared with machdep.c) describe the physical address space
409  * of this machine.
410  */
411 paddr_t avail_start;	/* PA of first available physical page */
412 paddr_t avail_end;	/* PA of last available physical page */
413 
414 #ifdef XEN
415 #ifdef __x86_64__
416 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
417 static paddr_t xen_dummy_user_pgd;
418 #endif /* __x86_64__ */
419 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
420 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
421 #endif /* XEN */
422 
423 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
424 
425 #define	pp_lock(pp)	mutex_spin_enter(&(pp)->pp_lock)
426 #define	pp_unlock(pp)	mutex_spin_exit(&(pp)->pp_lock)
427 #define	pp_locked(pp)	mutex_owned(&(pp)->pp_lock)
428 
429 #define	PV_HASH_SIZE		32768
430 #define	PV_HASH_LOCK_CNT	32
431 
432 struct pv_hash_lock {
433 	kmutex_t lock;
434 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
435     __aligned(CACHE_LINE_SIZE);
436 
437 struct pv_hash_head {
438 	SLIST_HEAD(, pv_entry) hh_list;
439 } pv_hash_heads[PV_HASH_SIZE];
440 
441 static u_int
442 pvhash_hash(struct vm_page *ptp, vaddr_t va)
443 {
444 
445 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
446 }
447 
448 static struct pv_hash_head *
449 pvhash_head(u_int hash)
450 {
451 
452 	return &pv_hash_heads[hash % PV_HASH_SIZE];
453 }
454 
455 static kmutex_t *
456 pvhash_lock(u_int hash)
457 {
458 
459 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
460 }
461 
462 static struct pv_entry *
463 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
464 {
465 	struct pv_entry *pve;
466 	struct pv_entry *prev;
467 
468 	prev = NULL;
469 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
470 		if (pve->pve_pte.pte_ptp == ptp &&
471 		    pve->pve_pte.pte_va == va) {
472 			if (prev != NULL) {
473 				SLIST_REMOVE_AFTER(prev, pve_hash);
474 			} else {
475 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
476 			}
477 			break;
478 		}
479 		prev = pve;
480 	}
481 	return pve;
482 }
483 
484 /*
485  * other data structures
486  */
487 
488 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
489 static bool pmap_initialized = false;	/* pmap_init done yet? */
490 
491 /*
492  * the following two vaddr_t's are used during system startup
493  * to keep track of how much of the kernel's VM space we have used.
494  * once the system is started, the management of the remaining kernel
495  * VM space is turned over to the kernel_map vm_map.
496  */
497 
498 static vaddr_t virtual_avail;	/* VA of first free KVA */
499 static vaddr_t virtual_end;	/* VA of last free KVA */
500 
501 /*
502  * linked list of all non-kernel pmaps
503  */
504 
505 static struct pmap_head pmaps;
506 
507 /*
508  * pool that pmap structures are allocated from
509  */
510 
511 static struct pool_cache pmap_cache;
512 
513 /*
514  * pv_entry cache
515  */
516 
517 static struct pool_cache pmap_pv_cache;
518 
519 /*
520  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
521  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
522  * due to false sharing.
523  */
524 
525 #ifdef MULTIPROCESSOR
526 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
527 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
528 #else
529 #define PTESLEW(pte, id) (pte)
530 #define VASLEW(va,id) (va)
531 #endif
532 
533 /*
534  * special VAs and the PTEs that map them
535  */
536 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
537 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
538 
539 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
540 
541 /*
542  * pool and cache that PDPs are allocated from
543  */
544 
545 static struct pool_cache pmap_pdp_cache;
546 int	pmap_pdp_ctor(void *, void *, int);
547 void	pmap_pdp_dtor(void *, void *);
548 #ifdef PAE
549 /* need to allocate items of 4 pages */
550 void *pmap_pdp_alloc(struct pool *, int);
551 void pmap_pdp_free(struct pool *, void *);
552 static struct pool_allocator pmap_pdp_allocator = {
553 	.pa_alloc = pmap_pdp_alloc,
554 	.pa_free = pmap_pdp_free,
555 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
556 };
557 #endif /* PAE */
558 
559 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
560 
561 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
562 extern paddr_t idt_paddr;
563 
564 #ifdef _LP64
565 extern vaddr_t lo32_vaddr;
566 extern vaddr_t lo32_paddr;
567 #endif
568 
569 extern int end;
570 
571 #ifdef i386
572 /* stuff to fix the pentium f00f bug */
573 extern vaddr_t pentium_idt_vaddr;
574 #endif
575 
576 
577 /*
578  * local prototypes
579  */
580 
581 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
582 				      pd_entry_t * const *);
583 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
584 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
585 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
586 				       vaddr_t, pt_entry_t *,
587 				       pd_entry_t * const *);
588 static bool		 pmap_is_curpmap(struct pmap *);
589 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
590 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
591 					 pt_entry_t *, vaddr_t,
592 					 struct pv_entry **);
593 static pt_entry_t	 pmap_remove_ptes(struct pmap *, struct vm_page *,
594 					  vaddr_t, vaddr_t, vaddr_t,
595 					  struct pv_entry **);
596 
597 static void		 pmap_unmap_apdp(void);
598 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
599 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
600 					  long *);
601 
602 static bool		 pmap_reactivate(struct pmap *);
603 
604 /*
605  * p m a p   h e l p e r   f u n c t i o n s
606  */
607 
608 static inline void
609 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
610 {
611 
612 	if (pmap == pmap_kernel()) {
613 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
614 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
615 	} else {
616 		KASSERT(mutex_owned(&pmap->pm_lock));
617 		pmap->pm_stats.resident_count += resid_diff;
618 		pmap->pm_stats.wired_count += wired_diff;
619 	}
620 }
621 
622 static inline void
623 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
624 {
625 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
626 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
627 
628 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
629 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
630 
631 	pmap_stats_update(pmap, resid_diff, wired_diff);
632 }
633 
634 /*
635  * ptp_to_pmap: lookup pmap by ptp
636  */
637 
638 static struct pmap *
639 ptp_to_pmap(struct vm_page *ptp)
640 {
641 	struct pmap *pmap;
642 
643 	if (ptp == NULL) {
644 		return pmap_kernel();
645 	}
646 	pmap = (struct pmap *)ptp->uobject;
647 	KASSERT(pmap != NULL);
648 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
649 	return pmap;
650 }
651 
652 static inline struct pv_pte *
653 pve_to_pvpte(struct pv_entry *pve)
654 {
655 
656 	KASSERT((void *)&pve->pve_pte == (void *)pve);
657 	return &pve->pve_pte;
658 }
659 
660 static inline struct pv_entry *
661 pvpte_to_pve(struct pv_pte *pvpte)
662 {
663 	struct pv_entry *pve = (void *)pvpte;
664 
665 	KASSERT(pve_to_pvpte(pve) == pvpte);
666 	return pve;
667 }
668 
669 /*
670  * pv_pte_first, pv_pte_next: PV list iterator.
671  */
672 
673 static struct pv_pte *
674 pv_pte_first(struct pmap_page *pp)
675 {
676 
677 	KASSERT(pp_locked(pp));
678 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
679 		return &pp->pp_pte;
680 	}
681 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
682 }
683 
684 static struct pv_pte *
685 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
686 {
687 
688 	KASSERT(pvpte != NULL);
689 	KASSERT(pp_locked(pp));
690 	if (pvpte == &pp->pp_pte) {
691 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
692 		return NULL;
693 	}
694 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
695 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
696 }
697 
698 /*
699  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
700  *		of course the kernel is always loaded
701  */
702 
703 inline static bool
704 pmap_is_curpmap(struct pmap *pmap)
705 {
706 #if defined(XEN) && defined(__x86_64__)
707 	/*
708 	 * Only kernel pmap is physically loaded.
709 	 * User PGD may be active, but TLB will be flushed
710 	 * with HYPERVISOR_iret anyway, so let's say no
711 	 */
712 	return(pmap == pmap_kernel());
713 #else /* XEN && __x86_64__*/
714 	return((pmap == pmap_kernel()) ||
715 	       (pmap == curcpu()->ci_pmap));
716 #endif
717 }
718 
719 /*
720  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
721  */
722 
723 inline static bool
724 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
725 {
726 
727 	return (pmap == pmap_kernel() ||
728 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
729 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
730 }
731 
732 static void
733 pmap_apte_flush(struct pmap *pmap)
734 {
735 
736 	KASSERT(kpreempt_disabled());
737 
738 	/*
739 	 * Flush the APTE mapping from all other CPUs that
740 	 * are using the pmap we are using (who's APTE space
741 	 * is the one we've just modified).
742 	 *
743 	 * XXXthorpej -- find a way to defer the IPI.
744 	 */
745 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
746 	pmap_tlb_shootwait();
747 }
748 
749 /*
750  * Unmap the content of APDP PDEs
751  */
752 static void
753 pmap_unmap_apdp(void)
754 {
755 	int i;
756 
757 	for (i = 0; i < PDP_SIZE; i++) {
758 		pmap_pte_set(APDP_PDE+i, 0);
759 #if defined (XEN) && defined (PAE)
760 		/* clear shadow entries too */
761 		pmap_pte_set(APDP_PDE_SHADOW+i, 0);
762 #endif
763 	}
764 }
765 
766 /*
767  *	Add a reference to the specified pmap.
768  */
769 
770 inline void
771 pmap_reference(struct pmap *pmap)
772 {
773 
774 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
775 }
776 
777 /*
778  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
779  *
780  * => we lock enough pmaps to keep things locked in
781  * => must be undone with pmap_unmap_ptes before returning
782  */
783 
784 void
785 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
786 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
787 {
788 	pd_entry_t opde, npde;
789 	struct pmap *ourpmap;
790 	struct cpu_info *ci;
791 	struct lwp *l;
792 	bool iscurrent;
793 	uint64_t ncsw;
794 #ifdef XEN
795 	int s, i;
796 #endif
797 
798 	/* the kernel's pmap is always accessible */
799 	if (pmap == pmap_kernel()) {
800 		*pmap2 = NULL;
801 		*ptepp = PTE_BASE;
802 		*pdeppp = normal_pdes;
803 		return;
804 	}
805 	KASSERT(kpreempt_disabled());
806 
807  retry:
808 	l = curlwp;
809 	ncsw = l->l_ncsw;
810  	ourpmap = NULL;
811 	ci = curcpu();
812 #if defined(XEN) && defined(__x86_64__)
813 	/*
814 	 * curmap can only be pmap_kernel so at this point
815 	 * pmap_is_curpmap is always false
816 	 */
817 	iscurrent = 0;
818 	ourpmap = pmap_kernel();
819 #else /* XEN && __x86_64__*/
820 	if (ci->ci_want_pmapload &&
821 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
822 		pmap_load();
823 		if (l->l_ncsw != ncsw)
824 			goto retry;
825 	}
826 	iscurrent = pmap_is_curpmap(pmap);
827 	/* if curpmap then we are always mapped */
828 	if (iscurrent) {
829 		mutex_enter(&pmap->pm_lock);
830 		*pmap2 = NULL;
831 		*ptepp = PTE_BASE;
832 		*pdeppp = normal_pdes;
833 		goto out;
834 	}
835 	ourpmap = ci->ci_pmap;
836 #endif /* XEN && __x86_64__ */
837 
838 	/* need to lock both curpmap and pmap: use ordered locking */
839 	pmap_reference(ourpmap);
840 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
841 		mutex_enter(&pmap->pm_lock);
842 		mutex_enter(&ourpmap->pm_lock);
843 	} else {
844 		mutex_enter(&ourpmap->pm_lock);
845 		mutex_enter(&pmap->pm_lock);
846 	}
847 
848 	if (l->l_ncsw != ncsw)
849 		goto unlock_and_retry;
850 
851 	/* need to load a new alternate pt space into curpmap? */
852 	COUNT(apdp_pde_map);
853 	opde = *APDP_PDE;
854 	if (!pmap_valid_entry(opde) ||
855 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
856 #ifdef XEN
857 		s = splvm();
858 		/* Make recursive entry usable in user PGD */
859 		for (i = 0; i < PDP_SIZE; i++) {
860 			npde = pmap_pa2pte(
861 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
862 			xpq_queue_pte_update(
863 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
864 			    npde);
865 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
866 			    npde);
867 #ifdef PAE
868 			/* update shadow entry too */
869 			xpq_queue_pte_update(
870 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
871 #endif /* PAE */
872 			xpq_queue_invlpg(
873 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
874 		}
875 		if (pmap_valid_entry(opde))
876 			pmap_apte_flush(ourpmap);
877 		splx(s);
878 #else /* XEN */
879 		int i;
880 		for (i = 0; i < PDP_SIZE; i++) {
881 			npde = pmap_pa2pte(
882 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_RW | PG_V;
883 			pmap_pte_set(APDP_PDE+i, npde);
884 		}
885 		pmap_pte_flush();
886 		if (pmap_valid_entry(opde))
887 			pmap_apte_flush(ourpmap);
888 #endif /* XEN */
889 	}
890 	*pmap2 = ourpmap;
891 	*ptepp = APTE_BASE;
892 	*pdeppp = alternate_pdes;
893 	KASSERT(l->l_ncsw == ncsw);
894 #if !defined(XEN) || !defined(__x86_64__)
895  out:
896 #endif
897  	/*
898  	 * might have blocked, need to retry?
899  	 */
900 	if (l->l_ncsw != ncsw) {
901  unlock_and_retry:
902 	    	if (ourpmap != NULL) {
903 			mutex_exit(&ourpmap->pm_lock);
904 			pmap_destroy(ourpmap);
905 		}
906 		mutex_exit(&pmap->pm_lock);
907 		goto retry;
908 	}
909 
910 	return;
911 }
912 
913 /*
914  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
915  */
916 
917 void
918 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
919 {
920 
921 	if (pmap == pmap_kernel()) {
922 		return;
923 	}
924 	KASSERT(kpreempt_disabled());
925 	if (pmap2 == NULL) {
926 		mutex_exit(&pmap->pm_lock);
927 	} else {
928 #if defined(XEN) && defined(__x86_64__)
929 		KASSERT(pmap2 == pmap_kernel());
930 #else
931 		KASSERT(curcpu()->ci_pmap == pmap2);
932 #endif
933 #if defined(MULTIPROCESSOR)
934 		pmap_unmap_apdp();
935 		pmap_pte_flush();
936 		pmap_apte_flush(pmap2);
937 #endif
938 		COUNT(apdp_pde_unmap);
939 		mutex_exit(&pmap->pm_lock);
940 		mutex_exit(&pmap2->pm_lock);
941 		pmap_destroy(pmap2);
942 	}
943 }
944 
945 inline static void
946 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
947 {
948 
949 #if !defined(__x86_64__)
950 	if (curproc == NULL || curproc->p_vmspace == NULL ||
951 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
952 		return;
953 
954 	if ((opte ^ npte) & PG_X)
955 		pmap_update_pg(va);
956 
957 	/*
958 	 * Executability was removed on the last executable change.
959 	 * Reset the code segment to something conservative and
960 	 * let the trap handler deal with setting the right limit.
961 	 * We can't do that because of locking constraints on the vm map.
962 	 */
963 
964 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
965 		struct trapframe *tf = curlwp->l_md.md_regs;
966 
967 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
968 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
969 	}
970 #endif /* !defined(__x86_64__) */
971 }
972 
973 #if !defined(__x86_64__)
974 /*
975  * Fixup the code segment to cover all potential executable mappings.
976  * returns 0 if no changes to the code segment were made.
977  */
978 
979 int
980 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
981 {
982 	struct vm_map_entry *ent;
983 	struct pmap *pm = vm_map_pmap(map);
984 	vaddr_t va = 0;
985 
986 	vm_map_lock_read(map);
987 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
988 
989 		/*
990 		 * This entry has greater va than the entries before.
991 		 * We need to make it point to the last page, not past it.
992 		 */
993 
994 		if (ent->protection & VM_PROT_EXECUTE)
995 			va = trunc_page(ent->end) - PAGE_SIZE;
996 	}
997 	vm_map_unlock_read(map);
998 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
999 		return (0);
1000 
1001 	pm->pm_hiexec = va;
1002 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
1003 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
1004 	} else {
1005 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1006 		return (0);
1007 	}
1008 	return (1);
1009 }
1010 #endif /* !defined(__x86_64__) */
1011 
1012 void
1013 pat_init(struct cpu_info *ci)
1014 {
1015 	uint64_t pat;
1016 
1017 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
1018 		return;
1019 
1020 	/* We change WT to WC. Leave all other entries the default values. */
1021 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
1022 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
1023 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
1024 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
1025 
1026 	wrmsr(MSR_CR_PAT, pat);
1027 	cpu_pat_enabled = true;
1028 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
1029 }
1030 
1031 static pt_entry_t
1032 pmap_pat_flags(u_int flags)
1033 {
1034 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
1035 
1036 	if (!cpu_pat_enabled) {
1037 		switch (cacheflags) {
1038 		case PMAP_NOCACHE:
1039 		case PMAP_NOCACHE_OVR:
1040 			/* results in PGC_UCMINUS on cpus which have
1041 			 * the cpuid PAT but PAT "disabled"
1042 			 */
1043 			return PG_N;
1044 		default:
1045 			return 0;
1046 		}
1047 	}
1048 
1049 	switch (cacheflags) {
1050 	case PMAP_NOCACHE:
1051 		return PGC_UC;
1052 	case PMAP_WRITE_COMBINE:
1053 		return PGC_WC;
1054 	case PMAP_WRITE_BACK:
1055 		return PGC_WB;
1056 	case PMAP_NOCACHE_OVR:
1057 		return PGC_UCMINUS;
1058 	}
1059 
1060 	return 0;
1061 }
1062 
1063 /*
1064  * p m a p   k e n t e r   f u n c t i o n s
1065  *
1066  * functions to quickly enter/remove pages from the kernel address
1067  * space.   pmap_kremove is exported to MI kernel.  we make use of
1068  * the recursive PTE mappings.
1069  */
1070 
1071 /*
1072  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1073  *
1074  * => no need to lock anything, assume va is already allocated
1075  * => should be faster than normal pmap enter function
1076  */
1077 
1078 void
1079 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1080 {
1081 	pt_entry_t *pte, opte, npte;
1082 
1083 	KASSERT(!(prot & ~VM_PROT_ALL));
1084 
1085 	if (va < VM_MIN_KERNEL_ADDRESS)
1086 		pte = vtopte(va);
1087 	else
1088 		pte = kvtopte(va);
1089 #ifdef DOM0OPS
1090 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1091 #ifdef DEBUG
1092 		printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
1093 		    " outside range\n", (int64_t)pa, (int64_t)va);
1094 #endif /* DEBUG */
1095 		npte = pa;
1096 	} else
1097 #endif /* DOM0OPS */
1098 		npte = pmap_pa2pte(pa);
1099 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1100 	npte |= pmap_pat_flags(flags);
1101 	opte = pmap_pte_testset(pte, npte); /* zap! */
1102 #if defined(DIAGNOSTIC)
1103 	/* XXX For now... */
1104 	if (opte & PG_PS)
1105 		panic("pmap_kenter_pa: PG_PS");
1106 #endif
1107 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1108 		/* This should not happen, so no need to batch updates. */
1109 		kpreempt_disable();
1110 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1111 		kpreempt_enable();
1112 	}
1113 }
1114 
1115 void
1116 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1117 {
1118 	pt_entry_t *pte, opte, npte;
1119 
1120 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1121 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1122 
1123 #ifdef DOM0OPS
1124 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1125 		npte = pa;
1126 	} else
1127 #endif
1128 		npte = pmap_pa2pte(pa);
1129 
1130 	npte = pmap_pa2pte(pa);
1131 	npte |= protection_codes[prot] | PG_k | PG_V;
1132 	opte = pmap_pte_testset(pte, npte);
1133 }
1134 
1135 /*
1136  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1137  */
1138 void
1139 pmap_emap_sync(bool canload)
1140 {
1141 	struct cpu_info *ci = curcpu();
1142 	struct pmap *pmap;
1143 
1144 	KASSERT(kpreempt_disabled());
1145 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1146 		/*
1147 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1148 		 * not perform TLB flush, if state has not changed.
1149 		 */
1150 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1151 		if (__predict_false(pmap == ci->ci_pmap)) {
1152 			const uint32_t cpumask = ci->ci_cpumask;
1153 			atomic_and_32(&pmap->pm_cpus, ~cpumask);
1154 		}
1155 		pmap_load();
1156 		KASSERT(ci->ci_want_pmapload == 0);
1157 	} else {
1158 		tlbflush();
1159 	}
1160 
1161 }
1162 
1163 void
1164 pmap_emap_remove(vaddr_t sva, vsize_t len)
1165 {
1166 	pt_entry_t *pte, xpte;
1167 	vaddr_t va, eva = sva + len;
1168 
1169 	for (va = sva; va < eva; va += PAGE_SIZE) {
1170 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1171 		xpte |= pmap_pte_testset(pte, 0);
1172 	}
1173 }
1174 
1175 __weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1176 
1177 #if defined(__x86_64__)
1178 /*
1179  * Change protection for a virtual address. Local for a CPU only, don't
1180  * care about TLB shootdowns.
1181  *
1182  * => must be called with preemption disabled
1183  */
1184 void
1185 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1186 {
1187 	pt_entry_t *pte, opte, npte;
1188 
1189 	KASSERT(kpreempt_disabled());
1190 
1191 	if (va < VM_MIN_KERNEL_ADDRESS)
1192 		pte = vtopte(va);
1193 	else
1194 		pte = kvtopte(va);
1195 
1196 	npte = opte = *pte;
1197 
1198 	if ((prot & VM_PROT_WRITE) != 0)
1199 		npte |= PG_RW;
1200 	else
1201 		npte &= ~PG_RW;
1202 
1203 	if (opte != npte) {
1204 		pmap_pte_set(pte, npte);
1205 		pmap_pte_flush();
1206 		invlpg(va);
1207 	}
1208 }
1209 #endif /* defined(__x86_64__) */
1210 
1211 /*
1212  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1213  *
1214  * => no need to lock anything
1215  * => caller must dispose of any vm_page mapped in the va range
1216  * => note: not an inline function
1217  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1218  * => we assume kernel only unmaps valid addresses and thus don't bother
1219  *    checking the valid bit before doing TLB flushing
1220  * => must be followed by call to pmap_update() before reuse of page
1221  */
1222 
1223 void
1224 pmap_kremove(vaddr_t sva, vsize_t len)
1225 {
1226 	pt_entry_t *pte, xpte;
1227 	vaddr_t va, eva;
1228 
1229 	eva = sva + len;
1230 	xpte = 0;
1231 
1232 	for (va = sva; va < eva; va += PAGE_SIZE) {
1233 		if (va < VM_MIN_KERNEL_ADDRESS)
1234 			pte = vtopte(va);
1235 		else
1236 			pte = kvtopte(va);
1237 		xpte |= pmap_pte_testset(pte, 0); /* zap! */
1238 #if defined(DIAGNOSTIC)
1239 		/* XXX For now... */
1240 		if (xpte & PG_PS)
1241 			panic("pmap_kremove: PG_PS");
1242 		if (xpte & PG_PVLIST)
1243 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
1244 			      va);
1245 #endif
1246 	}
1247 	if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1248 		kpreempt_disable();
1249 		pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
1250 		kpreempt_enable();
1251 	}
1252 }
1253 
1254 /*
1255  * p m a p   i n i t   f u n c t i o n s
1256  *
1257  * pmap_bootstrap and pmap_init are called during system startup
1258  * to init the pmap module.   pmap_bootstrap() does a low level
1259  * init just to get things rolling.   pmap_init() finishes the job.
1260  */
1261 
1262 /*
1263  * pmap_bootstrap: get the system in a state where it can run with VM
1264  *	properly enabled (called before main()).   the VM system is
1265  *      fully init'd later...
1266  *
1267  * => on i386, locore.s has already enabled the MMU by allocating
1268  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1269  * => kva_start is the first free virtual address in kernel space
1270  */
1271 
1272 void
1273 pmap_bootstrap(vaddr_t kva_start)
1274 {
1275 	struct pmap *kpm;
1276 	pt_entry_t *pte;
1277 	int i;
1278 	vaddr_t kva;
1279 #ifndef XEN
1280 	unsigned long p1i;
1281 	vaddr_t kva_end;
1282 #endif
1283 
1284 	pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1285 
1286 	/*
1287 	 * set up our local static global vars that keep track of the
1288 	 * usage of KVM before kernel_map is set up
1289 	 */
1290 
1291 	virtual_avail = kva_start;		/* first free KVA */
1292 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1293 
1294 	/*
1295 	 * set up protection_codes: we need to be able to convert from
1296 	 * a MI protection code (some combo of VM_PROT...) to something
1297 	 * we can jam into a i386 PTE.
1298 	 */
1299 
1300 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1301 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1302 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1303 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1304 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1305 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1306 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1307 								/* wr- */
1308 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1309 
1310 	/*
1311 	 * now we init the kernel's pmap
1312 	 *
1313 	 * the kernel pmap's pm_obj is not used for much.   however, in
1314 	 * user pmaps the pm_obj contains the list of active PTPs.
1315 	 * the pm_obj currently does not have a pager.   it might be possible
1316 	 * to add a pager that would allow a process to read-only mmap its
1317 	 * own page tables (fast user level vtophys?).   this may or may not
1318 	 * be useful.
1319 	 */
1320 
1321 	kpm = pmap_kernel();
1322 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1323 		UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
1324 		kpm->pm_ptphint[i] = NULL;
1325 	}
1326 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1327 
1328 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1329 	for (i = 0; i < PDP_SIZE; i++)
1330 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1331 
1332 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1333 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1334 
1335 	/*
1336 	 * the above is just a rough estimate and not critical to the proper
1337 	 * operation of the system.
1338 	 */
1339 
1340 #ifndef XEN
1341 	/*
1342 	 * Begin to enable global TLB entries if they are supported.
1343 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1344 	 * which happens in cpu_init(), which is run on each cpu
1345 	 * (and happens later)
1346 	 */
1347 
1348 	if (cpu_feature[0] & CPUID_PGE) {
1349 		pmap_pg_g = PG_G;		/* enable software */
1350 
1351 		/* add PG_G attribute to already mapped kernel pages */
1352 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1353 			kva_end = virtual_avail;
1354 		} else {
1355 			extern vaddr_t eblob, esym;
1356 			kva_end = (vaddr_t)&end;
1357 			if (esym > kva_end)
1358 				kva_end = esym;
1359 			if (eblob > kva_end)
1360 				kva_end = eblob;
1361 			kva_end = roundup(kva_end, PAGE_SIZE);
1362 		}
1363 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1364 			p1i = pl1_i(kva);
1365 			if (pmap_valid_entry(PTE_BASE[p1i]))
1366 				PTE_BASE[p1i] |= PG_G;
1367 		}
1368 	}
1369 
1370 	/*
1371 	 * enable large pages if they are supported.
1372 	 */
1373 
1374 	if (cpu_feature[0] & CPUID_PSE) {
1375 		paddr_t pa;
1376 		pd_entry_t *pde;
1377 		extern char __data_start;
1378 
1379 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1380 		pmap_largepages = 1;	/* enable software */
1381 
1382 		/*
1383 		 * the TLB must be flushed after enabling large pages
1384 		 * on Pentium CPUs, according to section 3.6.2.2 of
1385 		 * "Intel Architecture Software Developer's Manual,
1386 		 * Volume 3: System Programming".
1387 		 */
1388 		tlbflush();
1389 
1390 		/*
1391 		 * now, remap the kernel text using large pages.  we
1392 		 * assume that the linker has properly aligned the
1393 		 * .data segment to a NBPD_L2 boundary.
1394 		 */
1395 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1396 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1397 		     kva += NBPD_L2, pa += NBPD_L2) {
1398 			pde = &L2_BASE[pl2_i(kva)];
1399 			*pde = pa | pmap_pg_g | PG_PS |
1400 			    PG_KR | PG_V;	/* zap! */
1401 			tlbflush();
1402 		}
1403 #if defined(DEBUG)
1404 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1405 		    "pages and %" PRIuPSIZE " normal pages\n",
1406 		    howmany(kva - KERNBASE, NBPD_L2),
1407 		    howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1408 #endif /* defined(DEBUG) */
1409 	}
1410 #endif /* !XEN */
1411 
1412 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1413 		/*
1414 		 * zero_pte is stuck at the end of mapped space for the kernel
1415 		 * image (disjunct from kva space). This is done so that it
1416 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1417 		 * when it's called for the first time.
1418 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1419 		 */
1420 
1421 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1422 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1423 	}
1424 
1425 	/*
1426 	 * now we allocate the "special" VAs which are used for tmp mappings
1427 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1428 	 * virtual_avail (note that there are no pages mapped at these VAs).
1429 	 * we find the PTE that maps the allocated VA via the linear PTE
1430 	 * mapping.
1431 	 */
1432 
1433 	pte = PTE_BASE + pl1_i(virtual_avail);
1434 
1435 #ifdef MULTIPROCESSOR
1436 	/*
1437 	 * Waste some VA space to avoid false sharing of cache lines
1438 	 * for page table pages: Give each possible CPU a cache line
1439 	 * of PTE's (8) to play with, though we only need 4.  We could
1440 	 * recycle some of this waste by putting the idle stacks here
1441 	 * as well; we could waste less space if we knew the largest
1442 	 * CPU ID beforehand.
1443 	 */
1444 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1445 
1446 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1447 
1448 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1449 
1450 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1451 
1452 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1453 	pte += maxcpus * NPTECL;
1454 #else
1455 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1456 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1457 
1458 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1459 	virtual_avail += PAGE_SIZE; pte++;
1460 
1461 	zerop = (void *) virtual_avail;  zero_pte = pte;
1462 	virtual_avail += PAGE_SIZE; pte++;
1463 
1464 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1465 	virtual_avail += PAGE_SIZE; pte++;
1466 #endif
1467 
1468 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1469 		early_zerop = zerop;
1470 		early_zero_pte = zero_pte;
1471 	}
1472 
1473 	/*
1474 	 * Nothing after this point actually needs pte;
1475 	 */
1476 	pte = (void *)0xdeadbeef;
1477 
1478 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1479 	/* XXXfvdl PTEs not needed here */
1480 	vmmap = (char *)virtual_avail;			/* don't need pte */
1481 	virtual_avail += PAGE_SIZE; pte++;
1482 
1483 #ifdef XEN
1484 #ifdef __x86_64__
1485 	/*
1486 	 * We want a dummy page directory for Xen:
1487 	 * when deactivate a pmap, Xen will still consider it active.
1488 	 * So we set user PGD to this one to lift all protection on
1489 	 * the now inactive page tables set.
1490 	 */
1491 	xen_dummy_user_pgd = avail_start;
1492 	avail_start += PAGE_SIZE;
1493 
1494 	/* Zero fill it, the less checks in Xen it requires the better */
1495 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1496 	/* Mark read-only */
1497 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1498 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1499 	/* Pin as L4 */
1500 	xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1501 #endif /* __x86_64__ */
1502 	idt_vaddr = virtual_avail;                      /* don't need pte */
1503 	idt_paddr = avail_start;                        /* steal a page */
1504 	/*
1505 	 * Xen require one more page as we can't store
1506 	 * GDT and LDT on the same page
1507 	 */
1508 	virtual_avail += 3 * PAGE_SIZE;
1509 	avail_start += 3 * PAGE_SIZE;
1510 #else /* XEN */
1511 	idt_vaddr = virtual_avail;			/* don't need pte */
1512 	idt_paddr = avail_start;			/* steal a page */
1513 #if defined(__x86_64__)
1514 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1515 	avail_start += 2 * PAGE_SIZE;
1516 #else /* defined(__x86_64__) */
1517 	virtual_avail += PAGE_SIZE; pte++;
1518 	avail_start += PAGE_SIZE;
1519 	/* pentium f00f bug stuff */
1520 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1521 	virtual_avail += PAGE_SIZE; pte++;
1522 #endif /* defined(__x86_64__) */
1523 #endif /* XEN */
1524 
1525 #ifdef _LP64
1526 	/*
1527 	 * Grab a page below 4G for things that need it (i.e.
1528 	 * having an initial %cr3 for the MP trampoline).
1529 	 */
1530 	lo32_vaddr = virtual_avail;
1531 	virtual_avail += PAGE_SIZE; pte++;
1532 	lo32_paddr = avail_start;
1533 	avail_start += PAGE_SIZE;
1534 #endif
1535 
1536 	/*
1537 	 * now we reserve some VM for mapping pages when doing a crash dump
1538 	 */
1539 
1540 	virtual_avail = reserve_dumppages(virtual_avail);
1541 
1542 	/*
1543 	 * init the static-global locks and global lists.
1544 	 *
1545 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1546 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1547 	 *	again is never taken from interrupt context.
1548 	 */
1549 
1550 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1551 	LIST_INIT(&pmaps);
1552 	pmap_cpu_init_early(curcpu());
1553 
1554 	/*
1555 	 * initialize caches.
1556 	 */
1557 
1558 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1559 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1560 #ifdef PAE
1561 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1562 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1563 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1564 #else /* PAE */
1565 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1566 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1567 #endif /* PAE */
1568 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1569 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1570 	    NULL, NULL);
1571 
1572 	/*
1573 	 * ensure the TLB is sync'd with reality by flushing it...
1574 	 */
1575 
1576 	tlbflush();
1577 
1578 	/*
1579 	 * calculate pmap_maxkvaddr from nkptp[].
1580 	 */
1581 
1582 	kva = VM_MIN_KERNEL_ADDRESS;
1583 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1584 		kva += nkptp[i] * nbpd[i];
1585 	}
1586 	pmap_maxkvaddr = kva;
1587 }
1588 
1589 #if defined(__x86_64__)
1590 /*
1591  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1592  * trampoline code can be entered.
1593  */
1594 void
1595 pmap_prealloc_lowmem_ptps(void)
1596 {
1597 #ifdef XEN
1598 	int level;
1599 	paddr_t newp;
1600 	paddr_t pdes_pa;
1601 
1602 	pdes_pa = pmap_pdirpa(pmap_kernel(), 0);
1603 	level = PTP_LEVELS;
1604 	for (;;) {
1605 		newp = avail_start;
1606 		avail_start += PAGE_SIZE;
1607 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1608 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1609 		memset((void *)early_zerop, 0, PAGE_SIZE);
1610 		/* Mark R/O before installing */
1611 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1612 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1613 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1614 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1615 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1616 		xpq_queue_pte_update (
1617 			xpmap_ptom_masked(pdes_pa)
1618 			+ (pl_i(0, level) * sizeof (pd_entry_t)),
1619 			xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1620 		level--;
1621 		if (level <= 1)
1622 			break;
1623 		pdes_pa = newp;
1624 	}
1625 #else /* XEN */
1626 	pd_entry_t *pdes;
1627 	int level;
1628 	paddr_t newp;
1629 
1630 	pdes = pmap_kernel()->pm_pdir;
1631 	level = PTP_LEVELS;
1632 	for (;;) {
1633 		newp = avail_start;
1634 		avail_start += PAGE_SIZE;
1635 		*early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1636 		pmap_update_pg((vaddr_t)early_zerop);
1637 		memset(early_zerop, 0, PAGE_SIZE);
1638 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1639 		level--;
1640 		if (level <= 1)
1641 			break;
1642 		pdes = normal_pdes[level - 2];
1643 	}
1644 #endif /* XEN */
1645 }
1646 #endif /* defined(__x86_64__) */
1647 
1648 /*
1649  * pmap_init: called from uvm_init, our job is to get the pmap
1650  * system ready to manage mappings...
1651  */
1652 
1653 void
1654 pmap_init(void)
1655 {
1656 	int i;
1657 
1658 	for (i = 0; i < PV_HASH_SIZE; i++) {
1659 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1660 	}
1661 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1662 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1663 	}
1664 
1665 	/*
1666 	 * done: pmap module is up (and ready for business)
1667 	 */
1668 
1669 	pmap_initialized = true;
1670 }
1671 
1672 /*
1673  * pmap_cpu_init_early: perform early per-CPU initialization.
1674  */
1675 
1676 void
1677 pmap_cpu_init_early(struct cpu_info *ci)
1678 {
1679 	struct pmap_cpu *pc;
1680 	static uint8_t pmap_cpu_alloc;
1681 
1682 	pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1683 	ci->ci_pmap_cpu = pc;
1684 }
1685 
1686 /*
1687  * pmap_cpu_init_late: perform late per-CPU initialization.
1688  */
1689 
1690 void
1691 pmap_cpu_init_late(struct cpu_info *ci)
1692 {
1693 
1694 	if (ci == &cpu_info_primary) {
1695 		evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1696 		    NULL, "global", "TLB IPI");
1697 		evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1698 		    NULL, "x86", "io bitmap copy");
1699 		evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1700 		    NULL, "x86", "ldt sync");
1701 	}
1702 
1703 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
1704 	    NULL, device_xname(ci->ci_dev), "TLB IPI");
1705 
1706 #ifdef PAE
1707 	int ret;
1708 	struct pglist pg;
1709 	struct vm_page *vmap;
1710 
1711 	/* The BP has already its own L3 page allocated in locore.S. */
1712 	if (ci == &cpu_info_primary)
1713 		return;
1714 
1715 	/*
1716 	 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1717 	 * resides below the 4GB boundary.
1718 	 */
1719 	ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1720 	vmap = TAILQ_FIRST(&pg);
1721 
1722 	if (ret != 0 || vmap == NULL)
1723 		panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1724 			__func__, cpu_index(ci), ret);
1725 
1726 	ci->ci_pae_l3_pdirpa = vmap->phys_addr;
1727 
1728 	ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1729 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1730 	if (ci->ci_pae_l3_pdir == NULL)
1731 		panic("%s: failed to allocate L3 PD for CPU %d\n",
1732 			__func__, cpu_index(ci));
1733 
1734 	pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1735 		VM_PROT_READ | VM_PROT_WRITE, 0);
1736 
1737 	pmap_update(pmap_kernel());
1738 #endif
1739 }
1740 
1741 /*
1742  * p v _ e n t r y   f u n c t i o n s
1743  */
1744 
1745 /*
1746  * pmap_free_pvs: free a list of pv_entrys
1747  */
1748 
1749 static void
1750 pmap_free_pvs(struct pv_entry *pve)
1751 {
1752 	struct pv_entry *next;
1753 
1754 	for ( /* null */ ; pve != NULL ; pve = next) {
1755 		next = pve->pve_next;
1756 		pool_cache_put(&pmap_pv_cache, pve);
1757 	}
1758 }
1759 
1760 /*
1761  * main pv_entry manipulation functions:
1762  *   pmap_enter_pv: enter a mapping onto a pv_head list
1763  *   pmap_remove_pv: remove a mapping from a pv_head list
1764  *
1765  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1766  *       the pvh before calling
1767  */
1768 
1769 /*
1770  * insert_pv: a helper of pmap_enter_pv
1771  */
1772 
1773 static void
1774 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1775 {
1776 	struct pv_hash_head *hh;
1777 	kmutex_t *lock;
1778 	u_int hash;
1779 
1780 	KASSERT(pp_locked(pp));
1781 
1782 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1783 	lock = pvhash_lock(hash);
1784 	hh = pvhash_head(hash);
1785 	mutex_spin_enter(lock);
1786 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1787 	mutex_spin_exit(lock);
1788 
1789 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1790 }
1791 
1792 /*
1793  * pmap_enter_pv: enter a mapping onto a pv_head lst
1794  *
1795  * => caller should have the pp_lock locked
1796  * => caller should adjust ptp's wire_count before calling
1797  */
1798 
1799 static struct pv_entry *
1800 pmap_enter_pv(struct pmap_page *pp,
1801 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1802 	      struct pv_entry **sparepve,
1803 	      struct vm_page *ptp,
1804 	      vaddr_t va)
1805 {
1806 
1807 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1808 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1809 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1810 	KASSERT(pp_locked(pp));
1811 
1812 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1813 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1814 			pp->pp_flags |= PP_EMBEDDED;
1815 			pp->pp_pte.pte_ptp = ptp;
1816 			pp->pp_pte.pte_va = va;
1817 
1818 			return pve;
1819 		}
1820 	} else {
1821 		struct pv_entry *pve2;
1822 
1823 		pve2 = *sparepve;
1824 		*sparepve = NULL;
1825 
1826 		pve2->pve_pte = pp->pp_pte;
1827 		pp->pp_flags &= ~PP_EMBEDDED;
1828 		LIST_INIT(&pp->pp_head.pvh_list);
1829 		insert_pv(pp, pve2);
1830 	}
1831 
1832 	pve->pve_pte.pte_ptp = ptp;
1833 	pve->pve_pte.pte_va = va;
1834 	insert_pv(pp, pve);
1835 
1836 	return NULL;
1837 }
1838 
1839 /*
1840  * pmap_remove_pv: try to remove a mapping from a pv_list
1841  *
1842  * => caller should hold pp_lock [so that attrs can be adjusted]
1843  * => caller should adjust ptp's wire_count and free PTP if needed
1844  * => we return the removed pve
1845  */
1846 
1847 static struct pv_entry *
1848 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1849 {
1850 	struct pv_hash_head *hh;
1851 	struct pv_entry *pve;
1852 	kmutex_t *lock;
1853 	u_int hash;
1854 
1855 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1856 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1857 	KASSERT(pp_locked(pp));
1858 
1859 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1860 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1861 		KASSERT(pp->pp_pte.pte_va == va);
1862 
1863 		pp->pp_flags &= ~PP_EMBEDDED;
1864 		LIST_INIT(&pp->pp_head.pvh_list);
1865 
1866 		return NULL;
1867 	}
1868 
1869 	hash = pvhash_hash(ptp, va);
1870 	lock = pvhash_lock(hash);
1871 	hh = pvhash_head(hash);
1872 	mutex_spin_enter(lock);
1873 	pve = pvhash_remove(hh, ptp, va);
1874 	mutex_spin_exit(lock);
1875 
1876 	LIST_REMOVE(pve, pve_list);
1877 
1878 	return pve;
1879 }
1880 
1881 /*
1882  * p t p   f u n c t i o n s
1883  */
1884 
1885 static inline struct vm_page *
1886 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1887 {
1888 	int lidx = level - 1;
1889 	struct vm_page *pg;
1890 
1891 	KASSERT(mutex_owned(&pmap->pm_lock));
1892 
1893 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1894 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1895 		return (pmap->pm_ptphint[lidx]);
1896 	}
1897 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1898 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1899 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1900 
1901 	KASSERT(pg == NULL || pg->wire_count >= 1);
1902 	return pg;
1903 }
1904 
1905 static inline void
1906 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1907 {
1908 	lwp_t *l;
1909 	int lidx;
1910 	struct uvm_object *obj;
1911 
1912 	KASSERT(ptp->wire_count == 1);
1913 
1914 	lidx = level - 1;
1915 
1916 	obj = &pmap->pm_obj[lidx];
1917 	pmap_stats_update(pmap, -1, 0);
1918 	if (lidx != 0)
1919 		mutex_enter(&obj->vmobjlock);
1920 	if (pmap->pm_ptphint[lidx] == ptp)
1921 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1922 	ptp->wire_count = 0;
1923 	uvm_pagerealloc(ptp, NULL, 0);
1924 	l = curlwp;
1925 	KASSERT((l->l_pflag & LP_INTR) == 0);
1926 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1927 	l->l_md.md_gc_ptp = ptp;
1928 	if (lidx != 0)
1929 		mutex_exit(&obj->vmobjlock);
1930 }
1931 
1932 static void
1933 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1934 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1935 {
1936 	unsigned long index;
1937 	int level;
1938 	vaddr_t invaladdr;
1939 #ifdef MULTIPROCESSOR
1940 	vaddr_t invaladdr2;
1941 #endif
1942 	pd_entry_t opde;
1943 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1944 
1945 	KASSERT(pmap != pmap_kernel());
1946 	KASSERT(mutex_owned(&pmap->pm_lock));
1947 	KASSERT(kpreempt_disabled());
1948 
1949 	level = 1;
1950 	do {
1951 		index = pl_i(va, level + 1);
1952 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1953 #if defined(XEN) && defined(__x86_64__)
1954 		/*
1955 		 * If ptp is a L3 currently mapped in kernel space,
1956 		 * clear it before freeing
1957 		 */
1958 		if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd
1959 		    && level == PTP_LEVELS - 1)
1960 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1961 #endif /* XEN && __x86_64__ */
1962 		pmap_freepage(pmap, ptp, level);
1963 		invaladdr = level == 1 ? (vaddr_t)ptes :
1964 		    (vaddr_t)pdes[level - 2];
1965 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1966 		    0, opde);
1967 #if defined(MULTIPROCESSOR)
1968 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1969 		    (vaddr_t)normal_pdes[level - 2];
1970 		if (pmap != curpmap || invaladdr != invaladdr2) {
1971 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1972 			    0, opde);
1973 		}
1974 #endif
1975 		if (level < PTP_LEVELS - 1) {
1976 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1977 			ptp->wire_count--;
1978 			if (ptp->wire_count > 1)
1979 				break;
1980 		}
1981 	} while (++level < PTP_LEVELS);
1982 	pmap_pte_flush();
1983 }
1984 
1985 /*
1986  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1987  *
1988  * => pmap should NOT be pmap_kernel()
1989  * => pmap should be locked
1990  * => preemption should be disabled
1991  */
1992 
1993 static struct vm_page *
1994 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1995 {
1996 	struct vm_page *ptp, *pptp;
1997 	int i;
1998 	unsigned long index;
1999 	pd_entry_t *pva;
2000 	paddr_t ppa, pa;
2001 	struct uvm_object *obj;
2002 
2003 	KASSERT(pmap != pmap_kernel());
2004 	KASSERT(mutex_owned(&pmap->pm_lock));
2005 	KASSERT(kpreempt_disabled());
2006 
2007 	ptp = NULL;
2008 	pa = (paddr_t)-1;
2009 
2010 	/*
2011 	 * Loop through all page table levels seeing if we need to
2012 	 * add a new page to that level.
2013 	 */
2014 	for (i = PTP_LEVELS; i > 1; i--) {
2015 		/*
2016 		 * Save values from previous round.
2017 		 */
2018 		pptp = ptp;
2019 		ppa = pa;
2020 
2021 		index = pl_i(va, i);
2022 		pva = pdes[i - 2];
2023 
2024 		if (pmap_valid_entry(pva[index])) {
2025 			ppa = pmap_pte2pa(pva[index]);
2026 			ptp = NULL;
2027 			continue;
2028 		}
2029 
2030 		obj = &pmap->pm_obj[i-2];
2031 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2032 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
2033 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2034 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2035 
2036 		if (ptp == NULL)
2037 			return NULL;
2038 
2039 		ptp->flags &= ~PG_BUSY; /* never busy */
2040 		ptp->wire_count = 1;
2041 		pmap->pm_ptphint[i - 2] = ptp;
2042 		pa = VM_PAGE_TO_PHYS(ptp);
2043 		pmap_pte_set(&pva[index], (pd_entry_t)
2044 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2045 #if defined(XEN) && defined(__x86_64__)
2046 		/*
2047 		 * In Xen we must enter the mapping in kernel map too
2048 		 * if pmap is curmap and modifying top level (PGD)
2049 		 */
2050 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
2051 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
2052 		                (pd_entry_t) (pmap_pa2pte(pa)
2053 		                        | PG_u | PG_RW | PG_V));
2054 		}
2055 #endif /* XEN && __x86_64__ */
2056 		pmap_pte_flush();
2057 		pmap_stats_update(pmap, 1, 0);
2058 		/*
2059 		 * If we're not in the top level, increase the
2060 		 * wire count of the parent page.
2061 		 */
2062 		if (i < PTP_LEVELS) {
2063 			if (pptp == NULL)
2064 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2065 #ifdef DIAGNOSTIC
2066 			if (pptp == NULL)
2067 				panic("pde page disappeared");
2068 #endif
2069 			pptp->wire_count++;
2070 		}
2071 	}
2072 
2073 	/*
2074 	 * ptp is not NULL if we just allocated a new ptp. If it's
2075 	 * still NULL, we must look up the existing one.
2076 	 */
2077 	if (ptp == NULL) {
2078 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2079 #ifdef DIAGNOSTIC
2080 		if (ptp == NULL) {
2081 			printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n",
2082 			    va, ppa);
2083 			panic("pmap_get_ptp: unmanaged user PTP");
2084 		}
2085 #endif
2086 	}
2087 
2088 	pmap->pm_ptphint[0] = ptp;
2089 	return(ptp);
2090 }
2091 
2092 /*
2093  * p m a p  l i f e c y c l e   f u n c t i o n s
2094  */
2095 
2096 /*
2097  * pmap_pdp_ctor: constructor for the PDP cache.
2098  */
2099 
2100 int
2101 pmap_pdp_ctor(void *arg, void *v, int flags)
2102 {
2103 	pd_entry_t *pdir = v;
2104 	paddr_t pdirpa = 0;	/* XXX: GCC */
2105 	vaddr_t object;
2106 	int i;
2107 
2108 #if !defined(XEN) || !defined(__x86_64__)
2109 	int npde;
2110 #endif
2111 #ifdef XEN
2112 	int s;
2113 #endif
2114 
2115 	/*
2116 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
2117 	 */
2118 
2119 #if defined(XEN) && defined(__x86_64__)
2120 	/* fetch the physical address of the page directory. */
2121 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2122 
2123 	/* zero init area */
2124 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2125 	/*
2126 	 * this pdir will NEVER be active in kernel mode
2127 	 * so mark recursive entry invalid
2128 	 */
2129 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2130 	/*
2131 	 * PDP constructed this way won't be for kernel,
2132 	 * hence we don't put kernel mappings on Xen.
2133 	 * But we need to make pmap_create() happy, so put a dummy (without
2134 	 * PG_V) value at the right place.
2135 	 */
2136 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2137 	     (pd_entry_t)-1 & PG_FRAME;
2138 #else /* XEN && __x86_64__*/
2139 	/* zero init area */
2140 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2141 
2142 	object = (vaddr_t)v;
2143 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2144 		/* fetch the physical address of the page directory. */
2145 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2146 		/* put in recursive PDE to map the PTEs */
2147 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2148 #ifndef XEN
2149 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2150 #endif
2151 	}
2152 
2153 	/* copy kernel's PDE */
2154 	npde = nkptp[PTP_LEVELS - 1];
2155 
2156 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2157 	    npde * sizeof(pd_entry_t));
2158 
2159 	/* zero the rest */
2160 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2161 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2162 
2163 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2164 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2165 
2166 		pdir[idx] = PDP_BASE[idx];
2167 	}
2168 #endif /* XEN  && __x86_64__*/
2169 #ifdef XEN
2170 	s = splvm();
2171 	object = (vaddr_t)v;
2172 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2173 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2174 		/* remap this page RO */
2175 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0);
2176 		pmap_update(pmap_kernel());
2177 		/*
2178 		 * pin as L2/L4 page, we have to do the page with the
2179 		 * PDIR_SLOT_PTE entries last
2180 		 */
2181 #ifdef PAE
2182 		if (i == l2tol3(PDIR_SLOT_PTE))
2183 			continue;
2184 #endif
2185 		xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2186 	}
2187 #ifdef PAE
2188 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2189 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2190 	xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2191 #endif
2192 	splx(s);
2193 #endif /* XEN */
2194 
2195 	return (0);
2196 }
2197 
2198 /*
2199  * pmap_pdp_dtor: destructor for the PDP cache.
2200  */
2201 
2202 void
2203 pmap_pdp_dtor(void *arg, void *v)
2204 {
2205 #ifdef XEN
2206 	paddr_t pdirpa = 0;	/* XXX: GCC */
2207 	vaddr_t object = (vaddr_t)v;
2208 	int i;
2209 	int s = splvm();
2210 	pt_entry_t *pte;
2211 
2212 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2213 		/* fetch the physical address of the page directory. */
2214 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2215 		/* unpin page table */
2216 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2217 	}
2218 	object = (vaddr_t)v;
2219 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2220 		/* Set page RW again */
2221 		pte = kvtopte(object);
2222 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2223 		xpq_queue_invlpg((vaddr_t)object);
2224 	}
2225 	splx(s);
2226 #endif  /* XEN */
2227 }
2228 
2229 #ifdef PAE
2230 
2231 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2232 
2233 void *
2234 pmap_pdp_alloc(struct pool *pp, int flags)
2235 {
2236 	return (void *)uvm_km_alloc(kernel_map,
2237 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2238 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2239 	    | UVM_KMF_WIRED);
2240 }
2241 
2242 /*
2243  * pmap_pdp_free: free a PDP
2244  */
2245 
2246 void
2247 pmap_pdp_free(struct pool *pp, void *v)
2248 {
2249 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2250 	    UVM_KMF_WIRED);
2251 }
2252 #endif /* PAE */
2253 
2254 /*
2255  * pmap_create: create a pmap
2256  *
2257  * => note: old pmap interface took a "size" args which allowed for
2258  *	the creation of "software only" pmaps (not in bsd).
2259  */
2260 
2261 struct pmap *
2262 pmap_create(void)
2263 {
2264 	struct pmap *pmap;
2265 	int i;
2266 
2267 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2268 
2269 	/* init uvm_object */
2270 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2271 		UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1);
2272 		pmap->pm_ptphint[i] = NULL;
2273 	}
2274 	pmap->pm_stats.wired_count = 0;
2275 	/* count the PDP allocd below */
2276 	pmap->pm_stats.resident_count = PDP_SIZE;
2277 #if !defined(__x86_64__)
2278 	pmap->pm_hiexec = 0;
2279 #endif /* !defined(__x86_64__) */
2280 	pmap->pm_flags = 0;
2281 	pmap->pm_cpus = 0;
2282 	pmap->pm_kernel_cpus = 0;
2283 
2284 	/* init the LDT */
2285 	pmap->pm_ldt = NULL;
2286 	pmap->pm_ldt_len = 0;
2287 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2288 
2289 	/* allocate PDP */
2290  try_again:
2291 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2292 
2293 	mutex_enter(&pmaps_lock);
2294 
2295 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2296 		mutex_exit(&pmaps_lock);
2297 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2298 		goto try_again;
2299 	}
2300 
2301 	for (i = 0; i < PDP_SIZE; i++)
2302 		pmap->pm_pdirpa[i] =
2303 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2304 
2305 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2306 
2307 	mutex_exit(&pmaps_lock);
2308 
2309 	return (pmap);
2310 }
2311 
2312 /*
2313  * pmap_destroy: drop reference count on pmap.   free pmap if
2314  *	reference count goes to zero.
2315  */
2316 
2317 void
2318 pmap_destroy(struct pmap *pmap)
2319 {
2320 	int i;
2321 #ifdef DIAGNOSTIC
2322 	struct cpu_info *ci;
2323 	CPU_INFO_ITERATOR cii;
2324 #endif /* DIAGNOSTIC */
2325 
2326 	/*
2327 	 * if we have torn down this pmap, process deferred frees and
2328 	 * invalidations now.
2329 	 */
2330 	if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) {
2331 		pmap_update(pmap);
2332 	}
2333 
2334 	/*
2335 	 * drop reference count
2336 	 */
2337 
2338 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2339 		return;
2340 	}
2341 
2342 #ifdef DIAGNOSTIC
2343 	for (CPU_INFO_FOREACH(cii, ci))
2344 		if (ci->ci_pmap == pmap)
2345 			panic("destroying pmap being used");
2346 #endif /* DIAGNOSTIC */
2347 
2348 	/*
2349 	 * reference count is zero, free pmap resources and then free pmap.
2350 	 */
2351 #ifdef XEN
2352 	/*
2353 	 * Xen lazy APDP handling:
2354 	 * clear APDP_PDE if pmap is the currently mapped
2355 	 */
2356 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2357 		kpreempt_disable();
2358 		pmap_unmap_apdp();
2359 		pmap_pte_flush();
2360 	        pmap_apte_flush(pmap_kernel());
2361 	        kpreempt_enable();
2362 	}
2363 #endif
2364 
2365 	/*
2366 	 * remove it from global list of pmaps
2367 	 */
2368 
2369 	mutex_enter(&pmaps_lock);
2370 	LIST_REMOVE(pmap, pm_list);
2371 	mutex_exit(&pmaps_lock);
2372 
2373 	/*
2374 	 * destroyed pmap shouldn't have remaining PTPs
2375 	 */
2376 
2377 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2378 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2379 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2380 	}
2381 
2382 	/*
2383 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2384 	 * APTE space because we do that in pmap_unmap_ptes().
2385 	 */
2386 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2387 
2388 #ifdef USER_LDT
2389 	if (pmap->pm_ldt != NULL) {
2390 		/*
2391 		 * no need to switch the LDT; this address space is gone,
2392 		 * nothing is using it.
2393 		 *
2394 		 * No need to lock the pmap for ldt_free (or anything else),
2395 		 * we're the last one to use it.
2396 		 */
2397 		mutex_enter(&cpu_lock);
2398 		ldt_free(pmap->pm_ldt_sel);
2399 		mutex_exit(&cpu_lock);
2400 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2401 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2402 	}
2403 #endif
2404 
2405 	for (i = 0; i < PTP_LEVELS - 1; i++)
2406 		mutex_destroy(&pmap->pm_obj[i].vmobjlock);
2407 	pool_cache_put(&pmap_cache, pmap);
2408 }
2409 
2410 /*
2411  * pmap_remove_all: pmap is being torn down by the current thread.
2412  * avoid unnecessary invalidations.
2413  */
2414 
2415 void
2416 pmap_remove_all(struct pmap *pmap)
2417 {
2418 	lwp_t *l = curlwp;
2419 
2420 	KASSERT(l->l_md.md_gc_pmap == NULL);
2421 
2422 	l->l_md.md_gc_pmap = pmap;
2423 }
2424 
2425 #if defined(PMAP_FORK)
2426 /*
2427  * pmap_fork: perform any necessary data structure manipulation when
2428  * a VM space is forked.
2429  */
2430 
2431 void
2432 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2433 {
2434 #ifdef USER_LDT
2435 	union descriptor *new_ldt;
2436 	size_t len;
2437 	int sel;
2438 
2439 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2440 		return;
2441 	}
2442 
2443  retry:
2444 	if (pmap1->pm_ldt != NULL) {
2445 		len = pmap1->pm_ldt_len;
2446 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2447 		    UVM_KMF_WIRED);
2448 		mutex_enter(&cpu_lock);
2449 		sel = ldt_alloc(new_ldt, len);
2450 		if (sel == -1) {
2451 			mutex_exit(&cpu_lock);
2452 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2453 			    UVM_KMF_WIRED);
2454 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2455 			return;
2456 		}
2457 	} else {
2458 		len = -1;
2459 		new_ldt = NULL;
2460 		sel = -1;
2461 		mutex_enter(&cpu_lock);
2462 	}
2463 
2464  	/* Copy the LDT, if necessary. */
2465  	if (pmap1->pm_ldt != NULL) {
2466 		if (len != pmap1->pm_ldt_len) {
2467 			if (len != -1) {
2468 				ldt_free(sel);
2469 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2470 				    len, UVM_KMF_WIRED);
2471 			}
2472 			mutex_exit(&cpu_lock);
2473 			goto retry;
2474 		}
2475 
2476 		memcpy(new_ldt, pmap1->pm_ldt, len);
2477 		pmap2->pm_ldt = new_ldt;
2478 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2479 		pmap2->pm_ldt_sel = sel;
2480 		len = -1;
2481 	}
2482 
2483 	if (len != -1) {
2484 		ldt_free(sel);
2485 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2486 		    UVM_KMF_WIRED);
2487 	}
2488 	mutex_exit(&cpu_lock);
2489 #endif /* USER_LDT */
2490 }
2491 #endif /* PMAP_FORK */
2492 
2493 #ifdef USER_LDT
2494 
2495 /*
2496  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2497  * is active, reload LDTR.
2498  */
2499 static void
2500 pmap_ldt_xcall(void *arg1, void *arg2)
2501 {
2502 	struct pmap *pm;
2503 
2504 	kpreempt_disable();
2505 	pm = arg1;
2506 	if (curcpu()->ci_pmap == pm) {
2507 		lldt(pm->pm_ldt_sel);
2508 	}
2509 	kpreempt_enable();
2510 }
2511 
2512 /*
2513  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2514  * in the new selector on all CPUs.
2515  */
2516 void
2517 pmap_ldt_sync(struct pmap *pm)
2518 {
2519 	uint64_t where;
2520 
2521 	KASSERT(mutex_owned(&cpu_lock));
2522 
2523 	pmap_ldt_evcnt.ev_count++;
2524 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2525 	xc_wait(where);
2526 }
2527 
2528 /*
2529  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2530  * restore the default.
2531  */
2532 
2533 void
2534 pmap_ldt_cleanup(struct lwp *l)
2535 {
2536 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2537 	union descriptor *dp = NULL;
2538 	size_t len = 0;
2539 	int sel = -1;
2540 
2541 	if (__predict_true(pmap->pm_ldt == NULL)) {
2542 		return;
2543 	}
2544 
2545 	mutex_enter(&cpu_lock);
2546 	if (pmap->pm_ldt != NULL) {
2547 		sel = pmap->pm_ldt_sel;
2548 		dp = pmap->pm_ldt;
2549 		len = pmap->pm_ldt_len;
2550 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2551 		pmap->pm_ldt = NULL;
2552 		pmap->pm_ldt_len = 0;
2553 		pmap_ldt_sync(pmap);
2554 		ldt_free(sel);
2555 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2556 	}
2557 	mutex_exit(&cpu_lock);
2558 }
2559 #endif /* USER_LDT */
2560 
2561 /*
2562  * pmap_activate: activate a process' pmap
2563  *
2564  * => must be called with kernel preemption disabled
2565  * => if lwp is the curlwp, then set ci_want_pmapload so that
2566  *    actual MMU context switch will be done by pmap_load() later
2567  */
2568 
2569 void
2570 pmap_activate(struct lwp *l)
2571 {
2572 	struct cpu_info *ci;
2573 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2574 
2575 	KASSERT(kpreempt_disabled());
2576 
2577 	ci = curcpu();
2578 
2579 	if (l == ci->ci_curlwp) {
2580 		KASSERT(ci->ci_want_pmapload == 0);
2581 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2582 #ifdef KSTACK_CHECK_DR0
2583 		/*
2584 		 * setup breakpoint on the top of stack
2585 		 */
2586 		if (l == &lwp0)
2587 			dr0(0, 0, 0, 0);
2588 		else
2589 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2590 #endif
2591 
2592 		/*
2593 		 * no need to switch to kernel vmspace because
2594 		 * it's a subset of any vmspace.
2595 		 */
2596 
2597 		if (pmap == pmap_kernel()) {
2598 			ci->ci_want_pmapload = 0;
2599 			return;
2600 		}
2601 
2602 		ci->ci_want_pmapload = 1;
2603 	}
2604 }
2605 
2606 /*
2607  * pmap_reactivate: try to regain reference to the pmap.
2608  *
2609  * => must be called with kernel preemption disabled
2610  */
2611 
2612 static bool
2613 pmap_reactivate(struct pmap *pmap)
2614 {
2615 	struct cpu_info *ci;
2616 	uint32_t cpumask;
2617 	bool result;
2618 	uint32_t oldcpus;
2619 
2620 	ci = curcpu();
2621 	cpumask = ci->ci_cpumask;
2622 
2623 	KASSERT(kpreempt_disabled());
2624 #if defined(XEN) && defined(__x86_64__)
2625 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2626 #elif defined(PAE)
2627 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2628 #elif !defined(XEN)
2629 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2630 #endif
2631 
2632 	/*
2633 	 * if we still have a lazy reference to this pmap,
2634 	 * we can assume that there was no tlb shootdown
2635 	 * for this pmap in the meantime.
2636 	 *
2637 	 * the order of events here is important as we must
2638 	 * synchronize with TLB shootdown interrupts.  declare
2639 	 * interest in invalidations (TLBSTATE_VALID) and then
2640 	 * check the cpumask, which the IPIs can change only
2641 	 * when the state is TLBSTATE_LAZY.
2642 	 */
2643 
2644 	ci->ci_tlbstate = TLBSTATE_VALID;
2645 	oldcpus = pmap->pm_cpus;
2646 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2647 	if (oldcpus & cpumask) {
2648 		/* got it */
2649 		result = true;
2650 	} else {
2651 		/* must reload */
2652 		atomic_or_32(&pmap->pm_cpus, cpumask);
2653 		result = false;
2654 	}
2655 
2656 	return result;
2657 }
2658 
2659 /*
2660  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2661  */
2662 
2663 void
2664 pmap_load(void)
2665 {
2666 	struct cpu_info *ci;
2667 	uint32_t cpumask;
2668 	struct pmap *pmap;
2669 	struct pmap *oldpmap;
2670 	struct lwp *l;
2671 	struct pcb *pcb;
2672 	uint64_t ncsw;
2673 
2674 	kpreempt_disable();
2675  retry:
2676 	ci = curcpu();
2677 	if (!ci->ci_want_pmapload) {
2678 		kpreempt_enable();
2679 		return;
2680 	}
2681 	cpumask = ci->ci_cpumask;
2682 	l = ci->ci_curlwp;
2683 	ncsw = l->l_ncsw;
2684 
2685 	/* should be able to take ipis. */
2686 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2687 #ifdef XEN
2688 	/* XXX not yet KASSERT(x86_read_psl() != 0); */
2689 #else
2690 	KASSERT((x86_read_psl() & PSL_I) != 0);
2691 #endif
2692 
2693 	KASSERT(l != NULL);
2694 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2695 	KASSERT(pmap != pmap_kernel());
2696 	oldpmap = ci->ci_pmap;
2697 	pcb = lwp_getpcb(l);
2698 
2699 	if (pmap == oldpmap) {
2700 		if (!pmap_reactivate(pmap)) {
2701 			u_int gen = uvm_emap_gen_return();
2702 
2703 			/*
2704 			 * pmap has been changed during deactivated.
2705 			 * our tlb may be stale.
2706 			 */
2707 
2708 			tlbflush();
2709 			uvm_emap_update(gen);
2710 		}
2711 
2712 		ci->ci_want_pmapload = 0;
2713 		kpreempt_enable();
2714 		return;
2715 	}
2716 
2717 	/*
2718 	 * grab a reference to the new pmap.
2719 	 */
2720 
2721 	pmap_reference(pmap);
2722 
2723 	/*
2724 	 * actually switch pmap.
2725 	 */
2726 
2727 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2728 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2729 
2730 #if defined(XEN) && defined(__x86_64__)
2731 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2732 	    oldpmap == pmap_kernel());
2733 #elif defined(PAE)
2734 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2735 #elif !defined(XEN)
2736 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2737 #endif
2738 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2739 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2740 
2741 	/*
2742 	 * mark the pmap in use by this processor.  again we must
2743 	 * synchronize with TLB shootdown interrupts, so set the
2744 	 * state VALID first, then register us for shootdown events
2745 	 * on this pmap.
2746 	 */
2747 
2748 	ci->ci_tlbstate = TLBSTATE_VALID;
2749 	atomic_or_32(&pmap->pm_cpus, cpumask);
2750 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2751 	ci->ci_pmap = pmap;
2752 
2753 	/*
2754 	 * update tss.  now that we have registered for invalidations
2755 	 * from other CPUs, we're good to load the page tables.
2756 	 */
2757 #ifdef PAE
2758 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2759 #else
2760 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2761 #endif
2762 
2763 #ifdef i386
2764 #ifdef XEN
2765 	/*
2766 	 * clear APDP slot, in case it points to a page table that has
2767 	 * been freed
2768 	 */
2769 	if (*APDP_PDE) {
2770 		pmap_unmap_apdp();
2771 	}
2772 	/* lldt() does pmap_pte_flush() */
2773 #endif /* XEN */
2774 
2775 #ifndef XEN
2776 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2777 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2778 #endif /* !XEN */
2779 #endif /* i386 */
2780 
2781 	lldt(pmap->pm_ldt_sel);
2782 
2783 	u_int gen = uvm_emap_gen_return();
2784 	cpu_load_pmap(pmap);
2785 	uvm_emap_update(gen);
2786 
2787 	ci->ci_want_pmapload = 0;
2788 
2789 	/*
2790 	 * we're now running with the new pmap.  drop the reference
2791 	 * to the old pmap.  if we block, we need to go around again.
2792 	 */
2793 
2794 	pmap_destroy(oldpmap);
2795 	if (l->l_ncsw != ncsw) {
2796 		goto retry;
2797 	}
2798 
2799 	kpreempt_enable();
2800 }
2801 
2802 /*
2803  * pmap_deactivate: deactivate a process' pmap
2804  *
2805  * => must be called with kernel preemption disabled (high SPL is enough)
2806  */
2807 
2808 void
2809 pmap_deactivate(struct lwp *l)
2810 {
2811 	struct pmap *pmap;
2812 	struct cpu_info *ci;
2813 
2814 	KASSERT(kpreempt_disabled());
2815 
2816 	if (l != curlwp) {
2817 		return;
2818 	}
2819 
2820 	/*
2821 	 * wait for pending TLB shootdowns to complete.  necessary
2822 	 * because TLB shootdown state is per-CPU, and the LWP may
2823 	 * be coming off the CPU before it has a chance to call
2824 	 * pmap_update().
2825 	 */
2826 	pmap_tlb_shootwait();
2827 
2828 	ci = curcpu();
2829 
2830 	if (ci->ci_want_pmapload) {
2831 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2832 		    != pmap_kernel());
2833 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2834 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2835 
2836 		/*
2837 		 * userspace has not been touched.
2838 		 * nothing to do here.
2839 		 */
2840 
2841 		ci->ci_want_pmapload = 0;
2842 		return;
2843 	}
2844 
2845 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2846 
2847 	if (pmap == pmap_kernel()) {
2848 		return;
2849 	}
2850 
2851 #if defined(XEN) && defined(__x86_64__)
2852 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2853 #elif defined(PAE)
2854 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2855 #elif !defined(XEN)
2856 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2857 #endif
2858 	KASSERT(ci->ci_pmap == pmap);
2859 
2860 	/*
2861 	 * we aren't interested in TLB invalidations for this pmap,
2862 	 * at least for the time being.
2863 	 */
2864 
2865 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2866 	ci->ci_tlbstate = TLBSTATE_LAZY;
2867 }
2868 
2869 /*
2870  * end of lifecycle functions
2871  */
2872 
2873 /*
2874  * some misc. functions
2875  */
2876 
2877 int
2878 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2879 {
2880 	int i;
2881 	unsigned long index;
2882 	pd_entry_t pde;
2883 
2884 	for (i = PTP_LEVELS; i > 1; i--) {
2885 		index = pl_i(va, i);
2886 		pde = pdes[i - 2][index];
2887 		if ((pde & PG_V) == 0)
2888 			return i;
2889 	}
2890 	if (lastpde != NULL)
2891 		*lastpde = pde;
2892 	return 0;
2893 }
2894 
2895 /*
2896  * pmap_extract: extract a PA for the given VA
2897  */
2898 
2899 bool
2900 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2901 {
2902 	pt_entry_t *ptes, pte;
2903 	pd_entry_t pde;
2904 	pd_entry_t * const *pdes;
2905 	struct pmap *pmap2;
2906 	struct cpu_info *ci;
2907 	paddr_t pa;
2908 	lwp_t *l;
2909 	bool hard, rv;
2910 
2911 	rv = false;
2912 	pa = 0;
2913 	l = curlwp;
2914 
2915 	KPREEMPT_DISABLE(l);
2916 	ci = l->l_cpu;
2917 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2918 	    pmap == pmap_kernel()) {
2919 		/*
2920 		 * no need to lock, because it's pmap_kernel() or our
2921 		 * own pmap and is active.  if a user pmap, the caller
2922 		 * will hold the vm_map write/read locked and so prevent
2923 		 * entries from disappearing while we are here.  ptps
2924 		 * can disappear via pmap_remove() and pmap_protect(),
2925 		 * but they are called with the vm_map write locked.
2926 		 */
2927 		hard = false;
2928 		ptes = PTE_BASE;
2929 		pdes = normal_pdes;
2930 	} else {
2931 		/* we lose, do it the hard way. */
2932 		hard = true;
2933 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2934 	}
2935 	if (pmap_pdes_valid(va, pdes, &pde)) {
2936 		pte = ptes[pl1_i(va)];
2937 		if (pde & PG_PS) {
2938 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2939 			rv = true;
2940 		} else if (__predict_true((pte & PG_V) != 0)) {
2941 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2942 			rv = true;
2943 		}
2944 	}
2945 	if (__predict_false(hard)) {
2946 		pmap_unmap_ptes(pmap, pmap2);
2947 	}
2948 	KPREEMPT_ENABLE(l);
2949 	if (pap != NULL) {
2950 		*pap = pa;
2951 	}
2952 	return rv;
2953 }
2954 
2955 
2956 /*
2957  * vtophys: virtual address to physical address.  For use by
2958  * machine-dependent code only.
2959  */
2960 
2961 paddr_t
2962 vtophys(vaddr_t va)
2963 {
2964 	paddr_t pa;
2965 
2966 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2967 		return (pa);
2968 	return (0);
2969 }
2970 
2971 __weak_alias(pmap_extract_ma, pmap_extract);
2972 
2973 #ifdef XEN
2974 
2975 /*
2976  * vtomach: virtual address to machine address.  For use by
2977  * machine-dependent code only.
2978  */
2979 
2980 paddr_t
2981 vtomach(vaddr_t va)
2982 {
2983 	paddr_t pa;
2984 
2985 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2986 		return (pa);
2987 	return (0);
2988 }
2989 
2990 #endif /* XEN */
2991 
2992 /*
2993  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2994  *	determine the bounds of the kernel virtual addess space.
2995  */
2996 
2997 void
2998 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
2999 {
3000 	*startp = virtual_avail;
3001 	*endp = virtual_end;
3002 }
3003 
3004 /*
3005  * pmap_map: map a range of PAs into kvm.
3006  *
3007  * => used during crash dump
3008  * => XXX: pmap_map() should be phased out?
3009  */
3010 
3011 vaddr_t
3012 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
3013 {
3014 	while (spa < epa) {
3015 		pmap_kenter_pa(va, spa, prot, 0);
3016 		va += PAGE_SIZE;
3017 		spa += PAGE_SIZE;
3018 	}
3019 	pmap_update(pmap_kernel());
3020 	return va;
3021 }
3022 
3023 /*
3024  * pmap_zero_page: zero a page
3025  */
3026 
3027 void
3028 pmap_zero_page(paddr_t pa)
3029 {
3030 	pt_entry_t *zpte;
3031 	void *zerova;
3032 	int id;
3033 
3034 	kpreempt_disable();
3035 	id = cpu_number();
3036 	zpte = PTESLEW(zero_pte, id);
3037 	zerova = VASLEW(zerop, id);
3038 
3039 #ifdef DIAGNOSTIC
3040 	if (*zpte)
3041 		panic("pmap_zero_page: lock botch");
3042 #endif
3043 
3044 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3045 	pmap_pte_flush();
3046 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3047 
3048 	memset(zerova, 0, PAGE_SIZE);
3049 
3050 #if defined(DIAGNOSTIC) || defined(XEN)
3051 	pmap_pte_set(zpte, 0);				/* zap ! */
3052 	pmap_pte_flush();
3053 #endif
3054 	kpreempt_enable();
3055 }
3056 
3057 /*
3058  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3059  * Returns true if the page was zero'd, false if we aborted for
3060  * some reason.
3061  */
3062 
3063 bool
3064 pmap_pageidlezero(paddr_t pa)
3065 {
3066 	pt_entry_t *zpte;
3067 	void *zerova;
3068 	bool rv;
3069 	int id;
3070 
3071 	id = cpu_number();
3072 	zpte = PTESLEW(zero_pte, id);
3073 	zerova = VASLEW(zerop, id);
3074 
3075 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3076 	KASSERT(*zpte == 0);
3077 
3078 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3079 	pmap_pte_flush();
3080 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3081 
3082 	rv = sse2_idlezero_page(zerova);
3083 
3084 #if defined(DIAGNOSTIC) || defined(XEN)
3085 	pmap_pte_set(zpte, 0);				/* zap ! */
3086 	pmap_pte_flush();
3087 #endif
3088 
3089 	return rv;
3090 }
3091 
3092 /*
3093  * pmap_copy_page: copy a page
3094  */
3095 
3096 void
3097 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3098 {
3099 	pt_entry_t *spte;
3100 	pt_entry_t *dpte;
3101 	void *csrcva;
3102 	void *cdstva;
3103 	int id;
3104 
3105 	kpreempt_disable();
3106 	id = cpu_number();
3107 	spte = PTESLEW(csrc_pte,id);
3108 	dpte = PTESLEW(cdst_pte,id);
3109 	csrcva = VASLEW(csrcp, id);
3110 	cdstva = VASLEW(cdstp, id);
3111 
3112 	KASSERT(*spte == 0 && *dpte == 0);
3113 
3114 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3115 	pmap_pte_set(dpte,
3116 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3117 	pmap_pte_flush();
3118 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3119 
3120 	memcpy(cdstva, csrcva, PAGE_SIZE);
3121 
3122 #if defined(DIAGNOSTIC) || defined(XEN)
3123 	pmap_pte_set(spte, 0);
3124 	pmap_pte_set(dpte, 0);
3125 	pmap_pte_flush();
3126 #endif
3127 	kpreempt_enable();
3128 }
3129 
3130 static pt_entry_t *
3131 pmap_map_ptp(struct vm_page *ptp)
3132 {
3133 	pt_entry_t *ptppte;
3134 	void *ptpva;
3135 	int id;
3136 
3137 	KASSERT(kpreempt_disabled());
3138 
3139 	id = cpu_number();
3140 	ptppte = PTESLEW(ptp_pte, id);
3141 	ptpva = VASLEW(ptpp, id);
3142 #if !defined(XEN)
3143 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3144 	    PG_RW | PG_U | PG_k);
3145 #else
3146 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3147 	    PG_U | PG_k);
3148 #endif
3149 	pmap_pte_flush();
3150 	pmap_update_pg((vaddr_t)ptpva);
3151 
3152 	return (pt_entry_t *)ptpva;
3153 }
3154 
3155 static void
3156 pmap_unmap_ptp(void)
3157 {
3158 #if defined(DIAGNOSTIC) || defined(XEN)
3159 	pt_entry_t *pte;
3160 
3161 	KASSERT(kpreempt_disabled());
3162 
3163 	pte = PTESLEW(ptp_pte, cpu_number());
3164 	if (*pte != 0) {
3165 		pmap_pte_set(pte, 0);
3166 		pmap_pte_flush();
3167 	}
3168 #endif
3169 }
3170 
3171 static pt_entry_t *
3172 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3173 {
3174 
3175 	KASSERT(kpreempt_disabled());
3176 	if (pmap_is_curpmap(pmap)) {
3177 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3178 	}
3179 	KASSERT(ptp != NULL);
3180 	return pmap_map_ptp(ptp) + pl1_pi(va);
3181 }
3182 
3183 static void
3184 pmap_unmap_pte(void)
3185 {
3186 
3187 	KASSERT(kpreempt_disabled());
3188 
3189 	pmap_unmap_ptp();
3190 }
3191 
3192 /*
3193  * p m a p   r e m o v e   f u n c t i o n s
3194  *
3195  * functions that remove mappings
3196  */
3197 
3198 /*
3199  * pmap_remove_ptes: remove PTEs from a PTP
3200  *
3201  * => must have proper locking on pmap_master_lock
3202  * => caller must hold pmap's lock
3203  * => PTP must be mapped into KVA
3204  * => PTP should be null if pmap == pmap_kernel()
3205  * => must be called with kernel preemption disabled
3206  * => returns composite pte if at least one page should be shot down
3207  */
3208 
3209 static pt_entry_t
3210 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3211 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3212 {
3213 	struct pv_entry *pve;
3214 	pt_entry_t *pte = (pt_entry_t *) ptpva;
3215 	pt_entry_t opte, xpte = 0;
3216 
3217 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3218 	KASSERT(kpreempt_disabled());
3219 
3220 	/*
3221 	 * note that ptpva points to the PTE that maps startva.   this may
3222 	 * or may not be the first PTE in the PTP.
3223 	 *
3224 	 * we loop through the PTP while there are still PTEs to look at
3225 	 * and the wire_count is greater than 1 (because we use the wire_count
3226 	 * to keep track of the number of real PTEs in the PTP).
3227 	 */
3228 
3229 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
3230 			     ; pte++, startva += PAGE_SIZE) {
3231 		struct vm_page *pg;
3232 		struct pmap_page *pp;
3233 
3234 		if (!pmap_valid_entry(*pte))
3235 			continue;			/* VA not mapped */
3236 
3237 		/* atomically save the old PTE and zap! it */
3238 		opte = pmap_pte_testset(pte, 0);
3239 		if (!pmap_valid_entry(opte)) {
3240 			continue;
3241 		}
3242 
3243 		pmap_exec_account(pmap, startva, opte, 0);
3244 		pmap_stats_update_bypte(pmap, 0, opte);
3245 		xpte |= opte;
3246 
3247 		if (ptp) {
3248 			ptp->wire_count--;		/* dropping a PTE */
3249 			/* Make sure that the PDE is flushed */
3250 			if (ptp->wire_count <= 1)
3251 				xpte |= PG_U;
3252 		}
3253 
3254 		/*
3255 		 * if we are not on a pv_head list we are done.
3256 		 */
3257 
3258 		if ((opte & PG_PVLIST) == 0) {
3259 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3260 			if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3261 				panic("pmap_remove_ptes: managed page without "
3262 				      "PG_PVLIST for %#" PRIxVADDR, startva);
3263 #endif
3264 			continue;
3265 		}
3266 
3267 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3268 #ifdef DIAGNOSTIC
3269 		if (pg == NULL)
3270 			panic("pmap_remove_ptes: unmanaged page marked "
3271 			      "PG_PVLIST, va = %#" PRIxVADDR ", "
3272 			      "pa = %#" PRIxPADDR,
3273 			      startva, (paddr_t)pmap_pte2pa(opte));
3274 #endif
3275 
3276 		/* sync R/M bits */
3277 		pp = VM_PAGE_TO_PP(pg);
3278 		pp_lock(pp);
3279 		pp->pp_attrs |= opte;
3280 		pve = pmap_remove_pv(pp, ptp, startva);
3281 		pp_unlock(pp);
3282 
3283 		if (pve != NULL) {
3284 			pve->pve_next = *pv_tofree;
3285 			*pv_tofree = pve;
3286 		}
3287 
3288 		/* end of "for" loop: time for next pte */
3289 	}
3290 
3291 	return xpte;
3292 }
3293 
3294 
3295 /*
3296  * pmap_remove_pte: remove a single PTE from a PTP
3297  *
3298  * => must have proper locking on pmap_master_lock
3299  * => caller must hold pmap's lock
3300  * => PTP must be mapped into KVA
3301  * => PTP should be null if pmap == pmap_kernel()
3302  * => returns true if we removed a mapping
3303  * => must be called with kernel preemption disabled
3304  */
3305 
3306 static bool
3307 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3308 		vaddr_t va, struct pv_entry **pv_tofree)
3309 {
3310 	pt_entry_t opte;
3311 	struct pv_entry *pve;
3312 	struct vm_page *pg;
3313 	struct pmap_page *pp;
3314 
3315 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3316 	KASSERT(pmap == pmap_kernel() || kpreempt_disabled());
3317 
3318 	if (!pmap_valid_entry(*pte))
3319 		return(false);		/* VA not mapped */
3320 
3321 	/* atomically save the old PTE and zap! it */
3322 	opte = pmap_pte_testset(pte, 0);
3323 	if (!pmap_valid_entry(opte)) {
3324 		return false;
3325 	}
3326 
3327 	pmap_exec_account(pmap, va, opte, 0);
3328 	pmap_stats_update_bypte(pmap, 0, opte);
3329 
3330 	if (opte & PG_U)
3331 		pmap_tlb_shootdown(pmap, va, 0, opte);
3332 
3333 	if (ptp) {
3334 		ptp->wire_count--;		/* dropping a PTE */
3335 		/* Make sure that the PDE is flushed */
3336 		if ((ptp->wire_count <= 1) && !(opte & PG_U))
3337 			pmap_tlb_shootdown(pmap, va, 0, opte);
3338 	}
3339 
3340 	/*
3341 	 * if we are not on a pv_head list we are done.
3342 	 */
3343 
3344 	if ((opte & PG_PVLIST) == 0) {
3345 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3346 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3347 			panic("pmap_remove_pte: managed page without "
3348 			      "PG_PVLIST for %#" PRIxVADDR, va);
3349 #endif
3350 		return(true);
3351 	}
3352 
3353 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3354 #ifdef DIAGNOSTIC
3355 	if (pg == NULL)
3356 		panic("pmap_remove_pte: unmanaged page marked "
3357 		    "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR,
3358 		    va, (paddr_t)pmap_pte2pa(opte));
3359 #endif
3360 
3361 	/* sync R/M bits */
3362 	pp = VM_PAGE_TO_PP(pg);
3363 	pp_lock(pp);
3364 	pp->pp_attrs |= opte;
3365 	pve = pmap_remove_pv(pp, ptp, va);
3366 	pp_unlock(pp);
3367 
3368 	if (pve) {
3369 		pve->pve_next = *pv_tofree;
3370 		*pv_tofree = pve;
3371 	}
3372 
3373 	return(true);
3374 }
3375 
3376 /*
3377  * pmap_remove: mapping removal function.
3378  *
3379  * => caller should not be holding any pmap locks
3380  */
3381 
3382 void
3383 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3384 {
3385 	pt_entry_t *ptes, xpte = 0;
3386 	pd_entry_t pde;
3387 	pd_entry_t * const *pdes;
3388 	struct pv_entry *pv_tofree = NULL;
3389 	bool result;
3390 	int i;
3391 	paddr_t ptppa;
3392 	vaddr_t blkendva, va = sva;
3393 	struct vm_page *ptp;
3394 	struct pmap *pmap2;
3395 
3396 	kpreempt_disable();
3397 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3398 
3399 	/*
3400 	 * removing one page?  take shortcut function.
3401 	 */
3402 
3403 	if (va + PAGE_SIZE == eva) {
3404 		if (pmap_pdes_valid(va, pdes, &pde)) {
3405 
3406 			/* PA of the PTP */
3407 			ptppa = pmap_pte2pa(pde);
3408 
3409 			/* get PTP if non-kernel mapping */
3410 			if (pmap == pmap_kernel()) {
3411 				/* we never free kernel PTPs */
3412 				ptp = NULL;
3413 			} else {
3414 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3415 #ifdef DIAGNOSTIC
3416 				if (ptp == NULL)
3417 					panic("pmap_remove: unmanaged "
3418 					      "PTP detected");
3419 #endif
3420 			}
3421 
3422 			/* do it! */
3423 			result = pmap_remove_pte(pmap, ptp,
3424 			    &ptes[pl1_i(va)], va, &pv_tofree);
3425 
3426 			/*
3427 			 * if mapping removed and the PTP is no longer
3428 			 * being used, free it!
3429 			 */
3430 
3431 			if (result && ptp && ptp->wire_count <= 1)
3432 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3433 		}
3434 	} else for (/* null */ ; va < eva ; va = blkendva) {
3435 		int lvl;
3436 
3437 		/* determine range of block */
3438 		blkendva = x86_round_pdr(va+1);
3439 		if (blkendva > eva)
3440 			blkendva = eva;
3441 
3442 		/*
3443 		 * XXXCDC: our PTE mappings should never be removed
3444 		 * with pmap_remove!  if we allow this (and why would
3445 		 * we?) then we end up freeing the pmap's page
3446 		 * directory page (PDP) before we are finished using
3447 		 * it when we hit in in the recursive mapping.  this
3448 		 * is BAD.
3449 		 *
3450 		 * long term solution is to move the PTEs out of user
3451 		 * address space.  and into kernel address space (up
3452 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3453 		 * be VM_MAX_ADDRESS.
3454 		 */
3455 
3456 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3457 		for (i = 0; i < PDP_SIZE; i++) {
3458 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3459 				continue;
3460 		}
3461 
3462 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3463 		if (lvl != 0) {
3464 			/*
3465 			 * skip a range corresponding to an invalid pde.
3466 			 */
3467 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3468  			continue;
3469 		}
3470 
3471 		/* PA of the PTP */
3472 		ptppa = pmap_pte2pa(pde);
3473 
3474 		/* get PTP if non-kernel mapping */
3475 		if (pmap == pmap_kernel()) {
3476 			/* we never free kernel PTPs */
3477 			ptp = NULL;
3478 		} else {
3479 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3480 #ifdef DIAGNOSTIC
3481 			if (ptp == NULL)
3482 				panic("pmap_remove: unmanaged PTP "
3483 				      "detected");
3484 #endif
3485 		}
3486 		xpte |= pmap_remove_ptes(pmap, ptp,
3487 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree);
3488 
3489 		/* if PTP is no longer being used, free it! */
3490 		if (ptp && ptp->wire_count <= 1) {
3491 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3492 		}
3493 		if ((xpte & PG_U) != 0)
3494 			pmap_tlb_shootdown(pmap, sva, eva, xpte);
3495 	}
3496 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3497 	kpreempt_enable();
3498 
3499 	/* Now we free unused PVs */
3500 	if (pv_tofree)
3501 		pmap_free_pvs(pv_tofree);
3502 }
3503 
3504 /*
3505  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3506  *
3507  * => called with pp_lock held. (thus preemption disabled)
3508  * => issues tlb shootdowns if necessary.
3509  */
3510 
3511 static int
3512 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3513     pt_entry_t *optep)
3514 {
3515 	struct pmap *pmap;
3516 	struct vm_page *ptp;
3517 	vaddr_t va;
3518 	pt_entry_t *ptep;
3519 	pt_entry_t opte;
3520 	pt_entry_t npte;
3521 	bool need_shootdown;
3522 
3523 	ptp = pvpte->pte_ptp;
3524 	va = pvpte->pte_va;
3525 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3526 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3527 	pmap = ptp_to_pmap(ptp);
3528 
3529 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3530 	KASSERT((expect & PG_V) != 0);
3531 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3532 	KASSERT(kpreempt_disabled());
3533 
3534 	ptep = pmap_map_pte(pmap, ptp, va);
3535 	do {
3536 		opte = *ptep;
3537 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3538 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3539 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3540 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3541 
3542 			/*
3543 			 * we lost a race with a V->P operation like
3544 			 * pmap_remove().  wait for the competitor
3545 			 * reflecting pte bits into mp_attrs.
3546 			 *
3547 			 * issue a redundant TLB shootdown so that
3548 			 * we can wait for its completion.
3549 			 */
3550 
3551 			pmap_unmap_pte();
3552 			if (clearbits != 0) {
3553 				pmap_tlb_shootdown(pmap, va, 0,
3554 				    (pmap == pmap_kernel() ? PG_G : 0));
3555 			}
3556 			return EAGAIN;
3557 		}
3558 
3559 		/*
3560 		 * check if there's anything to do on this pte.
3561 		 */
3562 
3563 		if ((opte & clearbits) == 0) {
3564 			need_shootdown = false;
3565 			break;
3566 		}
3567 
3568 		/*
3569 		 * we need a shootdown if the pte is cached. (PG_U)
3570 		 *
3571 		 * ...unless we are clearing only the PG_RW bit and
3572 		 * it isn't cached as RW. (PG_M)
3573 		 */
3574 
3575 		need_shootdown = (opte & PG_U) != 0 &&
3576 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3577 
3578 		npte = opte & ~clearbits;
3579 
3580 		/*
3581 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3582 		 */
3583 
3584 		if (need_shootdown) {
3585 			npte &= ~(PG_U | PG_M);
3586 		}
3587 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3588 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3589 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3590 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3591 
3592 	if (need_shootdown) {
3593 		pmap_tlb_shootdown(pmap, va, 0, opte);
3594 	}
3595 	pmap_unmap_pte();
3596 
3597 	*optep = opte;
3598 	return 0;
3599 }
3600 
3601 /*
3602  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3603  *
3604  * => R/M bits are sync'd back to attrs
3605  */
3606 
3607 void
3608 pmap_page_remove(struct vm_page *pg)
3609 {
3610 	struct pmap_page *pp;
3611 	struct pv_pte *pvpte;
3612 	struct pv_entry *killlist = NULL;
3613 	struct vm_page *ptp;
3614 	pt_entry_t expect;
3615 	lwp_t *l;
3616 	int count;
3617 
3618 	l = curlwp;
3619 	pp = VM_PAGE_TO_PP(pg);
3620 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3621 	count = SPINLOCK_BACKOFF_MIN;
3622 	kpreempt_disable();
3623 startover:
3624 	pp_lock(pp);
3625 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3626 		struct pmap *pmap;
3627 		struct pv_entry *pve;
3628 		pt_entry_t opte;
3629 		vaddr_t va;
3630 		int error;
3631 
3632 		/*
3633 		 * add a reference to the pmap before clearing the pte.
3634 		 * otherwise the pmap can disappear behind us.
3635 		 */
3636 
3637 		ptp = pvpte->pte_ptp;
3638 		pmap = ptp_to_pmap(ptp);
3639 		if (ptp != NULL) {
3640 			pmap_reference(pmap);
3641 		}
3642 
3643 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3644 		if (error == EAGAIN) {
3645 			int hold_count;
3646 			pp_unlock(pp);
3647 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3648 			if (ptp != NULL) {
3649 				pmap_destroy(pmap);
3650 			}
3651 			SPINLOCK_BACKOFF(count);
3652 			KERNEL_LOCK(hold_count, curlwp);
3653 			goto startover;
3654 		}
3655 
3656 		pp->pp_attrs |= opte;
3657 		va = pvpte->pte_va;
3658 		pve = pmap_remove_pv(pp, ptp, va);
3659 		pp_unlock(pp);
3660 
3661 		/* update the PTP reference count.  free if last reference. */
3662 		if (ptp != NULL) {
3663 			struct pmap *pmap2;
3664 			pt_entry_t *ptes;
3665 			pd_entry_t * const *pdes;
3666 
3667 			KASSERT(pmap != pmap_kernel());
3668 
3669 			pmap_tlb_shootwait();
3670 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3671 			pmap_stats_update_bypte(pmap, 0, opte);
3672 			ptp->wire_count--;
3673 			if (ptp->wire_count <= 1) {
3674 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3675 			}
3676 			pmap_unmap_ptes(pmap, pmap2);
3677 			pmap_destroy(pmap);
3678 		} else {
3679 			KASSERT(pmap == pmap_kernel());
3680 			pmap_stats_update_bypte(pmap, 0, opte);
3681 		}
3682 
3683 		if (pve != NULL) {
3684 			pve->pve_next = killlist;	/* mark it for death */
3685 			killlist = pve;
3686 		}
3687 		pp_lock(pp);
3688 	}
3689 	pp_unlock(pp);
3690 	kpreempt_enable();
3691 
3692 	/* Now free unused pvs. */
3693 	pmap_free_pvs(killlist);
3694 }
3695 
3696 /*
3697  * p m a p   a t t r i b u t e  f u n c t i o n s
3698  * functions that test/change managed page's attributes
3699  * since a page can be mapped multiple times we must check each PTE that
3700  * maps it by going down the pv lists.
3701  */
3702 
3703 /*
3704  * pmap_test_attrs: test a page's attributes
3705  */
3706 
3707 bool
3708 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3709 {
3710 	struct pmap_page *pp;
3711 	struct pv_pte *pvpte;
3712 	pt_entry_t expect;
3713 	u_int result;
3714 
3715 	pp = VM_PAGE_TO_PP(pg);
3716 	if ((pp->pp_attrs & testbits) != 0) {
3717 		return true;
3718 	}
3719 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3720 	pp_lock(pp);
3721 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3722 		pt_entry_t opte;
3723 		int error;
3724 
3725 		if ((pp->pp_attrs & testbits) != 0) {
3726 			break;
3727 		}
3728 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3729 		if (error == 0) {
3730 			pp->pp_attrs |= opte;
3731 		}
3732 	}
3733 	result = pp->pp_attrs & testbits;
3734 	pp_unlock(pp);
3735 
3736 	/*
3737 	 * note that we will exit the for loop with a non-null pve if
3738 	 * we have found the bits we are testing for.
3739 	 */
3740 
3741 	return result != 0;
3742 }
3743 
3744 /*
3745  * pmap_clear_attrs: clear the specified attribute for a page.
3746  *
3747  * => we return true if we cleared one of the bits we were asked to
3748  */
3749 
3750 bool
3751 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3752 {
3753 	struct pmap_page *pp;
3754 	struct pv_pte *pvpte;
3755 	u_int result;
3756 	pt_entry_t expect;
3757 	int count;
3758 
3759 	pp = VM_PAGE_TO_PP(pg);
3760 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3761 	count = SPINLOCK_BACKOFF_MIN;
3762 	kpreempt_disable();
3763 startover:
3764 	pp_lock(pp);
3765 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3766 		pt_entry_t opte;
3767 		int error;
3768 
3769 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3770 		if (error == EAGAIN) {
3771 			int hold_count;
3772 			pp_unlock(pp);
3773 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3774 			SPINLOCK_BACKOFF(count);
3775 			KERNEL_LOCK(hold_count, curlwp);
3776 			goto startover;
3777 		}
3778 		pp->pp_attrs |= opte;
3779 	}
3780 	result = pp->pp_attrs & clearbits;
3781 	pp->pp_attrs &= ~clearbits;
3782 	pp_unlock(pp);
3783 	kpreempt_enable();
3784 
3785 	return result != 0;
3786 }
3787 
3788 
3789 /*
3790  * p m a p   p r o t e c t i o n   f u n c t i o n s
3791  */
3792 
3793 /*
3794  * pmap_page_protect: change the protection of all recorded mappings
3795  *	of a managed page
3796  *
3797  * => NOTE: this is an inline function in pmap.h
3798  */
3799 
3800 /* see pmap.h */
3801 
3802 /*
3803  * pmap_protect: set the protection in of the pages in a pmap
3804  *
3805  * => NOTE: this is an inline function in pmap.h
3806  */
3807 
3808 /* see pmap.h */
3809 
3810 /*
3811  * pmap_write_protect: write-protect pages in a pmap
3812  */
3813 
3814 void
3815 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3816 {
3817 	int i;
3818 	pt_entry_t *ptes, *epte;
3819 	pt_entry_t *spte;
3820 	pd_entry_t * const *pdes;
3821 	vaddr_t blockend, va;
3822 	pt_entry_t opte;
3823 	struct pmap *pmap2;
3824 
3825 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3826 
3827 	kpreempt_disable();
3828 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3829 
3830 	/* should be ok, but just in case ... */
3831 	sva &= PG_FRAME;
3832 	eva &= PG_FRAME;
3833 
3834 	for (va = sva ; va < eva ; va = blockend) {
3835 
3836 		blockend = (va & L2_FRAME) + NBPD_L2;
3837 		if (blockend > eva)
3838 			blockend = eva;
3839 
3840 		/*
3841 		 * XXXCDC: our PTE mappings should never be write-protected!
3842 		 *
3843 		 * long term solution is to move the PTEs out of user
3844 		 * address space.  and into kernel address space (up
3845 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3846 		 * be VM_MAX_ADDRESS.
3847 		 */
3848 
3849 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3850 		for (i = 0; i < PDP_SIZE; i++) {
3851 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3852 				continue;
3853 		}
3854 
3855 		/* empty block? */
3856 		if (!pmap_pdes_valid(va, pdes, NULL))
3857 			continue;
3858 
3859 #ifdef DIAGNOSTIC
3860 		if (va >= VM_MAXUSER_ADDRESS &&
3861 		    va < VM_MAX_ADDRESS)
3862 			panic("pmap_write_protect: PTE space");
3863 #endif
3864 
3865 		spte = &ptes[pl1_i(va)];
3866 		epte = &ptes[pl1_i(blockend)];
3867 
3868 		for (/*null */; spte < epte ; spte++) {
3869 			pt_entry_t npte;
3870 
3871 			do {
3872 				opte = *spte;
3873 				if ((~opte & (PG_RW | PG_V)) != 0) {
3874 					goto next;
3875 				}
3876 				npte = opte & ~PG_RW;
3877 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3878 			if ((opte & PG_M) != 0) {
3879 				vaddr_t tva;
3880 
3881 				tva = x86_ptob(spte - ptes);
3882 				pmap_tlb_shootdown(pmap, tva, 0, opte);
3883 			}
3884 next:;
3885 		}
3886 	}
3887 
3888 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
3889 	kpreempt_enable();
3890 }
3891 
3892 /*
3893  * end of protection functions
3894  */
3895 
3896 /*
3897  * pmap_unwire: clear the wired bit in the PTE
3898  *
3899  * => mapping should already be in map
3900  */
3901 
3902 void
3903 pmap_unwire(struct pmap *pmap, vaddr_t va)
3904 {
3905 	pt_entry_t *ptes;
3906 	pd_entry_t * const *pdes;
3907 	struct pmap *pmap2;
3908 
3909 	kpreempt_disable();
3910 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3911 
3912 	if (pmap_pdes_valid(va, pdes, NULL)) {
3913 		pt_entry_t *ptep = &ptes[pl1_i(va)];
3914 		pt_entry_t opte = *ptep;
3915 
3916 #ifdef DIAGNOSTIC
3917 		if (!pmap_valid_entry(opte))
3918 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
3919 #endif
3920 		if ((opte & PG_W) != 0) {
3921 			pt_entry_t npte = opte & ~PG_W;
3922 
3923 			opte = pmap_pte_testset(ptep, npte);
3924 			pmap_stats_update_bypte(pmap, npte, opte);
3925 		}
3926 #ifdef DIAGNOSTIC
3927 		else {
3928 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3929 			       "didn't change!\n", pmap, va);
3930 		}
3931 #endif
3932 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
3933 	}
3934 #ifdef DIAGNOSTIC
3935 	else {
3936 		panic("pmap_unwire: invalid PDE");
3937 	}
3938 #endif
3939 	kpreempt_enable();
3940 }
3941 
3942 /*
3943  * pmap_copy: copy mappings from one pmap to another
3944  *
3945  * => optional function
3946  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3947  */
3948 
3949 /*
3950  * defined as macro in pmap.h
3951  */
3952 
3953 __weak_alias(pmap_enter, pmap_enter_default);
3954 
3955 int
3956 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3957     u_int flags)
3958 {
3959 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3960 }
3961 
3962 /*
3963  * pmap_enter: enter a mapping into a pmap
3964  *
3965  * => must be done "now" ... no lazy-evaluation
3966  * => we set pmap => pv_head locking
3967  */
3968 int
3969 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3970 	   vm_prot_t prot, u_int flags, int domid)
3971 {
3972 	pt_entry_t *ptes, opte, npte;
3973 	pt_entry_t *ptep;
3974 	pd_entry_t * const *pdes;
3975 	struct vm_page *ptp, *pg;
3976 	struct pmap_page *new_pp;
3977 	struct pmap_page *old_pp;
3978 	struct pv_entry *old_pve = NULL;
3979 	struct pv_entry *new_pve;
3980 	struct pv_entry *new_pve2;
3981 	int error;
3982 	bool wired = (flags & PMAP_WIRED) != 0;
3983 	struct pmap *pmap2;
3984 
3985 	KASSERT(pmap_initialized);
3986 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3987 
3988 #ifdef DIAGNOSTIC
3989 	/* sanity check: totally out of range? */
3990 	if (va >= VM_MAX_KERNEL_ADDRESS)
3991 		panic("pmap_enter: too big");
3992 
3993 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
3994 		panic("pmap_enter: trying to map over PDP/APDP!");
3995 
3996 	/* sanity check: kernel PTPs should already have been pre-allocated */
3997 	if (va >= VM_MIN_KERNEL_ADDRESS &&
3998 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
3999 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
4000 #endif /* DIAGNOSTIC */
4001 #ifdef XEN
4002 	KASSERT(domid == DOMID_SELF || pa == 0);
4003 #endif /* XEN */
4004 
4005 	npte = ma | protection_codes[prot] | PG_V;
4006 	npte |= pmap_pat_flags(flags);
4007 	if (wired)
4008 	        npte |= PG_W;
4009 	if (va < VM_MAXUSER_ADDRESS)
4010 		npte |= PG_u;
4011 	else if (va < VM_MAX_ADDRESS)
4012 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4013 	else
4014 		npte |= PG_k;
4015 	if (pmap == pmap_kernel())
4016 		npte |= pmap_pg_g;
4017 	if (flags & VM_PROT_ALL) {
4018 		npte |= PG_U;
4019 		if (flags & VM_PROT_WRITE) {
4020 			KASSERT((npte & PG_RW) != 0);
4021 			npte |= PG_M;
4022 		}
4023 	}
4024 
4025 #ifdef XEN
4026 	if (domid != DOMID_SELF)
4027 		pg = NULL;
4028 	else
4029 #endif
4030 		pg = PHYS_TO_VM_PAGE(pa);
4031 	if (pg != NULL) {
4032 		/* This is a managed page */
4033 		npte |= PG_PVLIST;
4034 		new_pp = VM_PAGE_TO_PP(pg);
4035 	} else {
4036 		new_pp = NULL;
4037 	}
4038 
4039 	/* get pves. */
4040 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4041 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4042 	if (new_pve == NULL || new_pve2 == NULL) {
4043 		if (flags & PMAP_CANFAIL) {
4044 			error = ENOMEM;
4045 			goto out2;
4046 		}
4047 		panic("pmap_enter: pve allocation failed");
4048 	}
4049 
4050 	kpreempt_disable();
4051 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4052 	if (pmap == pmap_kernel()) {
4053 		ptp = NULL;
4054 	} else {
4055 		ptp = pmap_get_ptp(pmap, va, pdes);
4056 		if (ptp == NULL) {
4057 			pmap_unmap_ptes(pmap, pmap2);
4058 			if (flags & PMAP_CANFAIL) {
4059 				error = ENOMEM;
4060 				goto out;
4061 			}
4062 			panic("pmap_enter: get ptp failed");
4063 		}
4064 	}
4065 
4066 	/*
4067 	 * update the pte.
4068 	 */
4069 
4070 	ptep = &ptes[pl1_i(va)];
4071 	do {
4072 		opte = *ptep;
4073 
4074 		/*
4075 		 * if the same page, inherit PG_U and PG_M.
4076 		 */
4077 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4078 			npte |= opte & (PG_U | PG_M);
4079 		}
4080 #if defined(XEN)
4081 		if (domid != DOMID_SELF) {
4082 			/* pmap_pte_cas with error handling */
4083 			int s = splvm();
4084 			if (opte != *ptep) {
4085 				splx(s);
4086 				continue;
4087 			}
4088 			error = xpq_update_foreign(
4089 			    vtomach((vaddr_t)ptep), npte, domid);
4090 			splx(s);
4091 			if (error) {
4092 				if (ptp != NULL && ptp->wire_count <= 1) {
4093 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4094 				}
4095 				pmap_unmap_ptes(pmap, pmap2);
4096 				goto out;
4097 			}
4098 			break;
4099 		}
4100 #endif /* defined(XEN) */
4101 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4102 
4103 	/*
4104 	 * update statistics and PTP's reference count.
4105 	 */
4106 
4107 	pmap_stats_update_bypte(pmap, npte, opte);
4108 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4109 		ptp->wire_count++;
4110 	}
4111 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4112 
4113 	/*
4114 	 * if the same page, we can skip pv_entry handling.
4115 	 */
4116 
4117 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4118 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4119 		goto same_pa;
4120 	}
4121 
4122 	/*
4123 	 * if old page is managed, remove pv_entry from its list.
4124 	 */
4125 
4126 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4127 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4128 #ifdef DIAGNOSTIC
4129 		if (pg == NULL)
4130 			panic("pmap_enter: PG_PVLIST mapping with "
4131 			      "unmanaged page "
4132 			      "pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4133 			      (int64_t)pa, (int64_t)atop(pa));
4134 #endif
4135 		old_pp = VM_PAGE_TO_PP(pg);
4136 
4137 		pp_lock(old_pp);
4138 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4139 		old_pp->pp_attrs |= opte;
4140 		pp_unlock(old_pp);
4141 	}
4142 
4143 	/*
4144 	 * if new page is managed, insert pv_entry into its list.
4145 	 */
4146 
4147 	if (new_pp) {
4148 		pp_lock(new_pp);
4149 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4150 		pp_unlock(new_pp);
4151 	}
4152 
4153 same_pa:
4154 	pmap_unmap_ptes(pmap, pmap2);
4155 
4156 	/*
4157 	 * shootdown tlb if necessary.
4158 	 */
4159 
4160 	if ((~opte & (PG_V | PG_U)) == 0 &&
4161 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4162 		pmap_tlb_shootdown(pmap, va, 0, opte);
4163 	}
4164 
4165 	error = 0;
4166 out:
4167 	kpreempt_enable();
4168 out2:
4169 	if (old_pve != NULL) {
4170 		pool_cache_put(&pmap_pv_cache, old_pve);
4171 	}
4172 	if (new_pve != NULL) {
4173 		pool_cache_put(&pmap_pv_cache, new_pve);
4174 	}
4175 	if (new_pve2 != NULL) {
4176 		pool_cache_put(&pmap_pv_cache, new_pve2);
4177 	}
4178 
4179 	return error;
4180 }
4181 
4182 static bool
4183 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4184 {
4185 	struct vm_page *ptp;
4186 	struct pmap *kpm = pmap_kernel();
4187 
4188 	if (uvm.page_init_done == false) {
4189 		/*
4190 		 * we're growing the kernel pmap early (from
4191 		 * uvm_pageboot_alloc()).  this case must be
4192 		 * handled a little differently.
4193 		 */
4194 
4195 		if (uvm_page_physget(paddrp) == false)
4196 			panic("pmap_get_physpage: out of memory");
4197 		kpreempt_disable();
4198 		pmap_pte_set(early_zero_pte,
4199 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4200 		pmap_pte_flush();
4201 		pmap_update_pg((vaddr_t)early_zerop);
4202 		memset(early_zerop, 0, PAGE_SIZE);
4203 #if defined(DIAGNOSTIC) || defined (XEN)
4204 		pmap_pte_set(early_zero_pte, 0);
4205 		pmap_pte_flush();
4206 #endif /* defined(DIAGNOSTIC) */
4207 		kpreempt_enable();
4208 	} else {
4209 		/* XXX */
4210 		PMAP_SUBOBJ_LOCK(kpm, level - 1);
4211 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
4212 				    ptp_va2o(va, level), NULL,
4213 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4214 		PMAP_SUBOBJ_UNLOCK(kpm, level - 1);
4215 		if (ptp == NULL)
4216 			panic("pmap_get_physpage: out of memory");
4217 		ptp->flags &= ~PG_BUSY;
4218 		ptp->wire_count = 1;
4219 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4220 	}
4221 	pmap_stats_update(kpm, 1, 0);
4222 	return true;
4223 }
4224 
4225 /*
4226  * Allocate the amount of specified ptps for a ptp level, and populate
4227  * all levels below accordingly, mapping virtual addresses starting at
4228  * kva.
4229  *
4230  * Used by pmap_growkernel.
4231  */
4232 static void
4233 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4234     long *needed_ptps)
4235 {
4236 	unsigned long i;
4237 	vaddr_t va;
4238 	paddr_t pa;
4239 	unsigned long index, endindex;
4240 	int level;
4241 	pd_entry_t *pdep;
4242 #ifdef XEN
4243 	int s = splvm(); /* protect xpq_* */
4244 #endif
4245 
4246 	for (level = lvl; level > 1; level--) {
4247 		if (level == PTP_LEVELS)
4248 			pdep = pmap_kernel()->pm_pdir;
4249 		else
4250 			pdep = pdes[level - 2];
4251 		va = kva;
4252 		index = pl_i_roundup(kva, level);
4253 		endindex = index + needed_ptps[level - 1] - 1;
4254 
4255 
4256 		for (i = index; i <= endindex; i++) {
4257 			KASSERT(!pmap_valid_entry(pdep[i]));
4258 			pmap_get_physpage(va, level - 1, &pa);
4259 #ifdef XEN
4260 			xpq_queue_pte_update((level == PTP_LEVELS) ?
4261 			    xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) :
4262 			    xpmap_ptetomach(&pdep[i]),
4263 			    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4264 #ifdef PAE
4265 			if (level == PTP_LEVELS &&  i > L2_SLOT_KERN) {
4266 				/* update real kernel PD too */
4267 				xpq_queue_pte_update(
4268 				    xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]),
4269 				    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4270 			}
4271 #endif
4272 #else /* XEN */
4273 			pdep[i] = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4274 #endif /* XEN */
4275 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4276 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4277 			nkptp[level - 1]++;
4278 			va += nbpd[level - 1];
4279 		}
4280 		pmap_pte_flush();
4281 	}
4282 #ifdef XEN
4283 	splx(s);
4284 #endif
4285 }
4286 
4287 /*
4288  * pmap_growkernel: increase usage of KVM space
4289  *
4290  * => we allocate new PTPs for the kernel and install them in all
4291  *	the pmaps on the system.
4292  */
4293 
4294 vaddr_t
4295 pmap_growkernel(vaddr_t maxkvaddr)
4296 {
4297 	struct pmap *kpm = pmap_kernel();
4298 #if !defined(XEN) || !defined(__x86_64__)
4299 	struct pmap *pm;
4300 #endif
4301 	int s, i;
4302 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4303 	bool invalidate = false;
4304 
4305 	s = splvm();	/* to be safe */
4306 	mutex_enter(&kpm->pm_lock);
4307 
4308 	if (maxkvaddr <= pmap_maxkvaddr) {
4309 		mutex_exit(&kpm->pm_lock);
4310 		splx(s);
4311 		return pmap_maxkvaddr;
4312 	}
4313 
4314 	maxkvaddr = x86_round_pdr(maxkvaddr);
4315 	old = nkptp[PTP_LEVELS - 1];
4316 	/*
4317 	 * This loop could be optimized more, but pmap_growkernel()
4318 	 * is called infrequently.
4319 	 */
4320 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4321 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4322 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4323 		/*
4324 		 * XXX only need to check toplevel.
4325 		 */
4326 		if (target_nptp > nkptpmax[i])
4327 			panic("out of KVA space");
4328 		KASSERT(target_nptp >= nkptp[i]);
4329 		needed_kptp[i] = target_nptp - nkptp[i];
4330 	}
4331 
4332 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4333 
4334 	/*
4335 	 * If the number of top level entries changed, update all
4336 	 * pmaps.
4337 	 */
4338 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4339 #ifdef XEN
4340 #ifdef __x86_64__
4341 		/* nothing, kernel entries are never entered in user pmap */
4342 #else /* __x86_64__ */
4343 		mutex_enter(&pmaps_lock);
4344 		LIST_FOREACH(pm, &pmaps, pm_list) {
4345 			int pdkidx;
4346 			for (pdkidx =  PDIR_SLOT_KERN + old;
4347 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4348 			    pdkidx++) {
4349 				xpq_queue_pte_update(
4350 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4351 				    kpm->pm_pdir[pdkidx]);
4352 			}
4353 			xpq_flush_queue();
4354 		}
4355 		mutex_exit(&pmaps_lock);
4356 #endif /* __x86_64__ */
4357 #else /* XEN */
4358 		unsigned newpdes;
4359 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4360 		mutex_enter(&pmaps_lock);
4361 		LIST_FOREACH(pm, &pmaps, pm_list) {
4362 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4363 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4364 			       newpdes * sizeof (pd_entry_t));
4365 		}
4366 		mutex_exit(&pmaps_lock);
4367 #endif
4368 		invalidate = true;
4369 	}
4370 	pmap_maxkvaddr = maxkvaddr;
4371 	mutex_exit(&kpm->pm_lock);
4372 	splx(s);
4373 
4374 	if (invalidate) {
4375 		/* Invalidate the PDP cache. */
4376 		pool_cache_invalidate(&pmap_pdp_cache);
4377 	}
4378 
4379 	return maxkvaddr;
4380 }
4381 
4382 #ifdef DEBUG
4383 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4384 
4385 /*
4386  * pmap_dump: dump all the mappings from a pmap
4387  *
4388  * => caller should not be holding any pmap locks
4389  */
4390 
4391 void
4392 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4393 {
4394 	pt_entry_t *ptes, *pte;
4395 	pd_entry_t * const *pdes;
4396 	struct pmap *pmap2;
4397 	vaddr_t blkendva;
4398 
4399 	/*
4400 	 * if end is out of range truncate.
4401 	 * if (end == start) update to max.
4402 	 */
4403 
4404 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4405 		eva = VM_MAXUSER_ADDRESS;
4406 
4407 	/*
4408 	 * we lock in the pmap => pv_head direction
4409 	 */
4410 
4411 	kpreempt_disable();
4412 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4413 
4414 	/*
4415 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4416 	 */
4417 
4418 	for (/* null */ ; sva < eva ; sva = blkendva) {
4419 
4420 		/* determine range of block */
4421 		blkendva = x86_round_pdr(sva+1);
4422 		if (blkendva > eva)
4423 			blkendva = eva;
4424 
4425 		/* valid block? */
4426 		if (!pmap_pdes_valid(sva, pdes, NULL))
4427 			continue;
4428 
4429 		pte = &ptes[pl1_i(sva)];
4430 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4431 			if (!pmap_valid_entry(*pte))
4432 				continue;
4433 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4434 			    " (pte=%#" PRIxPADDR ")\n",
4435 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4436 		}
4437 	}
4438 	pmap_unmap_ptes(pmap, pmap2);
4439 	kpreempt_enable();
4440 }
4441 #endif
4442 
4443 /*
4444  * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm'
4445  *
4446  * => always invalidates locally before returning
4447  * => returns before remote CPUs have invalidated
4448  * => must be called with preemption disabled
4449  */
4450 
4451 void
4452 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte)
4453 {
4454 #ifdef MULTIPROCESSOR
4455 	extern bool x86_mp_online;
4456 	struct cpu_info *ci;
4457 	struct pmap_mbox *mb, *selfmb;
4458 	CPU_INFO_ITERATOR cii;
4459 	uintptr_t head;
4460 	u_int count;
4461 	int s;
4462 #endif	/* MULTIPROCESSOR */
4463 	struct cpu_info *self;
4464 	bool kernel;
4465 
4466 	KASSERT(eva == 0 || eva >= sva);
4467 	KASSERT(kpreempt_disabled());
4468 
4469 	if (pte & PG_PS)
4470 		sva &= PG_LGFRAME;
4471 	pte &= PG_G;
4472 	self = curcpu();
4473 
4474 	if (sva == (vaddr_t)-1LL) {
4475 		kernel = true;
4476 	} else {
4477 		if (eva == 0)
4478 			eva = sva + PAGE_SIZE;
4479 		kernel = sva >= VM_MAXUSER_ADDRESS;
4480 		KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS));
4481 	}
4482 
4483 	/*
4484 	 * if tearing down the pmap, do nothing.  we'll flush later
4485 	 * when we're ready to recycle/destroy it.
4486 	 */
4487 	if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) {
4488 		return;
4489 	}
4490 
4491 	/*
4492 	 * If the range is larger than 32 pages, then invalidate
4493 	 * everything.
4494 	 */
4495 	if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) {
4496 		sva = (vaddr_t)-1LL;
4497 		eva = sva;
4498 	}
4499 
4500 #ifdef MULTIPROCESSOR
4501 	if (ncpu > 1 && x86_mp_online) {
4502 		selfmb = &self->ci_pmap_cpu->pc_mbox;
4503 
4504 		/*
4505 		 * If the CPUs have no notion of global pages then
4506 		 * reload of %cr3 is sufficient.
4507 		 */
4508 		if (pte != 0 && (cpu_feature[0] & CPUID_PGE) == 0)
4509 			pte = 0;
4510 
4511 		if (pm == pmap_kernel()) {
4512 			/*
4513 			 * Mapped on all CPUs: use the broadcast mechanism.
4514 			 * Once we have the lock, increment the counter.
4515 			 */
4516 			s = splvm();
4517 			mb = &pmap_mbox;
4518 			count = SPINLOCK_BACKOFF_MIN;
4519 			do {
4520 				if ((head = mb->mb_head) != mb->mb_tail) {
4521 					splx(s);
4522 					while ((head = mb->mb_head) !=
4523 					    mb->mb_tail)
4524 						SPINLOCK_BACKOFF(count);
4525 					s = splvm();
4526 				}
4527 			} while (atomic_cas_ulong(
4528 			    (volatile u_long *)&mb->mb_head,
4529 			    head, head + ncpu - 1) != head);
4530 
4531 			/*
4532 			 * Once underway we must stay at IPL_VM until the
4533 			 * IPI is dispatched.  Otherwise interrupt handlers
4534 			 * on this CPU can deadlock against us.
4535 			 */
4536 			pmap_tlb_evcnt.ev_count++;
4537 			mb->mb_pointer = self;
4538 			mb->mb_addr1 = sva;
4539 			mb->mb_addr2 = eva;
4540 			mb->mb_global = pte;
4541 			x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL,
4542 			    LAPIC_DLMODE_FIXED);
4543 			self->ci_need_tlbwait = 1;
4544 			splx(s);
4545 		} else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 ||
4546 		    (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) {
4547 			/*
4548 			 * We don't bother traversing the CPU list if only
4549 			 * used by this CPU.
4550 			 *
4551 			 * We can't do global flushes with the multicast
4552 			 * mechanism.
4553 			 */
4554 			KASSERT(pte == 0);
4555 
4556 			/*
4557 			 * Take ownership of the shootdown mailbox on each
4558 			 * CPU, fill the details and fire it off.
4559 			 */
4560 			s = splvm();
4561 			for (CPU_INFO_FOREACH(cii, ci)) {
4562 				if (ci == self ||
4563 				    !pmap_is_active(pm, ci, kernel) ||
4564 				    !(ci->ci_flags & CPUF_RUNNING))
4565 					continue;
4566 				selfmb->mb_head++;
4567 				mb = &ci->ci_pmap_cpu->pc_mbox;
4568 				count = SPINLOCK_BACKOFF_MIN;
4569 				while (atomic_cas_ulong(
4570 				    (u_long *)&mb->mb_pointer,
4571 				    0, (u_long)&selfmb->mb_tail) != 0) {
4572 				    	splx(s);
4573 					while (mb->mb_pointer != 0)
4574 						SPINLOCK_BACKOFF(count);
4575 					s = splvm();
4576 				}
4577 				mb->mb_addr1 = sva;
4578 				mb->mb_addr2 = eva;
4579 				mb->mb_global = pte;
4580 				if (x86_ipi(LAPIC_TLB_MCAST_VECTOR,
4581 				    ci->ci_cpuid, LAPIC_DLMODE_FIXED))
4582 					panic("pmap_tlb_shootdown: ipi failed");
4583 			}
4584 			self->ci_need_tlbwait = 1;
4585 			splx(s);
4586 		}
4587 	}
4588 #endif	/* MULTIPROCESSOR */
4589 
4590 	/* Update the current CPU before waiting for others. */
4591 	if (!pmap_is_active(pm, self, kernel))
4592 		return;
4593 
4594 	if (sva == (vaddr_t)-1LL) {
4595 		u_int gen = uvm_emap_gen_return();
4596 		if (pte != 0) {
4597 			tlbflushg();
4598 		} else {
4599 			tlbflush();
4600 		}
4601 		uvm_emap_update(gen);
4602 	} else {
4603 		do {
4604 			pmap_update_pg(sva);
4605 			sva += PAGE_SIZE;
4606 		} while (sva < eva);
4607 	}
4608 }
4609 
4610 /*
4611  * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete
4612  *
4613  * => only waits for operations generated by the current CPU
4614  * => must be called with preemption disabled
4615  */
4616 
4617 void
4618 pmap_tlb_shootwait(void)
4619 {
4620 	struct cpu_info *self;
4621 	struct pmap_mbox *mb;
4622 
4623 	KASSERT(kpreempt_disabled());
4624 
4625 	/*
4626 	 * Anything to do?  XXX Really we want to avoid touching the cache
4627 	 * lines of the two mailboxes, but the processor may read ahead.
4628 	 */
4629 	self = curcpu();
4630 	if (!self->ci_need_tlbwait)
4631 		return;
4632 	self->ci_need_tlbwait = 0;
4633 
4634 	/* If we own the global mailbox, wait for it to drain. */
4635 	mb = &pmap_mbox;
4636 	while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail)
4637 		x86_pause();
4638 
4639 	/* If we own other CPU's mailboxes, wait for them to drain. */
4640 	mb = &self->ci_pmap_cpu->pc_mbox;
4641 	KASSERT(mb->mb_pointer != &mb->mb_tail);
4642 	while (mb->mb_head != mb->mb_tail)
4643 		x86_pause();
4644 }
4645 
4646 /*
4647  * pmap_update: process deferred invalidations
4648  */
4649 
4650 void
4651 pmap_update(struct pmap *pmap)
4652 {
4653 	struct vm_page *ptp, *empty_ptps;
4654 	struct pmap_page *pp;
4655 	lwp_t *l;
4656 
4657 	/*
4658 	 * if we have torn down this pmap, invalidate non-global TLB
4659 	 * entries on any processors using it.
4660 	 */
4661 	l = curlwp;
4662 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4663 		l->l_md.md_gc_pmap = NULL;
4664 		KPREEMPT_DISABLE(l);
4665 		pmap_tlb_shootdown(pmap, -1, -1, 0);
4666 		KPREEMPT_ENABLE(l);
4667 	}
4668 
4669 	/*
4670 	 * wait for tlb shootdowns to complete before returning control
4671 	 * to the caller.
4672 	 */
4673 	kpreempt_disable();
4674 	pmap_tlb_shootwait();
4675 	kpreempt_enable();
4676 
4677 	/*
4678 	 * now that shootdowns are complete, process deferred frees,
4679 	 * but not from interrupt context.
4680 	 */
4681 	if (l->l_md.md_gc_ptp != NULL) {
4682 		KASSERT((l->l_pflag & LP_INTR) == 0);
4683 		if (cpu_intr_p()) {
4684 			return;
4685 		}
4686 
4687 		empty_ptps = l->l_md.md_gc_ptp;
4688 		l->l_md.md_gc_ptp = NULL;
4689 
4690 		while ((ptp = empty_ptps) != NULL) {
4691 			ptp->flags |= PG_ZERO;
4692 			pp = VM_PAGE_TO_PP(ptp);
4693 			empty_ptps = pp->pp_link;
4694 			LIST_INIT(&pp->pp_head.pvh_list);
4695 			uvm_pagefree(ptp);
4696 		}
4697 	}
4698 }
4699 
4700 #if PTP_LEVELS > 4
4701 #error "Unsupported number of page table mappings"
4702 #endif
4703 
4704 paddr_t
4705 pmap_init_tmp_pgtbl(paddr_t pg)
4706 {
4707 	static bool maps_loaded;
4708 	static const paddr_t x86_tmp_pml_paddr[] = {
4709 	    4 * PAGE_SIZE,
4710 	    5 * PAGE_SIZE,
4711 	    6 * PAGE_SIZE,
4712 	    7 * PAGE_SIZE
4713 	};
4714 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4715 
4716 	pd_entry_t *tmp_pml, *kernel_pml;
4717 
4718 	int level;
4719 
4720 	if (!maps_loaded) {
4721 		for (level = 0; level < PTP_LEVELS; ++level) {
4722 			x86_tmp_pml_vaddr[level] =
4723 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4724 			    UVM_KMF_VAONLY);
4725 
4726 			if (x86_tmp_pml_vaddr[level] == 0)
4727 				panic("mapping of real mode PML failed\n");
4728 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4729 			    x86_tmp_pml_paddr[level],
4730 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4731 			pmap_update(pmap_kernel());
4732 		}
4733 		maps_loaded = true;
4734 	}
4735 
4736 	/* Zero levels 1-3 */
4737 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4738 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4739 		memset(tmp_pml, 0, PAGE_SIZE);
4740 	}
4741 
4742 	/* Copy PML4 */
4743 	kernel_pml = pmap_kernel()->pm_pdir;
4744 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4745 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4746 
4747 #ifdef PAE
4748 	/*
4749 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4750 	 * last entries are unlikely to be used for temporary mappings.
4751 	 * 508: maps 0->1GB (userland)
4752 	 * 509: unused
4753 	 * 510: unused
4754 	 * 511: maps 3->4GB (kernel)
4755 	 */
4756 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4757 	tmp_pml[509] = 0;
4758 	tmp_pml[510] = 0;
4759 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V;
4760 #endif
4761 
4762 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4763 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4764 
4765 		tmp_pml[pl_i(pg, level + 1)] =
4766 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4767 	}
4768 
4769 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4770 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4771 
4772 #ifdef PAE
4773 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4774 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4775 #endif
4776 
4777 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4778 }
4779